config('search.settings') ->set('index.minimum_word_size', 1) ->set('index.overlap_cjk', TRUE) ->save(); $this->refreshVariables(); // Create a string of CJK characters from various character ranges in // the Unicode tables. // Beginnings of the character ranges. $starts = [ 'CJK unified' => 0x4e00, 'CJK Ext A' => 0x3400, 'CJK Compat' => 0xf900, 'Hangul Jamo' => 0x1100, 'Hangul Ext A' => 0xa960, 'Hangul Ext B' => 0xd7b0, 'Hangul Compat' => 0x3131, 'Half non-punct 1' => 0xff21, 'Half non-punct 2' => 0xff41, 'Half non-punct 3' => 0xff66, 'Hangul Syllables' => 0xac00, 'Hiragana' => 0x3040, 'Katakana' => 0x30a1, 'Katakana Ext' => 0x31f0, 'CJK Reserve 1' => 0x20000, 'CJK Reserve 2' => 0x30000, 'Bomofo' => 0x3100, 'Bomofo Ext' => 0x31a0, 'Lisu' => 0xa4d0, 'Yi' => 0xa000, ]; // Ends of the character ranges. $ends = [ 'CJK unified' => 0x9fcf, 'CJK Ext A' => 0x4dbf, 'CJK Compat' => 0xfaff, 'Hangul Jamo' => 0x11ff, 'Hangul Ext A' => 0xa97f, 'Hangul Ext B' => 0xd7ff, 'Hangul Compat' => 0x318e, 'Half non-punct 1' => 0xff3a, 'Half non-punct 2' => 0xff5a, 'Half non-punct 3' => 0xffdc, 'Hangul Syllables' => 0xd7af, 'Hiragana' => 0x309f, 'Katakana' => 0x30ff, 'Katakana Ext' => 0x31ff, 'CJK Reserve 1' => 0x2fffd, 'CJK Reserve 2' => 0x3fffd, 'Bomofo' => 0x312f, 'Bomofo Ext' => 0x31b7, 'Lisu' => 0xa4fd, 'Yi' => 0xa48f, ]; // Generate characters consisting of starts, midpoints, and ends. $chars = []; $charcodes = []; foreach ($starts as $key => $value) { $charcodes[] = $starts[$key]; $chars[] = $this->code2utf($starts[$key]); $mid = round(0.5 * ($starts[$key] + $ends[$key])); $charcodes[] = $mid; $chars[] = $this->code2utf($mid); $charcodes[] = $ends[$key]; $chars[] = $this->code2utf($ends[$key]); } // Merge into a string and tokenize. $string = implode('', $chars); $out = trim(search_simplify($string)); $expected = Unicode::strtolower(implode(' ', $chars)); // Verify that the output matches what we expect. $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters'); } /** * Verifies that strings of non-CJK characters are not tokenized. * * This is just a sanity check - it verifies that strings of letters are * not tokenized. */ public function testNoTokenizer() { // Set the minimum word size to 1 (to split all CJK characters) and make // sure CJK tokenizing is turned on. $this->config('search.settings') ->set('index.minimum_word_size', 1) ->set('index.overlap_cjk', TRUE) ->save(); $this->refreshVariables(); $letters = 'abcdefghijklmnopqrstuvwxyz'; $out = trim(search_simplify($letters)); $this->assertEqual($letters, $out, 'Letters are not CJK tokenized'); } /** * Like PHP chr() function, but for unicode characters. * * chr() only works for ASCII characters up to character 255. This function * converts a number to the corresponding unicode character. Adapted from * functions supplied in comments on several functions on php.net. */ public function code2utf($num) { if ($num < 128) { return chr($num); } if ($num < 2048) { return chr(($num >> 6) + 192) . chr(($num & 63) + 128); } if ($num < 65536) { return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); } if ($num < 2097152) { return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); } return ''; } }