3 namespace Drupal\Tests\search\Functional;
4 use Drupal\Component\Utility\Unicode;
7 * Tests that CJK tokenizer works as intended.
11 class SearchTokenizerTest extends SearchTestBase {
14 * Verifies that strings of CJK characters are tokenized.
16 * The search_simplify() function does special things with numbers, symbols,
17 * and punctuation. So we only test that CJK characters that are not in these
18 * character classes are tokenized properly. See PREG_CLASS_CKJ for more
21 public function testTokenizer() {
22 // Set the minimum word size to 1 (to split all CJK characters) and make
23 // sure CJK tokenizing is turned on.
24 $this->config('search.settings')
25 ->set('index.minimum_word_size', 1)
26 ->set('index.overlap_cjk', TRUE)
28 $this->refreshVariables();
30 // Create a string of CJK characters from various character ranges in
31 // the Unicode tables.
33 // Beginnings of the character ranges.
35 'CJK unified' => 0x4e00,
36 'CJK Ext A' => 0x3400,
37 'CJK Compat' => 0xf900,
38 'Hangul Jamo' => 0x1100,
39 'Hangul Ext A' => 0xa960,
40 'Hangul Ext B' => 0xd7b0,
41 'Hangul Compat' => 0x3131,
42 'Half non-punct 1' => 0xff21,
43 'Half non-punct 2' => 0xff41,
44 'Half non-punct 3' => 0xff66,
45 'Hangul Syllables' => 0xac00,
48 'Katakana Ext' => 0x31f0,
49 'CJK Reserve 1' => 0x20000,
50 'CJK Reserve 2' => 0x30000,
52 'Bomofo Ext' => 0x31a0,
57 // Ends of the character ranges.
59 'CJK unified' => 0x9fcf,
60 'CJK Ext A' => 0x4dbf,
61 'CJK Compat' => 0xfaff,
62 'Hangul Jamo' => 0x11ff,
63 'Hangul Ext A' => 0xa97f,
64 'Hangul Ext B' => 0xd7ff,
65 'Hangul Compat' => 0x318e,
66 'Half non-punct 1' => 0xff3a,
67 'Half non-punct 2' => 0xff5a,
68 'Half non-punct 3' => 0xffdc,
69 'Hangul Syllables' => 0xd7af,
72 'Katakana Ext' => 0x31ff,
73 'CJK Reserve 1' => 0x2fffd,
74 'CJK Reserve 2' => 0x3fffd,
76 'Bomofo Ext' => 0x31b7,
81 // Generate characters consisting of starts, midpoints, and ends.
84 foreach ($starts as $key => $value) {
85 $charcodes[] = $starts[$key];
86 $chars[] = $this->code2utf($starts[$key]);
87 $mid = round(0.5 * ($starts[$key] + $ends[$key]));
89 $chars[] = $this->code2utf($mid);
90 $charcodes[] = $ends[$key];
91 $chars[] = $this->code2utf($ends[$key]);
94 // Merge into a string and tokenize.
95 $string = implode('', $chars);
96 $out = trim(search_simplify($string));
97 $expected = Unicode::strtolower(implode(' ', $chars));
99 // Verify that the output matches what we expect.
100 $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
104 * Verifies that strings of non-CJK characters are not tokenized.
106 * This is just a sanity check - it verifies that strings of letters are
109 public function testNoTokenizer() {
110 // Set the minimum word size to 1 (to split all CJK characters) and make
111 // sure CJK tokenizing is turned on.
112 $this->config('search.settings')
113 ->set('index.minimum_word_size', 1)
114 ->set('index.overlap_cjk', TRUE)
116 $this->refreshVariables();
118 $letters = 'abcdefghijklmnopqrstuvwxyz';
119 $out = trim(search_simplify($letters));
121 $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
125 * Like PHP chr() function, but for unicode characters.
127 * chr() only works for ASCII characters up to character 255. This function
128 * converts a number to the corresponding unicode character. Adapted from
129 * functions supplied in comments on several functions on php.net.
131 public function code2utf($num) {
137 return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
141 return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
144 if ($num < 2097152) {
145 return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);