www.aleph1.co.uk Git - yaffs-website/blob - web/core/modules/search/tests/src/Functional/SearchTokenizerTest.php

   1 <?php
   2
   3 namespace Drupal\Tests\search\Functional;
   4 use Drupal\Component\Utility\Unicode;
   5
   6 /**
   7  * Tests that CJK tokenizer works as intended.
   8  *
   9  * @group search
  10  */
  11 class SearchTokenizerTest extends SearchTestBase {
  12
  13   /**
  14    * Verifies that strings of CJK characters are tokenized.
  15    *
  16    * The search_simplify() function does special things with numbers, symbols,
  17    * and punctuation. So we only test that CJK characters that are not in these
  18    * character classes are tokenized properly. See PREG_CLASS_CKJ for more
  19    * information.
  20    */
  21   public function testTokenizer() {
  22     // Set the minimum word size to 1 (to split all CJK characters) and make
  23     // sure CJK tokenizing is turned on.
  24     $this->config('search.settings')
  25       ->set('index.minimum_word_size', 1)
  26       ->set('index.overlap_cjk', TRUE)
  27       ->save();
  28     $this->refreshVariables();
  29
  30     // Create a string of CJK characters from various character ranges in
  31     // the Unicode tables.
  32
  33     // Beginnings of the character ranges.
  34     $starts = [
  35       'CJK unified' => 0x4e00,
  36       'CJK Ext A' => 0x3400,
  37       'CJK Compat' => 0xf900,
  38       'Hangul Jamo' => 0x1100,
  39       'Hangul Ext A' => 0xa960,
  40       'Hangul Ext B' => 0xd7b0,
  41       'Hangul Compat' => 0x3131,
  42       'Half non-punct 1' => 0xff21,
  43       'Half non-punct 2' => 0xff41,
  44       'Half non-punct 3' => 0xff66,
  45       'Hangul Syllables' => 0xac00,
  46       'Hiragana' => 0x3040,
  47       'Katakana' => 0x30a1,
  48       'Katakana Ext' => 0x31f0,
  49       'CJK Reserve 1' => 0x20000,
  50       'CJK Reserve 2' => 0x30000,
  51       'Bomofo' => 0x3100,
  52       'Bomofo Ext' => 0x31a0,
  53       'Lisu' => 0xa4d0,
  54       'Yi' => 0xa000,
  55     ];
  56
  57     // Ends of the character ranges.
  58     $ends = [
  59       'CJK unified' => 0x9fcf,
  60       'CJK Ext A' => 0x4dbf,
  61       'CJK Compat' => 0xfaff,
  62       'Hangul Jamo' => 0x11ff,
  63       'Hangul Ext A' => 0xa97f,
  64       'Hangul Ext B' => 0xd7ff,
  65       'Hangul Compat' => 0x318e,
  66       'Half non-punct 1' => 0xff3a,
  67       'Half non-punct 2' => 0xff5a,
  68       'Half non-punct 3' => 0xffdc,
  69       'Hangul Syllables' => 0xd7af,
  70       'Hiragana' => 0x309f,
  71       'Katakana' => 0x30ff,
  72       'Katakana Ext' => 0x31ff,
  73       'CJK Reserve 1' => 0x2fffd,
  74       'CJK Reserve 2' => 0x3fffd,
  75       'Bomofo' => 0x312f,
  76       'Bomofo Ext' => 0x31b7,
  77       'Lisu' => 0xa4fd,
  78       'Yi' => 0xa48f,
  79     ];
  80
  81     // Generate characters consisting of starts, midpoints, and ends.
  82     $chars = [];
  83     $charcodes = [];
  84     foreach ($starts as $key => $value) {
  85       $charcodes[] = $starts[$key];
  86       $chars[] = $this->code2utf($starts[$key]);
  87       $mid = round(0.5 * ($starts[$key] + $ends[$key]));
  88       $charcodes[] = $mid;
  89       $chars[] = $this->code2utf($mid);
  90       $charcodes[] = $ends[$key];
  91       $chars[] = $this->code2utf($ends[$key]);
  92     }
  93
  94     // Merge into a string and tokenize.
  95     $string = implode('', $chars);
  96     $out = trim(search_simplify($string));
  97     $expected = Unicode::strtolower(implode(' ', $chars));
  98
  99     // Verify that the output matches what we expect.
 100     $this->assertEqual($out, $expected, 'CJK tokenizer worked on all supplied CJK characters');
 101   }
 102
 103   /**
 104    * Verifies that strings of non-CJK characters are not tokenized.
 105    *
 106    * This is just a sanity check - it verifies that strings of letters are
 107    * not tokenized.
 108    */
 109   public function testNoTokenizer() {
 110     // Set the minimum word size to 1 (to split all CJK characters) and make
 111     // sure CJK tokenizing is turned on.
 112     $this->config('search.settings')
 113       ->set('index.minimum_word_size', 1)
 114       ->set('index.overlap_cjk', TRUE)
 115       ->save();
 116     $this->refreshVariables();
 117
 118     $letters = 'abcdefghijklmnopqrstuvwxyz';
 119     $out = trim(search_simplify($letters));
 120
 121     $this->assertEqual($letters, $out, 'Letters are not CJK tokenized');
 122   }
 123
 124   /**
 125    * Like PHP chr() function, but for unicode characters.
 126    *
 127    * chr() only works for ASCII characters up to character 255. This function
 128    * converts a number to the corresponding unicode character. Adapted from
 129    * functions supplied in comments on several functions on php.net.
 130    */
 131   public function code2utf($num) {
 132     if ($num < 128) {
 133       return chr($num);
 134     }
 135
 136     if ($num < 2048) {
 137       return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
 138     }
 139
 140     if ($num < 65536) {
 141       return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
 142     }
 143
 144     if ($num < 2097152) {
 145       return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
 146     }
 147
 148     return '';
 149   }
 150
 151 }