www.aleph1.co.uk Git - yaffs-website/blob - web/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php

   1 <?php
   2
   3 namespace Drupal\Component\Transliteration;
   4
   5 /**
   6  * Implements transliteration without using the PECL extensions.
   7  *
   8  * Transliterations are done character-by-character, by looking up non-US-ASCII
   9  * characters in a transliteration database.
  10  *
  11  * The database comes from two types of files, both of which are searched for in
  12  * the PhpTransliteration::$dataDirectory directory. First, language-specific
  13  * overrides are searched (see PhpTransliteration::readLanguageOverrides()). If
  14  * there is no language-specific override for a character, the generic
  15  * transliteration character tables are searched (see
  16  * PhpTransliteration::readGenericData()). If looking up the character in the
  17  * generic table results in a NULL value, or an illegal character is
  18  * encountered, then a substitute character is returned.
  19  *
  20  * Some parts of this code were derived from the MediaWiki project's UtfNormal
  21  * class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
  22  * http://www.mediawiki.org/
  23  */
  24 class PhpTransliteration implements TransliterationInterface {
  25
  26   /**
  27    * Directory where data for transliteration resides.
  28    *
  29    * The constructor sets this (by default) to subdirectory 'data' underneath
  30    * the directory where the class's PHP file resides.
  31    *
  32    * @var string
  33    */
  34   protected $dataDirectory;
  35
  36   /**
  37    * Associative array of language-specific character transliteration tables.
  38    *
  39    * The outermost array keys are language codes. For each language code key,
  40    * the value is an array whose keys are Unicode character codes, and whose
  41    * values are the transliterations of those characters to US-ASCII. This is
  42    * set up as needed in PhpTransliteration::replace() by calling
  43    * PhpTransliteration::readLanguageOverrides().
  44    *
  45    * @var array
  46    */
  47   protected $languageOverrides = [];
  48
  49   /**
  50    * Non-language-specific transliteration tables.
  51    *
  52    * Array whose keys are the upper two bytes of the Unicode character, and
  53    * whose values are an array of transliterations for each lower-two bytes
  54    * character code. This is set up as needed in PhpTransliteration::replace()
  55    * by calling PhpTransliteration::readGenericData().
  56    *
  57    * @var array
  58    */
  59   protected $genericMap = [];
  60
  61   /**
  62    * Constructs a transliteration object.
  63    *
  64    * @param string $data_directory
  65    *   (optional) The directory where data files reside. If omitted, defaults
  66    *   to subdirectory 'data' underneath the directory where the class's PHP
  67    *   file resides.
  68    */
  69   public function __construct($data_directory = NULL) {
  70     $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';
  71   }
  72
  73   /**
  74    * {@inheritdoc}
  75    */
  76   public function removeDiacritics($string) {
  77     $result = '';
  78
  79     foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
  80       $code = self::ordUTF8($character);
  81
  82       // These two Unicode ranges include the accented US-ASCII letters, with a
  83       // few characters that aren't accented letters mixed in. So define the
  84       // ranges and the excluded characters.
  85       $range1 = $code > 0x00bf && $code < 0x017f;
  86       $exclusions_range1 = [0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b];
  87       $range2 = $code > 0x01cc && $code < 0x0250;
  88       $exclusions_range2 = [0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245];
  89
  90       $replacement = $character;
  91       if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) {
  92         $to_add = $this->lookupReplacement($code, 'xyz');
  93         if (strlen($to_add) === 1) {
  94           $replacement = $to_add;
  95         }
  96       }
  97
  98       $result .= $replacement;
  99     }
 100
 101     return $result;
 102   }
 103
 104   /**
 105    * {@inheritdoc}
 106    */
 107   public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
 108     $result = '';
 109     $length = 0;
 110     // Split into Unicode characters and transliterate each one.
 111     foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
 112       $code = self::ordUTF8($character);
 113       if ($code == -1) {
 114         $to_add = $unknown_character;
 115       }
 116       else {
 117         $to_add = $this->replace($code, $langcode, $unknown_character);
 118       }
 119
 120       // Check if this exceeds the maximum allowed length.
 121       if (isset($max_length)) {
 122         $length += strlen($to_add);
 123         if ($length > $max_length) {
 124           // There is no more space.
 125           return $result;
 126         }
 127       }
 128
 129       $result .= $to_add;
 130     }
 131
 132     return $result;
 133   }
 134
 135   /**
 136    * Finds the character code for a UTF-8 character: like ord() but for UTF-8.
 137    *
 138    * @param string $character
 139    *   A single UTF-8 character.
 140    *
 141    * @return int
 142    *   The character code, or -1 if an illegal character is found.
 143    */
 144   protected static function ordUTF8($character) {
 145     $first_byte = ord($character[0]);
 146
 147     if (($first_byte & 0x80) == 0) {
 148       // Single-byte form: 0xxxxxxxx.
 149       return $first_byte;
 150     }
 151     if (($first_byte & 0xe0) == 0xc0) {
 152       // Two-byte form: 110xxxxx 10xxxxxx.
 153       return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
 154     }
 155     if (($first_byte & 0xf0) == 0xe0) {
 156       // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
 157       return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
 158     }
 159     if (($first_byte & 0xf8) == 0xf0) {
 160       // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
 161       return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
 162     }
 163
 164     // Other forms are not legal.
 165     return -1;
 166   }
 167
 168   /**
 169    * Replaces a single Unicode character using the transliteration database.
 170    *
 171    * @param int $code
 172    *   The character code of a Unicode character.
 173    * @param string $langcode
 174    *   The language code of the language the character is in.
 175    * @param string $unknown_character
 176    *   The character to substitute for characters without transliterated
 177    *   equivalents.
 178    *
 179    * @return string
 180    *   US-ASCII replacement character. If it has a mapping, it is returned;
 181    *   otherwise, $unknown_character is returned. The replacement can contain
 182    *   multiple characters.
 183    */
 184   protected function replace($code, $langcode, $unknown_character) {
 185     if ($code < 0x80) {
 186       // Already lower ASCII.
 187       return chr($code);
 188     }
 189
 190     // See if there is a language-specific override for this character.
 191     if (!isset($this->languageOverrides[$langcode])) {
 192       $this->readLanguageOverrides($langcode);
 193     }
 194     if (isset($this->languageOverrides[$langcode][$code])) {
 195       return $this->languageOverrides[$langcode][$code];
 196     }
 197
 198     return $this->lookupReplacement($code, $unknown_character);
 199   }
 200
 201   /**
 202    * Look up the generic replacement for a UTF-8 character code.
 203    *
 204    * @param $code
 205    *   The UTF-8 character code.
 206    * @param string $unknown_character
 207    *   (optional) The character to substitute for characters without entries in
 208    *   the replacement tables.
 209    *
 210    * @return string
 211    *   US-ASCII replacement characters. If it has a mapping, it is returned;
 212    *   otherwise, $unknown_character is returned. The replacement can contain
 213    *   multiple characters.
 214    */
 215   protected function lookupReplacement($code, $unknown_character = '?') {
 216     // See if there is a generic mapping for this character.
 217     $bank = $code >> 8;
 218     if (!isset($this->genericMap[$bank])) {
 219       $this->readGenericData($bank);
 220     }
 221     $code = $code & 0xff;
 222     return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character;
 223   }
 224
 225   /**
 226    * Reads in language overrides for a language code.
 227    *
 228    * The data is read from files named "$langcode.php" in
 229    * PhpTransliteration::$dataDirectory. These files should set up an array
 230    * variable $overrides with an element whose key is $langcode and whose value
 231    * is an array whose keys are character codes, and whose values are their
 232    * transliterations in this language. The character codes can be for any valid
 233    * Unicode character, independent of the number of bytes.
 234    *
 235    * @param $langcode
 236    *   Code for the language to read.
 237    */
 238   protected function readLanguageOverrides($langcode) {
 239     // Figure out the file name to use by sanitizing the language code,
 240     // just in case.
 241     $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php';
 242
 243     // Read in this file, which should set up a variable called $overrides,
 244     // which will be local to this function.
 245     if (is_file($file)) {
 246       include $file;
 247     }
 248     if (!isset($overrides) || !is_array($overrides)) {
 249       $overrides = [$langcode => []];
 250     }
 251     $this->languageOverrides[$langcode] = $overrides[$langcode];
 252   }
 253
 254   /**
 255    * Reads in generic transliteration data for a bank of characters.
 256    *
 257    * The data is read in from a file named "x$bank.php" (with $bank in
 258    * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
 259    * should set up a variable $bank containing an array whose numerical indices
 260    * are the remaining two bytes of the character code, and whose values are the
 261    * transliterations of these characters into US-ASCII. Note that the maximum
 262    * Unicode character that can be encoded in this way is 4 bytes.
 263    *
 264    * @param $bank
 265    *   First two bytes of the Unicode character, or 0 for the ASCII range.
 266    */
 267   protected function readGenericData($bank) {
 268     // Figure out the file name.
 269     $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
 270
 271     // Read in this file, which should set up a variable called $base, which
 272     // will be local to this function.
 273     if (is_file($file)) {
 274       include $file;
 275     }
 276     if (!isset($base) || !is_array($base)) {
 277       $base = [];
 278     }
 279
 280     // Save this data.
 281     $this->genericMap[$bank] = $base;
 282   }
 283
 284 }