3 namespace Drupal\porterstemmer;
6 * PHP Implementation of the Porter2 Stemming Algorithm.
8 * See http://snowball.tartarus.org/algorithms/english/stemmer.html .
13 * Computes the stem of the word.
18 public static function stem($word) {
41 // Process exceptions.
42 if (isset($exceptions[$word])) {
43 $word = $exceptions[$word];
45 elseif (strlen($word) > 2) {
46 // Only execute algorithm on words that are longer than two letters.
47 $word = self::prepare($word);
48 $word = self::step0($word);
49 $word = self::step1a($word);
50 $word = self::step1b($word);
51 $word = self::step1c($word);
52 $word = self::step2($word);
53 $word = self::step3($word);
54 $word = self::step4($word);
55 $word = self::step5($word);
57 return strtolower($word);
61 * Set initial y, or y after a vowel, to Y.
66 * @return string $word
69 protected static function prepare($word) {
71 if (strpos($word, "'") === 0) {
72 $word = substr($word, 1);
74 while ($inc <= strlen($word)) {
75 if (substr($word, $inc, 1) === 'y' && ($inc == 0 || self::isVowel($inc - 1, $word))) {
76 $word = substr_replace($word, 'Y', $inc, 1);
84 * Search for the longest among the "s" suffixes and removes it.
89 * @return string $word
92 protected static function step0($word) {
94 $checks = array("'s'", "'s", "'");
95 foreach ($checks as $check) {
96 if (!$found && self::hasEnding($word, $check)) {
97 $word = self::removeEnding($word, $check);
105 * Handles various suffixes, of which the longest is replaced.
107 * @param string $word
110 * @return string $word
113 protected static function step1a($word) {
115 if (self::hasEnding($word, 'sses')) {
116 $word = self::removeEnding($word, 'sses') . 'ss';
119 $checks = array('ied', 'ies');
120 foreach ($checks as $check) {
121 if (!$found && self::hasEnding($word, $check)) {
122 // @todo: check order here.
123 $length = strlen($word);
124 $word = self::removeEnding($word, $check);
134 if (self::hasEnding($word, 'us') || self::hasEnding($word, 'ss')) {
137 // Delete if preceding word part has a vowel not immediately before the s.
138 if (!$found && self::hasEnding($word, 's') && self::containsVowel(substr($word, 0, -2))) {
139 $word = self::removeEnding($word, 's');
145 * Handles various suffixes, of which the longest is replaced.
147 * @param string $word
150 * @return string $word
153 protected static function step1b($word) {
164 if (in_array($word, $exceptions)) {
167 $checks = array('eedly', 'eed');
168 foreach ($checks as $check) {
169 if (self::hasEnding($word, $check)) {
170 if (self::r($word, 1) !== strlen($word)) {
171 $word = self::removeEnding($word, $check) . 'ee';
176 $checks = array('ingly', 'edly', 'ing', 'ed');
177 $second_endings = array('at', 'bl', 'iz');
178 foreach ($checks as $check) {
179 // If the ending is present and the previous part contains a vowel.
180 if (self::hasEnding($word, $check) && self::containsVowel(substr($word, 0, -strlen($check)))) {
181 $word = self::removeEnding($word, $check);
182 foreach ($second_endings as $ending) {
183 if (self::hasEnding($word, $ending)) {
187 // If the word ends with a double, remove the last letter.
188 $double_removed = self::removeDoubles($word);
189 if ($double_removed != $word) {
190 $word = $double_removed;
192 elseif (self::isShort($word)) {
193 // If the word is short, add e (so hop -> hope).
203 * Replaces suffix y or Y with i if after non-vowel not @ word begin.
205 * @param string $word
208 * @return string $word
211 protected static function step1c($word) {
212 if ((self::hasEnding($word, 'y') || self::hasEnding($word, 'Y')) && strlen($word) > 2 && !(self::isVowel(strlen($word) - 2, $word))) {
213 $word = self::removeEnding($word, 'y');
220 * Implements step 2 of the Porter2 algorithm.
222 * @param string $word
225 * @return string $word
228 protected static function step2($word) {
254 foreach ($checks as $find => $replace) {
255 if (self::hasEnding($word, $find)) {
256 if (self::inR1($word, $find)) {
257 $word = self::removeEnding($word, $find) . $replace;
262 if (self::hasEnding($word, 'li')) {
263 if (strlen($word) > 4 && self::validLi(self::charAt(-3, $word))) {
264 $word = self::removeEnding($word, 'li');
271 * Implements step 3 of the Porter2 algorithm.
273 * @param string $word
276 * @return string $word
279 protected static function step3($word) {
290 foreach ($checks as $find => $replace) {
291 if (self::hasEnding($word, $find)) {
292 if (self::inR1($word, $find)) {
293 $word = self::removeEnding($word, $find) . $replace;
298 if (self::hasEnding($word, 'ative')) {
299 if (self::inR2($word, 'ative')) {
300 $word = self::removeEnding($word, 'ative');
307 * Implements step 4 of the Porter2 algorithm.
309 * @param string $word
312 * @return string $word
315 protected static function step4($word) {
336 foreach ($checks as $check) {
337 // Among the suffixes, if found and in R2, delete.
338 if (self::hasEnding($word, $check)) {
339 if (self::inR2($word, $check)) {
340 if ($check !== 'ion' || in_array(self::charAt(-4, $word), array('s', 't'))) {
341 $word = self::removeEnding($word, $check);
351 * Implements step 5 of the Porter2 algorithm.
353 * @param string $word
356 * @return string $word
359 protected static function step5($word) {
360 if (self::hasEnding($word, 'e')) {
361 // Delete if in R2, or in R1 and not preceded by a short syllable.
362 if (self::inR2($word, 'e') || (self::inR1($word, 'e') && !self::isShortSyllable($word, strlen($word) - 3))) {
363 $word = self::removeEnding($word, 'e');
367 if (self::hasEnding($word, 'l')) {
368 // Delete if in R2 and preceded by l.
369 if (self::inR2($word, 'l') && self::charAt(-2, $word) == 'l') {
370 $word = self::removeEnding($word, 'l');
377 * Removes certain double consonants from the word's end.
379 * @param string $word
382 * @return string $word
385 protected static function removeDoubles($word) {
386 $doubles = array('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt');
387 foreach ($doubles as $double) {
388 if (substr($word, -2) == $double) {
389 $word = substr($word, 0, -1);
397 * Checks whether a character is a vowel.
399 * @param int $position
400 * The character's position.
401 * @param string $word
402 * The word in which to check.
403 * @param string[] $additional
404 * (optional) Additional characters that should count as vowels.
407 * TRUE if the character is a vowel, FALSE otherwise.
409 protected static function isVowel($position, $word, $additional = array()) {
410 $vowels = array_merge(array('a', 'e', 'i', 'o', 'u', 'y'), $additional);
411 return in_array(self::charAt($position, $word), $vowels);
415 * Retrieves the character at the given position.
417 * @param int $position
418 * The 0-based index of the character. If a negative number is given, the
419 * position is counted from the end of the string.
420 * @param string $word
421 * The word from which to retrieve the character.
424 * The character at the given position, or an empty string if the given
425 * position was illegal.
427 protected static function charAt($position, $word) {
428 $length = strlen($word);
429 if (abs($position) >= $length) {
433 $position += $length;
435 return $word[$position];
439 * Determines whether the word ends in a "vowel-consonant" suffix.
441 * Unless the word is only two characters long, it also checks that the
442 * third-last character is neither "w", "x" nor "Y".
444 * @param int|null $position
445 * (optional) If given, do not check the end of the word, but the character
446 * at the given position, and the next one.
449 * TRUE if the word has the described suffix, FALSE otherwise.
451 protected static function isShortSyllable($word, $position = NULL) {
452 if ($position === NULL) {
453 $position = strlen($word) - 2;
455 // A vowel at the beginning of the word followed by a non-vowel.
456 if ($position === 0) {
457 return self::isVowel(0, $word) && !self::isVowel(1, $word);
459 // Vowel followed by non-vowel other than w, x, Y and preceded by
461 $additional = array('w', 'x', 'Y');
462 return !self::isVowel($position - 1, $word) && self::isVowel($position, $word) && !self::isVowel($position + 1, $word, $additional);
466 * Determines whether the word is short.
468 * A word is called short if it ends in a short syllable and if R1 is null.
471 * TRUE if the word is short, FALSE otherwise.
473 protected static function isShort($word) {
474 return self::isShortSyllable($word) && self::r($word, 1) == strlen($word);
478 * Determines the start of a certain "R" region.
480 * R is a region after the first non-vowel following a vowel, or end of word.
483 * (optional) 1 or 2. If 2, then calculate the R after the R1.
488 protected static function r($word, $type = 1) {
491 $inc = self::r($word, 1);
493 elseif (strlen($word) > 5) {
494 $prefix_5 = substr($word, 0, 5);
495 if ($prefix_5 === 'gener' || $prefix_5 === 'arsen') {
498 if (strlen($word) > 5 && substr($word, 0, 6) === 'commun') {
503 while ($inc <= strlen($word)) {
504 if (!self::isVowel($inc, $word) && self::isVowel($inc - 1, $word)) {
510 if (!isset($position)) {
511 $position = strlen($word);
514 // We add one, as this is the position AFTER the first non-vowel.
521 * Checks whether the given string is contained in R1.
523 * @param string $string
527 * TRUE if the string is in R1, FALSE otherwise.
529 protected static function inR1($word, $string) {
530 $r1 = substr($word, self::r($word, 1));
531 return strpos($r1, $string) !== FALSE;
535 * Checks whether the given string is contained in R2.
537 * @param string $string
541 * TRUE if the string is in R2, FALSE otherwise.
543 protected static function inR2($word, $string) {
544 $r2 = substr($word, self::r($word, 2));
545 return strpos($r2, $string) !== FALSE;
549 * Checks whether the word ends with the given string.
551 * @param string $string
555 * TRUE if the word ends with the given string, FALSE otherwise.
557 protected static function hasEnding($word, $string) {
558 $length = strlen($string);
559 if ($length > strlen($word)) {
562 return (substr_compare($word, $string, -1 * $length, $length) === 0);
566 * Removes a given string from the end of the current word.
568 * Does not check whether the ending is actually there.
570 * @param string $string
571 * The ending to remove.
573 protected static function removeEnding($word, $string) {
574 return substr($word, 0, -strlen($string));
578 * Checks whether the given string contains a vowel.
580 * @param string $string
581 * The string to check.
584 * TRUE if the string contains a vowel, FALSE otherwise.
586 protected static function containsVowel($string) {
589 while ($inc < strlen($string)) {
590 if (self::isVowel($inc, $string)) {
600 * Checks whether the given string is a valid -li prefix.
602 * @param string $string
603 * The string to check.
606 * TRUE if the given string is a valid -li prefix, FALSE otherwise.
608 protected static function validLi($string) {
609 return in_array($string, array(