www.aleph1.co.uk Git - yaffs-website/blob - vendor/nikic/php-parser/lib/PhpParser/Lexer.php

   1 <?php
   2
   3 namespace PhpParser;
   4
   5 use PhpParser\Parser\Tokens;
   6
   7 class Lexer
   8 {
   9     protected $code;
  10     protected $tokens;
  11     protected $pos;
  12     protected $line;
  13     protected $filePos;
  14     protected $prevCloseTagHasNewline;
  15
  16     protected $tokenMap;
  17     protected $dropTokens;
  18
  19     protected $usedAttributes;
  20
  21     /**
  22      * Creates a Lexer.
  23      *
  24      * @param array $options Options array. Currently only the 'usedAttributes' option is supported,
  25      *                       which is an array of attributes to add to the AST nodes. Possible
  26      *                       attributes are: 'comments', 'startLine', 'endLine', 'startTokenPos',
  27      *                       'endTokenPos', 'startFilePos', 'endFilePos'. The option defaults to the
  28      *                       first three. For more info see getNextToken() docs.
  29      */
  30     public function __construct(array $options = array()) {
  31         // map from internal tokens to PhpParser tokens
  32         $this->tokenMap = $this->createTokenMap();
  33
  34         // map of tokens to drop while lexing (the map is only used for isset lookup,
  35         // that's why the value is simply set to 1; the value is never actually used.)
  36         $this->dropTokens = array_fill_keys(
  37             array(T_WHITESPACE, T_OPEN_TAG, T_COMMENT, T_DOC_COMMENT), 1
  38         );
  39
  40         // the usedAttributes member is a map of the used attribute names to a dummy
  41         // value (here "true")
  42         $options += array(
  43             'usedAttributes' => array('comments', 'startLine', 'endLine'),
  44         );
  45         $this->usedAttributes = array_fill_keys($options['usedAttributes'], true);
  46     }
  47
  48     /**
  49      * Initializes the lexer for lexing the provided source code.
  50      *
  51      * This function does not throw if lexing errors occur. Instead, errors may be retrieved using
  52      * the getErrors() method.
  53      *
  54      * @param string $code The source code to lex
  55      * @param ErrorHandler|null $errorHandler Error handler to use for lexing errors. Defaults to
  56      *                                        ErrorHandler\Throwing
  57      */
  58     public function startLexing($code, ErrorHandler $errorHandler = null) {
  59         if (null === $errorHandler) {
  60             $errorHandler = new ErrorHandler\Throwing();
  61         }
  62
  63         $this->code = $code; // keep the code around for __halt_compiler() handling
  64         $this->pos  = -1;
  65         $this->line =  1;
  66         $this->filePos = 0;
  67
  68         // If inline HTML occurs without preceding code, treat it as if it had a leading newline.
  69         // This ensures proper composability, because having a newline is the "safe" assumption.
  70         $this->prevCloseTagHasNewline = true;
  71
  72         $scream = ini_set('xdebug.scream', '0');
  73
  74         $this->resetErrors();
  75         $this->tokens = @token_get_all($code);
  76         $this->handleErrors($errorHandler);
  77
  78         if (false !== $scream) {
  79             ini_set('xdebug.scream', $scream);
  80         }
  81     }
  82
  83     protected function resetErrors() {
  84         if (function_exists('error_clear_last')) {
  85             error_clear_last();
  86         } else {
  87             // set error_get_last() to defined state by forcing an undefined variable error
  88             set_error_handler(function() { return false; }, 0);
  89             @$undefinedVariable;
  90             restore_error_handler();
  91         }
  92     }
  93
  94     private function handleInvalidCharacterRange($start, $end, $line, ErrorHandler $errorHandler) {
  95         for ($i = $start; $i < $end; $i++) {
  96             $chr = $this->code[$i];
  97             if ($chr === 'b' || $chr === 'B') {
  98                 // HHVM does not treat b" tokens correctly, so ignore these
  99                 continue;
 100             }
 101
 102             if ($chr === "\0") {
 103                 // PHP cuts error message after null byte, so need special case
 104                 $errorMsg = 'Unexpected null byte';
 105             } else {
 106                 $errorMsg = sprintf(
 107                     'Unexpected character "%s" (ASCII %d)', $chr, ord($chr)
 108                 );
 109             }
 110
 111             $errorHandler->handleError(new Error($errorMsg, [
 112                 'startLine' => $line,
 113                 'endLine' => $line,
 114                 'startFilePos' => $i,
 115                 'endFilePos' => $i,
 116             ]));
 117         }
 118     }
 119
 120     private function isUnterminatedComment($token) {
 121         return ($token[0] === T_COMMENT || $token[0] === T_DOC_COMMENT)
 122             && substr($token[1], 0, 2) === '/*'
 123             && substr($token[1], -2) !== '*/';
 124     }
 125
 126     private function errorMayHaveOccurred() {
 127         if (defined('HHVM_VERSION')) {
 128             // In HHVM token_get_all() does not throw warnings, so we need to conservatively
 129             // assume that an error occurred
 130             return true;
 131         }
 132
 133         $error = error_get_last();
 134         return null !== $error
 135             && false === strpos($error['message'], 'Undefined variable');
 136     }
 137
 138     protected function handleErrors(ErrorHandler $errorHandler) {
 139         if (!$this->errorMayHaveOccurred()) {
 140             return;
 141         }
 142
 143         // PHP's error handling for token_get_all() is rather bad, so if we want detailed
 144         // error information we need to compute it ourselves. Invalid character errors are
 145         // detected by finding "gaps" in the token array. Unterminated comments are detected
 146         // by checking if a trailing comment has a "*/" at the end.
 147
 148         $filePos = 0;
 149         $line = 1;
 150         foreach ($this->tokens as $i => $token) {
 151             $tokenValue = \is_string($token) ? $token : $token[1];
 152             $tokenLen = \strlen($tokenValue);
 153
 154             if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) {
 155                 // Something is missing, must be an invalid character
 156                 $nextFilePos = strpos($this->code, $tokenValue, $filePos);
 157                 $this->handleInvalidCharacterRange(
 158                     $filePos, $nextFilePos, $line, $errorHandler);
 159                 $filePos = $nextFilePos;
 160             }
 161
 162             $filePos += $tokenLen;
 163             $line += substr_count($tokenValue, "\n");
 164         }
 165
 166         if ($filePos !== \strlen($this->code)) {
 167             if (substr($this->code, $filePos, 2) === '/*') {
 168                 // Unlike PHP, HHVM will drop unterminated comments entirely
 169                 $comment = substr($this->code, $filePos);
 170                 $errorHandler->handleError(new Error('Unterminated comment', [
 171                     'startLine' => $line,
 172                     'endLine' => $line + substr_count($comment, "\n"),
 173                     'startFilePos' => $filePos,
 174                     'endFilePos' => $filePos + \strlen($comment),
 175                 ]));
 176
 177                 // Emulate the PHP behavior
 178                 $isDocComment = isset($comment[3]) && $comment[3] === '*';
 179                 $this->tokens[] = [$isDocComment ? T_DOC_COMMENT : T_COMMENT, $comment, $line];
 180             } else {
 181                 // Invalid characters at the end of the input
 182                 $this->handleInvalidCharacterRange(
 183                     $filePos, \strlen($this->code), $line, $errorHandler);
 184             }
 185             return;
 186         }
 187
 188         if (count($this->tokens) > 0) {
 189             // Check for unterminated comment
 190             $lastToken = $this->tokens[count($this->tokens) - 1];
 191             if ($this->isUnterminatedComment($lastToken)) {
 192                 $errorHandler->handleError(new Error('Unterminated comment', [
 193                     'startLine' => $line - substr_count($lastToken[1], "\n"),
 194                     'endLine' => $line,
 195                     'startFilePos' => $filePos - \strlen($lastToken[1]),
 196                     'endFilePos' => $filePos,
 197                 ]));
 198             }
 199         }
 200     }
 201
 202     /**
 203      * Fetches the next token.
 204      *
 205      * The available attributes are determined by the 'usedAttributes' option, which can
 206      * be specified in the constructor. The following attributes are supported:
 207      *
 208      *  * 'comments'      => Array of PhpParser\Comment or PhpParser\Comment\Doc instances,
 209      *                       representing all comments that occurred between the previous
 210      *                       non-discarded token and the current one.
 211      *  * 'startLine'     => Line in which the node starts.
 212      *  * 'endLine'       => Line in which the node ends.
 213      *  * 'startTokenPos' => Offset into the token array of the first token in the node.
 214      *  * 'endTokenPos'   => Offset into the token array of the last token in the node.
 215      *  * 'startFilePos'  => Offset into the code string of the first character that is part of the node.
 216      *  * 'endFilePos'    => Offset into the code string of the last character that is part of the node.
 217      *
 218      * @param mixed $value           Variable to store token content in
 219      * @param mixed $startAttributes Variable to store start attributes in
 220      * @param mixed $endAttributes   Variable to store end attributes in
 221      *
 222      * @return int Token id
 223      */
 224     public function getNextToken(&$value = null, &$startAttributes = null, &$endAttributes = null) {
 225         $startAttributes = array();
 226         $endAttributes   = array();
 227
 228         while (1) {
 229             if (isset($this->tokens[++$this->pos])) {
 230                 $token = $this->tokens[$this->pos];
 231             } else {
 232                 // EOF token with ID 0
 233                 $token = "\0";
 234             }
 235
 236             if (isset($this->usedAttributes['startLine'])) {
 237                 $startAttributes['startLine'] = $this->line;
 238             }
 239             if (isset($this->usedAttributes['startTokenPos'])) {
 240                 $startAttributes['startTokenPos'] = $this->pos;
 241             }
 242             if (isset($this->usedAttributes['startFilePos'])) {
 243                 $startAttributes['startFilePos'] = $this->filePos;
 244             }
 245
 246             if (\is_string($token)) {
 247                 $value = $token;
 248                 if (isset($token[1])) {
 249                     // bug in token_get_all
 250                     $this->filePos += 2;
 251                     $id = ord('"');
 252                 } else {
 253                     $this->filePos += 1;
 254                     $id = ord($token);
 255                 }
 256             } elseif (!isset($this->dropTokens[$token[0]])) {
 257                 $value = $token[1];
 258                 $id = $this->tokenMap[$token[0]];
 259                 if (T_CLOSE_TAG === $token[0]) {
 260                     $this->prevCloseTagHasNewline = false !== strpos($token[1], "\n");
 261                 } else if (T_INLINE_HTML === $token[0]) {
 262                     $startAttributes['hasLeadingNewline'] = $this->prevCloseTagHasNewline;
 263                 }
 264
 265                 $this->line += substr_count($value, "\n");
 266                 $this->filePos += \strlen($value);
 267             } else {
 268                 if (T_COMMENT === $token[0] || T_DOC_COMMENT === $token[0]) {
 269                     if (isset($this->usedAttributes['comments'])) {
 270                         $comment = T_DOC_COMMENT === $token[0]
 271                             ? new Comment\Doc($token[1], $this->line, $this->filePos)
 272                             : new Comment($token[1], $this->line, $this->filePos);
 273                         $startAttributes['comments'][] = $comment;
 274                     }
 275                 }
 276
 277                 $this->line += substr_count($token[1], "\n");
 278                 $this->filePos += \strlen($token[1]);
 279                 continue;
 280             }
 281
 282             if (isset($this->usedAttributes['endLine'])) {
 283                 $endAttributes['endLine'] = $this->line;
 284             }
 285             if (isset($this->usedAttributes['endTokenPos'])) {
 286                 $endAttributes['endTokenPos'] = $this->pos;
 287             }
 288             if (isset($this->usedAttributes['endFilePos'])) {
 289                 $endAttributes['endFilePos'] = $this->filePos - 1;
 290             }
 291
 292             return $id;
 293         }
 294
 295         throw new \RuntimeException('Reached end of lexer loop');
 296     }
 297
 298     /**
 299      * Returns the token array for current code.
 300      *
 301      * The token array is in the same format as provided by the
 302      * token_get_all() function and does not discard tokens (i.e.
 303      * whitespace and comments are included). The token position
 304      * attributes are against this token array.
 305      *
 306      * @return array Array of tokens in token_get_all() format
 307      */
 308     public function getTokens() {
 309         return $this->tokens;
 310     }
 311
 312     /**
 313      * Handles __halt_compiler() by returning the text after it.
 314      *
 315      * @return string Remaining text
 316      */
 317     public function handleHaltCompiler() {
 318         // text after T_HALT_COMPILER, still including ();
 319         $textAfter = substr($this->code, $this->filePos);
 320
 321         // ensure that it is followed by ();
 322         // this simplifies the situation, by not allowing any comments
 323         // in between of the tokens.
 324         if (!preg_match('~^\s*\(\s*\)\s*(?:;|\?>\r?\n?)~', $textAfter, $matches)) {
 325             throw new Error('__HALT_COMPILER must be followed by "();"');
 326         }
 327
 328         // prevent the lexer from returning any further tokens
 329         $this->pos = count($this->tokens);
 330
 331         // return with (); removed
 332         return (string) substr($textAfter, strlen($matches[0])); // (string) converts false to ''
 333     }
 334
 335     /**
 336      * Creates the token map.
 337      *
 338      * The token map maps the PHP internal token identifiers
 339      * to the identifiers used by the Parser. Additionally it
 340      * maps T_OPEN_TAG_WITH_ECHO to T_ECHO and T_CLOSE_TAG to ';'.
 341      *
 342      * @return array The token map
 343      */
 344     protected function createTokenMap() {
 345         $tokenMap = array();
 346
 347         // 256 is the minimum possible token number, as everything below
 348         // it is an ASCII value
 349         for ($i = 256; $i < 1000; ++$i) {
 350             if (T_DOUBLE_COLON === $i) {
 351                 // T_DOUBLE_COLON is equivalent to T_PAAMAYIM_NEKUDOTAYIM
 352                 $tokenMap[$i] = Tokens::T_PAAMAYIM_NEKUDOTAYIM;
 353             } elseif(T_OPEN_TAG_WITH_ECHO === $i) {
 354                 // T_OPEN_TAG_WITH_ECHO with dropped T_OPEN_TAG results in T_ECHO
 355                 $tokenMap[$i] = Tokens::T_ECHO;
 356             } elseif(T_CLOSE_TAG === $i) {
 357                 // T_CLOSE_TAG is equivalent to ';'
 358                 $tokenMap[$i] = ord(';');
 359             } elseif ('UNKNOWN' !== $name = token_name($i)) {
 360                 if ('T_HASHBANG' === $name) {
 361                     // HHVM uses a special token for #! hashbang lines
 362                     $tokenMap[$i] = Tokens::T_INLINE_HTML;
 363                 } else if (defined($name = Tokens::class . '::' . $name)) {
 364                     // Other tokens can be mapped directly
 365                     $tokenMap[$i] = constant($name);
 366                 }
 367             }
 368         }
 369
 370         // HHVM uses a special token for numbers that overflow to double
 371         if (defined('T_ONUMBER')) {
 372             $tokenMap[T_ONUMBER] = Tokens::T_DNUMBER;
 373         }
 374         // HHVM also has a separate token for the __COMPILER_HALT_OFFSET__ constant
 375         if (defined('T_COMPILER_HALT_OFFSET')) {
 376             $tokenMap[T_COMPILER_HALT_OFFSET] = Tokens::T_STRING;
 377         }
 378
 379         return $tokenMap;
 380     }
 381 }