--- /dev/null
+<?php
+
+/*
+ * This file is part of the Behat Gherkin.
+ * (c) Konstantin Kudryashov <ever.zet@gmail.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+namespace Behat\Gherkin;
+
+use Behat\Gherkin\Exception\LexerException;
+use Behat\Gherkin\Keywords\KeywordsInterface;
+
+/**
+ * Gherkin lexer.
+ *
+ * @author Konstantin Kudryashov <ever.zet@gmail.com>
+ */
+class Lexer
+{
+ private $language;
+ private $lines;
+ private $linesCount;
+ private $line;
+ private $trimmedLine;
+ private $lineNumber;
+ private $eos;
+ private $keywords;
+ private $keywordsCache = array();
+ private $stepKeywordTypesCache = array();
+ private $deferredObjects = array();
+ private $deferredObjectsCount = 0;
+ private $stashedToken;
+ private $inPyString = false;
+ private $pyStringSwallow = 0;
+ private $featureStarted = false;
+ private $allowMultilineArguments = false;
+ private $allowSteps = false;
+
+ /**
+ * Initializes lexer.
+ *
+ * @param KeywordsInterface $keywords Keywords holder
+ */
+ public function __construct(KeywordsInterface $keywords)
+ {
+ $this->keywords = $keywords;
+ }
+
+ /**
+ * Sets lexer input.
+ *
+ * @param string $input Input string
+ * @param string $language Language name
+ *
+ * @throws Exception\LexerException
+ */
+ public function analyse($input, $language = 'en')
+ {
+ // try to detect unsupported encoding
+ if ('UTF-8' !== mb_detect_encoding($input, 'UTF-8', true)) {
+ throw new LexerException('Feature file is not in UTF8 encoding');
+ }
+
+ $input = strtr($input, array("\r\n" => "\n", "\r" => "\n"));
+
+ $this->lines = explode("\n", $input);
+ $this->linesCount = count($this->lines);
+ $this->line = $this->lines[0];
+ $this->lineNumber = 1;
+ $this->trimmedLine = null;
+ $this->eos = false;
+
+ $this->deferredObjects = array();
+ $this->deferredObjectsCount = 0;
+ $this->stashedToken = null;
+ $this->inPyString = false;
+ $this->pyStringSwallow = 0;
+
+ $this->featureStarted = false;
+ $this->allowMultilineArguments = false;
+ $this->allowSteps = false;
+
+ $this->keywords->setLanguage($this->language = $language);
+ $this->keywordsCache = array();
+ $this->stepKeywordTypesCache = array();
+ }
+
+ /**
+ * Returns current lexer language.
+ *
+ * @return string
+ */
+ public function getLanguage()
+ {
+ return $this->language;
+ }
+
+ /**
+ * Returns next token or previously stashed one.
+ *
+ * @return array
+ */
+ public function getAdvancedToken()
+ {
+ return $this->getStashedToken() ?: $this->getNextToken();
+ }
+
+ /**
+ * Defers token.
+ *
+ * @param array $token Token to defer
+ */
+ public function deferToken(array $token)
+ {
+ $token['deferred'] = true;
+ $this->deferredObjects[] = $token;
+ ++$this->deferredObjectsCount;
+ }
+
+ /**
+ * Predicts for number of tokens.
+ *
+ * @return array
+ */
+ public function predictToken()
+ {
+ if (null === $this->stashedToken) {
+ $this->stashedToken = $this->getNextToken();
+ }
+
+ return $this->stashedToken;
+ }
+
+ /**
+ * Constructs token with specified parameters.
+ *
+ * @param string $type Token type
+ * @param string $value Token value
+ *
+ * @return array
+ */
+ public function takeToken($type, $value = null)
+ {
+ return array(
+ 'type' => $type,
+ 'line' => $this->lineNumber,
+ 'value' => $value ?: null,
+ 'deferred' => false
+ );
+ }
+
+ /**
+ * Consumes line from input & increments line counter.
+ */
+ protected function consumeLine()
+ {
+ ++$this->lineNumber;
+
+ if (($this->lineNumber - 1) === $this->linesCount) {
+ $this->eos = true;
+
+ return;
+ }
+
+ $this->line = $this->lines[$this->lineNumber - 1];
+ $this->trimmedLine = null;
+ }
+
+ /**
+ * Returns trimmed version of line.
+ *
+ * @return string
+ */
+ protected function getTrimmedLine()
+ {
+ return null !== $this->trimmedLine ? $this->trimmedLine : $this->trimmedLine = trim($this->line);
+ }
+
+ /**
+ * Returns stashed token or null if hasn't.
+ *
+ * @return array|null
+ */
+ protected function getStashedToken()
+ {
+ $stashedToken = $this->stashedToken;
+ $this->stashedToken = null;
+
+ return $stashedToken;
+ }
+
+ /**
+ * Returns deferred token or null if hasn't.
+ *
+ * @return array|null
+ */
+ protected function getDeferredToken()
+ {
+ if (!$this->deferredObjectsCount) {
+ return null;
+ }
+
+ --$this->deferredObjectsCount;
+
+ return array_shift($this->deferredObjects);
+ }
+
+ /**
+ * Returns next token from input.
+ *
+ * @return array
+ */
+ protected function getNextToken()
+ {
+ return $this->getDeferredToken()
+ ?: $this->scanEOS()
+ ?: $this->scanLanguage()
+ ?: $this->scanComment()
+ ?: $this->scanPyStringOp()
+ ?: $this->scanPyStringContent()
+ ?: $this->scanStep()
+ ?: $this->scanScenario()
+ ?: $this->scanBackground()
+ ?: $this->scanOutline()
+ ?: $this->scanExamples()
+ ?: $this->scanFeature()
+ ?: $this->scanTags()
+ ?: $this->scanTableRow()
+ ?: $this->scanNewline()
+ ?: $this->scanText();
+ }
+
+ /**
+ * Scans for token with specified regex.
+ *
+ * @param string $regex Regular expression
+ * @param string $type Expected token type
+ *
+ * @return null|array
+ */
+ protected function scanInput($regex, $type)
+ {
+ if (!preg_match($regex, $this->line, $matches)) {
+ return null;
+ }
+
+ $token = $this->takeToken($type, $matches[1]);
+ $this->consumeLine();
+
+ return $token;
+ }
+
+ /**
+ * Scans for token with specified keywords.
+ *
+ * @param string $keywords Keywords (splitted with |)
+ * @param string $type Expected token type
+ *
+ * @return null|array
+ */
+ protected function scanInputForKeywords($keywords, $type)
+ {
+ if (!preg_match('/^(\s*)(' . $keywords . '):\s*(.*)/u', $this->line, $matches)) {
+ return null;
+ }
+
+ $token = $this->takeToken($type, $matches[3]);
+ $token['keyword'] = $matches[2];
+ $token['indent'] = mb_strlen($matches[1], 'utf8');
+
+ $this->consumeLine();
+
+ // turn off language searching
+ if ('Feature' === $type) {
+ $this->featureStarted = true;
+ }
+
+ // turn off PyString and Table searching
+ if ('Feature' === $type || 'Scenario' === $type || 'Outline' === $type) {
+ $this->allowMultilineArguments = false;
+ } elseif ('Examples' === $type) {
+ $this->allowMultilineArguments = true;
+ }
+
+ // turn on steps searching
+ if ('Scenario' === $type || 'Background' === $type || 'Outline' === $type) {
+ $this->allowSteps = true;
+ }
+
+ return $token;
+ }
+
+ /**
+ * Scans EOS from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanEOS()
+ {
+ if (!$this->eos) {
+ return null;
+ }
+
+ return $this->takeToken('EOS');
+ }
+
+ /**
+ * Returns keywords for provided type.
+ *
+ * @param string $type Keyword type
+ *
+ * @return string
+ */
+ protected function getKeywords($type)
+ {
+ if (!isset($this->keywordsCache[$type])) {
+ $getter = 'get' . $type . 'Keywords';
+ $keywords = $this->keywords->$getter();
+
+ if ('Step' === $type) {
+ $padded = array();
+ foreach (explode('|', $keywords) as $keyword) {
+ $padded[] = false !== mb_strpos($keyword, '<', 0, 'utf8')
+ ? preg_quote(mb_substr($keyword, 0, -1, 'utf8'), '/') . '\s*'
+ : preg_quote($keyword, '/') . '\s+';
+ }
+
+ $keywords = implode('|', $padded);
+ }
+
+ $this->keywordsCache[$type] = $keywords;
+ }
+
+ return $this->keywordsCache[$type];
+ }
+
+ /**
+ * Scans Feature from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanFeature()
+ {
+ return $this->scanInputForKeywords($this->getKeywords('Feature'), 'Feature');
+ }
+
+ /**
+ * Scans Background from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanBackground()
+ {
+ return $this->scanInputForKeywords($this->getKeywords('Background'), 'Background');
+ }
+
+ /**
+ * Scans Scenario from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanScenario()
+ {
+ return $this->scanInputForKeywords($this->getKeywords('Scenario'), 'Scenario');
+ }
+
+ /**
+ * Scans Scenario Outline from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanOutline()
+ {
+ return $this->scanInputForKeywords($this->getKeywords('Outline'), 'Outline');
+ }
+
+ /**
+ * Scans Scenario Outline Examples from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanExamples()
+ {
+ return $this->scanInputForKeywords($this->getKeywords('Examples'), 'Examples');
+ }
+
+ /**
+ * Scans Step from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanStep()
+ {
+ if (!$this->allowSteps) {
+ return null;
+ }
+
+ $keywords = $this->getKeywords('Step');
+ if (!preg_match('/^\s*(' . $keywords . ')([^\s].+)/u', $this->line, $matches)) {
+ return null;
+ }
+
+ $keyword = trim($matches[1]);
+ $token = $this->takeToken('Step', $keyword);
+ $token['keyword_type'] = $this->getStepKeywordType($keyword);
+ $token['text'] = $matches[2];
+
+ $this->consumeLine();
+ $this->allowMultilineArguments = true;
+
+ return $token;
+ }
+
+ /**
+ * Scans PyString from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanPyStringOp()
+ {
+ if (!$this->allowMultilineArguments) {
+ return null;
+ }
+
+ if (false === ($pos = mb_strpos($this->line, '"""', 0, 'utf8'))) {
+ return null;
+ }
+
+ $this->inPyString = !$this->inPyString;
+ $token = $this->takeToken('PyStringOp');
+ $this->pyStringSwallow = $pos;
+
+ $this->consumeLine();
+
+ return $token;
+ }
+
+ /**
+ * Scans PyString content.
+ *
+ * @return null|array
+ */
+ protected function scanPyStringContent()
+ {
+ if (!$this->inPyString) {
+ return null;
+ }
+
+ $token = $this->scanText();
+ // swallow trailing spaces
+ $token['value'] = preg_replace('/^\s{0,' . $this->pyStringSwallow . '}/u', '', $token['value']);
+
+ return $token;
+ }
+
+ /**
+ * Scans Table Row from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanTableRow()
+ {
+ if (!$this->allowMultilineArguments) {
+ return null;
+ }
+
+ $line = $this->getTrimmedLine();
+ if (!isset($line[0]) || '|' !== $line[0] || '|' !== substr($line, -1)) {
+ return null;
+ }
+
+ $token = $this->takeToken('TableRow');
+ $line = mb_substr($line, 1, mb_strlen($line, 'utf8') - 2, 'utf8');
+ $columns = array_map(function ($column) {
+ return trim(str_replace('\\|', '|', $column));
+ }, preg_split('/(?<!\\\)\|/u', $line));
+ $token['columns'] = $columns;
+
+ $this->consumeLine();
+
+ return $token;
+ }
+
+ /**
+ * Scans Tags from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanTags()
+ {
+ $line = $this->getTrimmedLine();
+ if (!isset($line[0]) || '@' !== $line[0]) {
+ return null;
+ }
+
+ $token = $this->takeToken('Tag');
+ $tags = explode('@', mb_substr($line, 1, mb_strlen($line, 'utf8') - 1, 'utf8'));
+ $tags = array_map('trim', $tags);
+ $token['tags'] = $tags;
+
+ $this->consumeLine();
+
+ return $token;
+ }
+
+ /**
+ * Scans Language specifier from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanLanguage()
+ {
+ if ($this->featureStarted) {
+ return null;
+ }
+
+ if ($this->inPyString) {
+ return null;
+ }
+
+ if (0 !== mb_strpos(ltrim($this->line), '#', 0, 'utf8')) {
+ return null;
+ }
+
+ return $this->scanInput('/^\s*\#\s*language:\s*([\w_\-]+)\s*$/', 'Language');
+ }
+
+ /**
+ * Scans Comment from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanComment()
+ {
+ if ($this->inPyString) {
+ return null;
+ }
+
+ $line = $this->getTrimmedLine();
+ if (0 !== mb_strpos($line, '#', 0, 'utf8')) {
+ return null;
+ }
+
+ $token = $this->takeToken('Comment', $line);
+ $this->consumeLine();
+
+ return $token;
+ }
+
+ /**
+ * Scans Newline from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanNewline()
+ {
+ if ('' !== $this->getTrimmedLine()) {
+ return null;
+ }
+
+ $token = $this->takeToken('Newline', mb_strlen($this->line, 'utf8'));
+ $this->consumeLine();
+
+ return $token;
+ }
+
+ /**
+ * Scans text from input & returns it if found.
+ *
+ * @return null|array
+ */
+ protected function scanText()
+ {
+ $token = $this->takeToken('Text', $this->line);
+ $this->consumeLine();
+
+ return $token;
+ }
+
+ /**
+ * Returns step type keyword (Given, When, Then, etc.).
+ *
+ * @param string $native Step keyword in provided language
+ * @return string
+ */
+ private function getStepKeywordType($native)
+ {
+ // Consider "*" as a AND keyword so that it is normalized to the previous step type
+ if ('*' === $native) {
+ return 'And';
+ }
+
+ if (empty($this->stepKeywordTypesCache)) {
+ $this->stepKeywordTypesCache = array(
+ 'Given' => explode('|', $this->keywords->getGivenKeywords()),
+ 'When' => explode('|', $this->keywords->getWhenKeywords()),
+ 'Then' => explode('|', $this->keywords->getThenKeywords()),
+ 'And' => explode('|', $this->keywords->getAndKeywords()),
+ 'But' => explode('|', $this->keywords->getButKeywords())
+ );
+ }
+
+ foreach ($this->stepKeywordTypesCache as $type => $keywords) {
+ if (in_array($native, $keywords) || in_array($native . '<', $keywords)) {
+ return $type;
+ }
+ }
+
+ return 'Given';
+ }
+}