2 namespace Masterminds\HTML5\Tests\Parser;
4 use Masterminds\HTML5\Parser\UTF8Utils;
5 use Masterminds\HTML5\Parser\StringInputStream;
6 use Masterminds\HTML5\Parser\Scanner;
7 use Masterminds\HTML5\Parser\Tokenizer;
9 class TokenizerTest extends \Masterminds\HTML5\Tests\TestCase
11 // ================================================================
12 // Additional assertions.
13 // ================================================================
15 * Tests that an event matches both the event type and the expected value.
18 * Expected event type.
19 * @param string $expects
20 * The value expected in $event['data'][0].
22 public function assertEventEquals($type, $expects, $event)
24 $this->assertEquals($type, $event['name'], "Event $type for " . print_r($event, true));
25 if (is_array($expects)) {
26 $this->assertEquals($expects, $event['data'], "Event $type should equal " . print_r($expects, true) . ": " . print_r($event, true));
28 $this->assertEquals($expects, $event['data'][0], "Event $type should equal $expects: " . print_r($event, true));
33 * Assert that a given event is 'error'.
35 public function assertEventError($event)
37 $this->assertEquals('error', $event['name'], "Expected error for event: " . print_r($event, true));
41 * Asserts that all of the tests are good.
43 * This loops through a map of tests/expectations and runs a few assertions on each test.
46 * - depth (if depth is > 0)
48 * - matches on event 0.
50 protected function isAllGood($name, $depth, $tests, $debug = false)
52 foreach ($tests as $try => $expects) {
54 fprintf(STDOUT, "%s expects %s\n", $try, print_r($expects, true));
56 $e = $this->parse($try);
58 $this->assertEquals($depth, $e->depth(), "Expected depth $depth for test $try." . print_r($e, true));
60 $this->assertEventEquals($name, $expects, $e->get(0));
64 // ================================================================
66 // ================================================================
67 public function testParse()
69 list ($tok, $events) = $this->createTokenizer('');
72 $e1 = $events->get(0);
74 $this->assertEquals(1, $events->Depth());
75 $this->assertEquals('eof', $e1['name']);
78 public function testWhitespace()
81 list ($tok, $events) = $this->createTokenizer($spaces);
85 $this->assertEquals(2, $events->depth());
87 $e1 = $events->get(0);
89 $this->assertEquals('text', $e1['name']);
90 $this->assertEquals($spaces, $e1['data'][0]);
93 public function testCharacterReference()
101 $this->isAllGood('text', 2, $good);
103 // Test with broken charref
105 $events = $this->parse($str);
106 $e1 = $events->get(0);
107 $this->assertEquals('error', $e1['name']);
110 $events = $this->parse($str);
111 $e1 = $events->get(0);
112 $this->assertEquals('error', $e1['name']);
115 $events = $this->parse($str);
116 $e1 = $events->get(0);
117 $this->assertEquals('error', $e1['name']);
119 // FIXME: Once the text processor is done, need to verify that the
120 // tokens are transformed correctly into text.
123 public function testBogusComment()
126 '</+this is a bogus comment. +>',
127 '<!+this is a bogus comment. !>',
128 '<!D OCTYPE foo bar>',
129 '<!DOCTYEP foo bar>',
130 '<![CADATA[ TEST ]]>',
131 '<![CDATA Hello ]]>',
132 '<![CDATA[ Hello [[>',
133 '<!CDATA[[ test ]]>',
135 '<![CDATA[hellooooo hello',
139 foreach ($bogus as $str) {
140 $events = $this->parse($str);
141 $this->assertEventError($events->get(0));
142 $this->assertEventEquals('comment', $str, $events->get(1));
146 public function testEndTag()
153 '</thisIsTheTagThatDoesntEndItJustGoesOnAndOnMyFriend>' => 'thisisthetagthatdoesntenditjustgoesonandonmyfriend',
154 // See 8.2.4.10, which requires this and does not say error.
157 $this->isAllGood('endTag', 2, $succeed);
159 // Recoverable failures
161 '</a class="monkey">' => 'a',
164 '</a is the loneliest letter>' => 'a',
167 foreach ($fail as $test => $result) {
168 $events = $this->parse($test);
169 $this->assertEquals(3, $events->depth());
170 // Should have triggered an error.
171 $this->assertEventError($events->get(0));
172 // Should have tried to parse anyway.
173 $this->assertEventEquals('endTag', $result, $events->get(1));
182 foreach ($comments as $test => $result) {
183 $events = $this->parse($test);
184 $this->assertEquals(3, $events->depth());
186 // Should have triggered an error.
187 $this->assertEventError($events->get(0));
189 // Should have tried to parse anyway.
190 $this->assertEventEquals('comment', $result, $events->get(1));
194 public function testComment()
197 '<!--easy-->' => 'easy',
198 '<!-- 1 > 0 -->' => ' 1 > 0 ',
199 '<!-- --$i -->' => ' --$i ',
200 '<!----$i-->' => '--$i',
201 "<!--\nHello World.\na-->" => "\nHello World.\na",
202 '<!-- <!-- -->' => ' <!-- '
204 foreach ($good as $test => $expected) {
205 $events = $this->parse($test);
206 $this->assertEventEquals('comment', $expected, $events->get(0));
211 '<!--Hello' => 'Hello',
212 "<!--\0Hello" => UTF8Utils::FFFD . 'Hello',
215 foreach ($fail as $test => $expected) {
216 $events = $this->parse($test);
217 $this->assertEquals(3, $events->depth());
218 $this->assertEventError($events->get(0));
219 $this->assertEventEquals('comment', $expected, $events->get(1));
223 public function testCDATASection()
226 '<![CDATA[ This is a test. ]]>' => ' This is a test. ',
227 '<![CDATA[CDATA]]>' => 'CDATA',
228 '<![CDATA[ ]] > ]]>' => ' ]] > ',
229 '<![CDATA[ ]]>' => ' '
231 $this->isAllGood('cdata', 2, $good);
234 public function testDoctype()
237 '<!DOCTYPE html>' => array(
243 '<!doctype html>' => array(
249 '<!DocType html>' => array(
255 "<!DOCTYPE\nhtml>" => array(
261 "<!DOCTYPE\fhtml>" => array(
267 '<!DOCTYPE html PUBLIC "foo bar">' => array(
269 EventStack::DOCTYPE_PUBLIC,
273 "<!DOCTYPE html PUBLIC 'foo bar'>" => array(
275 EventStack::DOCTYPE_PUBLIC,
279 '<!DOCTYPE html PUBLIC "foo bar" >' => array(
281 EventStack::DOCTYPE_PUBLIC,
285 "<!DOCTYPE html \nPUBLIC\n'foo bar'>" => array(
287 EventStack::DOCTYPE_PUBLIC,
291 '<!DOCTYPE html SYSTEM "foo bar">' => array(
293 EventStack::DOCTYPE_SYSTEM,
297 "<!DOCTYPE html SYSTEM 'foo bar'>" => array(
299 EventStack::DOCTYPE_SYSTEM,
303 '<!DOCTYPE html SYSTEM "foo/bar" >' => array(
305 EventStack::DOCTYPE_SYSTEM,
309 "<!DOCTYPE html \nSYSTEM\n'foo bar'>" => array(
311 EventStack::DOCTYPE_SYSTEM,
316 $this->isAllGood('doctype', 2, $good);
319 '<!DOCTYPE>' => array(
321 EventStack::DOCTYPE_NONE,
325 '<!DOCTYPE >' => array(
327 EventStack::DOCTYPE_NONE,
331 '<!DOCTYPE foo' => array(
333 EventStack::DOCTYPE_NONE,
337 '<!DOCTYPE foo PUB' => array(
339 EventStack::DOCTYPE_NONE,
343 '<!DOCTYPE foo PUB>' => array(
345 EventStack::DOCTYPE_NONE,
349 '<!DOCTYPE foo PUB "Looks good">' => array(
351 EventStack::DOCTYPE_NONE,
355 '<!DOCTYPE foo SYSTME "Looks good"' => array(
357 EventStack::DOCTYPE_NONE,
362 // Can't tell whether these are ids or ID types, since the context is chopped.
363 '<!DOCTYPE foo PUBLIC' => array(
365 EventStack::DOCTYPE_NONE,
369 '<!DOCTYPE foo PUBLIC>' => array(
371 EventStack::DOCTYPE_NONE,
375 '<!DOCTYPE foo SYSTEM' => array(
377 EventStack::DOCTYPE_NONE,
381 '<!DOCTYPE foo SYSTEM>' => array(
383 EventStack::DOCTYPE_NONE,
388 '<!DOCTYPE html SYSTEM "foo bar"' => array(
390 EventStack::DOCTYPE_SYSTEM,
394 '<!DOCTYPE html SYSTEM "foo bar" more stuff>' => array(
396 EventStack::DOCTYPE_SYSTEM,
401 foreach ($bad as $test => $expects) {
402 $events = $this->parse($test);
403 // fprintf(STDOUT, $test . PHP_EOL);
404 $this->assertEquals(3, $events->depth(), "Counting events for '$test': " . print_r($events, true));
405 $this->assertEventError($events->get(0));
406 $this->assertEventEquals('doctype', $expects, $events->get(1));
410 public function testProcessorInstruction()
414 '<?hph echo "Hello World"; ?>' => array(
416 'echo "Hello World"; '
418 "<?hph \necho 'Hello World';\n?>" => array(
420 "echo 'Hello World';\n"
423 $this->isAllGood('pi', 2, $good);
427 * This tests just simple tags.
429 public function testSimpleTags()
436 "<foo\n\n\n\n>" => 'foo',
437 '<foo:bar>' => 'foo:bar'
439 $this->isAllGood('startTag', 2, $open);
445 "<foo\n\n\n\n/>" => 'foo',
446 '<foo:bar/>' => 'foo:bar'
448 foreach ($selfClose as $test => $expects) {
449 $events = $this->parse($test);
450 $this->assertEquals(3, $events->depth(), "Counting events for '$test'" . print_r($events, true));
451 $this->assertEventEquals('startTag', $expects, $events->get(0));
452 $this->assertEventEquals('endTag', $expects, $events->get(1));
462 foreach ($bad as $test => $expects) {
463 $events = $this->parse($test);
464 $this->assertEquals(3, $events->depth(), "Counting events for '$test': " . print_r($events, true));
465 $this->assertEventError($events->get(0));
466 $this->assertEventEquals('startTag', $expects, $events->get(1));
470 public function testTagsWithAttributeAndMissingName()
473 '<id="top_featured">' => 'id',
474 '<color="white">' => 'color',
475 "<class='neaktivni_stranka'>" => 'class',
476 '<bgcolor="white">' => 'bgcolor',
477 '<class="nom">' => 'class'
480 foreach ($cases as $html => $expected) {
481 $events = $this->parse($html);
482 $this->assertEventError($events->get(0));
483 $this->assertEventError($events->get(1));
484 $this->assertEventError($events->get(2));
485 $this->assertEventEquals('startTag', $expected, $events->get(3));
486 $this->assertEventEquals('eof', null, $events->get(4));
490 public function testTagNotClosedAfterTagName()
493 "<noscript<img>" => array(
497 '<center<a>' => array(
507 foreach ($cases as $html => $expected) {
508 $events = $this->parse($html);
509 $this->assertEventError($events->get(0));
510 $this->assertEventEquals('startTag', $expected[0], $events->get(1));
511 $this->assertEventEquals('startTag', $expected[1], $events->get(2));
512 $this->assertEventEquals('eof', null, $events->get(3));
515 $events = $this->parse('<span<>02</span>');
516 $this->assertEventError($events->get(0));
517 $this->assertEventEquals('startTag', 'span', $events->get(1));
518 $this->assertEventError($events->get(2));
519 $this->assertEventEquals('text', '>02', $events->get(3));
520 $this->assertEventEquals('endTag', 'span', $events->get(4));
521 $this->assertEventEquals('eof', null, $events->get(5));
523 $events = $this->parse('<p</p>');
524 $this->assertEventError($events->get(0));
525 $this->assertEventEquals('startTag', 'p', $events->get(1));
526 $this->assertEventEquals('endTag', 'p', $events->get(2));
527 $this->assertEventEquals('eof', null, $events->get(3));
529 $events = $this->parse('<strong><WordPress</strong>');
530 $this->assertEventEquals('startTag', 'strong', $events->get(0));
531 $this->assertEventError($events->get(1));
532 $this->assertEventEquals('startTag', 'wordpress', $events->get(2));
533 $this->assertEventEquals('endTag', 'strong', $events->get(3));
534 $this->assertEventEquals('eof', null, $events->get(4));
536 $events = $this->parse('<src=<a>');
537 $this->assertEventError($events->get(0));
538 $this->assertEventError($events->get(1));
539 $this->assertEventError($events->get(2));
540 $this->assertEventEquals('startTag', 'src', $events->get(3));
541 $this->assertEventEquals('startTag', 'a', $events->get(4));
542 $this->assertEventEquals('eof', null, $events->get(5));
544 $events = $this->parse('<br...<a>');
545 $this->assertEventError($events->get(0));
546 $this->assertEventEquals('startTag', 'br', $events->get(1));
547 $this->assertEventEquals('eof', null, $events->get(2));
550 public function testIllegalTagNames()
556 '<static*all>' => 'static',
558 '<st*ATTRIBUTE />' => 'st',
561 foreach ($cases as $html => $expected) {
562 $events = $this->parse($html);
563 $this->assertEventError($events->get(0));
564 $this->assertEventEquals('startTag', $expected, $events->get(1));
569 * @depends testCharacterReference
571 public function testTagAttributes()
575 '<foo bar="baz">' => array(
582 '<foo bar=" baz ">' => array(
589 "<foo bar=\"\nbaz\n\">" => array(
596 "<foo bar='baz'>" => array(
603 '<foo bar="A full sentence.">' => array(
606 'bar' => 'A full sentence.'
610 "<foo a='1' b=\"2\">" => array(
618 "<foo ns:bar='baz'>" => array(
625 "<foo a='blue&red'>" => array(
632 "<foo a='blue&red'>" => array(
639 "<foo a='blue&&&red'>" => array(
646 "<foo a='blue&&red'>" => array(
653 "<foo\nbar='baz'\n>" => array(
660 '<doe a deer>' => array(
668 '<foo bar=baz>' => array(
676 // Updated for 8.1.2.3
677 '<foo bar = "baz" >' => array(
685 // The spec allows an unquoted value '/'. This will not be a closing
687 '<foo bar=/>' => array(
694 '<foo bar=baz/>' => array(
702 $this->isAllGood('startTag', 2, $good);
704 // Self-closing tags.
706 '<foo bar="baz"/>' => array(
713 '<foo BAR="baz"/>' => array(
720 '<foo BAR="BAZ"/>' => array(
727 "<foo a='1' b=\"2\" c=3 d/>" => array(
738 $this->isAllGood('startTag', 3, $withEnd);
740 // Cause a parse error.
742 // This will emit an entity lookup failure for &+dark.
743 "<foo a='blue&+dark'>" => array(
750 '<foo bar=>' => array(
757 '<foo bar="oh' => array(
764 '<foo bar=oh">' => array(
772 // these attributes are ignored because of current implementation
773 // of method "DOMElement::setAttribute"
774 // see issue #23: https://github.com/Masterminds/html5-php/issues/23
775 '<foo b"="baz">' => array(
780 '<foo 2abc="baz">' => array(
785 '<foo ?="baz">' => array(
790 '<foo foo?bar="baz">' => array(
797 foreach ($bad as $test => $expects) {
798 $events = $this->parse($test);
799 $this->assertEquals(3, $events->depth(), "Counting events for '$test': " . print_r($events, true));
800 $this->assertEventError($events->get(0));
801 $this->assertEventEquals('startTag', $expects, $events->get(1));
804 // Cause multiple parse errors.
806 '<foo ="bar">' => array(
814 '<foo////>' => array(
819 // character "&" in unquoted attribute shouldn't cause an infinite loop
820 '<foo bar=index.php?str=1&id=29>' => array(
823 'bar' => 'index.php?str=1&id=29'
828 foreach ($reallyBad as $test => $expects) {
829 $events = $this->parse($test);
830 // fprintf(STDOUT, $test . print_r($events, true));
831 $this->assertEventError($events->get(0));
832 $this->assertEventError($events->get(1));
833 // $this->assertEventEquals('startTag', $expects, $events->get(1));
836 // Regression: Malformed elements should be detected.
837 // '<foo baz="1" <bar></foo>' => array('foo', array('baz' => '1'), false),
838 $events = $this->parse('<foo baz="1" <bar></foo>');
839 $this->assertEventError($events->get(0));
840 $this->assertEventEquals('startTag', array(
847 $this->assertEventEquals('startTag', array(
852 $this->assertEventEquals('endTag', array(
857 public function testRawText()
860 '<script>abcd efg hijk lmnop</script> ' => 'abcd efg hijk lmnop',
861 '<script><not/><the/><tag></script>' => '<not/><the/><tag>',
862 '<script><<<<<<<<</script>' => '<<<<<<<<',
863 '<script>hello</script</script>' => 'hello</script',
864 "<script>\nhello</script\n</script>" => "\nhello</script\n",
865 '<script>&</script>' => '&',
866 '<script><!--not a comment--></script>' => '<!--not a comment-->',
867 '<script><![CDATA[not a comment]]></script>' => '<![CDATA[not a comment]]>'
869 foreach ($good as $test => $expects) {
870 $events = $this->parse($test);
871 $this->assertEventEquals('startTag', 'script', $events->get(0));
872 $this->assertEventEquals('text', $expects, $events->get(1));
873 $this->assertEventEquals('endTag', 'script', $events->get(2));
877 '<script>&</script' => '&</script',
878 '<script>Hello world' => 'Hello world'
880 foreach ($bad as $test => $expects) {
881 $events = $this->parse($test);
882 $this->assertEquals(4, $events->depth(), "Counting events for '$test': " . print_r($events, true));
883 $this->assertEventEquals('startTag', 'script', $events->get(0));
884 $this->assertEventError($events->get(1));
885 $this->assertEventEquals('text', $expects, $events->get(2));
888 // Testing case sensitivity
889 $events = $this->parse('<TITLE>a test</TITLE>');
890 $this->assertEventEquals('startTag', 'title', $events->get(0));
891 $this->assertEventEquals('text', 'a test', $events->get(1));
892 $this->assertEventEquals('endTag', 'title', $events->get(2));
894 // Testing end tags with whitespaces
895 $events = $this->parse('<title>Whitespaces are tasty</title >');
896 $this->assertEventEquals('startTag', 'title', $events->get(0));
897 $this->assertEventEquals('text', 'Whitespaces are tasty', $events->get(1));
898 $this->assertEventEquals('endTag', 'title', $events->get(2));
901 public function testRcdata()
903 list ($tok, $events) = $this->createTokenizer('<title>'<!-- not a comment --></TITLE>');
904 $tok->setTextMode(\Masterminds\HTML5\Elements::TEXT_RCDATA, 'title');
906 $this->assertEventEquals('text', "'<!-- not a comment -->", $events->get(1));
909 public function testText()
911 $events = $this->parse('a<br>b');
912 $this->assertEquals(4, $events->depth(), "Events: " . print_r($events, true));
913 $this->assertEventEquals('text', 'a', $events->get(0));
914 $this->assertEventEquals('startTag', 'br', $events->get(1));
915 $this->assertEventEquals('text', 'b', $events->get(2));
917 $events = $this->parse('<a>Test</a>');
918 $this->assertEquals(4, $events->depth(), "Events: " . print_r($events, true));
919 $this->assertEventEquals('startTag', 'a', $events->get(0));
920 $this->assertEventEquals('text', 'Test', $events->get(1));
921 $this->assertEventEquals('endTag', 'a', $events->get(2));
923 $events = $this->parse('<p>0</p><p>1</p>');
924 $this->assertEquals(7, $events->depth(), "Events: " . print_r($events, true));
926 $this->assertEventEquals('startTag', 'p', $events->get(0));
927 $this->assertEventEquals('text', '0', $events->get(1));
928 $this->assertEventEquals('endTag', 'p', $events->get(2));
930 $this->assertEventEquals('startTag', 'p', $events->get(3));
931 $this->assertEventEquals('text', '1', $events->get(4));
932 $this->assertEventEquals('endTag', 'p', $events->get(5));
935 $events = $this->parse('a<![CDATA[test]]>b');
936 $this->assertEquals(4, $events->depth(), "Events: " . print_r($events, true));
937 $this->assertEventEquals('text', 'a', $events->get(0));
938 $this->assertEventEquals('cdata', 'test', $events->get(1));
939 $this->assertEventEquals('text', 'b', $events->get(2));
941 $events = $this->parse('a<!--test-->b');
942 $this->assertEquals(4, $events->depth(), "Events: " . print_r($events, true));
943 $this->assertEventEquals('text', 'a', $events->get(0));
944 $this->assertEventEquals('comment', 'test', $events->get(1));
945 $this->assertEventEquals('text', 'b', $events->get(2));
947 $events = $this->parse('a&b');
948 $this->assertEquals(2, $events->depth(), "Events: " . print_r($events, true));
949 $this->assertEventEquals('text', 'a&b', $events->get(0));
951 $events = $this->parse('a²b');
952 $this->assertEquals(2, $events->depth(), "Events: " . print_r($events, true));
953 $this->assertEventEquals('text', 'a²b', $events->get(0));
956 // ================================================================
957 // Utility functions.
958 // ================================================================
959 protected function createTokenizer($string, $debug = false)
961 $eventHandler = new EventStack();
962 $stream = new StringInputStream($string);
963 $scanner = new Scanner($stream);
965 $scanner->debug = $debug;
968 new Tokenizer($scanner, $eventHandler),
973 public function parse($string, $debug = false)
975 list ($tok, $events) = $this->createTokenizer($string, $debug);