use strict; use warnings; require "tokeniser_auto.pl"; { package TokeniserImpl; my $Char_EOF = 0; my $inputStream; my $inputStreamPos; my @tokenStream; my $characterToken = ""; my $currentToken; sub emitToken { if (length $characterToken) { push @tokenStream, ["Character", $characterToken]; $characterToken = ""; } push @tokenStream, $_[0]; } sub reconsumeCharacter { --$inputStreamPos } sub emitCharacterToken { $characterToken .= chr($_[0]) } sub emitEOFToken { emitToken("EOF") } sub consumeCharacter() { my $c; if ($inputStreamPos >= length($inputStream)) { $c = $Char_EOF; } else { $c = ord(substr($inputStream, $inputStreamPos, 1)); } ++$inputStreamPos; return $c; } sub consumeAndEmitEntity { } sub consumeAndAppendEntity { } sub createStartTagToken { $currentToken = ["StartTag", "", [], 0] } sub createEndTagToken() { $currentToken = ["EndTag", "", [], 0] } sub createTagTokenAttribute() { push @{$currentToken->[2]}, ["", ""]; $currentToken->[3] = 0; } sub createCommentToken() { $currentToken = ["Comment", ""] } sub createDoctypeToken() { $currentToken = ["DOCTYPE", "", undef, undef, 1] } sub emitCurrentTagToken { emitToken($currentToken) } sub emitCurrentCommentToken { emitToken($currentToken) } sub emitCurrentDoctypeToken { emitToken($currentToken) } sub handleDuplicateAttributes { my $n = $currentToken->[2][-1][0]; for my $i (0..$#{$currentToken->[2]}-1) { if ($currentToken->[2][$i][0] eq $n) { parseError(); $currentToken->[3] = 1; pop @{$currentToken->[2]}; return; } } } sub appendToTagTokenName { $currentToken->[1] .= chr($_[0]) } sub appendToTagTokenAttributeName { $currentToken->[2][-1][0] .= chr($_[0]) } sub appendToTagTokenAttributeValue { if (not $currentToken->[3]) { $currentToken->[2][-1][1] .= chr($_[0]) } } sub appendToCommentToken { $currentToken->[1] .= chr($_[0]) } sub appendToDoctypeTokenName { $currentToken->[1] .= chr($_[0]) } sub appendToDoctypeTokenPubId { $currentToken->[2] .= chr($_[0]) } sub appendToDoctypeTokenSysId { $currentToken->[3] .= chr($_[0]) } sub setDoctypeTokenIncorrect { $currentToken->[4] = 0 } sub setDoctypeTokenPubIdEmpty { $currentToken->[2] = "" } sub setDoctypeTokenSysIdEmpty { $currentToken->[3] = "" } sub parseError { emitToken("ParseError") } sub parseErrorIfNonpermittedSlash { if (not (substr($inputStream, $inputStreamPos, 1) eq '>' and $currentToken->[0] eq 'StartTag' and $currentToken->[1] =~ /^(base|link|meta|hr|br|img|embed|param|area|col|input)$/)) { parseError(); } } sub parseErrorIfEndTagWithAttributes { if ($currentToken->[0] eq 'EndTag' and @{$currentToken->[2]}) { parseError(); } } sub getOldCharacter { return $_[0] >= $inputStreamPos ? $Char_EOF : ord(substr $inputStream, $inputStreamPos-$_[0]-1, 1) } sub isFollowedBy { return lc(substr $inputStream, $inputStreamPos-1, length($_[0])) eq $_[0] } sub isEndOfCData { return 0 } # XXX our ($entityNameMatch, $entityNameMatchAttr, $entityNameValues, $entityMap); sub hasConsumableEntity { my ($attr) = @_; my $s = substr($inputStream, $inputStreamPos); if ($s =~ /^([\t\u000A\u000B\r <&]|$)/) { return; } my $r; my $n; my $len; if ($s =~ /^(#([0-9]+)(;?))/) { $n = $2; $len = length($1); parseError() unless $3; } elsif ($s =~ /^(#[xX]([0-9A-Fa-f]+)(;?))/) { $n = hex($2); $len = length($1); parseError() unless $3; } elsif ($s =~ /^#/) { parseError(); return; } if (defined $n) { if (exists $entityMap->{$n}) { parseError(); $n = $entityMap->{$n}; } elsif ($n == 0 or $n > 0x10FFFF or ($n >= 0xD800 and $n <= 0xDFFF)) { parseError(); $n = 0xFFFD; } } else { if ($s =~ $entityNameMatch) { my $ent = $1; if ($ent !~ /;$/) { parseError(); if ($attr and not $s =~ $entityNameMatchAttr) { return; } } $n = $entityNameValues->{$ent}; $len = length($ent); } else { parseError(); return; } } if ($attr) { appendToTagTokenAttributeValue($n); } else { emitCharacterToken($n); } $inputStreamPos += $len; return 1; } sub run { $inputStream = $_[0]; $inputStreamPos = 0; @tokenStream = (); $inputStream =~ s/\r\n?/\n/g; # TODO (maybe): this clobbers the input argument my $nuls = $inputStream =~ s/\x{0000}/\x{FFFD}/g; parseError() for 1..$nuls; while (1) { TokeniserImpl::step(); last if @tokenStream and $tokenStream[-1] eq 'EOF'; } normalise(\@tokenStream); return \@tokenStream; } sub normalise { my ($ts) = @_; pop @$ts; for my $i (0..$#$ts) { next unless ref $ts->[$i] eq 'ARRAY'; if ($ts->[$i][0] eq 'StartTag') { $ts->[$i][2] = { map @$_, @{$ts->[$i][2]} }; pop @{$ts->[$i]}; } elsif ($ts->[$i][0] eq 'EndTag') { pop @{$ts->[$i]}; pop @{$ts->[$i]}; } elsif ($ts->[$i][0] eq 'DOCTYPE') { $ts->[$i][4] = ($ts->[$i][4] ? JSON::True() : JSON::False()); } } } } if (1) { TokeniserImpl::init(); my $tokens = TokeniserImpl::run(do { local $/; <> }); print objToJson($tokens); } else { use JSON; for my $f (<../../html5lib/testdata/tokenizer/test*.test>) { open my $fh, '<:utf8', $f or die $!; my $tests = jsonToObj(do { local $/; <$fh> }); for my $test (@{$tests->{tests}}) { TokeniserImpl::init(); my $out = TokeniserImpl::run($test->{input}); my $exp = objToJson($test->{output}); my $got = objToJson($out); if ($exp ne $got) { print "$test->{description}\n"; print "Input: $test->{input}\n"; print "Expected: $exp\n"; print "Output: $got\n\n"; } } } }