use strict; use warnings; die unless $ARGV[0]; require 'tokeniser.pl'; TokeniserImpl::init(); sub str { my ($t) = @_; return 'null' unless defined $t; $t =~ s/\\/\\\\/g; $t =~ s/"/\\"/g; $t =~ s/\n/\\n/g; qq{"$t"} } open T, '<', $ARGV[0] or die $!; my ($d, %d); my @out; for (, '') { if (/^#([\w-]+)/) { $d = $1; $d{$d} = ''; } elsif ($d eq 'document' and /^$/) { if ($d{'document-fragment'} or not $d{data}) { %d = (); next; } chomp $d{data}; my $tokens = TokeniserImpl::run($d{data}); my @e = split /\n/, $d{errors}; next if $d{data} =~ /<(script|style|textarea|noframes|plaintext|isindex|iframe|xmp)|((?!<\/title>).+)(<\/title>|$)/i; # XXX next if $d{data} eq '<!DOCTYPE HTML><frameset>test'; push @out, { input => [@$tokens], errors => \@e, output => $d{document} }; } else { $d{$d} .= $_; } } print JSON::objToJson(\@out);