#!/usr/bin/perl use strict; use warnings; sub to_modename { my $t = ucfirst $_[0]; $t =~ s/ ([a-z])/\U$1/g; $t; } my @commands = ( ['Insert the token\'s character into the current node.', 'AppendCharacterTokensToCurrentNode'], ['Insert the character into the current node.', 'AppendCharacterTokensToCurrentNode'], ['Append a Comment node to the current node with the data attribute set to the data given in the comment token.', 'AppendCommentTokenToCurrentNode'], ['Append a Comment node to the Document object with the data attribute set to the data given in the comment token.', 'AppendCommentTokenToDocument'], ['Append a Comment node to the first element in the stack of open elements (the html element), with the data attribute set to the data given in the comment token.', 'AppendCommentTokenToHTMLElement'], [qr/Process the token as if the insertion mode was "in body", with the following exception:.*/, sub { 'ReprocessWithFosteringAsIf InBody' }], #[qr/Process the token as (?:it would be processed )?if the insertion mode (?:had been|was) "([^"]+)"./, sub { my $t = ucfirst $1; $t =~ s/ ([a-z])/\U$1/g; "ReprocessAsIf $t" }], [qr/Process the token using the rules for the "([^"]+)" insertion mode\./, sub { 'ReprocessAsIf '.to_modename($1) }], ['Set the insertion mode to "in body" and reprocess the token.', 'SetInsertionMode InBody; ReprocessCurrentToken'], [qr/Switch the insertion mode to "([^"]+)", then reprocess the current token\./, sub { 'SetInsertionMode '.to_modename($1).'; ReprocessCurrentToken' }], ['Act as if a start tag token with the tag name "head" and no attributes had been seen, then reprocess the current token.', 'ActAsIfStartTag "head"; ReprocessCurrentToken'], ['Act as described in the "anything else" entry below.', 'ActAsIfAnythingElse'], ['Change the token\'s tag name to "img" and reprocess it. (Don\'t ask.)', 'ActAsIfStartTagName "img"'], ['If the stack of open elements has a p element in scope, then act as if an end tag with the tag name p had been seen.', 'If (StackHasElementInScope [Const "p"], [ ActAsIfEndTag "p" ], [ ])' ], # ['If the stack of open elements has in scope an element whose tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then generate implied end tags.', # 'If (StackHasElementInScope [Const "h1"; Const "h2"; Const "h3"; Const "h4"; Const "h5"; Const "h6"], [ GenerateImpliedEndTags [] ], [ ])'], ['If the stack of open elements has a p element in scope, then generate implied end tags, except for p elements.', 'If (StackHasElementInScope [Const "p"], [ GenerateImpliedEndTags [Const "p"] ], [ ])'], ['If the stack of open elements has an element in scope whose tag name matches the tag name of the token, then generate implied end tags, except for elements with the same tag name as the token.', 'If (StackHasElementInScope [TagName], [ GenerateImpliedEndTags [TagName] ], [ ])'], ['If the stack of open elements has a p element in scope, then pop elements from this stack until the stack no longer has a p element in scope.~Otherwise, act as if a start tag with the tag name p had been seen, then reprocess the current token.', 'If (StackHasElementInScope [Const "p"], [ PopElementsFromStackUntilNo ["p"] ], [ ActAsIfStartTag "p"; ReprocessCurrentToken ])'], ['If the stack of open elements has an element in scope whose tag name matches the tag name of the token, then pop elements from this stack until an element with that tag name has been popped from the stack.', 'If (StackHasElementInScope [TagName], [ PopElementsFromStackUntilOneOf [TagName] ], [ ])'], # ['If the stack of open elements has in scope an element whose tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then pop elements from the stack until an element with one of those tag names has been popped from the stack.', # 'If (StackHasElementInScope [Const "h1"; Const "h2"; Const "h3"; Const "h4"; Const "h5"; Const "h6"], [ PopElementsFromStackUntilOneOf [Const "h1"; Const "h2"; Const "h3"; Const "h4"; Const "h5"; Const "h6"] ], [ ])'], ['If the stack of open elements has a nobr element in scope, then this is a parse error. Act as if an end tag with the tag name nobr had been seen, then once again reconstruct the active formatting elements, if any.', 'If (StackHasElementInScope [Const "nobr"], [ ParseError "stack has nobr in scope"; ActAsIfEndTag "nobr"; ReconstructActiveFormattingElements ], [ ])' ], ['Pop elements from the stack of open elements until an element with the same tag name as the token has been popped from the stack.', 'PopElementsFromStackUntilOneOf [TagName]'], ['Pop elements from the stack of open elements until an element whose tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6" has been popped from the stack.', 'PopElementsFromStackUntilOneOf [Const "h1"; Const "h2"; Const "h3"; Const "h4"; Const "h5"; Const "h6"]'], ['Now, if the stack of open elements has an element in scope whose tag name matches the tag name of the token, then pop elements from the stack until that element has been popped from the stack, and clear the list of active formatting elements up to the last marker.', 'If (StackHasElementInScope [TagName], [ PopElementsFromStackUntilOneOf [TagName]; ClearActiveListUpToMarker ], [ ])'], ['Clear the list of active formatting elements up to the last marker.', 'ClearActiveListUpToMarker'], ['If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of pre blocks are ignored as an authoring convenience.)', 'IgnoreNextTokenIfLinefeed'], ['Now, if the current node is not an element with the same tag name as that of the token, then this is a parse error.~Otherwise, if the current node is an element with the same tag name as that of the token pop that element from the stack.~In any case,', 'If (CurrentNodeDoesNotHaveName TagName, [ ParseError "current node has wrong name" ], [ PopCurrentNodeFromStack ])'], [qr/(?:Now, )?if the current node is not an element with the same tag name as (that of )?the token, then this is a parse error\./i, sub { 'If (CurrentNodeDoesNotHaveName TagName, [ ParseError "current node has wrong name" ], [ ])' }], ['If the current node is not a p element, then this is a parse error.', 'If (CurrentNodeDoesNotHaveName (Const "p"), [ ParseError "current node is not p" ], [ ])'], [qr/If the current node is an (option|optgroup) element, act as if an end tag with the tag name "(option|optgroup)" had been seen\./, sub { 'If (CurrentNodeHasName (Const "'.$1.'"), [ ActAsIfEndTag "'.$2.'" ], [])' }], ['First, if the current node is an option element, and the node immediately before it in the stack of open elements is an optgroup element, then act as if an end tag with the tag name "option" had been seen.', 'If (CurrentNodeHasName (Const "option"), [ If (NodeBeforeCurrentHasName (Const "optgroup"), [ ActAsIfEndTag "option" ], []) ], [])'], ['If the list of active formatting elements contains an element whose tag name is "a" between the end of the list and the last marker on the list (or the start of the list if there is no marker on the list), then this is a parse error; act as if an end tag with the tag name "a" had been seen, then remove that element from the list of active formatting elements and the stack of open elements if the end tag didn\'t already remove it (it might not have if the element is not in table scope).', 'If (ListOfActiveContainsA, [ ParseError "list of active formatting elements contains a"; ActAsIfEndTag "a"; RemoveThatAElementIfNecessary ], [ ])'], ['Parse error.', 'ParseError "?"'], ['this is a parse error;', 'ParseError "?"'], ['this is a parse error,', 'ParseError "?"'], ['this is a parse error.', 'ParseError "?"'], ['Ignore the token.', 'IgnoreToken'], ['Ignore the end tag token.', 'IgnoreToken'], ['ignore the token with a parse error.', 'ParseError "?"; IgnoreToken'], ['Ignore the token', 'IgnoreToken'], ['Otherwise, ignore the token.', 'IgnoreToken'], # XXX - this 'otherwise' bit is wrong ['this is a parse error and the token must be ignored.', 'ParseError "?"; IgnoreToken'], ['Stop parsing.', 'StopParsing'], ['Generate implied end tags.', 'GenerateImpliedEndTags []'], ['generate implied end tags, except for elements with the same tag name as the token.', 'GenerateImpliedEndTags [TagName]'], ['generate implied end tags', 'GenerateImpliedEndTags []'], ['set the form element pointer to null.', 'SetFormElementPointerToNull'], ['Reconstruct the active formatting elements, if any.', 'ReconstructActiveFormattingElements'], ['Insert a marker at the end of the list of active formatting elements.', 'AddMarkerToActiveList'], ['Immediately pop the current node off the stack of open elements.', 'PopCurrentNodeFromStack'], ['pop that node from the stack of open elements.', 'PopCurrentNodeFromStack'], ['Pop that input element off the stack of open elements.', 'PopCurrentNodeFromStack'], [qr/Pop the current node (?:\(which will be [^)]+\) )?(off|from) the stack of open elements(; the new current node will be a head element)?\./i, sub { 'PopCurrentNodeFromStack' } ], ['Push the node pointed to by the head element pointer onto the stack of open elements.', 'PushHeadElementPointerOntoStack'], ['pop elements from this stack until an element with that tag name has been popped from the stack', 'PopElementsFromStackUntilOneOf [TagName]'], ['Pop elements from this stack until an element with the same tag name as the token has been popped from the stack.', 'PopElementsFromStackUntilOneOf [TagName]'], ['Pop elements from the stack of open elements until a select element has been popped from the stack.', 'PopElementsFromStackUntilOneOf [Const "select"]'], ['Pop elements from this stack until a table element has been popped from the stack.', 'PopElementsFromStackUntilOneOf [Const "table"]'], ['Pop elements from this stack until a caption element has been popped from the stack.', 'PopElementsFromStackUntilOneOf [Const "caption"]'], ['Insert an HTML element for the token. Add that element to the list of active formatting elements.', 'InsertElementAndAddToListOfActive' ], ['Insert an HTML element for the token, and set the form element pointer to point to the element created.', 'InsertElementAndSetFormElementPointer'], [qr/Insert an? \S+ element for the token(\.|, then)/, sub { 'InsertElement' }], ['Create an element for the token.~Set the head element pointer to this new element node.~Append the new element to the current node and push it onto the stack of open elements.', 'InsertElementAndSetHeadElementPointer'], ['for each attribute on the token, check to see if the attribute is already present on the body element (the second element) on the stack of open elements. If it is not, add the attribute and its corresponding value to that element.', 'MergeAttributesIntoBodyElement'], ['Switch the content model flag to the PLAINTEXT state.', 'SetContentModelFlag PLAINTEXT'], [qr/(?:Change|Switch) the insertion mode to "([^"]+)"\./i, sub { my $t = ucfirst $1; $t =~ s/ ([a-z])/\U$1/g; "SetInsertionMode $t" }], ['Reset the insertion mode appropriately.', 'ResetInsertionModeAppropriately'], ['Run the following algorithm:~Initialise node to be the current node (the bottommost node of the stack). If node is an li element, then pop all the nodes from the current node up to node, including node, then stop this algorithm. If more than one node is popped, then this is a parse error. If node is not in the formatting category, and is not in the phrasing category, and is not an address or div element, then stop this algorithm. Otherwise, set node to the previous entry in the stack of open elements and return to step 2. ~Finally, insert an HTML element for the token.', 'FixupForListElement ["li"]; InsertElement'], ['Run the following algorithm:~Initialise node to be the current node (the bottommost node of the stack). If node is a dd or dt element, then pop all the nodes from the current node up to node, including node, then stop this algorithm. If more than one node is popped, then this is a parse error. If node is not in the formatting category, and is not in the phrasing category, and is not an address or div element, then stop this algorithm. Otherwise, set node to the previous entry in the stack of open elements and return to step 2. ~Finally, insert an HTML element for the token.', 'FixupForListElement ["dd"; "dt"]; InsertElement'], ['Run the following algorithm:~Initialise node to be the current node (the bottommost node of the stack). If node has the same tag name as the end tag token, then:Generate implied end tags. If the tag name of the end tag token does not match the tag name of the current node, this is a parse error. Pop all the nodes from the current node up to node, including node, then stop this algorithm. Otherwise, if node is in neither the formatting category nor the phrasing category, then this is a parse error. Stop this algorithm. The end tag token is ignored. Set node to the previous entry in the stack of open elements. Return to step 2.', 'ApplyEndTag'], [qr/Follow these steps:~Let the formatting element be the last element in the list of active formatting elements .*/, sub { 'AdoptionAgency' }], ['If the token has an attribute called "action", set the action attribute on the resulting form element to the value of the "action" attribute of the token.', 'CopyActionAttributeToForm' ], [qr/If the form element pointer is not null, then associate the (\w+) element with the form element pointed to by the form element pointer\./, sub { 'If (FormElementPointerIsNotNull, [ AssociateCurrentNodeWithFormElementPointer ], [])' } ], ['Follow the generic RCDATA parsing algorithm.', 'GenericRCDATA'], ['Follow the generic CDATA parsing algorithm.', 'GenericCDATA'], [qr/Create an element for the token.~Mark the element as being "parser-inserted". .*/, # XXX: assumes scripting isn't supported sub { 'GenericCDATA' } ], ['Set the document to quirks mode.', 'SetCompatMode QuirksMode'], ['Create an element for the token. Append it to the Document object. Put this element in the stack of open elements.~If the token has an attribute "manifest", then run the application cache selection algorithm with the value of that attribute as the manifest URI. Otherwise, run the application cache selection algorithm with no manifest.', 'InsertHTMLElement'], # TODO: handle ['Create an HTMLElement node with the tag name html, in the HTML namespace. Append it to the Document object. Put this element in the stack of open elements.~Run the application cache selection algorithm with no manifest.', 'InsertHTMLElement'], # XXX ['If this start tag token was not the first start tag token, then it is a parse error.', 'If (NotFirstStartTagToken, [ ParseError "not the first start tag token" ], [])'], ['For each attribute on the token, check to see if the attribute is already present on the top element of the stack of open elements. If it is not, add the attribute and its corresponding value to that element.', 'MergeAttributesIntoHTMLElement'], ['If the parser was not originally created as part of the HTML fragment parsing algorithm (fragment case), and the current node is no longer a frameset element, then change the insertion mode to "after frameset".', 'If (ParsingFragment, [ ], [ If (CurrentNodeDoesNotHaveName (Const "frameset"), [ SetInsertionMode AfterFrameset ], []) ])'], ['If the element has a charset attribute, and its value is a supported encoding, and the confidence is currently tentative, then change the encoding to the encoding given by the value of the charset attribute.~Otherwise, if the element has a content attribute, and applying the algorithm for extracting an encoding from a Content-Type to its value returns a supported encoding encoding, and the confidence is currently tentative, then change the encoding to the encoding encoding.', 'CharsetEncodingStuff'], ['If the current node is not the root html element, then this is a parse error..', 'If (CurrentNodeDoesNotHaveName (Const "html"), [ ParseError "current node is not root html element" ], [ ])'], # XXX this doesn't require it to be root [qr/run these steps:(.*?)(~|$)/, sub { join '; ', parse_cmds($1) }], [qr/If the DOCTYPE token's name .*Then, switch the insertion mode to "before html"\./, sub { 'DoDoctypeStuff; SetInsertionMode BeforeHtml' }], [qr/If ([^\.]+?), then ([^~]+?)( \(fragment case\))?[ ~]Otherwise(?:, run these steps:~(.+)|:~(.+)|,([^~]+))/, sub { my ($c,$t,$g,$f) = ($1,$2,$3,$4||$5||$6); 'If (' . parse_expr($c) . ', [ ' . (join '; ', ($g?('AssertFragmentCase'):()), parse_cmds($t)) . ' ], [ ' . (join '; ', parse_cmds($f)) . ' ])' }], [qr/If ([^\.]+?), (this is a parse error[^~]+?)( \(fragment case\))?~Otherwise(?:, run these steps:~(.+)|:~(.+)|,([^~]+))/i, sub { my ($c,$t,$g,$f) = ($1,$2,$3,$4||$5||$6); 'If (' . parse_expr($c) . ', [ ' . (join '; ', ($g?('AssertFragmentCase'):()), parse_cmds($t)) . ' ], [ ' . (join '; ', parse_cmds($f)) . ' ])' }], [qr/If ([^\.]+?), ([^~]+?\.)( \(fragment case\))?~Otherwise(?:, run these steps:~(.+)|:~(.+)|,([^~]+))/, sub { my ($c,$t,$g,$f) = ($1,$2,$3,$4||$5||$6); 'If (' . parse_expr($c) . ', [ ' . (join '; ', ($g?('AssertFragmentCase'):()), parse_cmds($t)) . ' ], [ ' . (join '; ', parse_cmds($f)) . ' ])' }], [qr/If there is a node in the stack of open elements that is not either (((a|an|the|or the) \S+ element, )+)then this is a parse error\./i, sub { my @els = ($1 =~ / (\S+) element/g); 'If (NodeInStackIsNot [' . (join '; ', map qq{"$_"}, @els) . '], [ ParseError "stack contains a bad node" ], [ ])' }], # [qr/If the form element pointer is not null, ignore the token with a parse error\.~Otherwise:~([^~]+)/, # sub { 'If (FormElementPointerIsNotNull, [ ParseError "form element pointer is not null"; IgnoreToken ], [ ' . (join '; ', parse_cmds($1)) . ' ])' }], [qr/If the stack of open elements has (?:in scope an element|an element in scope) (?:with the same tag name as that|whose tag name is the same as the tag name) of the token, then ([^\.]+)\./, sub { 'If (StackHasElementInScope [TagName], [ ' . (join '; ', parse_cmds($1)) . ' ], [ ])' }], #[qr/If the form element pointer is not null, then ignore the token\. Otherwise: (.+)/, # sub { 'If (FormElementPointerIsNotNull, [ IgnoreToken ], [ ' . (join '; ', parse_cmds($1)) . ' ])' }], [qr/Act as if a start tag token with the tag name "(\S+)" had been seen\./, sub { 'ActAsIfStartTag "' . $1 . '"' }], [qr/Act as if the token had been an end tag with the tag name "(\S+)" instead\./, sub { 'ActAsIfEndTag "' . $1 . '"' }], [qr/Act as if an? (start|end) tag (?:token )?with the tag name "(\S+)" (?:and no attributes )?had been seen(?:, and then|, and| and|, then) reprocess the (?:current )?token\./i, sub { "ActAsIf\u$1Tag \"$2\"; ReprocessCurrentToken" }], [qr/Act as if an end tag (?:token )?with (?:the )?tag name "(\S+)" had been seen, (?:and )?then, if that token wasn't ignored, reprocess the current token\./, sub { 'ActAsIfEndTag "' . $1 . '"; If (GeneratedTokenWasNotIgnored, [ ReprocessCurrentToken ], [ ])' }], ['Act as if an end tag with the same tag name as the current node ("tbody", "tfoot", or "thead") had been seen, then reprocess the current token.', 'PopCurrentNodeFromStack; SetInsertionMode InTable; ReprocessCurrentToken'], # XXX this is totally different (but I think still valid) ['Clear the stack back to a table context. (See below.)', 'ClearStackToContext ["table"; "html"]'], ['Clear the stack back to a table body context. (See below.)', 'ClearStackToContext ["tbody"; "thead"; "tfoot"; "html"]'], ['Clear the stack back to a table row context. (See below.)', 'ClearStackToContext ["tr"; "html"]'], ['close the cell (see below) and reprocess the current token.', 'If (StackHasElementInTableScope [Const "td"], [ ActAsIfEndTag "td" ], [ ActAsIfEndTag "th" ]); ReprocessCurrentToken'], ['(The current node will be a tr element at this point.)', ''], [qr/Now, if the current node is not a (caption|table) element, then this is a parse error\./, sub { 'If (CurrentNodeDoesNotHaveName (Const "'.$1.'"), [ ParseError "current node is not '.$1.'" ], [])' }], ); my @exprs = ( ['the second element on the stack of open elements is not a body element, or, if the stack of open elements has only one node on it', 'NotSecondElementIsBody'], #['the second element in the stack of open elements is not a body element', 'NotSecondElementIsBody'], # XXX: why does this differ from above? ['the stack of open elements does not have a body element in scope', 'StackNotHasElementInScope [Const "body"]'], ['the form element pointer is not null', 'FormElementPointerIsNotNull'], ['the stack of open elements has a button element in scope', 'StackHasElementInScope [Const "button"]'], ['the stack of open elements does not have an element in table scope with the same tag name as the token', 'StackNotHasElementInTableScope [TagName]'], # TODO: optimise this to 'select' ['the stack of open elements has an element in table scope with the same tag name as that of the token', 'StackHasElementInTableScope [TagName]'], ['the stack of open elements does not have an element in table scope with the same tag name as that of the token', 'StackNotHasElementInTableScope [TagName]'], ['the stack of open elements does not have an element in scope with the same tag name as that of the token', 'StackNotHasElementInScope [TagName]'], ['the stack of open elements does not have an element in table scope with the same tag name as that of the token (which can only happen for "tbody", "tfoot" and "thead", or, in the fragment case)', 'StackNotHasElementInTableScope [TagName]'], ['the stack of open elements does not have a tbody, thead, or tfoot element in table scope', 'StackNotHasElementInTableScope [Const "tbody"; Const "thead"; Const "tfoot"]'], ['the stack of open elements does not have a td or th element in table scope', 'StackNotHasElementInTableScope [Const "td"; Const "th"]'], ['the stack of open elements does not have an element in scope whose tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6"', 'StackNotHasElementInTableScope [Const "h1"; Const "h2"; Const "h3"; Const "h4"; Const "h5"; Const "h6"]'], ['the current node is an option element', 'CurrentNodeHasName (Const "option")'], ['the current node is an optgroup element', 'CurrentNodeHasName (Const "optgroup")'], ['the current node is the root html element', 'CurrentNodeHasName (Const "html")'], # XXX this doesn't require it to be root ['the parser was originally created as part of the HTML fragment parsing algorithm', 'ParsingFragment'], ['the insertion mode is one of in table", "in caption", "in column group", "in table body", "in row", or "in cell"', 'InsertionModeIs [InTable; InCaption; InColumnGroup; InTableBody; InRow; InCell]'], ); sub parse_cmds { my ($text) = @_; my $ok = 1; my @cmds; while ($text =~ /[^ ~]/) { $text =~ s/^~+//; my $done = 0; for my $cmd (@commands) { if (ref $cmd->[0] eq 'Regexp') { if ($text =~ s/^\s*$cmd->[0]//) { push @cmds, $cmd->[1](); $done = 1; last; } } elsif ($text =~ s/^\s*\Q$cmd->[0]\E//i) { push @cmds, $cmd->[1] if $cmd->[1]; $done = 1; last; } } if (not $done) { $ok = 0; last; } } push @cmds, "XXX($text)" if not $ok; return @cmds; } sub parse_expr { my ($text) = @_; for my $expr (@exprs) { if ($text =~ /^\s*\Q$expr->[0]\E\s*$/) { return $expr->[1]; } } return "XXX($text)"; } use XML::Twig; my $twig = new XML::Twig; $twig->parsefile('section-tree-construction.html'); my @modes; my $mode_h = $twig->elt_id('the-initial'); while ($mode_h) { (my $t = $mode_h->text) =~ s/\s+/ /g; last unless $t =~ /The "([^"]+)" insertion mode/; warn "$1\n"; push @modes, [ to_modename($1), $mode_h->next_sibling('dl')->first_child('dt') ]; do { $mode_h = $mode_h->next_sibling } while $mode_h and $mode_h->tag ne 'h5'; } print < "Doctype" | EndOfFile -> "EOF" | Character -> "Character" | CharacterIn cs -> "Character [" ^ (String.concat ", " (List.map (Format.sprintf "%d") cs)) ^ "]" | CharacterNotIn cs -> "Character not [" ^ (String.concat ", " (List.map (Format.sprintf "%d") cs)) ^ "]" | Comment -> "Comment" | StartTag ts -> "Start tag [" ^ (String.concat ", " ts) ^ "]" | EndTag ts -> "End tag [" ^ (String.concat ", " ts) ^ "]" | AnyStartTag -> "Start tag" | AnyEndTag -> "End tag" | Anything -> "Anything else" type condition = | CurrentNodeDoesNotHaveName of string_value | CurrentNodeHasName of string_value | FormElementPointerIsNotNull | GeneratedTokenWasNotIgnored | InsertionModeIs of insertion_mode list | ListOfActiveContainsA | MoreThanOneNodeAndSecondNotBody | MoreThanTwoNodesOrSecondNotBody | NodeBeforeCurrentHasName of string_value | NodeInStackIsNot of string list | NotFirstStartTagToken | NotSecondElementIsBody | ParsingFragment | StackHasElementInScope of string_value list | StackHasElementInTableScope of string_value list | StackNotHasElementInScope of string_value list | StackNotHasElementInTableScope of string_value list EOT my $commands = <[0], grep { $_->[0] ne 'common' } @modes; print "and insertion_mode =\n", (map " | $_\n", @mode_names), " | VirtualMode of (token_match * command list) list\n\n"; print "let insertionModes = [\n", (map " $_;\n", @mode_names), "]\n\n"; print "let string_of_insertion_mode = function\n", (map " | $_ -> \"$_\"\n", @mode_names), " | VirtualMode _ -> \"VirtualMode\"\n\n"; print "let string_of_command = function\n"; while ($commands =~ /^ \| (\S+)( of)?/mg) { print " | $1 " . ($2 ? '_ ' : '') . "-> \"$1\"\n"; } print "\n"; # print "let string_of_token_match = function\n"; # while ($token_matches =~ /^ \| (\S+)( of)?/mg) { # print " | $1 " . ($2 ? '_ ' : '') . "-> \"$1\"\n"; # } # print "\n"; sub print_algorithm { my ($name, $dt) = @_; while ($dt) { my @dts; my $e = $dt; do { push @dts, $e; $e = $e->next_sibling } while $e->tag eq 'dt'; my $dd = $e; die if $dd->tag ne 'dd'; $dt = $dd->next_sibling; die if $dt and $dt->tag ne 'dt'; my (@conds, $commands); for $dt (@dts) { my $text = $dt->text; $text =~ s/\s+/ /g; $text =~ s/^\s+//; $text =~ s/\s+$//; my $ok = 1; my @c; if ($text =~ /^A (character|comment) token$/) { push @c, ucfirst $1; } elsif ($text =~ /^An end-of-file token$/) { push @c, 'EndOfFile'; } elsif ($text =~ /^A DOCTYPE token$/) { push @c, 'Doctype'; } elsif ($text =~ /^A character token that is one of one of (.+)$/) { # TODO: spec typo my @cs = ($1 =~ /U\+([A-F0-9]{4})/g); push @c, 'CharacterIn ['.(join '; ', map "0x$_", @cs).']'; } elsif ($text =~ /^A character token that is not one of (.+)$/) { my @cs = ($1 =~ /U\+([A-F0-9]{4})/g); push @c, 'CharacterNotIn ['.(join '; ', map "0x$_", @cs).']'; } elsif ($text =~ /^An? (start|end) tag (?:token )?whose tag name is ("[^"]+")$/) { push @c, (ucfirst $1).'Tag ['.(join ';', split ',', $2).']'; } elsif ($text =~ /^An? (start|end) tag (?:token )?whose tag name is one of: ("[^"]+"(, "[^"]+")+)$/) { push @c, (ucfirst $1).'Tag ['.(join ';', split ',', $2).']'; } elsif ($text =~ /^A start or end tag whose tag name is one of: ("[^"]+"(, "[^"]+")+)$/) { push @c, 'StartTag ['.(join ';', split ',', $1).']', 'EndTag ['.(join ';', split ',', $1).']'; } elsif ($text =~ /^An?(?:y other)? (start|end) tag(?: token)?(?: not covered by the previous entries)?$/) { push @c, 'Any'.(ucfirst $1).'Tag'; } elsif ($text =~ /^Anything else$/) { push @c, 'Anything'; } else { $ok = 0; push @c, "XXX($text)"; } $dt->insert_new_elt(last_child => div => { style => 'background: #'.($ok ? 'ddd' : 'faa').'; text-indent: 0' }, join '; ', @c); push @conds, @c; } my $text = ''; for my $p ($dd->children(sub { $_[0]->tag eq 'p' or $_[0]->tag eq 'ol' })) { next if $p->att('class'); $text .= $p->text . '~'; } $text =~ s/\s+/ /g; $text =~ s/^\s+//; $text =~ s/\s+$//; my @cmds = parse_cmds($text); my $code = '[ ' . join('; ', @cmds) . ' ]'; $dd->insert_new_elt(last_child => div => { style => 'background: #'.($code =~ /XXX/ ? 'faa' : 'ddd').'; text-indent: 0' }, $code); $commands = $code; if ($commands =~ /XXX/) { $commands =~ s/"/\\"/g; $commands = "[ TODO \"$commands\" ]"; } for my $cond (grep !/XXX/, @conds) { # if ($cond eq 'StartTag ["base"; "link"]' and $commands = '[ InsertElement ]') { # $commands = '[ InsertElement; PopCurrentNodeFromStack ]'; # XXX - http://canvex.lazyilluminati.com/misc/cgi/issues.cgi/message/%3C0822044E-EA7B-43C4-A3AF-4196B09B580C%40iki.fi%3E # } elsif ($cond eq 'StartTag ["meta"]' and $commands eq '[ InsertElement; CharsetEncodingStuff ]') { # $commands = '[ InsertElement; PopCurrentNodeFromStack; CharsetEncodingStuff ]'; # XXX - as above # } print " ( $cond,\n $commands );\n"; } } } # print "let common = [\n"; # for my $mode (@modes) { # my ($mode_name, $mode_dt) = @$mode; # next unless $mode_name eq 'common'; # print_algorithm($mode_name, $mode_dt); # } # print " ]\n\n"; # print < (EndOfFile, [ActAsIfStartTag "head"; ReprocessCurrentToken]) :: (transformHACK_BeforeHead cs) # | c::cs -> c :: (transformHACK_BeforeHead cs) # | [] -> [] # # let rec transformHACK_InHead = function # | (EndOfFile, cmds)::cs -> (EndOfFile, [ActAsIfEndTag "head"; ReprocessCurrentToken]) :: (transformHACK_InHead cs) # | c::cs -> c :: (transformHACK_InHead cs) # | [] -> [] # # let rec transformHACK_AfterHead = function # | (EndOfFile, cmds)::cs -> (EndOfFile, [ActAsIfStartTag "body"; ReprocessCurrentToken]) :: (transformHACK_AfterHead cs) # | c::cs -> c :: (transformHACK_AfterHead cs) # | [] -> [] # # EOF print "let insertionModeAlgorithm = function\n"; for my $mode (@modes) { my ($mode_name, $mode_dt) = @$mode; next if $mode_name eq 'common'; print " | $mode_name -> "; #print "transformHACK_$mode_name " if $mode_name =~ /^(Before|In|After)Head$/; #print "common @ " if $mode_name =~ /^[A-Z]/; print "[\n"; print_algorithm($mode_name, $mode_dt); print " ]\n\n"; } print " | VirtualMode m -> m\n\n"; print <', 'treeconstructor.xhtml' or die $!; print $o $twig->sprint;