open Treeconstructor_spec open Map let traceEnabled = false let whitespaceChars = [0x0009; 0x000A; 0x000B; 0x000C; 0x0020] let transformAlgorithm mode = List.map (function (matcher, cmds) -> let rec transformCmds = function | [ActAsIfAnythingElse] -> (* Expand out the "act as described in the 'anything else' entry" blocks *) let (m, c) = List.find (function (Anything, _) -> true | _ -> false) (insertionModeAlgorithm mode) in c | [AppendCharacterTokensToCurrentNode] -> (match matcher with | CharacterIn whitespaceChars -> [AppendCharacterTokenToCurrentNodeAndSetAppendingWSCharacters] | Character -> (* TODO: show there's no other character match before this one *) [AppendCharacterTokenToCurrentNodeAndSetAppendingAllCharacters] | _ -> failwith "AppendCharacterTokensToCurrentNode in unrecognised case") | (ParseError "?")::cs -> (ParseError ((string_of_insertion_mode mode) ^ " - " ^ (string_of_token_match matcher))) :: (transformCmds cs) | (If (c, a, b))::cs -> (If (c, transformCmds a, transformCmds b))::(transformCmds cs) | c::cs -> c::(transformCmds cs) | [] -> [] in (matcher, transformCmds cmds) ) (insertionModeAlgorithm mode) type attribute = string * string type doctype = { doctypename: string; pubid: string option; sysid: string option; correct: bool } type start_tag = { start_tag_name: string; start_tag_attributes: attribute list } type end_tag = { end_tag_name: string } type token = DoctypeToken of doctype | StartTagToken of start_tag | EndTagToken of end_tag | CommentToken of string | CharacterToken of char | EndOfFileToken | ParseErrorToken let string_of_token = function | DoctypeToken _ -> "Doctype" | StartTagToken { start_tag_name = n } -> "<" ^ n ^ " ...>" | EndTagToken { end_tag_name = n } -> "" | CommentToken _ -> "Comment" | CharacterToken c -> "Char " ^ (String.make 1 c) | EndOfFileToken -> "EOF" | ParseErrorToken -> "Parse error" (* Note: Since nodes can be stored in multiple maps and lists and pointers, they are referenced by an id number *) type element_node = { name: string; attributes: (string*string) list; children: int list } (* children in reverse order *) type node = | DocumentNode of int list (* child ids, in reverse order *) | DoctypeNode of string * string * string (* name, publicId, systemId *) | CommentNode of string | TextNode of string | ElementNode of element_node type map_node = { id: int; node: node } module Int = struct type t = int let compare : int -> int -> int = compare end module ElementMap = Map.Make(Int) type state = { startTagsSeen: int; elementId: int; (* id of next element to be created *) elements: map_node ElementMap.t; openStack: int list; activeList: int list; (* -1 is marker *) parseErrors: string list; formElement: int; (* -1 means null *) headElement: int; (* -1 means null *) contextNode: int; (* -1 means null *) insertionMode: insertion_mode; compatMode: compat_mode; ignored: bool; (* true if the last was ignored *) fostering: int; (* nested depth - >= 1 if the foster-parent case is active *) genericData: int; (* -1 if not in 'generic CDATA|RCDATA parsing'; else id of node being inserted *) appendingAllCharacters: int; (* -1 if not in 'append a character' (for all chars); else id of text node being added to *) appendingWSCharacters: int; (* -1 if not in 'append a character' (for whitespace chars); else id of text node being added to *) ignoringNextCharacterIfNewline: bool; } let initialState = { startTagsSeen = 0; elementId = 1; elements = ElementMap.add 0 { id = 0; node = DocumentNode [] } ElementMap.empty; openStack = []; activeList = []; parseErrors = []; formElement = -1; headElement = -1; contextNode = -1; insertionMode = Initial; compatMode = StandardsMode; ignored = false; fostering = 0; genericData = -1; appendingAllCharacters = -1; appendingWSCharacters = -1; ignoringNextCharacterIfNewline = false; } let dumpState state = let dump = Format.sprintf "{ mode=%s; openStack=[%s]; activeList=[%s]; elements=[%s]; foster=%d }" (string_of_insertion_mode state.insertionMode) (String.concat "; " (List.map string_of_int state.openStack)) (String.concat "; " (List.map string_of_int state.activeList)) (String.concat "; " (List.map (fun n -> Format.sprintf "{%d:%s}" n.id (match n.node with | ElementNode e -> Format.sprintf "%s %s" e.name (String.concat ";" (List.map string_of_int e.children)) | TextNode t -> Format.sprintf "\"%s\"" t | _ -> "?") ) (List.rev (ElementMap.fold (fun k v a -> v::a) state.elements [])) )) state.fostering in print_string dump; print_newline () let createNode node state = (state.elementId, { state with elementId = state.elementId + 1; elements = ElementMap.add state.elementId { id = state.elementId; node = node } state.elements; }) let createCommentNode text = createNode (CommentNode text) let createTextNode text = createNode (TextNode text) let createElementNode name attrs = createNode (ElementNode { name = name; attributes = attrs; children = [] }) let createDoctypeNode name pubid sysid = createNode (DoctypeNode (name, pubid, sysid)) let currentNodeId state = match state.openStack with | [] -> failwith "error - stack of open elements is empty" | s::_ -> s let currentOrFosterNodeId state = match state.openStack with | [] -> failwith "error - stack of open elements is empty" | s::_ -> s let currentNodeElement state = match ElementMap.find (currentNodeId state) state.elements with | { node = ElementNode e } -> e | _ -> failwith "error - non-element node on stack of open elements" let currentNode state = match ElementMap.find (currentNodeId state) state.elements with | ({ node = ElementNode e } as current) -> (current, e) | _ -> failwith "error - non-element node on stack of open elements" let elementNode elements id = match ElementMap.find id elements with | { node = ElementNode e } -> e | _ -> failwith "error - non-element node on stack of open elements" let elementName elements id = (elementNode elements id).name let parentElement x elements = ElementMap.fold (fun k -> function | { node = ElementNode ({ children = cs } as e) } as n when List.mem x cs -> fun a -> Some (n, e) | _ -> fun a -> a) elements None let lastChild id elements = match ElementMap.find id elements with | { node = ElementNode { children = c::_ } } -> c | { node = ElementNode _ } -> failwith "no children for node in lastChild" | _ -> failwith "not an element in lastChild" let appendToElementNode id n state = match ElementMap.find id state.elements with | { node = ElementNode e } as context -> if state.fostering > 0 && (context.id = currentNodeId state) && (List.mem e.name ["table";"tbody";"tfoot";"thead";"tr"]) then let rec f = function | x::y::xs when elementName state.elements x = "table" -> (match parentElement x state.elements with | Some (p, e) -> let cs = (* add n into e.children just after x *) let rec f = function c::cs when c = x -> x::n::(f cs) | c::cs -> c::(f cs) | [] -> [] in f e.children in { state with elements = ElementMap.add p.id { p with node = ElementNode { e with children = cs } } state.elements } | None -> failwith "todo") | x::[] -> failwith "todo" | x::xs -> f xs | [] -> failwith "error - open stack is empty" in f state.openStack else { state with elements = ElementMap.add context.id { context with node = ElementNode { e with children = n::e.children } } state.elements } | _ -> failwith "error - non-element node used in appendToElementNode" let appendToCurrent n state = appendToElementNode (currentNodeId state) n state let appendToElementTextNotFostered id text state = match ElementMap.find id state.elements with | { node = ElementNode e } as n -> (match e.children with | c::cs -> (match ElementMap.find c state.elements with | { node = TextNode t } as n -> { state with elements = ElementMap.add n.id { n with node = TextNode (t ^ text) } state.elements } | _ -> let (tid, state) = createTextNode text state in { state with elements = ElementMap.add n.id { n with node = ElementNode { e with children = tid::e.children } } state.elements } ) | cs -> let (tid, state) = createTextNode text state in { state with elements = ElementMap.add n.id { n with node = ElementNode { e with children = tid::e.children } } state.elements } ) | _ -> failwith "error - not a childed element in appendToElementTextNotFostered" let appendCharacterToTextNode id text state = match ElementMap.find id state.elements with | { node = TextNode t } as n -> { state with elements = ElementMap.add id { n with node = TextNode (t ^ text) } state.elements } | _ -> failwith "error - not a text node in appendCharacterToTextNode" let appendCharacterToCurrentNode text state = match ElementMap.find (currentNodeId state) state.elements with | { node = ElementNode e } as context -> if state.fostering > 0 && (context.id = currentNodeId state) && (List.mem e.name ["table";"tbody";"tfoot";"thead";"tr"]) then let rec f = function | x::y::xs when elementName state.elements x = "table" -> (match parentElement x state.elements with | Some (p, e) -> let (tid, cs, state) = (* add n into e.children just after x *) let rec f = function | c::d::cs when c = x -> (match ElementMap.find d state.elements with | { node = TextNode t } as n -> (d, c::d::cs, { state with elements = ElementMap.add d { n with node = TextNode (t ^ text) } state.elements }) | _ -> let (tid, state) = createTextNode text state in (tid, c::tid::d::cs, state) ) | c::cs when c = x -> let (tid, state) = createTextNode text state in (tid, c::tid::cs, state) | c::cs -> let (tid, cs', state) = f cs in (tid, c::cs', state) | [] -> failwith "error - can't find table in its parent" in f e.children in tid, { state with elements = ElementMap.add p.id { p with node = ElementNode { e with children = cs } } state.elements } | None -> failwith "todo") | x::[] -> failwith "todo" | x::xs -> f xs | [] -> failwith "error - open stack is empty" in f state.openStack else (match e.children with | c::cs -> (match ElementMap.find c state.elements with | { node = TextNode t } as n -> c, { state with elements = ElementMap.add c { n with node = TextNode (t ^ text) } state.elements } | _ -> let (tid, state) = createTextNode text state in tid, { state with elements = ElementMap.add context.id { context with node = ElementNode { e with children = tid::e.children } } state.elements } ) | [] -> let (tid, state) = createTextNode text state in tid, { state with elements = ElementMap.add context.id { context with node = ElementNode { e with children = tid::e.children } } state.elements } ) | _ -> failwith "error - current node is not an element" let setElementChildren id cs state = match ElementMap.find id state.elements with | { node = ElementNode e } as n -> { state with elements = ElementMap.add n.id { n with node = ElementNode { e with children = cs } } state.elements } | _ -> failwith "error - non-element node used in setElementChildren" let pushToOpenStack n state = { state with openStack = n::state.openStack } let popOpenStack state = { state with openStack = List.tl state.openStack } let addToActiveList n state = { state with activeList = n::state.activeList } let addParseError e state = { state with parseErrors = e::state.parseErrors } let setFormElementPointer id state = { state with formElement = id } let setHeadElementPointer id state = { state with headElement = id } let setContextNode id state = { state with contextNode = id } let setInsertionMode mode state = { state with insertionMode = mode } let resetInsertionModeAppropriately state = let rec f = function n::ns -> (match elementName state.elements n with (* XXX: fragment case *) | "select" -> InSelect | "td" -> InCell | "th" -> InCell | "tr" -> InRow | "tbody" -> InTableBody | "thead" -> InTableBody | "tfoot" -> InTableBody | "caption" -> InCaption | "colgroup" -> InColumnGroup | "table" -> InTable | "head" -> InBody | "body" -> InBody | "frameset" -> InFrameset | "html" -> if state.headElement = -1 then BeforeHead else AfterHead | _ -> f ns) | [] -> InBody in setInsertionMode (f state.openStack) state let appendToDocument id state = match ElementMap.find 0 state.elements with | { node = DocumentNode cs } as n -> { state with elements = ElementMap.add n.id { n with node = DocumentNode (id::cs) } state.elements } | _ -> failwith "error - node 0 is not Document" let insertElementAnd token state action = match token with | StartTagToken tag -> let (id, state) = createElementNode tag.start_tag_name tag.start_tag_attributes state in let state = pushToOpenStack id (appendToCurrent id state) in action id state | _ -> failwith "error - token is incorrect type" let mergeAttributesInto id token state = let rec merge attrs = function | (ak,av)::ats -> merge (if List.exists (function (k,v) -> k = ak) attrs then attrs else (ak,av)::attrs) ats | [] -> attrs in match token with | StartTagToken tag -> (match ElementMap.find id state.elements with | { node = ElementNode ({ attributes = attrs } as e) } as n -> { state with elements = ElementMap.add n.id { n with node = ElementNode { e with attributes = merge attrs tag.start_tag_attributes } } state.elements } | _ -> failwith "error - non-element node on stack of open elements" ) | _ -> failwith "error - token is incorrect type" let mergeAttributesIntoBody token state = mergeAttributesInto (List.hd (List.tl (List.rev state.openStack))) token state let mergeAttributesIntoHTML token state = mergeAttributesInto (List.hd (List.rev state.openStack)) token state let hasElementInScope table ns els = let rec f = function | e::es -> let n = elementName els e in if List.mem n ns then true else if n = "table" then false else if (not table) && (List.mem n ["caption"; "td"; "th"; "button"; "marquee"; "object"]) then false else if n = "html" then false else f es | [] -> failwith "error - reached end of stack of open elements in hasElementInScope" in f let hasElementIdInScope id els = (* TODO: refactor *) let rec f = function | e::es -> let n = elementName els e in if e = id then true else if n = "table" then false else if List.mem n ["caption"; "td"; "th"; "button"; "marquee"; "object"] then false (* TODO: not for "in table scope" *) else if n = "html" then false else f es | [] -> failwith "error - reached end of stack of open elements in hasElementIdInScope" in f let ignoreToken state = { state with ignored = true } (* XXX need some way to reset this *) let removeElementFromParent id state = { state with elements = ElementMap.map (function | { node = ElementNode ({ children = cs } as e) } as n when List.mem id cs -> { n with node = ElementNode { e with children = List.filter (fun x -> x <> id) cs } } | n -> n ) state.elements } let rec adoptionAgency name state = let debug = false in (* 1. *) if debug then (print_string "entry: "; dumpState state); let rec f = function | [] -> addParseError "closed non-active formatting element" (ignoreToken state) | -1::_ -> addParseError "closed formatting element across marker" (ignoreToken state) | e::es when elementName state.elements e <> name -> f es | e::_ when List.mem e state.openStack && not (hasElementIdInScope e state.elements state.openStack) -> addParseError "closed formatting element is not in scope" (ignoreToken state) | e::_ when not (List.mem e state.openStack) -> addParseError "closed formatting element is not open" (ignoreToken { state with activeList = List.filter (fun e' -> e <> e') state.activeList }) (* TODO: be sure e is not repeated in activeList *) | fmt::_ -> if debug then (print_string "got fmt = "; print_int fmt; print_string ": "; dumpState state); let state = (if fmt != currentNodeId state then addParseError "closed formatting element is not current element" state else state) in (* 2. *) let rec f a = function (* find the topmost node in openStack lower than fmt and not phrasing/formatting *) | e::es when e = fmt -> a | [] -> a | e::es when not (let name = elementName state.elements e in isCategoryPhrasing name || isCategoryFormatting name) -> f (Some e) es | e::es -> f a es in match f None state.openStack with (* 3. *) | None -> let state = { state with openStack = (let rec f = function x::xs when x = fmt -> xs | x::xs -> f xs | [] -> failwith "can't find fmt in openStack" in f state.openStack); activeList = List.filter (fun n -> n <> fmt) state.activeList } in if debug then (print_string "no furthest: "; dumpState state); state | Some furthest -> (* 4. *) let ancestor = (* element immediately above fmt in openStack *) (let rec f = function | f::a::_ when f = fmt -> a | e::es -> f es | _ -> failwith "error - couldn't find fmt in open stack" in f state.openStack) in (* 5. *) let state = removeElementFromParent furthest state in if debug then (print_string "deparented furthest: "; dumpState state); (* 6-8. *) (* Reverse-engineered (and probably incorrect or buggy): * Equivalent to: * Find the range of open-stack between furthest-block and formatting-element (exclusive) * Remove all items from that range that are not in active-list. * Set bookmark to be after the first element in that open-stack range (plus formatting-element) that is also in active-list. * Fold all the elements in that range into a tree, with some cloning, then fold that into common-ancestor. *) let rec f = function | e::es when e = furthest -> let (range, after) = g es in ([e], range, after) | e::es -> let (before, range, after) = f es in (e::before, range, after) | [] -> failwith "didn't find furthest in open stack" and g = function | e::es when e = fmt -> ([], e::es) | e::es -> let (range, after) = g es in (e::range, after) | [] -> failwith "didn't find fmt in open stack" in let (before, range, after) = f state.openStack in if debug then (print_string "ranges: ["; print_string (String.concat "; " (List.map string_of_int before)); print_string "] ["; print_string (String.concat "; " (List.map string_of_int range)); print_string "] ["; print_string (String.concat "; " (List.map string_of_int after)); print_string "]\n"); let range = List.filter (fun e -> List.mem e state.activeList) range in let (lastNode, (state, range')) = List.fold_left (fun (lastNode, (state, range)) node -> let (state, range, node) = (let elNode = elementNode state.elements node in if List.length elNode.children <> 0 then let (clone, state) = createElementNode elNode.name elNode.attributes state in let f n = if n = node then clone else n in ({ state with activeList = List.map f state.activeList }, clone::range, clone) else (state, node::range, node) ) in (node, (appendToElementNode node lastNode (removeElementFromParent lastNode state), range)) ) (furthest, (state, [])) range in let range = List.rev range' in if debug then (print_string "folded: "; dumpState state); (* XXX: table fostering *) let state = appendToElementNode ancestor lastNode (removeElementFromParent lastNode state) in if debug then (print_string "appended to ancestor: "; dumpState state); let (clone, state) = (let elNode = elementNode state.elements fmt in createElementNode elNode.name elNode.attributes state) in let state = setElementChildren clone (elementNode state.elements furthest).children state in (* (clone can't be current-node, so fostering won't occur) *) let state = setElementChildren furthest [] state in let state = appendToElementNode furthest clone state in if debug then (print_string "moved to furthest: "; dumpState state); let bookmark = if range = [] then fmt else List.hd range in let state = { state with activeList = (let rec f = function (* add clone after bookmark, and delete fmt *) | n::ns when n = bookmark -> if n = fmt then clone::(f ns) else n::clone::(f ns) | n::ns when n = fmt -> f ns | n::ns -> n::(f ns) | [] -> [] in f state.activeList); openStack = (List.rev (List.tl (List.rev before))) @ [clone] @ [List.hd (List.rev before)] @ range @ (List.tl after) (* XXX very ugly *) } in if debug then (print_string "stuff: "; dumpState state); adoptionAgency name state in f state.activeList let reconstructActiveFormatting state = match state.activeList with | [] -> state | -1::_ -> state | e::es -> if List.mem e state.openStack then state else let rec f = function (* scan backwards to find the sublist up to the end-of-list or marker *) | [] -> failwith "error - reached end of open stack" | e0::[] -> ([e0], []) | e0::-1::es -> ([e0], es) | e0::e1::es -> if List.mem e1 state.openStack then ([e0], e1::es) else let (e, es') = f (e1::es) in (e0::e, es') in let (opened, rest) = f (e::es) in let (es, state) = List.fold_left (fun (es, state) e -> let n = elementNode state.elements e in let (clone, state) = createElementNode n.name n.attributes state in let state = pushToOpenStack clone (appendToCurrent clone state) in (clone::es, state) ) (rest, state) (List.rev opened) in { state with activeList = es } let rec generateImpliedEndTags except state = let name = (currentNodeElement state).name in if List.mem name ["dd"; "dt"; "li"; "p"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] && not (List.mem name except) then let state' = apply (EndTagToken { end_tag_name = (currentNodeElement state).name }) state.insertionMode state in generateImpliedEndTags except state' else state and apply token mode state = if state.ignoringNextCharacterIfNewline then let state = { state with ignoringNextCharacterIfNewline = false } in match token with | CharacterToken '\n' -> state | _ -> apply token state.insertionMode state else if state.genericData != -1 then match token with | CharacterToken c -> appendToElementTextNotFostered state.genericData (String.make 1 c) state | EndTagToken { end_tag_name = n } when n = (elementNode state.elements state.genericData).name -> { state with genericData = -1 } | _ -> apply token state.insertionMode (addParseError "generic CDATA/RCDATA stopped with incorrect token" { state with genericData = -1 }) else if state.appendingAllCharacters != -1 then (match token with | CharacterToken c -> appendCharacterToTextNode state.appendingAllCharacters (String.make 1 c) state | _ -> apply token state.insertionMode { state with appendingAllCharacters = -1 }) else if state.appendingWSCharacters != -1 then (match token with | CharacterToken c when List.mem (int_of_char c) whitespaceChars -> appendCharacterToTextNode state.appendingWSCharacters (String.make 1 c) state | _ -> apply token state.insertionMode { state with appendingWSCharacters = -1 }) else let value_to_string = function | Const s -> s | TagName -> (match token with | StartTagToken tag -> tag.start_tag_name | EndTagToken tag -> tag.end_tag_name | _ -> failwith "error - token is incorrect type in value_to_string" ) in let token_matches = function | (Doctype, DoctypeToken _) -> true | (EndOfFile, EndOfFileToken) -> true | (Character, CharacterToken _) -> true | (CharacterIn cs, CharacterToken c) -> List.mem (Char.code c) cs | (CharacterNotIn cs, CharacterToken c) -> not (List.mem (Char.code c) cs) | (Comment, CommentToken _) -> true | (StartTag names, StartTagToken { start_tag_name = name } ) -> List.mem name names | (EndTag names, EndTagToken { end_tag_name = name } ) -> List.mem name names | (AnyStartTag, StartTagToken _) -> true | (AnyEndTag, EndTagToken _) -> true | (Anything, _) -> true | _ -> false in let condition_matches state = function | CurrentNodeDoesNotHaveName n -> (currentNodeElement state).name <> value_to_string n | CurrentNodeHasName n -> (currentNodeElement state).name = value_to_string n | FormElementPointerIsNotNull -> state.formElement != -1 | GeneratedTokenWasNotIgnored -> true (* XXX *) | InsertionModeIs ms -> List.mem state.insertionMode ms | ListOfActiveContainsA -> let rec f = function | -1::es -> false | e::es -> if elementName state.elements e = "a" then true else f es | [] -> false in f state.activeList | MoreThanOneNodeAndSecondNotBody -> List.length state.openStack > 1 && elementName state.elements (List.nth (List.rev state.openStack) 1) <> "body" | MoreThanTwoNodesOrSecondNotBody -> List.length state.openStack > 2 || (List.length state.openStack = 2 && elementName state.elements (List.hd state.openStack) <> "body") | NodeBeforeCurrentHasName n -> (match state.openStack with | _::p::_ -> (elementNode state.elements p).name = value_to_string n | _ -> failwith "error - open stack does not have enough stuff in NodeBeforeCurrentHasName") | NodeInStackIsNot names -> List.exists (fun id -> not (List.mem (elementName state.elements id) names)) state.openStack | NotFirstStartTagToken -> state.startTagsSeen > 1 | NotSecondElementIsBody -> false (* XXX *) | ParsingFragment -> false (* TODO: implement this some time *) | StackHasElementInScope ns -> hasElementInScope false (List.map value_to_string ns) state.elements state.openStack | StackHasElementInTableScope ns -> hasElementInScope true (List.map value_to_string ns) state.elements state.openStack | StackNotHasElementInScope ns -> not (hasElementInScope false (List.map value_to_string ns) state.elements state.openStack) | StackNotHasElementInTableScope ns -> not (hasElementInScope true (List.map value_to_string ns) state.elements state.openStack) (* TODO: refactor *) in let rec execute_cmd state = function | ActAsIfEndTag name -> apply (EndTagToken { end_tag_name = name }) state.insertionMode state | ActAsIfStartTag name -> apply (StartTagToken { start_tag_name = name; start_tag_attributes = [] }) state.insertionMode state | ActAsIfStartTagName name -> (match token with StartTagToken { start_tag_attributes = attrs } -> apply (StartTagToken { start_tag_name = name; start_tag_attributes = attrs }) state.insertionMode state | _ -> failwith "error - token is incorrect type in ActAsIfStartTagName") | AddMarkerToActiveList -> { state with activeList = -1::state.activeList } | AdoptionAgency -> (match token with EndTagToken { end_tag_name = name } -> adoptionAgency name state | _ -> failwith "error - token is incorrect type in AdoptionAgency") | AppendCommentTokenToCurrentNode -> (match token with CommentToken text -> let (id, state) = createCommentNode text state in appendToCurrent id state | _ -> failwith "error - token is incorrect type in AppendCommentTokenToCurrentNode") | AppendCommentTokenToDocument -> (match token with CommentToken text -> let (id, state) = createCommentNode text state in appendToDocument id state | _ -> failwith "error - token is incorrect type in AppendCommentTokenToDocument") | AppendCommentTokenToHTMLElement -> (match token with CommentToken text -> let (id, state) = createCommentNode text state in appendToElementNode (List.hd (List.rev state.openStack)) id state | _ -> failwith "error - token is incorrect type in AppendCommentTokenToHTMLElement") | ApplyEndTag -> (match token with EndTagToken tag -> let rec f = function | n::ns -> let node = ElementMap.find n state.elements in (match node with | { id = id; node = ElementNode { name = name } } -> if name = tag.end_tag_name then let state = generateImpliedEndTags [] state in let state = (if tag.end_tag_name <> (currentNodeElement state).name then addParseError "mismatched end tag name" state else state) in let rec rest = function n::ns -> if n = id then ns else rest ns | [] -> [] in { state with openStack = rest state.openStack } else if not (isCategoryFormatting name || isCategoryPhrasing name) then let state = addParseError "invalid end tag" state in state (* TODO: mark this token as ignored? *) else f ns | _ -> failwith "error - non-element node on stack of open elements") | [] -> failwith "error - reached end of stack of open elements in ApplyEndTag" in f state.openStack | _ -> failwith "error - token is incorrect type") | AssertFragmentCase -> failwith "NYI AssertFragmentCase" | AssociateCurrentNodeWithFormElementPointer -> state (* TODO *) | CharsetEncodingStuff -> state (* TODO *) | ClearActiveListUpToMarker -> { state with activeList = (let rec f = function -1::xs -> xs | x::xs -> f xs | [] -> failwith "no marker in activeList" in f state.activeList) } | ClearStackToContext ns -> let ok e = (List.mem (elementName state.elements e) ns) in let state = (if not (ok (List.hd state.openStack)) then addParseError "popped elements when clearing stack to table context" state else state) in { state with openStack = let rec f = function e::es when ok e -> e::es | e::es -> f es | [] -> failwith "ran out of stack when clearing to table context" in f state.openStack } | DoDoctypeStuff -> (match token with DoctypeToken doctype -> let f = function Some s -> s | None -> "" in let state = (if String.lowercase doctype.doctypename <> "html" || doctype.pubid <> None || doctype.sysid <> None then addParseError "invalid doctype" state else state) in let (id, state) = createDoctypeNode doctype.doctypename (f doctype.pubid) (f doctype.sysid) state in appendToDocument id state | _ -> failwith "error - token is incorrect type in DoDoctypeStuff") | FixupForListElement names -> (match token with StartTagToken tag -> let rec f = function | (count, n::ns) -> let node = ElementMap.find n state.elements in (match node with | { id = id; node = ElementNode { name = name } } -> if List.mem name names then let state = { state with openStack = ns } in if count > 1 then addParseError "popped too much when adding list element" state else state else if not (isCategoryFormatting name || isCategoryPhrasing name || name = "address" || name = "div") then state else f (count+1, ns) | _ -> failwith "error - non-element node on stack of open elements") | (_, []) -> failwith "error - reached end of stack of open elements in ApplyEndTag" in f (1, state.openStack) | _ -> failwith "error - token is incorrect type") | GenerateImpliedEndTags except -> generateImpliedEndTags (List.map value_to_string except) state | If (cond, cmds1, cmds2) -> execute_cmds state (if condition_matches state cond then cmds1 else cmds2) | IgnoreNextTokenIfLinefeed -> { state with ignoringNextCharacterIfNewline = true } | IgnoreToken -> state (* XXX *) | InsertElement -> insertElementAnd token state (fun id state -> state) | InsertElementAndAddToListOfActive -> insertElementAnd token state addToActiveList | InsertElementAndSetFormElementPointer -> insertElementAnd token state setFormElementPointer | InsertElementAndSetHeadElementPointer -> insertElementAnd token state setHeadElementPointer | InsertHTMLElement -> let (id, state) = (createElementNode "html" [] state) in pushToOpenStack id (appendToDocument id state) | MergeAttributesIntoBodyElement -> mergeAttributesIntoBody token state | MergeAttributesIntoHTMLElement -> mergeAttributesIntoHTML token state | ParseError msg -> addParseError ("parse error: " ^ msg ^ " (token = " ^ (string_of_token token) ^ ")") state | PopCurrentNodeFromStack -> popOpenStack state | PopElementsFromStackUntilNo ns -> let rec f state = if hasElementInScope false ns state.elements state.openStack then f (popOpenStack state) else state in f state | PopElementsFromStackUntilOneOf ns -> { state with openStack = (let rec f = function x::xs when List.mem (elementName state.elements x) (List.map value_to_string ns) -> xs | x::xs -> f xs | [] -> failwith "popped until no stack left" in f state.openStack) } | PushHeadElementPointerOntoStack -> { state with openStack = state.headElement :: state.openStack } | ReconstructActiveFormattingElements -> reconstructActiveFormatting state | RemoveThatAElementIfNecessary -> (* TODO: it's not clear that this is valid *) let rec f = function | -1::es -> state | e::es -> if elementName state.elements e = "a" then { state with activeList = List.filter (fun id -> id != e) state.activeList; openStack = List.filter (fun id -> id != e) state.openStack } else f es | [] -> state in f state.activeList | ReprocessAsIf newMode -> apply token newMode state | ReprocessCurrentToken -> apply token state.insertionMode state | ReprocessWithFosteringAsIf newMode -> let state = { state with fostering = state.fostering+1 } in (* TODO: show this is valid *) let state = apply token newMode state in { state with fostering = state.fostering-1 } | ResetInsertionModeAppropriately -> resetInsertionModeAppropriately state | SetCompatMode compatMode -> { state with compatMode = compatMode } | SetContentModelFlag _ -> state (* XXX *) | SetFormElementPointerToNull -> setFormElementPointer (-1) state | SetInsertionMode newMode -> setInsertionMode newMode state | StopParsing -> state (* TODO: does this need something more interesting? *) | TODO err -> failwith ("NYI TODO " ^ err) | GenericCDATA -> (match token with | StartTagToken tag -> let (id, state) = createElementNode tag.start_tag_name tag.start_tag_attributes state in let state = appendToCurrent id state in (*XXX: setContentModelFlag CDATA*) { state with genericData = id } | _ -> failwith "error - token is incorrect type in GenericCDATA") | GenericRCDATA -> (match token with | StartTagToken tag -> let (id, state) = createElementNode tag.start_tag_name tag.start_tag_attributes state in let state = appendToCurrent id state in (*XXX: setContentModelFlag RCDATA*) { state with genericData = id } | _ -> failwith "error - token is incorrect type in GenericRCDATA") (* Removed by transforms *) | ActAsIfAnythingElse -> failwith "ActAsIfAnythingElse cannot be executed" | AppendCharacterTokensToCurrentNode -> failwith "AppendCharacterTokensToCurrentNode cannot be executed" (* Generated by transforms *) | AppendCharacterTokenToCurrentNodeAndSetAppendingAllCharacters -> (match token with CharacterToken text -> let (id, state) = appendCharacterToCurrentNode (String.make 1 text) state in { state with appendingAllCharacters = id } | _ -> failwith "error - token is incorrect type in AppendCharacterTokenToCurrentNodeAndSetAppendingAllCharacters") | AppendCharacterTokenToCurrentNodeAndSetAppendingWSCharacters -> (match token with CharacterToken text -> let (id, state) = appendCharacterToCurrentNode (String.make 1 text) state in { state with appendingWSCharacters = id } | _ -> failwith "error - token is incorrect type in AppendCharacterTokenToCurrentNodeAndSetAppendingWSCharacters") (* (These have to care about the element id, and not use current, because the first char might have been fostered and the foster flag reset before this stage) *) and execute_cmds state = function | s::ss -> (if traceEnabled then (print_string (" " ^ string_of_command s); print_newline ())); let state = execute_cmd state s in (if traceEnabled then (print_string "state: "; dumpState state)); execute_cmds state ss | [] -> state in let rec f = function | (cond, cmds)::ss -> if token_matches (cond, token) then ( if traceEnabled then (print_string (string_of_token_match cond); print_newline ()); execute_cmds state cmds ) else f ss | [] -> failwith "error - token not handled by state machine" in if traceEnabled then (print_string ("# " ^ string_of_insertion_mode mode ^ " - " ^ string_of_token token); print_newline (); print_string "^ "; dumpState state); f (transformAlgorithm mode) (* Need to count start tag tokens, but not count ones via 'reprocess' commands *) let applyRealToken token state = let state = (match token with StartTagToken _ -> { state with startTagsSeen = state.startTagsSeen + 1 } | _ -> state) in apply token state.insertionMode state let interpret ts = let rec step state = function | [] -> apply EndOfFileToken state.insertionMode state | ParseErrorToken::ts -> step (addParseError "from tokeniser" state) ts | t::ts -> step (applyRealToken t state) ts in step initialState ts