open Common open Tokeniser_spec open Tokeniser_states exception BrokenAlgorithm (* should never be raised *) exception RanOutOfCharacterStream (* not used in normal operation; for abnormal operation, add a -1 to the character stream and use tokenisePartial to catch the tokeniser state before it would consume an EOF *) type wchar = int (* 0 for EOF *) let eof = 0 (* Strings are stored as 'wchar list', in reverse (because that's easier) *) type attribute = wchar list * wchar list (* name, value *) type doctype = { doctypename: wchar list; pubid: wchar list option; sysid: wchar list option; correct: bool } type tag_type = StartTag | EndTag type tag = { tagtype: tag_type; name: wchar list; selfclosing: bool; attributes: attribute list; droppingattribute: bool } type token = DoctypeToken of doctype | TagToken of tag | CommentToken of wchar list | CharacterToken of wchar | EndOfFileToken | ParseErrorToken type tokeniser_state = { machineState: machine_state; contentModel: content_model; escapeFlag: bool; characterStream: wchar list; c0: wchar option; (* c0 = most recently consumed character, c1..c3 = previous ones *) c1: wchar option; c2: wchar option; c3: wchar option; currentToken: token option; tokenStream: token list; (* latest tokens on front of list *) (* TODO: this probably shouldn't be part of the state *) } let initialState = { machineState = DataState; contentModel = PCDATA; escapeFlag = false; characterStream = []; c0 = None; c1 = None; c2 = None; c3 = None; currentToken = None; tokenStream = []; } type tokeniser_hook = { matchedStep: (tokeniser_matcher * tokeniser_action list) -> unit } let nullHook = { matchedStep = (fun _ -> ()) } let explode str = let r = ref [] in for i = 0 to (String.length str)-1 do r := int_of_char str.[i] :: !r done; List.rev !r let isInvalidUnicode c = if c > 0x10FFFF then true else let rec f = function | (a, b)::_ when a <= c && c <= b -> true | _::rs -> f rs | [] -> false in f Entities.charRefInvalidRanges let consumeCharRef allowedChar charStream inAttribute = (* returns (newCharStream:wchar list, valueToAppend:wchar option, errors:token list) *) let f = function (0x0009::_) | (0x000A::_) | (0x000C::_) | (0x0020::_) | (0x003C::_) | (0x0026::_) | [] -> (* Nothing matched, no error *) (charStream, None, []) | (c::_) when (match allowedChar with Some a when a = c -> true | _ -> false) -> (* Nothing matched, no error *) (charStream, None, []) | (0x0023::cs) -> ( let rec consumeRestOfCharRef hex n = function (c::cs') -> ( let n' = (match n with | None -> 0 | Some n when n >= 0x110000 -> 0x110000 (* prevent integer overflow *) | Some n -> n ) in let base = (if hex then 16 else 10) in if c >= 0x0030 && c <= 0x0039 then consumeRestOfCharRef hex (Some (n'*base + c-0x0030)) cs' else if hex && (c >= 0x0061 && c <= 0x0066) then consumeRestOfCharRef hex (Some (n'*base + c-0x0061+10)) cs' else if hex && (c >= 0x0041 && c <= 0x0046) then consumeRestOfCharRef hex (Some (n'*base + c-0x0041+10)) cs' else (n, c::cs') ) | [] -> (n, []) in let (n, cs') = ( match cs with | 0x0078::cs'' | 0x0058::cs'' -> consumeRestOfCharRef true None cs'' (* hex *) | _ -> consumeRestOfCharRef false None cs (* decimal *) ) in match n with None -> (charStream, None, [ParseErrorToken]) (* no characters in range were matched *) | Some n -> let (n, errs) = if List.mem_assoc n Entities.charRefMappingTable then (List.assoc n Entities.charRefMappingTable, [ParseErrorToken]) else (n, []) in let (c, errs) = if isInvalidUnicode n then (0xFFFD, ParseErrorToken::errs) else (n, errs) in match cs' with | 0x003B::cs'' -> (cs'', Some c, errs) | _ -> (cs', Some c, ParseErrorToken::errs) ) | _ -> let sortedCharRefs = List.sort (fun (a, _) (b, _) -> compare (String.length b) (String.length a)) Entities.charRefTable in let rec startsWith = function | (0x003B::cs, [0x003B]) -> (true, true, cs) | (c::cs, n::ns) when c = n -> startsWith (cs, ns) | (c::cs, n::ns) -> (false, false, cs) | (cs, []) -> (false, true, cs) | ([], n::ns) -> (false, false, []) in let rec f = function | (name, value)::rs -> let (semicolon, ok, cs) = startsWith (charStream, explode name) in if ok then (semicolon, cs, Some value) else f rs | [] -> (false, [], None) in match f sortedCharRefs with | (semicolon, cs, Some value) -> if not semicolon && inAttribute && (match cs with c::cs' -> (0x0030 <= c && c <= 0x0039) || (0x0041 <= c && c <= 0x005A) || (0x0061 <= c && c <= 0x007A)) then (charStream, None, [ParseErrorToken]) else (cs, Some value, if semicolon then [] else [ParseErrorToken]) | (_, _, None) -> (charStream, None, [ParseErrorToken]) in match f charStream with (cs, None, errs) -> (cs, Some 0x0026, errs) | x -> x let matchString state str = let rec f = function (x::xs, y::ys) -> if x = y then f (xs, ys) else false | (_, []) -> true | ([], _) -> false in match state.c0 with None -> raise BrokenAlgorithm | Some c0 -> f (c0::state.characterStream, explode str) let matchStringCaseInsensitive state str = let rec f = function (x::xs, y::ys) -> let x' = (if x >= 0x0041 && x <= 0x005A then x + 0x0020 else x) in if x' = y then f (xs, ys) else false | (_, []) -> true | ([], _) -> false in match state.c0 with None -> raise BrokenAlgorithm | Some c0 -> f (c0 :: state.characterStream, explode str) let doesMatch state = let rec matches = function AND(a, b) -> matches a && matches b | OR(a, b) -> matches a || matches b | NOT a -> not (matches a) | IsContentModel cm -> state.contentModel = cm | IsConsumedCharacter c -> (match state.c0 with None -> raise BrokenAlgorithm | Some c0 -> c0 = c) | IsConsumedCharacterEOF -> (match state.c0 with None -> raise BrokenAlgorithm | Some c0 -> c0 = eof) | IsConsumedCharacterInRange (cl, cu) -> (match state.c0 with None -> raise BrokenAlgorithm | Some c0 -> c0 >= cl && c0 <= cu) | IsChar1 c -> (match state.c1 with Some c1 -> c = c1 | _ -> false) | IsChar2 c -> (match state.c2 with Some c2 -> c = c2 | _ -> false) | IsChar3 c -> (match state.c3 with Some c3 -> c = c3 | _ -> false) | IsEndOfCData -> false (* XXX *) | IsStartOfComment -> matchString state "--" | IsDoctype -> matchStringCaseInsensitive state "doctype" | IsPublic -> matchStringCaseInsensitive state "public" | IsSystem -> matchStringCaseInsensitive state "system" | IsEscapeFlag -> state.escapeFlag | NotYetHandled -> true in matches let rec applyAction state = function SwitchMachineState ms -> { state with machineState = ms } | SetEscapeFlag ef -> { state with escapeFlag = ef } | UnconsumeCharacter -> ( match state.c0 with Some c -> { state with characterStream = c::state.characterStream; c0 = state.c1; c1 = state.c2; c2 = None } | None -> raise BrokenAlgorithm ) | EmitConsumedCharacter -> ( match state.c0 with Some c -> { state with tokenStream = (CharacterToken c)::state.tokenStream } | None -> raise BrokenAlgorithm ) | EmitCharacter c -> { state with tokenStream = (CharacterToken c)::state.tokenStream } | EmitEOFToken -> { state with tokenStream = EndOfFileToken::state.tokenStream } | ConsumeCharacter -> let (c, cs) = (match state.characterStream with -1::_ -> raise RanOutOfCharacterStream | c'::cs' -> (c', cs') | [] -> (eof, []) ) in { state with characterStream = cs; c0 = (Some c); c1 = state.c0; c2 = state.c1; c3 = state.c2 } | ConsumeAndEmitCharRef c -> ( let (cs, ent, errs) = consumeCharRef c state.characterStream false in let ts = errs @ state.tokenStream in let ts = (match ent with Some c -> CharacterToken c :: ts | None -> ts) in { state with characterStream = cs; tokenStream = ts } ) | ConsumeAndAppendCharRefToAttributeValue c -> ( let (cs, ent, errs) = consumeCharRef c state.characterStream true in let ts = errs @ state.tokenStream in let state' = { state with characterStream = cs; tokenStream = ts } in match ent, state.currentToken with | None, _ -> state' | _, Some (TagToken { attributes = (an, av)::attrs; droppingattribute = true }) -> state' | Some c, Some (TagToken ({ attributes = (an, av)::attrs } as t)) -> { state' with currentToken = Some (TagToken { t with attributes = (an, c::av)::attrs }) } | _ -> raise BrokenAlgorithm ) | CreateStartTagToken -> { state with currentToken = Some (TagToken { tagtype = StartTag; name = []; selfclosing = false; attributes = []; droppingattribute = false }) } | CreateEndTagToken -> { state with currentToken = Some (TagToken { tagtype = EndTag; name = []; selfclosing = false; attributes = []; droppingattribute = false }) } | CreateTagTokenAttribute -> ( match state.currentToken with Some (TagToken t) -> { state with currentToken = Some (TagToken { t with attributes = ([], [])::t.attributes; droppingattribute = false }) } | _ -> raise BrokenAlgorithm ) | CreateCommentToken -> { state with currentToken = Some (CommentToken []) } | CreateDoctypeToken -> { state with currentToken = Some (DoctypeToken { doctypename = []; pubid = None; sysid = None; correct = true }) } | EmitCurrentTagToken -> ( match state.currentToken with Some (TagToken _ as t) -> { state with tokenStream = t::state.tokenStream; currentToken = None } (* XXX *) | _ -> raise BrokenAlgorithm ) | EmitCurrentCommentToken -> ( match state.currentToken with Some (CommentToken _ as t) -> { state with tokenStream = t::state.tokenStream; currentToken = None } | _ -> raise BrokenAlgorithm ) | EmitCurrentDoctypeToken -> ( match state.currentToken with Some (DoctypeToken _ as t) -> { state with tokenStream = t::state.tokenStream; currentToken = None } | _ -> raise BrokenAlgorithm ) | HandleDuplicateAttributes -> ( match state.currentToken with Some (TagToken ({ attributes = (an, [])::attrs; droppingattribute = false } as t)) -> if List.exists (fun (n,v) -> n = an) attrs then applyAction { state with currentToken = Some (TagToken { t with attributes = attrs; droppingattribute = true }) } ParseError else state | _ -> raise BrokenAlgorithm ) | AppendToTagTokenName -> ( match (state.c0, state.currentToken) with Some c, Some (TagToken t) -> { state with currentToken = Some (TagToken { t with name = c::t.name }) } (* XXX check d *) | _ -> raise BrokenAlgorithm ) | AppendToTagTokenNameLowercase -> ( match (state.c0, state.currentToken) with Some c, Some (TagToken t) -> { state with currentToken = Some (TagToken { t with name = (c+0x0020)::t.name }) } | _ -> raise BrokenAlgorithm ) | AppendToTagTokenAttributeName -> ( match (state.c0, state.currentToken) with Some c, Some (TagToken ({ attributes = (an, av)::attrs } as t)) -> { state with currentToken = Some (TagToken { t with attributes = (c::an, av)::attrs }) } | _ -> raise BrokenAlgorithm ) | AppendToTagTokenAttributeNameLowercase -> ( match (state.c0, state.currentToken) with Some c, Some (TagToken ({ attributes = (an, av)::attrs } as t)) -> { state with currentToken = Some (TagToken { t with attributes = ((c+0x0020)::an, av)::attrs }) } | _ -> raise BrokenAlgorithm ) | AppendToTagTokenAttributeValue -> ( match (state.c0, state.currentToken) with Some c, Some (TagToken { attributes = (an, av)::attrs; droppingattribute = true }) -> state | Some c, Some (TagToken ({ attributes = (an, av)::attrs } as t)) -> { state with currentToken = Some (TagToken { t with attributes = (an, c::av)::attrs }) } | _ -> raise BrokenAlgorithm ) | AppendAmpersandToTagTokenAttributeValue -> ( match state.currentToken with Some (TagToken { attributes = (an, av)::attrs; droppingattribute = true }) -> state | Some (TagToken ({ attributes = (an, av)::attrs } as t)) -> { state with currentToken = Some (TagToken { t with attributes = (an, 0x0026::av)::attrs }) } | _ -> raise BrokenAlgorithm ) | SetTagTokenSelfClosingFlag -> ( match state.currentToken with Some (TagToken t) -> { state with currentToken = Some (TagToken { t with selfclosing = true }) } | _ -> raise BrokenAlgorithm ) | AppendToCommentToken -> ( match (state.c0, state.currentToken) with Some c, Some (CommentToken cs) -> { state with currentToken = Some (CommentToken (c::cs)) } | _ -> raise BrokenAlgorithm ) | AppendHyphenToCommentToken -> ( match state.currentToken with Some (CommentToken cs) -> { state with currentToken = Some (CommentToken (0x002D::cs)) } | _ -> raise BrokenAlgorithm ) | AppendToDoctypeTokenName -> ( match (state.c0, state.currentToken) with Some c, Some (DoctypeToken t) -> { state with currentToken = Some (DoctypeToken { t with doctypename = c::t.doctypename }) } | _ -> raise BrokenAlgorithm ) | AppendToDoctypeTokenNameLowercase -> ( match (state.c0, state.currentToken) with Some c, Some (DoctypeToken t) -> { state with currentToken = Some (DoctypeToken { t with doctypename = (c+0x0020)::t.doctypename }) } | _ -> raise BrokenAlgorithm ) | AppendToDoctypeTokenPubId -> ( match (state.c0, state.currentToken) with Some c, Some (DoctypeToken t) -> ( match t.pubid with Some pubid -> { state with currentToken = Some (DoctypeToken { t with pubid = Some (c::pubid) }) } | None -> raise BrokenAlgorithm ) | _ -> raise BrokenAlgorithm ) | AppendToDoctypeTokenSysId -> ( match (state.c0, state.currentToken) with Some c, Some (DoctypeToken t) -> ( match t.sysid with Some sysid -> { state with currentToken = Some (DoctypeToken { t with sysid = Some (c::sysid) }) } | None -> raise BrokenAlgorithm ) | _ -> raise BrokenAlgorithm ) | SetDoctypeTokenIncorrect -> ( match state.currentToken with Some (DoctypeToken t) -> { state with currentToken = Some (DoctypeToken { t with correct = false }) } | _ -> raise BrokenAlgorithm ) | SetDoctypeTokenPubIdEmpty -> ( match state.currentToken with Some (DoctypeToken t) -> { state with currentToken = Some (DoctypeToken { t with pubid = Some [] }) } | _ -> raise BrokenAlgorithm ) | SetDoctypeTokenSysIdEmpty -> ( match state.currentToken with Some (DoctypeToken t) -> { state with currentToken = Some (DoctypeToken { t with sysid = Some [] }) } | _ -> raise BrokenAlgorithm ) | ParseError -> { state with tokenStream = ParseErrorToken::state.tokenStream } | ParseErrorIfEndTagWithAttributes -> ( match state.currentToken with Some (TagToken { tagtype = EndTag; attributes = attr::attrs }) -> { state with tokenStream = ParseErrorToken::state.tokenStream } | _ -> state ) | ParseErrorIfEndTagWithSelfClosing -> ( match state.currentToken with Some (TagToken { tagtype = EndTag; selfclosing = true }) -> { state with tokenStream = ParseErrorToken::state.tokenStream } | _ -> state ) let preprocessInputStream state = let rec f cs ts = function | 0x000D::0x000A::cs' -> f (0x000A :: cs) ts cs' | 0x000D::cs' -> f (0x000A :: cs) ts cs' | 0x0000::cs' -> f (0xFFFD :: cs) (ParseErrorToken :: ts) cs' | c::cs' when isInvalidUnicode c -> f (c::cs) (ParseErrorToken :: ts) cs' | c::cs' -> f (c::cs) ts cs' | [] -> (cs, ts) in let (cs, ts) = f [] state.tokenStream state.characterStream in { state with characterStream = List.rev cs; tokenStream = ts } let rec executeStep (state : tokeniser_state) (actions : tokeniser_action list) = List.fold_left applyAction state actions let advanceAlgorithm hook state = let rec nextStep state' = function (((matcher, actions) as step)::steps) -> if doesMatch state' matcher then (hook.matchedStep step; executeStep state' actions) else nextStep state' steps | [] -> raise BrokenAlgorithm in let rec findStateHandler = function (machineState, consume, steps)::hs when machineState = state.machineState -> nextStep (if consume then applyAction state ConsumeCharacter else state) steps | _::hs -> findStateHandler hs | [] -> raise BrokenAlgorithm in findStateHandler tokeniserAlgorithm let tokenise str = let rec f state = match state.tokenStream with EndOfFileToken::_ -> state | _ -> f (advanceAlgorithm nullHook state) in f (preprocessInputStream { initialState with characterStream = explode str }) let tokenise2 str = let rec f state = match state.tokenStream with EndOfFileToken::_ -> state | _ -> f (advanceAlgorithm nullHook state) in f (preprocessInputStream { initialState with characterStream = str }) let tokenisePartial hook cs = let f state = let s = ref state in (try ( while (match (!s).tokenStream with EndOfFileToken::_ -> false | _ -> true) do s := advanceAlgorithm hook !s done ) with RanOutOfCharacterStream -> ()); !s in f (preprocessInputStream { initialState with characterStream = cs }) let printJSONString str = let rec f acc = function 0x0022::cs -> f ("\\\"" :: acc) cs | 0x005C::cs -> f ("\\\\" :: acc) cs | c::cs when c >= 0x20 && c < 0x80 -> f (String.make 1 (char_of_int c) :: acc) cs | c::cs when c >= 0x10000 -> f (Printf.sprintf "\\u%04X\\u%04X" (0xD800 + ((c-0x10000) / 1024)) (0xDC00 + ((c-0x10000) mod 1024)) :: acc) cs | c::cs -> f (Printf.sprintf "\\u%04X" c :: acc) cs | [] -> acc in String.concat "" ("\"" :: f [] str) ^ "\"" let printTokenised tokenStream = let rec f acc = function CharacterToken c :: cs -> g acc [c] cs | TagToken { tagtype = StartTag; name = name; selfclosing = selfclosing; attributes = attrs } :: cs -> f (("[\"StartTag\", " ^ printJSONString name ^ ", {" ^ (String.concat ", " (List.map (fun (n,v) -> printJSONString n ^ ":" ^ printJSONString v) (List.rev attrs))) ^ "}" ^ (if selfclosing then ", true" else "") ^ "]")::acc) cs | TagToken { tagtype = EndTag; name = name; attributes = attrs } :: cs -> f (("[\"EndTag\", " ^ printJSONString name ^ "]")::acc) cs | CommentToken text :: cs -> f (("[\"Comment\", " ^ printJSONString text ^ "]")::acc) cs | DoctypeToken { doctypename=n; pubid=p; sysid=s; correct=c } :: cs -> f (("[\"DOCTYPE\", " ^ printJSONString n ^ ", " ^ (match p with Some t -> printJSONString t | None -> "null") ^ ", " ^ (match s with Some t -> printJSONString t | None -> "null") ^ ", " ^ (if c then "true" else "false") ^ "]")::acc) cs | ParseErrorToken::cs -> f ("\"ParseError\""::acc) cs | EndOfFileToken::cs -> f acc cs (* f ("[\"EndOfFile\"]"::acc) cs *) (* | EndOfFileToken::cs -> f ("[\"EndOfFile\"]"::acc) cs *) | [] -> acc and g acc cacc = function (* slurp all the character tokens into a single string *) (CharacterToken c)::cs -> g acc (c::cacc) cs | c::cs -> f (("[\"Character\", " ^ printJSONString (List.rev cacc) ^ "]")::acc) (c::cs) | [] -> ("[\"Character\", " ^ printJSONString (List.rev cacc) ^ "]")::acc in (String.concat ", " (f [] tokenStream))