open Common open Tokeniser_spec open Tokeniser_states (* Various instances of proofs can be handled as data-flow analysis (as in http://en.wikipedia.org/wiki/Data_flow_analysis etc). So, use a kind-of-generic framework to handle them. In-vals are stored as association lists keyed on machine-state - flow is always linear within a single step of the algorithm, so the values only needs to be stored for the entry points to each state. *) let dataflowAnalysis initInvals calc calcAct combine valid algorithm = let rec resultantMachineState prev = function (SwitchMachineState ms)::acts -> resultantMachineState ms acts | _::acts -> resultantMachineState prev acts | [] -> prev in let rec calcInvals invals = let nextInvals = List.sort compare (List.fold_left (fun invals' (ms, consume, steps) -> List.fold_left (fun invals'' (matcher, actions) -> let inval = calc (List.assq ms invals'') consume in let ms' = resultantMachineState ms actions in let rec f v = function act::acts -> f (calcAct v act) acts | [] -> v in (ms', combine (List.assq ms' invals'', f inval actions))::(List.remove_assq ms' invals'') ) invals' steps ) invals algorithm) in if nextInvals = invals then invals else calcInvals nextInvals in let verifyInvals invals = let errs = ref [] in List.iter (fun (ms, consume, steps) -> List.iter (fun (matcher, actions) -> let inval = calc (List.assq ms invals) consume in let rec f v = function act::acts -> errs := ((List.map (fun msg -> "In state " ^ string_of_machine_state ms ^ ": " ^ msg) (valid v act)) @ !errs); let outval = calcAct v act in f outval acts | [] -> () in f inval actions; () ) steps ) algorithm; !errs in let finalInvals = calcInvals initInvals in let errs = verifyInvals finalInvals in if List.length errs <> 0 then ( List.iter print_endline (List.rev errs); failwith "Verification failed" ); finalInvals (**** Current-token type correctness ****) (* Desired property: If the current token is modified, then it is of the type required by that modification. *) (* Define a type system that can represent various interesting properties we want to verify *) type token = TagToken | StartTagToken | EndTagToken | TagTokenWithAttributes | CommentToken | DoctypeToken | DoctypeTokenWithoutAnyId | DoctypeTokenWithOnlyPubId | DoctypeTokenWithoutSysId | DoctypeTokenWithPubId | DoctypeTokenWithSysId | NoToken let string_of_token_type = function TagToken -> "Tag" | StartTagToken -> "Start-tag" | EndTagToken -> "End-tag" | TagTokenWithAttributes -> "Tag-with-attributes" | CommentToken -> "Comment" | DoctypeToken -> "Doctype" | DoctypeTokenWithoutAnyId -> "Doctype-without-any-id" | DoctypeTokenWithOnlyPubId -> "Doctype-with-only-pub-id" | DoctypeTokenWithoutSysId -> "Doctype-without-sys-id" | DoctypeTokenWithPubId -> "Doctype-with-pub-id" | DoctypeTokenWithSysId -> "Doctype-with-sys-id" | NoToken -> "None" (* Define some type subsumption *) let isAcceptable = function (req, got) when req = got -> true | (TagToken, TagTokenWithAttributes) -> true | (TagToken, StartTagToken) -> true | (TagToken, EndTagToken) -> true | (DoctypeToken, DoctypeTokenWithoutAnyId) -> true | (DoctypeToken, DoctypeTokenWithoutSysId) -> true | (DoctypeToken, DoctypeTokenWithOnlyPubId) -> true | (DoctypeToken, DoctypeTokenWithPubId) -> true | (DoctypeToken, DoctypeTokenWithSysId) -> true | (DoctypeTokenWithPubId, DoctypeTokenWithOnlyPubId) -> true | (DoctypeTokenWithoutSysId, DoctypeTokenWithOnlyPubId) -> true | (DoctypeTokenWithoutSysId, DoctypeTokenWithoutAnyId) -> true | _ -> false (* Actions that can set the current token *) let resultantTokenType = function EmitConsumedCharacter -> Some NoToken | EmitCharacter c -> Some NoToken | EmitEOFToken -> Some NoToken | ConsumeAndEmitCharRef c -> Some NoToken | CreateStartTagToken -> Some StartTagToken | CreateEndTagToken -> Some EndTagToken | CreateTagTokenAttribute -> Some TagTokenWithAttributes | CreateCommentToken -> Some CommentToken | CreateDoctypeToken -> Some DoctypeTokenWithoutAnyId | EmitCurrentTagToken -> Some NoToken | EmitCurrentCommentToken -> Some NoToken | EmitCurrentDoctypeToken -> Some NoToken | SetDoctypeTokenPubIdEmpty -> Some DoctypeTokenWithOnlyPubId (* elsewhere we assert this never happens unless there was no sys ID already *) | SetDoctypeTokenSysIdEmpty -> Some DoctypeTokenWithSysId | _ -> None (* Actions that rely on the current token's type *) let requiredTokenType = function ConsumeAndAppendCharRefToAttributeValue c -> Some TagTokenWithAttributes | CreateTagTokenAttribute -> Some TagToken | EmitCurrentTagToken -> Some TagToken | EmitCurrentCommentToken -> Some CommentToken | EmitCurrentDoctypeToken -> Some DoctypeToken | AppendToTagTokenName -> Some TagToken | AppendToTagTokenNameLowercase -> Some TagToken | AppendToTagTokenAttributeName -> Some TagTokenWithAttributes | AppendToTagTokenAttributeNameLowercase -> Some TagTokenWithAttributes | AppendToTagTokenAttributeValue -> Some TagTokenWithAttributes | AppendAmpersandToTagTokenAttributeValue -> Some TagTokenWithAttributes | SetTagTokenSelfClosingFlag -> Some TagToken | AppendToCommentToken -> Some CommentToken | AppendHyphenToCommentToken -> Some CommentToken | AppendToDoctypeTokenName -> Some DoctypeToken | AppendToDoctypeTokenNameLowercase -> Some DoctypeToken | AppendToDoctypeTokenPubId -> Some DoctypeTokenWithPubId | AppendToDoctypeTokenSysId -> Some DoctypeTokenWithSysId | SetDoctypeTokenIncorrect -> Some DoctypeToken | SetDoctypeTokenPubIdEmpty -> Some DoctypeTokenWithoutAnyId (* require pub ID is never set after sys ID, and is only set once *) | SetDoctypeTokenSysIdEmpty -> Some DoctypeTokenWithoutSysId (* require sys ID is only set once *) | ParseErrorIfEndTagWithAttributes -> Some TagToken | ParseErrorIfEndTagWithSelfClosing -> Some TagToken (* Anything that changes the current token must only happen when there's not already a current one (else the old current token would get leaked) *) | a -> (match resultantTokenType a with Some t -> Some NoToken | None -> None) let currentTokenTypes = let calculateOutval inval consume = inval in let calculateOutvalAct inval act = (match resultantTokenType act with Some t -> [t] | None -> inval) in let combineVals = function a, b -> List.sort compare (unique (a @ b)) in let isValid inval act = (match requiredTokenType act with Some req -> List.map (fun got -> "May rely on current token being " ^ string_of_token_type req ^ " but get " ^ string_of_token_type got) (List.filter (fun got -> not (isAcceptable (req, got))) inval) | None -> [] ) in let initialInvals = (List.map (fun (ms, _, _) -> ms, if ms = DataState then [NoToken] else []) tokeniserAlgorithm) in dataflowAnalysis initialInvals calculateOutval calculateOutvalAct combineVals isValid tokeniserAlgorithm (**** Single-character reconsumption ****) (* Desired property: Any two UnconsumeCharacters have a ConsumeCharacter between them. (Useful because it means a 1-character buffer is sufficient for unconsuming.) *) type consumed_val = HasConsumed | NoConsumed let characterReconsumption = (* The outval from the beginning of a state-handler depends on whether it consumes a character immediately *) let calculateOutval inval = function true -> NoConsumed | false -> inval in (* The outval at an action depends whether it's a Consume/Unconsume *) let calculateOutvalAct inval = function UnconsumeCharacter -> HasConsumed | ConsumeCharacter -> NoConsumed | _ -> inval in (* If >0 paths into a state can have an unconsumed character, overestimate safely by saying the state always does have an unconsumed character *) let combineVals = function NoConsumed, NoConsumed -> NoConsumed | _ -> HasConsumed in (* Unconsume must not be called when there is an unconsumed character already *) let isValid inval = function UnconsumeCharacter -> (match inval with HasConsumed -> ["UnconsumeCharacter may be called while there is already an unconsumed character"] | NoConsumed -> [] ) | _ -> [] in (* Initialise each state *) let initialInvals = (List.map (fun (ms, _, _) -> ms, NoConsumed) tokeniserAlgorithm) in (* *) dataflowAnalysis initialInvals calculateOutval calculateOutvalAct combineVals isValid tokeniserAlgorithm (**** Entity semicolons ****) (* Desired property: every entity name can have a semicolon after the end of it *) let entitySemicolons = List.iter (fun (n, v) -> if not (String.contains n ';') && not (List.mem_assoc (n ^ ";") Entities.charRefTable) then (print_endline ("Entity " ^ n ^ " is defined, but not " ^ n ^ ";."); failwith "Verification failed") ) Entities.charRefTable ;; print_endline "Verification passed";; (* TODO: show termination *)