open Common open Tokeniser_spec open Tokeniser_states open Tokeniser_interp (* For each reachable machineState * contentModel * escapeFlag try to create test cases for every possible combination of matcher inputs *) type tokeniser_state = machine_state * content_model * bool (* escape flag *) type tokeniser_test_case = tokeniser_state * wchar list * (* character stream that led to this state *) token list (* output token stream *) let reachedStates : tokeniser_test_case list ref = ref [ ((DataState, PCDATA, false), [], []) ] (*let reachedStates = ref [ (CloseTagOpenState, PCDATA, false, [0x3c;0x2f]) ]*) (* Given the algorithm steps that are about to be executed, return a list of characters that are 'interesting' (i.e. going to trigger some particular cases in the algorithm) *) let relevantStrings steps = (* List.map (fun c -> [c]) (* [0x0000; 0x0009; 0x000A; 0x000B; 0x000C; 0x0020; 0x0021; 0x0022; 0x0026; 0x0027; 0x002D; 0x002F; 0x003C; 0x003D; 0x003E; 0x003F; 0x0040; 0x0041; 0x0042; 0x005A; 0x0060; 0x0061; 0x0062; 0x0079; 0x007A; 0x007B]*) [0x0020; 0x0061; 0x0062; 0x0079; 0x007A; 0x0040; 0x0041; 0x0042; 0x0059; 0x005A; 0x0021; 0x0022; 0x0026; 0x0027; 0x002D; 0x002F; 0x003C; 0x003D; 0x003E; 0x003F; 0x0060; 0x007B; 0x0000; 0x0009; 0x000A; 0x000B; 0x000C]*) let rec f = function AND(a, b) -> (f a) @ (f b) | OR(a, b) -> (f a) @ (f b) | NOT a -> f a | IsConsumedCharacter c -> [ [c]; [c-1]; [c+1] ] | IsConsumedCharacterEOF -> [ [0] ] | IsConsumedCharacterInRange (a, b) -> [ [a]; [b]; [a-1]; [a+1]; [b-1]; [b+1] ] | IsStartOfComment -> [ [0x2D]; [0x2D; 0x2D] ] | IsDoctype -> [ [0x44; 0x4F; 0x43; 0x54; 0x59; 0x50; 0x45] ] | IsPublic -> [ [0x50; 0x55; 0x42; 0x4C; 0x49; 0x43] ] | IsSystem -> [ [0x53; 0x59; 0x53; 0x54; 0x45; 0x4D] ] (* | IsDoctype -> [ [0x44; 0x4F; 0x43; 0x54; 0x59; 0x50; 0x45]; [0x64; 0x6F; 0x43; 0x74; 0x59; 0x70; 0x45] ] *) (* | IsPublic -> [ [0x50; 0x55; 0x42; 0x4C; 0x49; 0x43]; [0x70; 0x75; 0x42; 0x6C; 0x49; 0x63] ] *) (* | IsSystem -> [ [0x53; 0x59; 0x53; 0x54; 0x45; 0x4D]; [0x73; 0x59; 0x53; 0x74; 0x45; 0x4D] ] *) | _ -> [ ] in unique ( (List.map (fun c -> [c]) [ 0x0020; 0x0061; 0x0062; 0x0079; 0x007A; 0x0041; 0x0042; 0x0059; 0x005A; 0x0030; 0x0031; 0x0039; 0x0040; 0x0021; 0x0022; 0x0026; 0x0027; 0x002D; 0x002F; 0x003C; 0x003D; 0x003E; 0x003F; 0x0060; 0x007B; 0x0000; 0x0009; 0x000A; 0x000B; 0x000C; 0x100000; ]) (* [0x0020; 0x0078; 0x0021; 0x0022; 0x0026; 0x0027; 0x002D; 0x002F; 0x003C; 0x003D; 0x003E; 0x003F; 0x0000; 0x000B; 0x100000]) *) (* [0x0020; 0x0078]) *) @ List.flatten (List.map (fun (matcher, actions) -> f matcher ) steps) ) (* Remove duplicate states () *) let uniqueStates xs = let rec f us = function ((((a,b,c),_,_) as x)::xs) -> if (List.exists (function ((a',b',c'),_,_) when a=a' && b=b' && c=c' -> true | _ -> false) us) then f us xs else f (x::us) xs | [] -> us in f [] xs let printPath path = (List.fold_left (fun s c -> s ^ (if c = 0 then "" else String.make 1 (char_of_int c))) "" path) (*let testPaths = ref []*) let testCaseCost (_, path, toks) = (* (List.length path) *) (List.length toks, List.length path) (* (List.length path, List.length toks) *) ;; let iterations = 15 in let nonuniques = 1 in for i = 1 to iterations do let reached = List.map(fun ((ms, cm, ef), path, toks) -> let consume, steps = List.assoc ms (List.map (fun (a,b,c) -> a,(b,c)) tokeniserAlgorithm) in let strings = relevantStrings steps in List.map(fun string -> let path' = path @ string in let statePartial = tokenisePartial nullHook (List.rev (-1::(List.rev path'))) in let state = tokenise2 (List.rev (List.rev path')) in (statePartial.machineState, statePartial.contentModel, statePartial.escapeFlag), path', state.tokenStream ) strings ) (*!reachedStates*) (if i <= iterations-nonuniques+1 then (uniqueStates (List.stable_sort (fun a b -> compare (testCaseCost a) (testCaseCost b)) !reachedStates)) else (* !reachedStates *) unique !reachedStates ) (* (uniqueStates (List.sort (fun (_,ia,ta) (_,ib,tb) -> compare (List.length ia) (List.length ib)) !reachedStates)) *) in reachedStates := (List.flatten reached) @ !reachedStates; (*if i > 2 then reachedStates := uniqueStates (!reachedStates);*) (*reachedStates := uniqueStates (List.sort (fun (_,_,_,a) (_,_,_,b) -> compare (List.length a) (List.length b)) !reachedStates);*) (*List.iter (fun (_, _, _, path) -> testPaths := path::!testPaths) (uniqueStates !reachedStates)*) done (*;; reachedStates := uniqueStates (!reachedStates);*) (* ;; reachedStates := uniqueStates (List.stable_sort (fun a b -> compare (testCaseCost a) (testCaseCost b)) !reachedStates) *) ;; let reachedStates = ref (unique (List.map (fun (_, p, t) -> ((0,0,0), p, t)) !reachedStates)) ;; let reachedStates = ref (List.sort (fun (_,a,_) (_,b,_) -> compare a b) !reachedStates) (* TODO: minimise set of test cases *) ;; let retainUsefulPaths r n = let usefulPaths = ref [] in let totalCovered = ref [] in List.iter (fun (_, path) -> let covered = ref [] in let rememberCover = function step -> if not (List.memq step !covered) then covered := step::!covered in tokenisePartial { matchedStep = rememberCover } path; let newCovered = !totalCovered @ (List.filter (fun c -> not (List.memq c !totalCovered)) !covered) in if newCovered <> !totalCovered then (usefulPaths := path::!usefulPaths; totalCovered := newCovered) ) (List.sort (fun (_,a) (_,b) -> n * compare (List.length a) (List.length b)) r); !usefulPaths (*let reachedStates = ref (List.map (fun x -> 0,0,0,x) (retainUsefulPaths !reachedStates 1));; let reachedStates = ref (List.map (fun x -> 0,0,0,x) (retainUsefulPaths !reachedStates (-1)));;*) (* let reachedStates = ref (List.concat (List.map (fun (_, path) -> let state = tokenisePartial nullHook (List.rev (-1::(List.rev path))) in let consume, steps = List.assoc state.machineState (List.map (fun (a,b,c) -> a,(b,c)) tokeniserAlgorithm) in (*let strings = relevantStrings steps in*) let strings = List.map (fun x -> [x]) [0x0000; 0x0009; 0x000A; 0x000B; 0x000C; 0x0020; 0x0021; 0x0022; 0x0026; 0x0027; 0x002D; 0x002F; 0x003C; 0x003D; 0x003E; 0x003F; 0x0040; 0x0041; 0x0042; 0x005A; 0x0060; 0x0061; 0x0062; 0x0079; 0x007A; 0x007B] in List.map(fun string -> let path' = path @ string in (0, 0, 0, path') ) strings ) !reachedStates)) *) (* let q = ref [] ;; try while true do q := (let c = int_of_char (input_char stdin) in match c with 0 -> 0xFFFD | c -> c)::!q done with End_of_file -> () let reachedStates = ref [ 0, 0, 0, List.rev (-1::!q) ] *) (* TODO: cmdline opts to control activity *) (* let reachedStates = ref [] ;; try while true do let explode str = let r = ref [] in for i = 0 to (String.length str)-1 do r := int_of_char str.[i] :: !r done; List.rev (!r) (* -1::!r *) in reachedStates := (0, 0, 0, explode (input_line stdin))::!reachedStates done with End_of_file -> () *) let printCoverageDot algorithm = let out = open_out "tokeniser_test_coverage.dot" in let print str = output_string out (str ^ "\n") in print "digraph G {"; print "graph [ overlap=false ];"; print "node [ color=lightgrey ];"; let covered = ref [] in let rememberCover = function step -> if List.mem_assq step !covered then covered := (step, (List.assq step !covered)+1)::!covered else covered := (step, 1)::!covered in List.iter (fun (_, path, toks) -> tokenisePartial { matchedStep = rememberCover } path; () ) !reachedStates; let edges = ref [] in let nodes = ref [] in List.iter (fun (machineState, consume, steps) -> nodes := machineState :: !nodes; List.iter (fun ((matcher, actions) as step) -> let target = ref machineState in List.iter (function SwitchMachineState ms -> target := ms | _ -> ()) actions; edges := (machineState, !target, try List.assq step !covered with Not_found -> 0) :: !edges; ) steps; ) algorithm; List.iter (fun ms -> print (ms ^ " [ color=black ];") ) (unique (List.map string_of_machine_state !nodes)); let namedEdges = List.map (fun (ms1, ms2, covered) -> string_of_machine_state ms1, string_of_machine_state ms2, covered) !edges in List.iter (function ms1, ms2, covered -> print ("edge [style=\"setlinewidth(" ^ (string_of_float (0.5 +. 0.2 *. (log (1. +. (float_of_int covered))))) ^ ")\"]; " ^ ms1 ^ " -> " ^ ms2 ^ " [color=" ^ (if covered <> 0 then "black" else "red") ^ "];") ) namedEdges; print "}"; close_out out ;;printCoverageDot tokeniserAlgorithm ;; (*List.iter print_endline (unique (List.map printPath (List.map (fun (ms, cm, ef, path) -> path) !reachedStates)))*) let _ = let out = open_out "tokeniser_auto.test" in List.iter (fun ((ms, cm, ef), path, toks) -> (* print_endline (printPath path) *) (* print_endline (printPath path ^ " - " ^ string_of_machine_state ms ^ " " ^ string_of_content_model cm ^ " " ^ (if ef then "true" else "false") ^ " [" ^ printTokenised toks ^ "]") *) let pathStr = printJSONString (List.rev path) in output_string out ("{\"description\":" ^ pathStr ^ ",\n"); output_string out ("\"input\":" ^ pathStr ^ ",\n"); output_string out ("\"output\":[" ^ printTokenised toks ^ "]},\n\n"); ) !reachedStates; close_out out ;; (* List.iter (fun path -> print_endline (printPath path) ) (unique !testPaths) *)