open Tokeniser_interp open Format open Json_type (* The json library doesn't understand surrogates and throws an exception if you try using them, which is bad, so hackily replace them with raw UTF-8 bytes: *) let decode_json_surrogates str = let str = Str.global_substitute (Str.regexp "\\\\u\\([dD][89abAB][0-9a-fA-F][0-9a-fA-F]\\)\\\\u\\([dD][cdefCDEF][0-9a-fA-F][0-9a-fA-F]\\)") (fun s -> Scanf.sscanf (Str.replace_matched "\\1\\2" s) "%04x%04x" (fun hi lo -> let cp = (hi - 0xd800)*1024 + (lo - 0xdc00) + 0x10000 in let a = (cp / (64*64*64)) + 240 in let b = ((cp / (64*64)) mod 64) + 128 in let c = ((cp / 64) mod 64) + 128 in let d = (cp mod 64) + 128 in String.make 1 (char_of_int a) ^ String.make 1 (char_of_int b) ^ String.make 1 (char_of_int c) ^ String.make 1 (char_of_int d) )) str in (* Some invalid characters will break really badly, so cheat *) Str.global_replace (Str.regexp "\\\\u[fF][fF][fF][eEfF]") "\\u0001" ( Str.global_replace (Str.regexp "\\\\u[dD][89a-fA-F][0-9a-fA-F][0-9a-fA-F]") "\\u0001" str) let decode_utf8 str = let rec f = function | [] -> [] | a::rs when a < 192 -> a :: f rs | a::b::rs when a < 224 -> (a*64 + b - 0x00003080) :: f rs | a::b::c::rs when a < 240 -> (((a*64) + b)*64 + c - 0x000E2080) :: f rs | a::b::c::d::rs -> ((((a*64) + b)*64 + c)*64 + d - 0x03C82080) :: f rs | _ -> failwith "malformed utf8" in f str let wchar_list_of_string str = let r = ref [] in for i = 0 to (String.length str)-1 do r := int_of_char str.[i] :: !r done; decode_utf8 (List.rev !r) let reversed_wchar_list_of_string str = List.rev (wchar_list_of_string str) let parseTokens = let starttag name attrs flag = [ TagToken { tagtype = StartTag; name = reversed_wchar_list_of_string name; selfclosing = false; attributes = List.map (function (k, String v) -> reversed_wchar_list_of_string k, reversed_wchar_list_of_string v | _ -> failwith "invalid attribute value") attrs; droppingattribute = false; } ] in let f = function | Array [String "DOCTYPE"; String name; pubid; sysid; Bool correct] -> [ DoctypeToken { doctypename = reversed_wchar_list_of_string name; pubid = (match pubid with Null -> None | String s -> Some (reversed_wchar_list_of_string s)); sysid = (match sysid with Null -> None | String s -> Some (reversed_wchar_list_of_string s)); correct = correct } ] | Array [String "StartTag"; String name; Object attrs] -> starttag name attrs false | Array [String "StartTag"; String name; Object attrs; Bool true] -> starttag name attrs true | Array [String "EndTag"; String name] -> [ TagToken { tagtype = EndTag; name = reversed_wchar_list_of_string name; selfclosing = false; attributes = []; droppingattribute = false } ] | Array [String "Comment"; String c] -> [ CommentToken (reversed_wchar_list_of_string c) ] | Array [String "Character"; String cs] -> List.rev (List.map (fun c -> CharacterToken c) (reversed_wchar_list_of_string cs)) | String "ParseError" -> [ ParseErrorToken ] | _ -> failwith "unexpected value in 'output'" in function | Array tokens -> EndOfFileToken :: List.rev (List.flatten (List.map f tokens)) | _ -> failwith "'output' should be array of tokens'" (* Remove bits of tokens that aren't relevant for comparing test case output *) let normaliseTokens ignoreErrorOrder ts = let f = function | TagToken ({ tagtype = StartTag; attributes = attrs } as tag) -> TagToken { tag with selfclosing = false; attributes = List.sort (fun (a, _) (b, _) -> compare a b) attrs; droppingattribute = false } | TagToken ({ tagtype = EndTag } as tag) -> TagToken { tag with selfclosing = false; attributes = []; droppingattribute = false } | token -> token in let ts = List.map f ts in if ignoreErrorOrder then let (a, b) = List.partition (function ParseErrorToken -> true | _ -> false) ts in a @ b else ts let slurp filename = let file = open_in filename in let len = in_channel_length file in let buf = String.make len '?' in really_input file buf 0 len; buf let runTokeniserTests filename = let json_obj = Json_io.json_of_string (decode_json_surrogates (slurp filename)) in (match json_obj with | Object ["tests", Array tests] -> List.iter (function test -> let testData = Browse.make_table (Browse.objekt test) in let description = Browse.string (Browse.field testData "description") in let input = Browse.string (Browse.field testData "input") in let ignoreErrorOrder = (match Browse.fieldx testData "ignoreErrorOrder" with Bool true -> true | _ -> false) in let expected = normaliseTokens ignoreErrorOrder (parseTokens (Browse.field testData "output")) in let outputState = Tokeniser_interp.tokenise2 (wchar_list_of_string input) in let output = normaliseTokens ignoreErrorOrder outputState.tokenStream in if output = expected then ( print_string "OK ("; print_string description; print_string ")"; print_newline (); ) else ( print_string "Not OK ("; print_string description; print_string ")"; print_newline (); print_string "Input:\n"; print_string input; print_newline (); print_string "Expected:\n"; print_string ("[ " ^ Tokeniser_interp.printTokenised expected ^ " ]"); print_newline (); print_string "Got:\n"; print_string ("[ " ^ Tokeniser_interp.printTokenised output ^ " ]"); print_newline (); print_newline (); ) ) tests | _ -> failwith "tests should be a JSON array") let _ = Arg.parse [ ("-tokeniser", Arg.String runTokeniserTests, "run JSON test file"); ] (fun _ -> ()) "foo"