#include #include #include #include #include #include #include #include #define COLLECT_STATS typedef wchar_t Char; // TODO: sizeof(wchar_t) == 2 in MSVC, but this code assumes it can hold all 0x110000 codepoints, which is wrong typedef std::basic_string String; const Char Char_EOF = 0; // safe because all U+0000 get converted to U+FFFD before reaching the tokeniser class IInputStream { public: virtual ~IInputStream() {} virtual Char get() = 0; // must return Char_EOF on EOF virtual bool eof() = 0; // must return true after EOF has been reached (to distinguish Char_EOF from U+0000) (TODO: that's not very nice) }; class ITokenStream { public: struct IToken { virtual ~IToken() {} }; struct IDoctypeToken : public IToken { IDoctypeToken() : hasPubId(false), hasSysId(false), correct(true) {} String name, pubId, sysId; bool hasPubId, hasSysId; bool correct; }; struct ITagToken : public IToken { String name; std::vector > attributes; }; struct IStartTagToken : public ITagToken { }; struct IEndTagToken : public ITagToken { }; struct ICommentToken : public IToken { String value; }; virtual ~ITokenStream() {} virtual IDoctypeToken* createNewDoctypeToken() = 0; virtual IStartTagToken* createNewStartTagToken() = 0; virtual IEndTagToken* createNewEndTagToken() = 0; virtual ICommentToken* createNewCommentToken() = 0; virtual void emitDoctypeToken(IDoctypeToken* token) = 0; virtual int emitTagToken(ITagToken* token) = 0; // returns -1, or else a ContentModel to switch to virtual void emitCommentToken(ICommentToken* token) = 0; virtual void emitCharacterToken(Char c) = 0; virtual void emitEOFToken() = 0; virtual void emitParseErrorToken(const char* msg) = 0; }; // JSON-formatted ASCII output std::string escapeString(const String& str) { std::string s; s.reserve(str.length()); for (size_t i = 0; i < str.length(); ++i) { Char c = str[i]; if (c == '\\') s += "\\\\"; else if (c == '"') s += "\\\""; else if (c < 0x20 || c >= 0x80) { char h[16]; if (c <= 0xFFFF) sprintf(h, "\\u%04X", c); else sprintf(h, "\\u%04X\\u%04X", 0xD800 + ((c-0x10000) >> 10), 0xDC00 + ((c-0x10000) & 0x3FF)); s += h; } else s += c; } return s; } #include "tokeniser_auto.cpp" // class Tokeniser // { Char currentCharacter; MachineState machineState; ContentModel contentModel; bool escapeFlag; ITokenStream::IToken* currentToken; String lastStartTagName; bool ignoreAttributeValue; IInputStream& inputStream; char prevChar; // for ignoring the LF in a CRLF pair std::vector tempCharBuffer; // TODO: it's not entirely obvious that this is correct in the presence // of reconsumption. Also, need to be able to test this. Char backCharRing[4]; int backCharRingPos; // position for next valid character ITokenStream& tokenStream; #ifdef COLLECT_STATS std::map count_UnrecognisedEntityNames; std::map count_DuplicateAttributes; std::map count_Annotations; void annotate(const char* msg) { //++count_Annotations[msg]; } public: void dumpStats() { std::cout << "Unrecognised entity names\n"; for (std::map::iterator it = count_UnrecognisedEntityNames.begin(); it != count_UnrecognisedEntityNames.end(); ++it) std::cout << escapeString(it->first) << "\t" << it->second << "\n"; std::cout << "\n"; std::cout << "Duplicate attribute names\n"; for (std::map::iterator it = count_DuplicateAttributes.begin(); it != count_DuplicateAttributes.end(); ++it) std::cout << escapeString(it->first) << "\t" << it->second << "\n"; std::cout << "\n"; std::cout << "Annotations\n"; for (std::map::iterator it = count_Annotations.begin(); it != count_Annotations.end(); ++it) std::cout << it->first << "\t" << it->second << "\n"; std::cout << "\n"; } private: #else #define annotate(msg) #endif Char consumeCharacter() { Char c; if (tempCharBuffer.empty()) { c = inputStream.get(); if (c == 0x000A && prevChar == 0x000D) { prevChar = c; c = inputStream.get(); } else { prevChar = c; if (c == 0x0000 && !inputStream.eof()) { c = 0xFFFD; parseError("Found U+0000 in input stream"); // this is an asynchronous error - it's not // important where it ends up in the token stream } else if (c == 0x000D) { c = 0x000A; } } // TODO: it's not at all obvious that this is correct (particularly // because of reconsumption) if (contentModel == RCDATA || contentModel == CDATA) { backCharRing[backCharRingPos] = c; backCharRingPos = (backCharRingPos + 1) & 3; } } else { c = tempCharBuffer.back(); tempCharBuffer.pop_back(); } return c; } void reconsumeCharacter(Char c) { tempCharBuffer.push_back(c); } Char getOldCharacter(size_t dist) { return backCharRing[(backCharRingPos-1 - dist) & 3]; } bool isEndOfCData() { if (! (contentModel == RCDATA || contentModel == CDATA)) // TODO: some tests in the .inl are redundant with this return false; // TODO: fragment case (no last start tag token) size_t n = 0; Char c = currentCharacter; std::vector cs; while (n < lastStartTagName.size()) { Char in = (c >= 'A' && c <= 'Z' ? c + 0x0020 : c); if (in != lastStartTagName[n]) { while (! cs.empty()) { reconsumeCharacter(cs.back()); cs.pop_back(); } return false; } c = consumeCharacter(); cs.push_back(c); ++n; } bool ok = (c == 0x0009 || c == 0x000A || c == 0x000B || c == 0x000C || c == 0x0020 || c == 0x003E || c == 0x002F || c == Char_EOF); while (! cs.empty()) { reconsumeCharacter(cs.back()); cs.pop_back(); } return ok; } bool isFollowedBy(const char* str) { Char c = currentCharacter; bool ok = true; std::stack cs; while (true) { Char lc = (c >= 'A' && c <= 'Z' ? c + 0x0020 : c); if (*str != lc) { ok = false; break; } ++str; if (*str) { c = consumeCharacter(); cs.push(c); } else break; } while (! cs.empty()) { reconsumeCharacter(cs.top()); cs.pop(); } return ok; } int entityLength; int entityChar; bool hasConsumableEntity(bool isInAttribute, int allowedChar) { Char c = consumeCharacter(); switch (c) { case 0x0009: case 0x000A: case 0x000B: case 0x000C: case 0x0020: case 0x003C: case 0x0026: case Char_EOF: reconsumeCharacter(c); return false; case 0x0023: { Char c = consumeCharacter(); Char hex = 0; if (c == 0x0078 || c == 0x0058) { hex = c; c = consumeCharacter(); } entityChar = 0; bool matchedAny = false; while (true) { if (entityChar > 0x10FFFF) entityChar = 0x110000; // clamp it to an invalid value, so it doesn't overflow if (c >= 0x0030 && c <= 0x0039) entityChar = (entityChar * (hex ? 16 : 10)) + (c - 0x0030); else if (hex && c >= 0x0061 && c <= 0x0066) entityChar = (entityChar * 16) + (c - 0x0061 + 10); else if (hex && c >= 0x0041 && c <= 0x0046) entityChar = (entityChar * 16) + (c - 0x0041 + 10); else break; matchedAny = true; c = consumeCharacter(); } if (! matchedAny) { reconsumeCharacter(c); if (hex) reconsumeCharacter(hex); reconsumeCharacter(0x0023); parseError("Missing number after '&#'"); return false; } if (c != 0x003B) { reconsumeCharacter(c); parseError("Missing ';' after numeric entity"); } if (entityChar == 0 || entityChar > 0x10FFFF || (entityChar >= 0xD800 && entityChar <= 0xDFFF)) { parseError("Invalid value for numeric entity"); entityChar = 0xFFFD; return true; } Char subst = lookupCharacter(entityChar); if (subst) { parseError("Substituted value for numeric entity"); entityChar = subst; } return true; } default: { if (allowedChar != 0 && c == allowedChar) { reconsumeCharacter(c); return false; } // The entityNames vector is lexicographically sorted. For each input character, // the range of names with matching prefixes can be narrowed. If a name is an // exact prefix of the input character stream, it will be the lower bound of // one of these ranges, so it can be remembered until either a longer match is // found or no more matches are possible. std::vector cs; cs.reserve(8); size_t csl = 0; // cs.size() at point of longest match cs.push_back(c); entityChar = Char_EOF; bool isMissingSemicolon = false; const char** lower = entityNames; const char** upper = entityNames + sizeof(entityNames) / sizeof(entityNames[0]); for (int i = 0; ; ++i) { std::pair range = std::equal_range(EntityNameIterator(lower, i), EntityNameIterator(upper, i), c); if (range.first.ptr >= range.second.ptr) { // No more matches found #ifdef COLLECT_STATS if (entityChar == Char_EOF && ! isInAttribute) { // Find the whole of what looks like the entity name, so we can report it nicely while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { c = consumeCharacter(); cs.push_back(c); } ++count_UnrecognisedEntityNames[String(cs.begin(), cs.end())]; } #endif // Undo any character consumption since the best entity match while (cs.size() > csl) { c = cs.back(); reconsumeCharacter(c); cs.pop_back(); } // (c is now the character just after the matched part of the entity) if (entityChar == Char_EOF) { // Didn't find anything matching at all parseError(isInAttribute ? "Unrecognised entity name (in attribute)" : "Unrecognised entity name (in text)"); return false; } // Found some prefix break; } else { // Still some possible matches lower = range.first.ptr; upper = range.second.ptr; // If one of those possibilities is an exact match for the characters we've seen // already, its next character will be \0 so it will be the range's lowest value if ((*lower)[i+1] == '\0') { // This is the longest match we've seen so far entityChar = entityValues[range.first.ptr - entityNames]; isMissingSemicolon = ((*lower)[i] != 0x003B); // If this match is good and there aren't any other possible matches, then stop now if (range.first.ptr == range.second.ptr-1) break; csl = cs.size(); } } c = consumeCharacter(); cs.push_back(c); } if (isMissingSemicolon) { parseError("Missing ';' after named entity"); // If we got here from the "No more matches found" case, c is the next character // from the matched entity part, which is what we need in the attribute case. // If we got here from the "Still some possible matches + aren't any other" case, // i.e. we found a perfect match, that must end with a semicolon, so we wouldn't // get into this is-missing-semicolon bit so it doesn't matter what c is. if (isInAttribute && ((c >= 0x0030 && c <= 0x0039) || (c >= 0x0041 && c <= 0x005A) || (c >= 0x0061 && c <= 0x007A))) { // Unconsume everything while (! cs.empty()) { reconsumeCharacter(cs.back()); cs.pop_back(); } return false; } } return true; } } } // Helper for hasConsumableEntity when looking for the range matching a certain // character - iterates over char**s but returns a single (choosable) character // from within each string instead of having to compare the whole string struct EntityNameIterator { typedef std::random_access_iterator_tag iterator_category; typedef Char value_type; typedef ptrdiff_t difference_type; typedef Char* pointer; typedef Char& reference; const char** ptr; int o; EntityNameIterator() : ptr(NULL), o(0) {} EntityNameIterator(const char** ptr, int charOffset) : ptr(ptr), o(charOffset) { } difference_type operator-(EntityNameIterator& a) const { return ptr - a.ptr; } EntityNameIterator& operator++() { ++ptr; return *this; } EntityNameIterator& operator+=(difference_type d) { ptr += d; return *this; } Char operator*() const { return (*ptr)[o]; } }; // TODO: be certain that this is always called after hasConsumableEntity succeeded, // with nothing changing the character stream in the middle, so that it doesn't // matter we're cheating in terms of when the consumption happens void consumeAndEmitEntity(int allowedChar) { emitCharacterToken(entityChar); entityChar = 0; } void consumeAndAppendEntity(int allowedChar) { appendToTagTokenAttributeValue(entityChar); entityChar = 0; } void emitEOFToken() { assert(! currentToken); tokenStream.emitEOFToken(); } void emitCharacterToken(Char c) { assert(! currentToken); tokenStream.emitCharacterToken(c); } void createStartTagToken() { assert(! currentToken); currentToken = tokenStream.createNewStartTagToken(); ((ITokenStream::ITagToken*)currentToken)->name.reserve(8); // fits ~99.7% ((ITokenStream::ITagToken*)currentToken)->attributes.reserve(6); // fits ~99.2% } void createEndTagToken() { assert(! currentToken); currentToken = tokenStream.createNewEndTagToken(); ((ITokenStream::ITagToken*)currentToken)->name.reserve(8); // fits ~99.5% } void createCommentToken() { assert(! currentToken); currentToken = tokenStream.createNewCommentToken(); } void createDoctypeToken() { assert(! currentToken); currentToken = tokenStream.createNewDoctypeToken(); } void createTagTokenAttribute() { assert(dynamic_cast(currentToken)); ((ITokenStream::ITagToken*)currentToken)->attributes.push_back(std::make_pair(String(), String())); ((ITokenStream::ITagToken*)currentToken)->attributes.back().first.reserve(12); // fits ~99.7% ((ITokenStream::ITagToken*)currentToken)->attributes.back().second.reserve(12); // fits ~53.2% (the tail on the length distribution is huge) ignoreAttributeValue = false; } void handleDuplicateAttributes() { assert(dynamic_cast(currentToken)); assert(! ((ITokenStream::ITagToken*)currentToken)->attributes.empty()); assert(! ignoreAttributeValue); std::vector >& attrs = ((ITokenStream::ITagToken*)currentToken)->attributes; for (size_t i = 0; i < attrs.size()-1; ++i) { if (attrs[i].first == attrs.back().first) { parseError("Duplicate attribute"); #ifdef COLLECT_STATS ++count_DuplicateAttributes[attrs[i].first]; #endif attrs.pop_back(); ignoreAttributeValue = true; return; } } } void appendToTagTokenAttributeName(Char c) { assert(dynamic_cast(currentToken)); assert(! ((ITokenStream::ITagToken*)currentToken)->attributes.empty()); assert(! ignoreAttributeValue); ((ITokenStream::ITagToken*)currentToken)->attributes.back().first.push_back(c); } void appendToTagTokenAttributeValue(Char c) { assert(dynamic_cast(currentToken)); assert(! ((ITokenStream::ITagToken*)currentToken)->attributes.empty()); if (! ignoreAttributeValue) ((ITokenStream::ITagToken*)currentToken)->attributes.back().second.push_back(c); } void appendToTagTokenName(Char c) { assert(dynamic_cast(currentToken)); ((ITokenStream::ITagToken*)currentToken)->name.push_back(c); } void appendToCommentToken(Char c) { assert(dynamic_cast(currentToken)); ((ITokenStream::ICommentToken*)currentToken)->value.push_back(c); } void setDoctypeTokenIncorrect() { assert(dynamic_cast(currentToken)); ((ITokenStream::IDoctypeToken*)currentToken)->correct = false; } void appendToDoctypeTokenName(Char c) { assert(dynamic_cast(currentToken)); ((ITokenStream::IDoctypeToken*)currentToken)->name.push_back(c); } void appendToDoctypeTokenPubId(Char c) { assert(dynamic_cast(currentToken)); ((ITokenStream::IDoctypeToken*)currentToken)->pubId.push_back(c); } void appendToDoctypeTokenSysId(Char c) { assert(dynamic_cast(currentToken)); ((ITokenStream::IDoctypeToken*)currentToken)->sysId.push_back(c); } void setDoctypeTokenPubIdEmpty() { assert(dynamic_cast(currentToken)); assert(((ITokenStream::IDoctypeToken*)currentToken)->pubId == L""); ((ITokenStream::IDoctypeToken*)currentToken)->hasPubId = true; } void setDoctypeTokenSysIdEmpty() { assert(dynamic_cast(currentToken)); assert(((ITokenStream::IDoctypeToken*)currentToken)->sysId == L""); ((ITokenStream::IDoctypeToken*)currentToken)->hasSysId = true; } void parseError(const char* msg) { tokenStream.emitParseErrorToken(msg); } void parseErrorIfNonpermittedSlash() { bool permitted = false; Char c = consumeCharacter(); if (c == '>') { ITokenStream::IStartTagToken* t = dynamic_cast(currentToken); if (t) { const String& name = t->name; if (name == L"base" || name == L"link" || name == L"meta" || name == L"hr" || name == L"br" || name == L"img" || name == L"embed" || name == L"param" || name == L"area" || name == L"col" || name == L"input") permitted = true; } } reconsumeCharacter(c); if (! permitted) parseError("Non-permitted character '/'"); } void parseErrorIfEndTagWithAttributes() { ITokenStream::IEndTagToken* t = dynamic_cast(currentToken); if (t && ! t->attributes.empty()) parseError("Attribute on end tag"); } void emitCurrentTagToken() { assert(dynamic_cast(currentToken)); // TODO: some of the casting (here and elsewhere) is redundant - should // perhaps split start vs end earlier ITokenStream::IStartTagToken* start = dynamic_cast(currentToken); ITokenStream::IEndTagToken* end = dynamic_cast(currentToken); if (start) lastStartTagName = start->name; if (end) contentModel = PCDATA; int newContentModel = tokenStream.emitTagToken((ITokenStream::ITagToken*)currentToken); if (newContentModel != -1) contentModel = (ContentModel)newContentModel; currentToken = NULL; } void emitCurrentCommentToken() { assert(dynamic_cast(currentToken)); tokenStream.emitCommentToken((ITokenStream::ICommentToken*)currentToken); currentToken = NULL; } void emitCurrentDoctypeToken() { assert(dynamic_cast(currentToken)); tokenStream.emitDoctypeToken((ITokenStream::IDoctypeToken*)currentToken); currentToken = NULL; } std::map counts; void count(const char* str) { ++counts[str]; } void printCounts() { for (std::map::iterator it = counts.begin(); it != counts.end(); ++it) { std::cout << it->first << " " << it->second << "\n"; } } public: Tokeniser(IInputStream& inputStream, ITokenStream& tokenStream) : inputStream(inputStream), tokenStream(tokenStream) { ignoreAttributeValue = true; currentToken = NULL; machineState = DataState; contentModel = PCDATA; escapeFlag = false; prevChar = 0; backCharRingPos = 0; backCharRing[0] = backCharRing[1] = backCharRing[2] = backCharRing[3] = Char_EOF; } void setContentModel(ContentModel cm) { contentModel = cm; } void setLastStartTagName(const String& n) { lastStartTagName = n; } void tokenise() { run(); } }; class istreamAdaptor : public IInputStream { // TODO: This assumes the input has already been correctly converted from bytes // into characters, which is not possible for anything except Latin-1 std::istream& src; // Read into a temporary buffer, because that's significantly // more efficient than using istream::get(), when reading from // files or from std::cin static const size_t bufSize = 65536; char buf[bufSize]; size_t bufPos; size_t bufEnd; bool isEof; public: istreamAdaptor(std::istream& src) : src(src), bufPos(0), bufEnd(0), isEof(false) {} virtual Char get() { if (bufPos >= bufEnd) { if (! src.good()) { isEof = true; return Char_EOF; } src.read(buf, bufSize); bufEnd = src.gcount(); bufPos = 0; if (bufEnd == 0) { isEof = true; return Char_EOF; } } return (Char)(unsigned char)buf[bufPos++]; } virtual bool eof() { return isEof; } }; // When we don't have a proper tree construction thing, it's still useful // to handle non-PCDATA sections properly int simulateContentModel(const String& name) { if (name == L"title" || name == L"textarea") return Tokeniser::RCDATA; else if (name == L"style" || name == L"script" || name == L"xmp") return Tokeniser::CDATA; else if (name == L"plaintext") return Tokeniser::PLAINTEXT; else return -1; } class TokenStream : public ITokenStream { public: virtual void begin() = 0; virtual void end() = 0; }; class TestTokenStream : public TokenStream { static const bool fakeCdata = true; struct Token { virtual ~Token() {} virtual void print(std::string& r) const = 0; }; struct DoctypeToken : public IDoctypeToken, Token { virtual void print(std::string& r) const { r = "[\"DOCTYPE\", \"" + escapeString(name) + "\", " + (hasPubId ? "\"" + escapeString(pubId) + "\"" : "null") + ", " + (hasSysId ? "\"" + escapeString(sysId) + "\"" : "null") + ", " + (correct ? "true" : "false") + "]"; } }; struct StartTagToken : public IStartTagToken { virtual void print(std::string& r) const { r = "[\"StartTag\", \""; r += escapeString(name); r += "\", {"; for (size_t i = 0; i < attributes.size(); ++i) { if (i) r += ", "; r += "\""; r += escapeString(attributes[i].first); r += "\":\""; r += escapeString(attributes[i].second); r += "\""; } r += "}]"; } }; struct EndTagToken : public IEndTagToken { virtual void print(std::string& r) const { r = "[\"EndTag\", \""; r += escapeString(name); r += "\"]"; } }; struct CommentToken : public ICommentToken, Token { virtual void print(std::string& r) const { r = "[\"Comment\", \""; r += escapeString(value); r += "\"]"; } }; // Tokens not from ITokenStream: struct CharacterToken : public Token { virtual void print(std::string& r) const { r = "[\"Character\", \""; r += escapeString(value); r += "\"]"; } String value; }; struct EOFToken : public Token { virtual void print(std::string& r) const { r = "\"EOF\""; } }; struct ParseErrorToken : public Token { virtual void print(std::string& r) const { r = "\"ParseError\""; } }; public: TestTokenStream() : tokenId(0) { } virtual void begin() { std::cout << "["; } virtual void end() { assert(coalescingChars.empty()); // must be true because EOF is the last token emitted std::cout << "]\n"; } virtual IDoctypeToken* createNewDoctypeToken() { return new DoctypeToken(); } virtual IStartTagToken* createNewStartTagToken() { return new StartTagToken(); } virtual IEndTagToken* createNewEndTagToken() { return new EndTagToken(); } virtual ICommentToken* createNewCommentToken() { return new CommentToken(); } String coalescingChars; int tokenId; std::string printBuf; // reuse the same string to avoid repeated allocations void emitToken(const Token& token) { // Flush any collected character tokens if (! coalescingChars.empty()) { CharacterToken t; t.value = coalescingChars; coalescingChars.clear(); emitToken(t); } // Skip EOF tokens because the test cases don't include them if (dynamic_cast(&token)) return; // Add the comma separator (except before the first token) if (tokenId) std::cout << ", "; token.print(printBuf); std::cout << printBuf; ++tokenId; } virtual int emitTagToken(ITagToken* token) { emitToken(*(Token*)token); IStartTagToken* start = dynamic_cast(token); int contentModel = -1; if (fakeCdata && start) contentModel = simulateContentModel(start->name); delete token; return contentModel; } virtual void emitCommentToken(ICommentToken* token) { emitToken(*(Token*)token); delete token; } virtual void emitDoctypeToken(IDoctypeToken* token) { emitToken(*(Token*)token); delete token; } virtual void emitCharacterToken(Char c) { coalescingChars += c; } virtual void emitEOFToken() { emitToken(EOFToken()); } virtual void emitParseErrorToken(const char* msg) { emitToken(ParseErrorToken()); } }; class NullTokenStream : public TokenStream { // Prepare some token objects, so we don't have to do dynamic allocation IDoctypeToken doctype; IStartTagToken startTag; IEndTagToken endTag; ICommentToken commentTag; public: virtual void begin() {} virtual void end() {} virtual IDoctypeToken* createNewDoctypeToken() { doctype = IDoctypeToken(); // reset its contents return &doctype; } virtual IStartTagToken* createNewStartTagToken() { startTag = IStartTagToken(); return &startTag; } virtual IEndTagToken* createNewEndTagToken() { endTag = IEndTagToken(); return &endTag; } virtual ICommentToken* createNewCommentToken() { commentTag = ICommentToken(); return &commentTag; } virtual void emitDoctypeToken(IDoctypeToken* token) {} virtual int emitTagToken(ITagToken* token) { return -1; } virtual void emitCommentToken(ICommentToken* token) {} virtual void emitCharacterToken(Char c) {} virtual void emitEOFToken() {} virtual void emitParseErrorToken(const char* msg) {} }; class StatsTokenStream : public NullTokenStream { typedef std::pair, std::pair > Doctype; std::map count_StartTagNames; std::map, int> count_StartTagAttributeNames; std::map, int> count_StartTagAttributeNumbers; std::map, int> count_StartTagAttributeValueLengths; std::map count_EndTagNames; std::map, int> count_EndTagAttributeNames; std::map, int> count_EndTagAttributeNumbers; std::map count_Doctypes; std::map count_ParseErrors; static const bool fakeCdata = true; public: virtual int emitTagToken(ITagToken* token) { IStartTagToken* start = dynamic_cast(token); if (start) { ++count_StartTagNames[start->name]; for (size_t i = 0; i < start->attributes.size(); ++i) { ++count_StartTagAttributeNames[std::make_pair(start->name, start->attributes[i].first)]; ++count_StartTagAttributeValueLengths[std::make_pair(start->attributes[i].first, start->attributes[i].second.size())]; } ++count_StartTagAttributeNumbers[std::make_pair(start->name, start->attributes.size())]; } IEndTagToken* end = dynamic_cast(token); if (end) { ++count_EndTagNames[end->name]; for (size_t i = 0; i < end->attributes.size(); ++i) ++count_EndTagAttributeNames[std::make_pair(end->name, end->attributes[i].first)]; ++count_EndTagAttributeNumbers[std::make_pair(end->name, end->attributes.size())]; } if (fakeCdata && start) return simulateContentModel(start->name); else return -1; } virtual void emitDoctypeToken(IDoctypeToken* token) { ++count_Doctypes[std::make_pair( std::make_pair(token->name, token->hasPubId ? token->pubId : L"@NONE@"), std::make_pair(token->hasSysId ? token->sysId : L"@NONE@", token->correct) )]; } virtual void emitParseErrorToken(const char* msg) { ++count_ParseErrors[msg]; } virtual void end() { std::cout << "Start tag names\n"; for (std::map::iterator it = count_StartTagNames.begin(); it != count_StartTagNames.end(); ++it) std::cout << escapeString(it->first) << "\t" << it->second << "\n"; std::cout << "\n"; std::cout << "End tag names\n"; for (std::map::iterator it = count_EndTagNames.begin(); it != count_EndTagNames.end(); ++it) std::cout << escapeString(it->first) << "\t" << it->second << "\n"; std::cout << "\n"; std::cout << "Start tag attribute names\n"; for (std::map, int>::iterator it = count_StartTagAttributeNames.begin(); it != count_StartTagAttributeNames.end(); ++it) std::cout << escapeString(it->first.first) << "\t" << escapeString(it->first.second) << "\t" << it->second << "\n"; std::cout << "\n"; std::cout << "End tag attribute names\n"; for (std::map, int>::iterator it = count_EndTagAttributeNames.begin(); it != count_EndTagAttributeNames.end(); ++it) std::cout << escapeString(it->first.first) << "\t" << escapeString(it->first.second) << "\t" << it->second << "\n"; std::cout << "\n"; std::cout << "Start tag attribute counts\n"; for (std::map, int>::iterator it = count_StartTagAttributeNumbers.begin(); it != count_StartTagAttributeNumbers.end(); ++it) std::cout << escapeString(it->first.first) << "\t" << it->first.second << "\t" << it->second << "\n"; std::cout << "\n"; std::cout << "End tag attribute counts\n"; for (std::map, int>::iterator it = count_EndTagAttributeNumbers.begin(); it != count_EndTagAttributeNumbers.end(); ++it) std::cout << escapeString(it->first.first) << "\t" << it->first.second << "\t" << it->second << "\n"; std::cout << "\n"; std::cout << "Start tag attribute value lengths\n"; for (std::map, int>::iterator it = count_StartTagAttributeValueLengths.begin(); it != count_StartTagAttributeValueLengths.end(); ++it) std::cout << escapeString(it->first.first) << "\t" << it->first.second << "\t" << it->second << "\n"; std::cout << "\n"; std::cout << "Doctypes\n"; for (std::map::iterator it = count_Doctypes.begin(); it != count_Doctypes.end(); ++it) std::cout << escapeString(it->first.first.first) << "\t" << escapeString(it->first.first.second) << "\t" << escapeString(it->first.second.first) << "\t" << it->first.second.second << "\t" << it->second << "\n"; std::cout << "\n"; std::cout << "Parse errors\n"; for (std::map::iterator it = count_ParseErrors.begin(); it != count_ParseErrors.end(); ++it) std::cout << it->first << "\t" << it->second << "\n"; std::cout << "\n"; } }; int main(int argc, char** argv) { std::vector args; for (int i = 1; i < argc; ++i) args.push_back(argv[i]); if (find(args.begin(), args.end(), "--help") != args.end()) { std::cout << "Arguments:\n"; std::cout << " --file=f : read from named file instead of stdin.\n"; std::cout << " --repeat=n : repeat n times.\n"; std::cout << " --json : outputs a JSON copy of the token stream.\n"; std::cout << " --stats : count various statistics about the data.\n"; std::cout << " --time : reports the time spent in the tokeniser.\n"; std::cout << " --cdata=t : start in CDATA mode, with t as the last start tag name.\n"; std::cout << " --rcdata=t : start in RCDATA mode, with t as the last start tag name.\n"; std::cout << " --plaintext : start in PLAINTEXT mode.\n"; exit(0); } std::string filename; int repeat = 1; bool reportTime = (find(args.begin(), args.end(), "--time") != args.end()); Tokeniser::ContentModel contentModel = Tokeniser::PCDATA; String lastStartTagName; for (size_t i = 0; i < args.size(); ++i) { if (args[i].find("--file=") == 0) filename = args[i].substr(7); else if (args[i].find("--repeat=") == 0) { std::stringstream s; s << args[i].substr(9); s >> repeat; } else if (args[i].find("--cdata=") == 0) { contentModel = Tokeniser::CDATA; std::string t = args[i].substr(8); lastStartTagName = String(t.begin(), t.end()); } else if (args[i].find("--rcdata=") == 0) { contentModel = Tokeniser::RCDATA; std::string t = args[i].substr(9); lastStartTagName = String(t.begin(), t.end()); } else if (args[i].find("--plaintext") == 0) { contentModel = Tokeniser::PLAINTEXT; } } for (int i = 0; i < repeat; ++i) { TokenStream* tokenStream; bool doStats = (find(args.begin(), args.end(), "--stats") != args.end()); if (find(args.begin(), args.end(), "--json") != args.end()) tokenStream = new TestTokenStream(); else if (doStats) tokenStream = new StatsTokenStream(); else tokenStream = new NullTokenStream(); bool deleteIstream; std::istream* istream; if (filename.empty()) { istream = &std::cin; deleteIstream = false; } else { istream = new std::ifstream (filename.c_str(), std::ifstream::in | std::ifstream::binary); deleteIstream = true; } timespec t0, t1; { istreamAdaptor inputStream(*istream); tokenStream->begin(); clock_gettime(CLOCK_REALTIME, &t0); Tokeniser tokeniser (inputStream, *tokenStream); tokeniser.setContentModel(contentModel); tokeniser.setLastStartTagName(lastStartTagName); tokeniser.tokenise(); clock_gettime(CLOCK_REALTIME, &t1); tokenStream->end(); #ifdef COLLECT_STATS if (doStats) tokeniser.dumpStats(); #endif } std::cout << std::flush; if (reportTime) std::cerr << "Took " << (double(t1.tv_sec)-double(t0.tv_sec) + (double(t1.tv_nsec)-double(t0.tv_nsec))/1e9) << " seconds\n"; if (deleteIstream) delete istream; delete tokenStream; } }