#include "html_parser.h" String HTMLParserAttribute::to_string() { if (single) { return attribute; } if (data.find('"' == -1)) { return attribute + "=\"" + data + "\""; } else { return attribute + "=\'" + data + "\'"; } } void HTMLParserAttribute::print() { to_string().print(); } HTMLParserAttribute::HTMLParserAttribute() { single = false; } HTMLParserAttribute::~HTMLParserAttribute() { } void HTMLParserTag::process() { if (type != HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) { return; } if (data.size() < 2) { return; } ERR_FAIL_COND(data[0] != '<'); ERR_FAIL_COND(data[data.size() - 1] != '>'); int start_index = 1; if (data[1] == '/') { ++start_index; type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CLOSING_TAG; } else if (data[1] == '!') { if (data.size() < 8) { return; } //test for comment. ++start_index; if (data[2] == '-' && data[3] == '-') { type = HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT; int comment_start_index = data.find(' ', 3); if (comment_start_index == -1) { comment_start_index = 4; } tag = data.substr(comment_start_index, comment_start_index - data.size() - 3); } if (data.size() < 11) { return; } //test for doctype. int doctype_start_index = data.find("doctype ", 2); if (doctype_start_index == -1) { return; } type = HTMLParserTag::HTML_PARSER_TAG_TYPE_DOCTYPE; tag = data.substr(doctype_start_index + 8, data.size() - doctype_start_index - 8 - 1); } else { String tag_text; if (data[data.size() - 2] == '/') { //will catch all that looks like
//tags that look like
will be caught later in a post process, in a way //which also tries to catch erroneously not closed tags that supposed to be closed type = HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG; tag_text = data.substr(1, data.size() - 3); } else { type = HTMLParserTag::HTML_PARSER_TAG_TYPE_OPENING_TAG; tag_text = data.substr(1, data.size() - 2); } int fspc_index = tag_text.find(' '); if (fspc_index == -1) { //no args tag = tag_text; return; } //grab the tag itself tag = tag_text.substr(0, fspc_index); String args = tag_text.substr(fspc_index + 1, tag_text.size() - fspc_index - 1); parse_args(args); } int tag_end_index = data.find(' ', start_index); if (tag_end_index == -1) { //simple tag tag = data.substr(start_index, data.size() - start_index - 1); return; } } void HTMLParserTag::parse_args(const String &args) { attributes.clear(); int i = 0; while (i < args.size()) { if (args[i] == ' ') { //"trim" ++i; continue; } int equals_index = args.find('=', i); HTMLParserAttribute *a = new HTMLParserAttribute(); if (equals_index == -1) { a->attribute = args.substr(i, args.size() - i); a->single = true; attributes.push_back(a); return; } a->attribute = args.substr(i, equals_index - i); //todo //a.trim(); int next_char_index = equals_index + 1; if (next_char_index >= args.size()) { //an attribute looks like this "... attrib=" attributes.push_back(a); return; } //skip spaces while (args[next_char_index] == ' ') { ++next_char_index; if (next_char_index >= args.size()) { //an attribute looks like this "... attrib= " attributes.push_back(a); return; } } char c = args[next_char_index]; char find_char = ' '; if (c == '"' || c == '\'') { ++next_char_index; find_char = c; } int end_index = args.find(find_char, next_char_index); if (end_index == -1) { //missing closing ' or " if c is ' or " //else missing parameter a->data = args.substr(next_char_index, args.size() - next_char_index - 1); attributes.push_back(a); return; } a->data = args.substr(next_char_index, end_index - next_char_index); attributes.push_back(a); i = end_index + 1; } } String HTMLParserTag::to_string(const int level) { String s; for (int i = 0; i < level; ++i) { s += " "; } if (type == HTML_PARSER_TAG_TYPE_CONTENT) { s = data; } else if (type == HTML_PARSER_TAG_TYPE_OPENING_TAG) { int ln = level + 1; s = "<" + tag; for (int i = 0; i < attributes.size(); ++i) { s += " " + attributes[i]->to_string(); } s += ">\n"; for (int i = 0; i < tags.size(); ++i) { s += tags[i]->to_string(ln); } s += "\n"; } else if (type == HTML_PARSER_TAG_TYPE_CLOSING_TAG) { //HTMLParserTag should handle this automatically //it's here for debugging purposes though s = ""; } else if (type == HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG) { s = "<" + tag; for (int i = 0; i < attributes.size(); ++i) { s += " " + attributes[i]->to_string(); } s += "/>\n"; } else if (type == HTML_PARSER_TAG_TYPE_COMMENT) { s = "\n"; } else if (type == HTML_PARSER_TAG_TYPE_DOCTYPE) { s = data + "\n"; } return s; } void HTMLParserTag::print() { to_string().print(); } HTMLParserTag::HTMLParserTag() { type = HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE; } HTMLParserTag::~HTMLParserTag() { for (int i = 0; i < tags.size(); ++i) { delete tags[i]; } for (int i = 0; i < attributes.size(); ++i) { delete attributes[i]; } } void HTMLParser::parse(const String &data) { Vector tags; //split into tags for (int i = 0; i < data.size(); ++i) { if (data[i] == '<') { for (int j = i + 1; j < data.size(); ++j) { if (data[j] == '>') { HTMLParserTag *t = new HTMLParserTag(); t->data = data.substr(i, j - i + 1); t->process(); tags.push_back(t); i = j; break; } } } else { for (int j = i + 1; j < data.size(); ++j) { if (data[j] == '<') { HTMLParserTag *t = new HTMLParserTag(); t->data = data.substr(i, j - i); t->type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT; tags.push_back(t); i = j - 1; break; } } } } if (root) { delete root; } root = new HTMLParserTag(); //process tags into hierarchical order Vector tag_stack; for (int i = 0; i < tags.size(); ++i) { tags[i]->print(); } } String HTMLParser::to_string() { if (!root) { return ""; } return root->to_string(); } void HTMLParser::print() { if (root) { root->print(); } } HTMLParser::HTMLParser() { root = nullptr; } HTMLParser::~HTMLParser() { if (root) { delete root; } }