Implemented a few helpers for the HTMLParser.

This commit is contained in:
Relintai 2021-11-20 20:53:17 +01:00
parent da6d18beef
commit ff8790f81d
2 changed files with 131 additions and 28 deletions

View File

@ -8,11 +8,11 @@ bool HTMLParserAttribute::match_data(const String &d) {
return data == d; return data == d;
} }
bool HTMLParserAttribute::match_data(const Vector<String> &d) { bool HTMLParserAttribute::match_data(const Vector<String> &d) {
//todo // todo
return false; return false;
} }
bool HTMLParserAttribute::contains_data(const String &d) { bool HTMLParserAttribute::contains_data(const String &d) {
return attribute.find(d) != -1; return data.find(d) != -1;
} }
String HTMLParserAttribute::to_string() { String HTMLParserAttribute::to_string() {
@ -38,6 +38,98 @@ HTMLParserAttribute::HTMLParserAttribute() {
HTMLParserAttribute::~HTMLParserAttribute() { HTMLParserAttribute::~HTMLParserAttribute() {
} }
HTMLParserTag *HTMLParserTag::get_first(const String &t) {
if (tag == t) {
return this;
}
for (int i = 0; i < tags.size(); ++i) {
HTMLParserTag *ht = tags[i]->get_first(t);
if (ht) {
return ht;
}
}
return nullptr;
}
HTMLParserTag *HTMLParserTag::get_first(const String &t, const String &attrib, const String &val) {
if (tag == t) {
if (has_attribute(attrib, val)) {
return this;
}
}
for (int i = 0; i < tags.size(); ++i) {
HTMLParserTag *ht = tags[i]->get_first(t, attrib, val);
if (ht) {
return ht;
}
}
return nullptr;
}
String HTMLParserTag::get_attribute_value(const String &attrib) {
HTMLParserAttribute *a = get_attribute(attrib);
if (a) {
return a->data;
}
return "";
}
HTMLParserAttribute *HTMLParserTag::get_attribute(const String &attrib) {
for (int i = 0; i < attributes.size(); ++i) {
HTMLParserAttribute *a = attributes[i];
if (a->match_attrib(attrib)) {
return a;
}
}
return nullptr;
}
bool HTMLParserTag::has_attribute(const String &attrib) {
for (int i = 0; i < attributes.size(); ++i) {
HTMLParserAttribute *a = attributes[i];
if (a->match_attrib(attrib)) {
return true;
}
}
return false;
}
HTMLParserAttribute *HTMLParserTag::get_attribute(const String &attrib, const String &contains_val) {
for (int i = 0; i < attributes.size(); ++i) {
HTMLParserAttribute *a = attributes[i];
if (a->match_attrib(attrib) && a->contains_data(contains_val)) {
return a;
}
}
return nullptr;
}
bool HTMLParserTag::has_attribute(const String &attrib, const String &contains_val) {
for (int i = 0; i < attributes.size(); ++i) {
HTMLParserAttribute *a = attributes[i];
if (a->match_attrib(attrib) && a->contains_data(contains_val)) {
return true;
}
}
return false;
}
void HTMLParserTag::process() { void HTMLParserTag::process() {
if (type != HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) { if (type != HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) {
return; return;
@ -60,7 +152,7 @@ void HTMLParserTag::process() {
return; return;
} }
//test for comment. <!-- --> // test for comment. <!-- -->
++start_index; ++start_index;
if (data[2] == '-' && data[3] == '-') { if (data[2] == '-' && data[3] == '-') {
type = HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT; type = HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT;
@ -78,7 +170,7 @@ void HTMLParserTag::process() {
return; return;
} }
//test for doctype. <!doctype > // test for doctype. <!doctype >
int doctype_start_index = data.find("doctype ", 2); int doctype_start_index = data.find("doctype ", 2);
if (doctype_start_index == -1) { if (doctype_start_index == -1) {
@ -92,9 +184,9 @@ void HTMLParserTag::process() {
String tag_text; String tag_text;
if (data[data.size() - 2] == '/') { if (data[data.size() - 2] == '/') {
//will catch all that looks like <br/> // will catch all that looks like <br/>
//tags that look like <br> will be caught later in a post process, in a way // tags that look like <br> will be caught later in a post process, in a way
//which also tries to catch erroneously not closed tags that supposed to be closed // which also tries to catch erroneously not closed tags that supposed to be closed
type = HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG; type = HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG;
tag_text = data.substr(1, data.size() - 3); tag_text = data.substr(1, data.size() - 3);
@ -107,12 +199,12 @@ void HTMLParserTag::process() {
int fspc_index = tag_text.find(' '); int fspc_index = tag_text.find(' ');
if (fspc_index == -1) { if (fspc_index == -1) {
//no args // no args
tag = tag_text; tag = tag_text;
return; return;
} }
//grab the tag itself // grab the tag itself
tag = tag_text.substr(0, fspc_index); tag = tag_text.substr(0, fspc_index);
String args = tag_text.substr(fspc_index + 1, tag_text.size() - fspc_index - 1); String args = tag_text.substr(fspc_index + 1, tag_text.size() - fspc_index - 1);
@ -122,7 +214,7 @@ void HTMLParserTag::process() {
int tag_end_index = data.find(' ', start_index); int tag_end_index = data.find(' ', start_index);
if (tag_end_index == -1) { if (tag_end_index == -1) {
//simple tag // simple tag
tag = data.substr(start_index, data.size() - start_index - 1); tag = data.substr(start_index, data.size() - start_index - 1);
return; return;
} }
@ -153,23 +245,23 @@ void HTMLParserTag::parse_args(const String &args) {
a->attribute = args.substr(i, equals_index - i); a->attribute = args.substr(i, equals_index - i);
//todo // todo
//a.trim(); // a.trim();
int next_char_index = equals_index + 1; int next_char_index = equals_index + 1;
if (next_char_index >= args.size()) { if (next_char_index >= args.size()) {
//an attribute looks like this "... attrib=" // an attribute looks like this "... attrib="
attributes.push_back(a); attributes.push_back(a);
return; return;
} }
//skip spaces // skip spaces
while (args[next_char_index] == ' ') { while (args[next_char_index] == ' ') {
++next_char_index; ++next_char_index;
if (next_char_index >= args.size()) { if (next_char_index >= args.size()) {
//an attribute looks like this "... attrib= " // an attribute looks like this "... attrib= "
attributes.push_back(a); attributes.push_back(a);
return; return;
} }
@ -186,8 +278,8 @@ void HTMLParserTag::parse_args(const String &args) {
int end_index = args.find(find_char, next_char_index); int end_index = args.find(find_char, next_char_index);
if (end_index == -1) { if (end_index == -1) {
//missing closing ' or " if c is ' or " // missing closing ' or " if c is ' or "
//else missing parameter // else missing parameter
a->data = args.substr(next_char_index, args.size() - next_char_index - 1); a->data = args.substr(next_char_index, args.size() - next_char_index - 1);
attributes.push_back(a); attributes.push_back(a);
@ -236,8 +328,8 @@ String HTMLParserTag::to_string(const int level) {
s += "</" + tag + ">\n"; s += "</" + tag + ">\n";
} else if (type == HTML_PARSER_TAG_TYPE_CLOSING_TAG) { } else if (type == HTML_PARSER_TAG_TYPE_CLOSING_TAG) {
//HTMLParserTag should handle this automatically // HTMLParserTag should handle this automatically
//it's here for debugging purposes though // it's here for debugging purposes though
s += "</" + tag + "(!)>"; s += "</" + tag + "(!)>";
if (tags.size() != 0) { if (tags.size() != 0) {
@ -317,7 +409,7 @@ HTMLParserTag::~HTMLParserTag() {
void HTMLParser::parse(const String &data) { void HTMLParser::parse(const String &data) {
Vector<HTMLParserTag *> tags; Vector<HTMLParserTag *> tags;
//split into tags // split into tags
for (int i = 0; i < data.size(); ++i) { for (int i = 0; i < data.size(); ++i) {
if (data[i] == '<') { if (data[i] == '<') {
for (int j = i + 1; j < data.size(); ++j) { for (int j = i + 1; j < data.size(); ++j) {
@ -356,7 +448,7 @@ void HTMLParser::parse(const String &data) {
root = new HTMLParserTag(); root = new HTMLParserTag();
//process tags into hierarchical order // process tags into hierarchical order
Vector<HTMLParserTag *> tag_stack; Vector<HTMLParserTag *> tag_stack;
for (int i = 0; i < tags.size(); ++i) { for (int i = 0; i < tags.size(); ++i) {
HTMLParserTag *t = tags[i]; HTMLParserTag *t = tags[i];
@ -417,18 +509,18 @@ void HTMLParser::parse(const String &data) {
delete t; delete t;
tags[i] = nullptr; tags[i] = nullptr;
//ill-formed html // ill-formed html
continue; continue;
} }
//find it's pair // find it's pair
int tag_index = 0; int tag_index = 0;
for (int j = tag_stack.size() - 1; j > 0; --j) { for (int j = tag_stack.size() - 1; j > 0; --j) {
HTMLParserTag *ts = tag_stack[j]; HTMLParserTag *ts = tag_stack[j];
//we sould only have opening tags on the stack // we sould only have opening tags on the stack
if (ts->tag == t->tag) { if (ts->tag == t->tag) {
//found // found
tag_index = j; tag_index = j;
break; break;
} }
@ -436,8 +528,8 @@ void HTMLParser::parse(const String &data) {
HTMLParserTag *opening_tag = tag_stack[tag_index]; HTMLParserTag *opening_tag = tag_stack[tag_index];
//mark everything else that we found before finding the opening tag as self closing, and add them to out opening tag // mark everything else that we found before finding the opening tag as self closing, and add them to out opening tag
//If the html is ill formed, it just grabs everything from the tag stack // If the html is ill formed, it just grabs everything from the tag stack
for (int j = tag_index + 1; j < tag_stack.size(); ++j) { for (int j = tag_index + 1; j < tag_stack.size(); ++j) {
HTMLParserTag *ts = tag_stack[j]; HTMLParserTag *ts = tag_stack[j];
@ -460,7 +552,7 @@ void HTMLParser::parse(const String &data) {
} }
} }
//add everything remaining on the stack to root // add everything remaining on the stack to root
for (int i = 0; i < tag_stack.size(); ++i) { for (int i = 0; i < tag_stack.size(); ++i) {
root->tags.push_back(tag_stack[i]); root->tags.push_back(tag_stack[i]);
} }

View File

@ -42,6 +42,17 @@ public:
Vector<HTMLParserTag*> tags; Vector<HTMLParserTag*> tags;
Vector<HTMLParserAttribute*> attributes; Vector<HTMLParserAttribute*> attributes;
HTMLParserTag *get_first(const String &t);
HTMLParserTag *get_first(const String &t, const String &attrib, const String &val);
String get_attribute_value(const String &attrib);
HTMLParserAttribute *get_attribute(const String &attrib);
bool has_attribute(const String &attrib);
HTMLParserAttribute *get_attribute(const String &attrib, const String &contains_val);
bool has_attribute(const String &attrib, const String &contains_val);
void process(); void process();
void parse_args(const String &args); void parse_args(const String &args);