2021-11-18 00:59:13 +01:00
|
|
|
#include "html_parser.h"
|
|
|
|
|
2021-11-18 12:03:43 +01:00
|
|
|
String HTMLParserAttribute::to_string() {
|
|
|
|
if (single) {
|
|
|
|
return attribute;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (data.find('"' == -1)) {
|
|
|
|
return attribute + "=\"" + data + "\"";
|
|
|
|
} else {
|
|
|
|
return attribute + "=\'" + data + "\'";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void HTMLParserAttribute::print() {
|
|
|
|
to_string().print();
|
|
|
|
}
|
|
|
|
|
2021-11-18 00:59:13 +01:00
|
|
|
HTMLParserAttribute::HTMLParserAttribute() {
|
2021-11-18 12:03:43 +01:00
|
|
|
single = false;
|
2021-11-18 00:59:13 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
HTMLParserAttribute::~HTMLParserAttribute() {
|
|
|
|
}
|
|
|
|
|
|
|
|
void HTMLParserTag::process() {
|
|
|
|
if (type != HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (data.size() < 2) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
ERR_FAIL_COND(data[0] != '<');
|
|
|
|
ERR_FAIL_COND(data[data.size() - 1] != '>');
|
|
|
|
|
|
|
|
int start_index = 1;
|
|
|
|
if (data[1] == '/') {
|
|
|
|
++start_index;
|
|
|
|
|
|
|
|
type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CLOSING_TAG;
|
|
|
|
} else if (data[1] == '!') {
|
|
|
|
if (data.size() < 8) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
//test for comment. <!-- -->
|
|
|
|
++start_index;
|
|
|
|
if (data[2] == '-' && data[3] == '-') {
|
|
|
|
type = HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT;
|
|
|
|
|
|
|
|
int comment_start_index = data.find(' ', 3);
|
|
|
|
|
|
|
|
if (comment_start_index == -1) {
|
|
|
|
comment_start_index = 4;
|
|
|
|
}
|
|
|
|
|
|
|
|
tag = data.substr(comment_start_index, comment_start_index - data.size() - 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (data.size() < 11) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
//test for doctype. <!doctype >
|
|
|
|
int doctype_start_index = data.find("doctype ", 2);
|
|
|
|
|
|
|
|
if (doctype_start_index == -1) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
type = HTMLParserTag::HTML_PARSER_TAG_TYPE_DOCTYPE;
|
|
|
|
|
|
|
|
tag = data.substr(doctype_start_index + 8, data.size() - doctype_start_index - 8 - 1);
|
|
|
|
} else {
|
|
|
|
String tag_text;
|
|
|
|
|
|
|
|
if (data[data.size() - 2] == '/') {
|
|
|
|
//will catch all that looks like <br/>
|
|
|
|
//tags that look like <br> will be caught later in a post process, in a way
|
|
|
|
//which also tries to catch erroneously not closed tags that supposed to be closed
|
|
|
|
type = HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG;
|
|
|
|
|
|
|
|
tag_text = data.substr(1, data.size() - 3);
|
|
|
|
} else {
|
|
|
|
type = HTMLParserTag::HTML_PARSER_TAG_TYPE_OPENING_TAG;
|
|
|
|
|
|
|
|
tag_text = data.substr(1, data.size() - 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
int fspc_index = tag_text.find(' ');
|
|
|
|
|
|
|
|
if (fspc_index == -1) {
|
|
|
|
//no args
|
|
|
|
tag = tag_text;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
//grab the tag itself
|
2021-11-18 12:03:43 +01:00
|
|
|
tag = tag_text.substr(0, fspc_index);
|
2021-11-18 00:59:13 +01:00
|
|
|
|
|
|
|
String args = tag_text.substr(fspc_index + 1, tag_text.size() - fspc_index - 1);
|
|
|
|
parse_args(args);
|
|
|
|
}
|
|
|
|
|
|
|
|
int tag_end_index = data.find(' ', start_index);
|
|
|
|
|
|
|
|
if (tag_end_index == -1) {
|
|
|
|
//simple tag
|
|
|
|
tag = data.substr(start_index, data.size() - start_index - 1);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void HTMLParserTag::parse_args(const String &args) {
|
|
|
|
attributes.clear();
|
|
|
|
|
|
|
|
int i = 0;
|
|
|
|
while (i < args.size()) {
|
|
|
|
if (args[i] == ' ') {
|
|
|
|
//"trim"
|
|
|
|
++i;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
int equals_index = args.find('=', i);
|
|
|
|
|
2021-11-18 12:03:43 +01:00
|
|
|
HTMLParserAttribute *a = new HTMLParserAttribute();
|
2021-11-18 00:59:13 +01:00
|
|
|
|
|
|
|
if (equals_index == -1) {
|
2021-11-18 12:03:43 +01:00
|
|
|
a->attribute = args.substr(i, args.size() - i);
|
|
|
|
a->single = true;
|
2021-11-18 00:59:13 +01:00
|
|
|
attributes.push_back(a);
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-11-18 12:03:43 +01:00
|
|
|
a->attribute = args.substr(i, equals_index - i);
|
2021-11-18 00:59:13 +01:00
|
|
|
|
|
|
|
//todo
|
|
|
|
//a.trim();
|
|
|
|
|
2021-11-18 07:18:35 +01:00
|
|
|
int next_char_index = equals_index + 1;
|
2021-11-18 00:59:13 +01:00
|
|
|
|
2021-11-18 11:18:05 +01:00
|
|
|
if (next_char_index >= args.size()) {
|
|
|
|
//an attribute looks like this "... attrib="
|
|
|
|
attributes.push_back(a);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-11-18 00:59:13 +01:00
|
|
|
//skip spaces
|
2021-11-18 11:18:05 +01:00
|
|
|
while (args[next_char_index] == ' ') {
|
2021-11-18 00:59:13 +01:00
|
|
|
++next_char_index;
|
|
|
|
|
2021-11-18 11:18:05 +01:00
|
|
|
if (next_char_index >= args.size()) {
|
|
|
|
//an attribute looks like this "... attrib= "
|
2021-11-18 00:59:13 +01:00
|
|
|
attributes.push_back(a);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-18 11:18:05 +01:00
|
|
|
char c = args[next_char_index];
|
|
|
|
char find_char = ' ';
|
2021-11-18 00:59:13 +01:00
|
|
|
|
|
|
|
if (c == '"' || c == '\'') {
|
|
|
|
++next_char_index;
|
|
|
|
find_char = c;
|
|
|
|
}
|
|
|
|
|
|
|
|
int end_index = args.find(find_char, next_char_index);
|
|
|
|
|
|
|
|
if (end_index == -1) {
|
|
|
|
//missing closing ' or " if c is ' or "
|
|
|
|
//else missing parameter
|
|
|
|
|
2021-11-18 12:03:43 +01:00
|
|
|
a->data = args.substr(next_char_index, args.size() - next_char_index - 1);
|
2021-11-18 00:59:13 +01:00
|
|
|
attributes.push_back(a);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-11-18 12:03:43 +01:00
|
|
|
a->data = args.substr(next_char_index, end_index - next_char_index);
|
2021-11-18 00:59:13 +01:00
|
|
|
attributes.push_back(a);
|
|
|
|
|
|
|
|
i = end_index + 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-18 12:03:43 +01:00
|
|
|
String HTMLParserTag::to_string() {
|
|
|
|
String s;
|
|
|
|
|
|
|
|
if (type == HTML_PARSER_TAG_TYPE_CONTENT) {
|
|
|
|
s = data;
|
|
|
|
} else if (type == HTML_PARSER_TAG_TYPE_OPENING_TAG) {
|
|
|
|
s = "<" + tag;
|
|
|
|
|
|
|
|
for (int i = 0; i < attributes.size(); ++i) {
|
|
|
|
s += " " + attributes[i]->to_string();
|
|
|
|
}
|
|
|
|
|
|
|
|
s += ">";
|
|
|
|
} else if (type == HTML_PARSER_TAG_TYPE_CLOSING_TAG) {
|
|
|
|
s = "</" + tag + ">";
|
|
|
|
} else if (type == HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG) {
|
|
|
|
s = "<" + tag;
|
|
|
|
|
|
|
|
for (int i = 0; i < attributes.size(); ++i) {
|
|
|
|
s += " " + attributes[i]->to_string();
|
|
|
|
}
|
|
|
|
|
|
|
|
s += "/>";
|
|
|
|
} else if (type == HTML_PARSER_TAG_TYPE_COMMENT) {
|
|
|
|
s = "<!-- " + data + " -->";
|
|
|
|
} else if (type == HTML_PARSER_TAG_TYPE_DOCTYPE) {
|
|
|
|
s = "<!doctype " + data + ">";
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < tags.size(); ++i) {
|
|
|
|
s += tags[i]->to_string();
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
void HTMLParserTag::print() {
|
|
|
|
to_string().print();
|
|
|
|
}
|
|
|
|
|
2021-11-18 00:59:13 +01:00
|
|
|
HTMLParserTag::HTMLParserTag() {
|
|
|
|
type = HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
HTMLParserTag::~HTMLParserTag() {
|
|
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::parse(const String &data) {
|
2021-11-18 12:03:43 +01:00
|
|
|
Vector<HTMLParserTag *> tags;
|
2021-11-18 00:59:13 +01:00
|
|
|
|
2021-11-18 12:03:43 +01:00
|
|
|
//split into tags
|
2021-11-18 00:59:13 +01:00
|
|
|
for (int i = 0; i < data.size(); ++i) {
|
|
|
|
if (data[i] == '<') {
|
|
|
|
for (int j = i + 1; j < data.size(); ++j) {
|
|
|
|
if (data[j] == '>') {
|
2021-11-18 12:03:43 +01:00
|
|
|
HTMLParserTag *t = new HTMLParserTag();
|
2021-11-18 00:59:13 +01:00
|
|
|
|
2021-11-18 12:03:43 +01:00
|
|
|
t->data = data.substr(i, j - i + 1);
|
|
|
|
t->process();
|
|
|
|
t->print();
|
2021-11-18 00:59:13 +01:00
|
|
|
|
|
|
|
tags.push_back(t);
|
|
|
|
|
|
|
|
i = j;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (int j = i + 1; j < data.size(); ++j) {
|
|
|
|
if (data[j] == '<') {
|
2021-11-18 12:03:43 +01:00
|
|
|
HTMLParserTag *t = new HTMLParserTag();
|
2021-11-18 00:59:13 +01:00
|
|
|
|
2021-11-18 12:03:43 +01:00
|
|
|
t->data = data.substr(i, j - i);
|
|
|
|
t->type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT;
|
2021-11-18 00:59:13 +01:00
|
|
|
|
|
|
|
tags.push_back(t);
|
|
|
|
|
|
|
|
i = j - 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-11-18 12:03:43 +01:00
|
|
|
|
|
|
|
//process tags into hierarchical order
|
|
|
|
//Vector<HTMLParserTag> tag_stack;
|
|
|
|
//for (int i = 0; i < tags.size(); ++i) {
|
|
|
|
//}
|
|
|
|
|
|
|
|
for (int i = 0; i < tags.size(); ++i) {
|
|
|
|
delete tags[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
String HTMLParser::to_string() {
|
|
|
|
return html->to_string();
|
|
|
|
}
|
|
|
|
void HTMLParser::print() {
|
|
|
|
html->print();
|
2021-11-18 00:59:13 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
HTMLParser::HTMLParser() {
|
2021-11-18 12:03:43 +01:00
|
|
|
html = nullptr;
|
2021-11-18 00:59:13 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
HTMLParser::~HTMLParser() {
|
|
|
|
}
|