Fix edge cases during parsing <script> tags with in-html code.

This commit is contained in:
Relintai 2021-11-21 12:32:36 +01:00
parent 78f09b5fab
commit e32593da4c

View File

@ -208,7 +208,7 @@ void HTMLParserTag::process() {
tag = tag_text.substr(0, fspc_index); tag = tag_text.substr(0, fspc_index);
if (fspc_index + 1 == tag_text.size()) { if (fspc_index + 1 == tag_text.size()) {
//no args, but had a space like <br /> // no args, but had a space like <br />
return; return;
} }
@ -414,25 +414,103 @@ HTMLParserTag::~HTMLParserTag() {
void HTMLParser::parse(const String &data) { void HTMLParser::parse(const String &data) {
Vector<HTMLParserTag *> tags; Vector<HTMLParserTag *> tags;
// <script> content parsing is based on https://stackoverflow.com/questions/14574471/how-do-browsers-parse-a-script-tag-exactly
const int STATE_NONE = 0;
const int STATE_DATA_1 = 1;
const int STATE_DATA_2 = 2;
const int STATE_DATA_3 = 3;
int state = STATE_NONE;
// split into tags // split into tags
for (int i = 0; i < data.size(); ++i) { for (int i = 0; i < data.size(); ++i) {
if (data[i] == '<') { if (state == STATE_NONE) {
for (int j = i + 1; j < data.size(); ++j) { if (data[i] == '<') {
if (data[j] == '>') { // tag
HTMLParserTag *t = new HTMLParserTag();
t->data = data.substr(i, j - i + 1); if (data.is_word_at(i, "<script")) {
t->process(); // after the opening <script> tag, the parser goes to data1 state
state = STATE_DATA_1;
// no else, we need to process the tag istelf!
}
tags.push_back(t); for (int j = i + 1; j < data.size(); ++j) {
if (data[j] == '>') {
HTMLParserTag *t = new HTMLParserTag();
i = j; t->data = data.substr(i, j - i + 1);
break; t->process();
tags.push_back(t);
i = j;
break;
}
}
} else {
// content
for (int j = i + 1; j < data.size(); ++j) {
if (data[j] == '<') {
HTMLParserTag *t = new HTMLParserTag();
t->data = data.substr(i, j - i);
t->type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT;
tags.push_back(t);
i = j - 1;
break;
}
} }
} }
} else { } else {
for (int j = i + 1; j < data.size(); ++j) { // script tag content
if (data[j] == '<') {
bool done = false;
for (int j = i; j < data.size(); ++j) {
char c = data[j];
if (c != '<' && c != '-') {
continue;
}
if (data.is_word_at(j, "-->")) {
// if --> is encountered while in any state, switch to data1 state
state = STATE_DATA_1;
continue;
}
if (state == STATE_DATA_1) {
if (data.is_word_at(j, "<!--")) {
// if <!-- is encountered while in data1 state, switch to data2 state
state = STATE_DATA_2;
} else if (data.is_word_at(j, "</script")) {
// if </script[\s/>] is encountered while in any other state (than data3), stop parsing
done = true;
}
} else if (state == STATE_DATA_2) {
if (data.is_word_at(j, "<script")) {
// if <script[\s/>] is encountered while in data2 state, switch to data3 state
state = STATE_DATA_3;
} else if (data.is_word_at(j, "</script")) {
// if </script[\s/>] is encountered while in any other state (than data3), stop parsing
done = true;
}
} else if (state == STATE_DATA_3) {
// if </script[\s/>] is encountered while in data3 state, switch to data2 state
if (data.is_word_at(j, "</script")) {
state = STATE_DATA_2;
}
}
if (done) {
state = STATE_NONE;
HTMLParserTag *t = new HTMLParserTag(); HTMLParserTag *t = new HTMLParserTag();
t->data = data.substr(i, j - i); t->data = data.substr(i, j - i);