HTMLParser and it's helper classes inherit from Reference now.

This commit is contained in:
Relintai 2022-07-19 13:58:26 +02:00
parent 94f953494a
commit e4e2bea570
2 changed files with 95 additions and 101 deletions

View File

@ -1,6 +1,7 @@
#include "html_parser.h" #include "html_parser.h"
#include "core/error_macros.h" #include "core/error_macros.h"
#include "core/log/logger.h"
#include "core/print_string.h" #include "core/print_string.h"
bool HTMLParserAttribute::match_attrib(const String &attrib) { bool HTMLParserAttribute::match_attrib(const String &attrib) {
@ -17,7 +18,7 @@ bool HTMLParserAttribute::contains_data(const String &d) {
return data.find(d) != -1; return data.find(d) != -1;
} }
String HTMLParserAttribute::to_string() { String HTMLParserAttribute::convert_to_string() const {
if (single) { if (single) {
return attribute; return attribute;
} }
@ -29,8 +30,8 @@ String HTMLParserAttribute::to_string() {
} }
} }
void HTMLParserAttribute::print() { void HTMLParserAttribute::print() const {
print_verbose(to_string()); PLOG_MSG(convert_to_string());
} }
HTMLParserAttribute::HTMLParserAttribute() { HTMLParserAttribute::HTMLParserAttribute() {
@ -40,53 +41,53 @@ HTMLParserAttribute::HTMLParserAttribute() {
HTMLParserAttribute::~HTMLParserAttribute() { HTMLParserAttribute::~HTMLParserAttribute() {
} }
HTMLParserTag *HTMLParserTag::get_first(const String &t) { Ref<HTMLParserTag> HTMLParserTag::get_first(const String &t) {
if (tag == t) { if (tag == t) {
return this; return Ref<HTMLParserTag>(this);
} }
for (int i = 0; i < tags.size(); ++i) { for (int i = 0; i < tags.size(); ++i) {
HTMLParserTag *ht = tags[i]->get_first(t); Ref<HTMLParserTag> ht = tags.write[i]->get_first(t);
if (ht) { if (ht.is_valid()) {
return ht; return ht;
} }
} }
return nullptr; return Ref<HTMLParserTag>();
} }
HTMLParserTag *HTMLParserTag::get_first(const String &t, const String &attrib, const String &val) { Ref<HTMLParserTag> HTMLParserTag::get_first(const String &t, const String &attrib, const String &val) {
if (tag == t) { if (tag == t) {
if (has_attribute(attrib, val)) { if (has_attribute(attrib, val)) {
return this; return Ref<HTMLParserTag>(this);
} }
} }
for (int i = 0; i < tags.size(); ++i) { for (int i = 0; i < tags.size(); ++i) {
HTMLParserTag *ht = tags[i]->get_first(t, attrib, val); Ref<HTMLParserTag> ht = tags.write[i]->get_first(t, attrib, val);
if (ht) { if (ht.is_valid()) {
return ht; return ht;
} }
} }
return nullptr; return Ref<HTMLParserTag>();
} }
String HTMLParserTag::get_attribute_value(const String &attrib) { String HTMLParserTag::get_attribute_value(const String &attrib) {
HTMLParserAttribute *a = get_attribute(attrib); Ref<HTMLParserAttribute> a = get_attribute(attrib);
if (a) { if (a.is_valid()) {
return a->data; return a->data;
} }
return ""; return "";
} }
HTMLParserAttribute *HTMLParserTag::get_attribute(const String &attrib) { Ref<HTMLParserAttribute> HTMLParserTag::get_attribute(const String &attrib) {
for (int i = 0; i < attributes.size(); ++i) { for (int i = 0; i < attributes.size(); ++i) {
HTMLParserAttribute *a = attributes[i]; Ref<HTMLParserAttribute> a = attributes[i];
if (a->match_attrib(attrib)) { if (a->match_attrib(attrib)) {
return a; return a;
@ -98,7 +99,7 @@ HTMLParserAttribute *HTMLParserTag::get_attribute(const String &attrib) {
bool HTMLParserTag::has_attribute(const String &attrib) { bool HTMLParserTag::has_attribute(const String &attrib) {
for (int i = 0; i < attributes.size(); ++i) { for (int i = 0; i < attributes.size(); ++i) {
HTMLParserAttribute *a = attributes[i]; Ref<HTMLParserAttribute> a = attributes[i];
if (a->match_attrib(attrib)) { if (a->match_attrib(attrib)) {
return true; return true;
@ -108,9 +109,9 @@ bool HTMLParserTag::has_attribute(const String &attrib) {
return false; return false;
} }
HTMLParserAttribute *HTMLParserTag::get_attribute(const String &attrib, const String &contains_val) { Ref<HTMLParserAttribute> HTMLParserTag::get_attribute(const String &attrib, const String &contains_val) {
for (int i = 0; i < attributes.size(); ++i) { for (int i = 0; i < attributes.size(); ++i) {
HTMLParserAttribute *a = attributes[i]; Ref<HTMLParserAttribute> a = attributes[i];
if (a->match_attrib(attrib) && a->contains_data(contains_val)) { if (a->match_attrib(attrib) && a->contains_data(contains_val)) {
return a; return a;
@ -122,7 +123,7 @@ HTMLParserAttribute *HTMLParserTag::get_attribute(const String &attrib, const St
bool HTMLParserTag::has_attribute(const String &attrib, const String &contains_val) { bool HTMLParserTag::has_attribute(const String &attrib, const String &contains_val) {
for (int i = 0; i < attributes.size(); ++i) { for (int i = 0; i < attributes.size(); ++i) {
HTMLParserAttribute *a = attributes[i]; Ref<HTMLParserAttribute> a = attributes[i];
if (a->match_attrib(attrib) && a->contains_data(contains_val)) { if (a->match_attrib(attrib) && a->contains_data(contains_val)) {
return true; return true;
@ -240,7 +241,8 @@ void HTMLParserTag::parse_args(const String &args) {
int equals_index = args.find_char('=', i); int equals_index = args.find_char('=', i);
HTMLParserAttribute *a = memnew(HTMLParserAttribute); Ref<HTMLParserAttribute> a;
a.instance();
if (equals_index == -1) { if (equals_index == -1) {
a->attribute = args.substr(i, args.size() - i); a->attribute = args.substr(i, args.size() - i);
@ -300,7 +302,7 @@ void HTMLParserTag::parse_args(const String &args) {
} }
} }
String HTMLParserTag::to_string(const int level) { String HTMLParserTag::convert_to_string(const int level) const {
String s; String s;
s += String(" ").repeat(level); s += String(" ").repeat(level);
@ -313,7 +315,7 @@ String HTMLParserTag::to_string(const int level) {
s += "(!CONTENT TAG HAS TAGS!)\n"; s += "(!CONTENT TAG HAS TAGS!)\n";
for (int i = 0; i < tags.size(); ++i) { for (int i = 0; i < tags.size(); ++i) {
s += tags[i]->to_string(level + 1) + "\n"; s += tags[i]->convert_to_string(level + 1) + "\n";
} }
} }
} else if (type == HTML_PARSER_TAG_TYPE_OPENING_TAG) { } else if (type == HTML_PARSER_TAG_TYPE_OPENING_TAG) {
@ -322,13 +324,13 @@ String HTMLParserTag::to_string(const int level) {
s += "<" + tag; s += "<" + tag;
for (int i = 0; i < attributes.size(); ++i) { for (int i = 0; i < attributes.size(); ++i) {
s += " " + attributes[i]->to_string(); s += " " + attributes[i]->convert_to_string();
} }
s += ">\n"; s += ">\n";
for (int i = 0; i < tags.size(); ++i) { for (int i = 0; i < tags.size(); ++i) {
s += tags[i]->to_string(ln); s += tags[i]->convert_to_string(ln);
} }
s += String(" ").repeat(level); s += String(" ").repeat(level);
@ -344,14 +346,14 @@ String HTMLParserTag::to_string(const int level) {
s += "(!CLOSING TAG HAS TAGS!)\n"; s += "(!CLOSING TAG HAS TAGS!)\n";
for (int i = 0; i < tags.size(); ++i) { for (int i = 0; i < tags.size(); ++i) {
s += tags[i]->to_string(level + 1) + "\n"; s += tags[i]->convert_to_string(level + 1) + "\n";
} }
} }
} else if (type == HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG) { } else if (type == HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG) {
s += "<" + tag; s += "<" + tag;
for (int i = 0; i < attributes.size(); ++i) { for (int i = 0; i < attributes.size(); ++i) {
s += " " + attributes[i]->to_string(); s += " " + attributes[i]->convert_to_string();
} }
s += "/>\n"; s += "/>\n";
@ -361,7 +363,7 @@ String HTMLParserTag::to_string(const int level) {
s += "(!SELF CLOSING TAG HAS TAGS!)\n"; s += "(!SELF CLOSING TAG HAS TAGS!)\n";
for (int i = 0; i < tags.size(); ++i) { for (int i = 0; i < tags.size(); ++i) {
s += tags[i]->to_string(level + 1) + "\n"; s += tags[i]->convert_to_string(level + 1) + "\n";
} }
} }
} else if (type == HTML_PARSER_TAG_TYPE_COMMENT) { } else if (type == HTML_PARSER_TAG_TYPE_COMMENT) {
@ -372,7 +374,7 @@ String HTMLParserTag::to_string(const int level) {
s += "(!COMMENT TAG HAS TAGS!)\n"; s += "(!COMMENT TAG HAS TAGS!)\n";
for (int i = 0; i < tags.size(); ++i) { for (int i = 0; i < tags.size(); ++i) {
s += tags[i]->to_string(level + 1) + "\n"; s += tags[i]->convert_to_string(level + 1) + "\n";
} }
} }
} else if (type == HTML_PARSER_TAG_TYPE_DOCTYPE) { } else if (type == HTML_PARSER_TAG_TYPE_DOCTYPE) {
@ -383,20 +385,20 @@ String HTMLParserTag::to_string(const int level) {
s += "(!DOCTYPE TAG HAS TAGS!)\n"; s += "(!DOCTYPE TAG HAS TAGS!)\n";
for (int i = 0; i < tags.size(); ++i) { for (int i = 0; i < tags.size(); ++i) {
s += tags[i]->to_string(level + 1) + "\n"; s += tags[i]->convert_to_string(level + 1) + "\n";
} }
} }
} else if (type == HTML_PARSER_TAG_TYPE_NONE) { } else if (type == HTML_PARSER_TAG_TYPE_NONE) {
for (int i = 0; i < tags.size(); ++i) { for (int i = 0; i < tags.size(); ++i) {
s += tags[i]->to_string(level) + "\n"; s += tags[i]->convert_to_string(level) + "\n";
s += String(" ").repeat(level); s += String(" ").repeat(level);
} }
} }
return s; return s;
} }
void HTMLParserTag::print() { void HTMLParserTag::print() const {
print_verbose(to_string()); PLOG_MSG(convert_to_string());
} }
HTMLParserTag::HTMLParserTag() { HTMLParserTag::HTMLParserTag() {
@ -404,17 +406,12 @@ HTMLParserTag::HTMLParserTag() {
} }
HTMLParserTag::~HTMLParserTag() { HTMLParserTag::~HTMLParserTag() {
for (int i = 0; i < tags.size(); ++i) { tags.clear();
memdelete(tags[i]); attributes.clear();
}
for (int i = 0; i < attributes.size(); ++i) {
memdelete(attributes[i]);
}
} }
void HTMLParser::parse(const String &data) { void HTMLParser::parse(const String &data) {
Vector<HTMLParserTag *> tags; Vector<Ref<HTMLParserTag>> tags;
// <script> content parsing is based on https://stackoverflow.com/questions/14574471/how-do-browsers-parse-a-script-tag-exactly // <script> content parsing is based on https://stackoverflow.com/questions/14574471/how-do-browsers-parse-a-script-tag-exactly
const int STATE_NONE = 0; const int STATE_NONE = 0;
@ -438,7 +435,7 @@ void HTMLParser::parse(const String &data) {
for (int j = i + 1; j < data.size(); ++j) { for (int j = i + 1; j < data.size(); ++j) {
if (data[j] == '>') { if (data[j] == '>') {
HTMLParserTag *t = memnew(HTMLParserTag); Ref<HTMLParserTag> t = memnew(HTMLParserTag);
t->data = data.substr(i, j - i + 1); t->data = data.substr(i, j - i + 1);
t->process(); t->process();
@ -454,7 +451,7 @@ void HTMLParser::parse(const String &data) {
for (int j = i + 1; j < data.size(); ++j) { for (int j = i + 1; j < data.size(); ++j) {
if (data[j] == '<') { if (data[j] == '<') {
HTMLParserTag *t = memnew(HTMLParserTag); Ref<HTMLParserTag> t = memnew(HTMLParserTag);
t->data = data.substr(i, j - i); t->data = data.substr(i, j - i);
t->type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT; t->type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT;
@ -510,7 +507,7 @@ void HTMLParser::parse(const String &data) {
if (done) { if (done) {
state = STATE_NONE; state = STATE_NONE;
HTMLParserTag *t = memnew(HTMLParserTag); Ref<HTMLParserTag> t = memnew(HTMLParserTag);
t->data = data.substr(i, j - i); t->data = data.substr(i, j - i);
t->type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT; t->type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT;
@ -524,69 +521,64 @@ void HTMLParser::parse(const String &data) {
} }
} }
if (root) { root.instance();
memdelete(root);
}
root = memnew(HTMLParserTag);
// process tags into hierarchical order // process tags into hierarchical order
Vector<HTMLParserTag *> tag_stack; Vector<Ref<HTMLParserTag>> tag_stack;
for (int i = 0; i < tags.size(); ++i) { for (int i = 0; i < tags.size(); ++i) {
HTMLParserTag *t = tags[i]; Ref<HTMLParserTag> t = tags[i];
ERR_CONTINUE_MSG(t == nullptr, "HTMLParser::parse: t == nullptr!"); ERR_CONTINUE_MSG(!t.is_valid(), "HTMLParser::parse: t == nullptr!");
if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) { if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) {
ERR_PRINT("HTMLParser::parse: t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE!"); ERR_PRINT("HTMLParser::parse: t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE!");
memdelete(t); //memdelete(t);
tags.write[i] = nullptr; tags.write[i].unref();
continue; continue;
} else if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_OPENING_TAG) { } else if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_OPENING_TAG) {
tag_stack.push_back(t); tag_stack.push_back(t);
tags.write[i] = nullptr; tags.write[i].unref();
continue; continue;
} else if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG) { } else if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG) {
if (tag_stack.size() == 0) { if (tag_stack.size() == 0) {
root->tags.push_back(t); root->tags.push_back(t);
} else { } else {
tag_stack[tag_stack.size() - 1]->tags.push_back(t); tag_stack.write[tag_stack.size() - 1]->tags.push_back(t);
} }
tags.write[i] = nullptr; tags.write[i].unref();
continue; continue;
} else if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT) { } else if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT) {
if (tag_stack.size() == 0) { if (tag_stack.size() == 0) {
root->tags.push_back(t); root->tags.push_back(t);
} else { } else {
tag_stack[tag_stack.size() - 1]->tags.push_back(t); tag_stack.write[tag_stack.size() - 1]->tags.push_back(t);
} }
tags.write[i] = nullptr; tags.write[i].unref();
continue; continue;
} else if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT) { } else if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT) {
if (tag_stack.size() == 0) { if (tag_stack.size() == 0) {
root->tags.push_back(t); root->tags.push_back(t);
} else { } else {
tag_stack[tag_stack.size() - 1]->tags.push_back(t); tag_stack.write[tag_stack.size() - 1]->tags.push_back(t);
} }
tags.write[i] = nullptr; tags.write[i].unref();
continue; continue;
} else if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_DOCTYPE) { } else if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_DOCTYPE) {
if (tag_stack.size() == 0) { if (tag_stack.size() == 0) {
root->tags.push_back(t); root->tags.push_back(t);
} else { } else {
tag_stack[tag_stack.size() - 1]->tags.push_back(t); tag_stack.write[tag_stack.size() - 1]->tags.push_back(t);
} }
tags.write[i] = nullptr; tags.write[i].unref();
continue; continue;
} else if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_CLOSING_TAG) { } else if (t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_CLOSING_TAG) {
if (tag_stack.size() == 0) { if (tag_stack.size() == 0) {
memdelete(t); tags.write[i].unref();
tags.write[i] = nullptr;
// ill-formed html // ill-formed html
continue; continue;
@ -595,7 +587,7 @@ void HTMLParser::parse(const String &data) {
// find it's pair // find it's pair
int tag_index = 0; int tag_index = 0;
for (int j = tag_stack.size() - 1; j > 0; --j) { for (int j = tag_stack.size() - 1; j > 0; --j) {
HTMLParserTag *ts = tag_stack[j]; Ref<HTMLParserTag> ts = tag_stack[j];
// we sould only have opening tags on the stack // we sould only have opening tags on the stack
if (ts->tag == t->tag) { if (ts->tag == t->tag) {
@ -605,12 +597,12 @@ void HTMLParser::parse(const String &data) {
} }
} }
HTMLParserTag *opening_tag = tag_stack[tag_index]; Ref<HTMLParserTag> opening_tag = tag_stack[tag_index];
// mark everything else that we found before finding the opening tag as self closing, and add them to out opening tag // mark everything else that we found before finding the opening tag as self closing, and add them to out opening tag
// If the html is ill formed, it just grabs everything from the tag stack // If the html is ill formed, it just grabs everything from the tag stack
for (int j = tag_index + 1; j < tag_stack.size(); ++j) { for (int j = tag_index + 1; j < tag_stack.size(); ++j) {
HTMLParserTag *ts = tag_stack[j]; Ref<HTMLParserTag> ts = tag_stack[j];
ts->type = HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG; ts->type = HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG;
opening_tag->tags.push_back(ts); opening_tag->tags.push_back(ts);
@ -621,11 +613,10 @@ void HTMLParser::parse(const String &data) {
if (tag_stack.size() == 0) { if (tag_stack.size() == 0) {
root->tags.push_back(opening_tag); root->tags.push_back(opening_tag);
} else { } else {
tag_stack[tag_stack.size() - 1]->tags.push_back(opening_tag); tag_stack.write[tag_stack.size() - 1]->tags.push_back(opening_tag);
} }
memdelete(t); tags.write[i].unref();
tags.write[i] = nullptr;
continue; continue;
} }
@ -637,36 +628,31 @@ void HTMLParser::parse(const String &data) {
} }
for (int i = 0; i < tags.size(); ++i) { for (int i = 0; i < tags.size(); ++i) {
HTMLParserTag *t = tags[i]; Ref<HTMLParserTag> t = tags[i];
if (t != nullptr) { if (t.is_valid()) {
ERR_PRINT("HTMLParser::parse(const String &data): tag was not processed!\n"); ERR_PRINT("HTMLParser::parse(const String &data): tag was not processed!\n");
t->print(); t->print();
memdelete(t);
} }
} }
} }
String HTMLParser::to_string() { String HTMLParser::convert_to_string() const {
if (!root) { if (!root.is_valid()) {
return ""; return "";
} }
return root->to_string(); return root->convert_to_string();
} }
void HTMLParser::print() { void HTMLParser::print() const {
if (root) { if (root.is_valid()) {
root->print(); root->print();
} }
} }
HTMLParser::HTMLParser() { HTMLParser::HTMLParser() {
root = nullptr;
} }
HTMLParser::~HTMLParser() { HTMLParser::~HTMLParser() {
if (root) { root.unref();
memdelete(root);
}
} }

View File

@ -4,7 +4,11 @@
#include "core/ustring.h" #include "core/ustring.h"
#include "core/vector.h" #include "core/vector.h"
class HTMLParserAttribute { #include "core/reference.h"
class HTMLParserAttribute : public Reference {
GDCLASS(HTMLParserAttribute, Reference);
public: public:
String attribute; String attribute;
String data; String data;
@ -15,14 +19,16 @@ public:
bool match_data(const Vector<String> &d); bool match_data(const Vector<String> &d);
bool contains_data(const String &d); bool contains_data(const String &d);
String to_string(); String convert_to_string() const;
void print(); void print() const;
HTMLParserAttribute(); HTMLParserAttribute();
virtual ~HTMLParserAttribute(); virtual ~HTMLParserAttribute();
}; };
class HTMLParserTag { class HTMLParserTag : public Reference {
GDCLASS(HTMLParserTag, Reference);
public: public:
enum HTMLParserTagType { enum HTMLParserTagType {
HTML_PARSER_TAG_TYPE_NONE = 0, HTML_PARSER_TAG_TYPE_NONE = 0,
@ -39,39 +45,41 @@ public:
String tag; String tag;
String data; String data;
Vector<HTMLParserTag *> tags; Vector<Ref<HTMLParserTag>> tags;
Vector<HTMLParserAttribute *> attributes; Vector<Ref<HTMLParserAttribute>> attributes;
HTMLParserTag *get_first(const String &t); Ref<HTMLParserTag> get_first(const String &t);
HTMLParserTag *get_first(const String &t, const String &attrib, const String &val); Ref<HTMLParserTag> get_first(const String &t, const String &attrib, const String &val);
String get_attribute_value(const String &attrib); String get_attribute_value(const String &attrib);
HTMLParserAttribute *get_attribute(const String &attrib); Ref<HTMLParserAttribute> get_attribute(const String &attrib);
bool has_attribute(const String &attrib); bool has_attribute(const String &attrib);
HTMLParserAttribute *get_attribute(const String &attrib, const String &contains_val); Ref<HTMLParserAttribute> get_attribute(const String &attrib, const String &contains_val);
bool has_attribute(const String &attrib, const String &contains_val); bool has_attribute(const String &attrib, const String &contains_val);
void process(); void process();
void parse_args(const String &args); void parse_args(const String &args);
String to_string(const int level = 0); String convert_to_string(const int level = 0) const;
void print(); void print() const;
HTMLParserTag(); HTMLParserTag();
virtual ~HTMLParserTag(); virtual ~HTMLParserTag();
}; };
class HTMLParser { class HTMLParser : public Reference {
GDCLASS(HTMLParser, Reference);
public: public:
HTMLParserTag *root; Ref<HTMLParserTag> root;
void parse(const String &data); void parse(const String &data);
//void parse_tag(const String &data, const int index); //void parse_tag(const String &data, const int index);
String to_string(); String convert_to_string() const;
void print(); void print() const;
HTMLParser(); HTMLParser();
virtual ~HTMLParser(); virtual ~HTMLParser();