2022-06-25 01:55:54 +02:00
|
|
|
#include "html_parser.h"
|
2022-07-03 18:13:41 +02:00
|
|
|
|
2022-07-19 20:21:28 +02:00
|
|
|
#include "core/class_db.h"
|
2022-06-25 01:55:54 +02:00
|
|
|
#include "core/error_macros.h"
|
2022-07-19 13:58:26 +02:00
|
|
|
#include "core/log/logger.h"
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
String HTMLParserAttribute::get_attribute() {
|
|
|
|
return _attribute;
|
|
|
|
}
|
|
|
|
void HTMLParserAttribute::set_attribute(const String &val) {
|
|
|
|
_attribute = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
String HTMLParserAttribute::get_data() {
|
|
|
|
return _data;
|
|
|
|
}
|
|
|
|
void HTMLParserAttribute::set_data(const String &val) {
|
|
|
|
_data = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool HTMLParserAttribute::get_single() {
|
|
|
|
return _single;
|
|
|
|
}
|
|
|
|
void HTMLParserAttribute::set_single(const bool &val) {
|
|
|
|
_single = val;
|
|
|
|
}
|
|
|
|
|
2022-06-25 01:55:54 +02:00
|
|
|
bool HTMLParserAttribute::match_attrib(const String &attrib) {
|
2022-07-19 19:07:50 +02:00
|
|
|
return _attribute == attrib;
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
bool HTMLParserAttribute::match_data(const String &d) {
|
2022-07-19 19:07:50 +02:00
|
|
|
return _data == d;
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
2022-07-19 20:21:28 +02:00
|
|
|
bool HTMLParserAttribute::match_all_data(const Vector<String> &d) {
|
|
|
|
// todo
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
bool match_all_data_bind(const PoolStringArray &d) {
|
2022-06-25 01:55:54 +02:00
|
|
|
// todo
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
bool HTMLParserAttribute::contains_data(const String &d) {
|
2022-07-19 19:07:50 +02:00
|
|
|
return _data.find(d) != -1;
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
String HTMLParserAttribute::convert_to_string() const {
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_single) {
|
|
|
|
return _attribute;
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_data.find("\"") == -1) {
|
|
|
|
return _attribute + "=\"" + _data + "\"";
|
2022-06-25 01:55:54 +02:00
|
|
|
} else {
|
2022-07-19 19:07:50 +02:00
|
|
|
return _attribute + "=\'" + _data + "\'";
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
void HTMLParserAttribute::print() const {
|
|
|
|
PLOG_MSG(convert_to_string());
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
HTMLParserAttribute::HTMLParserAttribute() {
|
2022-07-19 19:07:50 +02:00
|
|
|
_single = false;
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
HTMLParserAttribute::~HTMLParserAttribute() {
|
|
|
|
}
|
|
|
|
|
2022-07-19 20:21:28 +02:00
|
|
|
void HTMLParserAttribute::_bind_methods() {
|
|
|
|
ClassDB::bind_method(D_METHOD("get_attribute"), &HTMLParserAttribute::get_attribute);
|
|
|
|
ClassDB::bind_method(D_METHOD("set_attribute", "val"), &HTMLParserAttribute::set_attribute);
|
|
|
|
ADD_PROPERTY(PropertyInfo(Variant::STRING, "attribute"), "set_attribute", "get_attribute");
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("get_data"), &HTMLParserAttribute::get_data);
|
|
|
|
ClassDB::bind_method(D_METHOD("set_data", "val"), &HTMLParserAttribute::set_data);
|
|
|
|
ADD_PROPERTY(PropertyInfo(Variant::STRING, "data"), "set_data", "get_data");
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("get_single"), &HTMLParserAttribute::get_single);
|
|
|
|
ClassDB::bind_method(D_METHOD("set_single", "val"), &HTMLParserAttribute::set_single);
|
|
|
|
ADD_PROPERTY(PropertyInfo(Variant::BOOL, "single"), "set_single", "get_single");
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("match_attrib", "attrib"), &HTMLParserAttribute::match_attrib);
|
|
|
|
ClassDB::bind_method(D_METHOD("match_data", "data"), &HTMLParserAttribute::match_data);
|
|
|
|
ClassDB::bind_method(D_METHOD("match_all_data", "data"), &HTMLParserAttribute::match_all_data_bind);
|
|
|
|
ClassDB::bind_method(D_METHOD("contains_data", "data"), &HTMLParserAttribute::contains_data);
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("convert_to_string"), &HTMLParserAttribute::convert_to_string);
|
|
|
|
ClassDB::bind_method(D_METHOD("print"), &HTMLParserAttribute::print);
|
|
|
|
};
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
void HTMLParserTag::add_child_tag(const Ref<HTMLParserTag> &tag) {
|
|
|
|
_tags.push_back(tag);
|
|
|
|
}
|
|
|
|
void HTMLParserTag::remote_child_tag(const int index) {
|
|
|
|
ERR_FAIL_INDEX(index, _tags.size());
|
|
|
|
|
|
|
|
_tags.remove(index);
|
|
|
|
}
|
|
|
|
Ref<HTMLParserTag> HTMLParserTag::get_child_tag(const int index) {
|
|
|
|
ERR_FAIL_INDEX_V(index, _tags.size(), Ref<HTMLParserTag>());
|
|
|
|
|
|
|
|
return _tags[index];
|
|
|
|
}
|
|
|
|
int HTMLParserTag::get_child_tag_count() const {
|
|
|
|
return _tags.size();
|
|
|
|
}
|
|
|
|
void HTMLParserTag::clear_child_tags() {
|
|
|
|
_tags.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
Vector<Variant> HTMLParserTag::get_child_tags() {
|
|
|
|
Vector<Variant> r;
|
|
|
|
for (int i = 0; i < _tags.size(); i++) {
|
|
|
|
r.push_back(_tags[i].get_ref_ptr());
|
|
|
|
}
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
void HTMLParserTag::set_child_tags(const Vector<Variant> &val) {
|
|
|
|
_tags.clear();
|
|
|
|
for (int i = 0; i < val.size(); i++) {
|
|
|
|
Ref<HTMLParserAttribute> e = Ref<HTMLParserAttribute>(val[i]);
|
|
|
|
_tags.push_back(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void HTMLParserTag::add_child_attribute(const Ref<HTMLParserAttribute> &tag) {
|
|
|
|
_attributes.push_back(tag);
|
|
|
|
}
|
|
|
|
void HTMLParserTag::remote_child_attribute(const int index) {
|
|
|
|
ERR_FAIL_INDEX(index, _tags.size());
|
|
|
|
|
|
|
|
_attributes.remove(index);
|
|
|
|
}
|
|
|
|
Ref<HTMLParserAttribute> HTMLParserTag::get_child_attribute(const int index) {
|
|
|
|
ERR_FAIL_INDEX_V(index, _tags.size(), Ref<HTMLParserAttribute>());
|
|
|
|
|
|
|
|
return _attributes[index];
|
|
|
|
}
|
|
|
|
int HTMLParserTag::get_child_attribute_count() const {
|
|
|
|
return _attributes.size();
|
|
|
|
}
|
|
|
|
void HTMLParserTag::clear_child_attributes() {
|
|
|
|
_attributes.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
Vector<Variant> HTMLParserTag::get_attributes() {
|
|
|
|
Vector<Variant> r;
|
|
|
|
for (int i = 0; i < _attributes.size(); i++) {
|
|
|
|
r.push_back(_attributes[i].get_ref_ptr());
|
|
|
|
}
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
void HTMLParserTag::set_attributes(const Vector<Variant> &val) {
|
|
|
|
_attributes.clear();
|
|
|
|
for (int i = 0; i < val.size(); i++) {
|
|
|
|
Ref<HTMLParserAttribute> e = Ref<HTMLParserAttribute>(val[i]);
|
|
|
|
_attributes.push_back(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
Ref<HTMLParserTag> HTMLParserTag::get_first(const String &t) {
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_tag == t) {
|
2022-07-19 13:58:26 +02:00
|
|
|
return Ref<HTMLParserTag>(this);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _tags.size(); ++i) {
|
|
|
|
Ref<HTMLParserTag> ht = _tags.write[i]->get_first(t);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
if (ht.is_valid()) {
|
2022-06-25 01:55:54 +02:00
|
|
|
return ht;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
return Ref<HTMLParserTag>();
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-19 20:21:28 +02:00
|
|
|
Ref<HTMLParserTag> HTMLParserTag::get_firstc(const String &t, const String &attrib, const String &val) {
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_tag == t) {
|
2022-07-19 20:21:28 +02:00
|
|
|
if (has_attributec(attrib, val)) {
|
2022-07-19 13:58:26 +02:00
|
|
|
return Ref<HTMLParserTag>(this);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _tags.size(); ++i) {
|
2022-07-19 20:21:28 +02:00
|
|
|
Ref<HTMLParserTag> ht = _tags.write[i]->get_firstc(t, attrib, val);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
if (ht.is_valid()) {
|
2022-06-25 01:55:54 +02:00
|
|
|
return ht;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
return Ref<HTMLParserTag>();
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
String HTMLParserTag::get_attribute_value(const String &attrib) {
|
2022-07-19 13:58:26 +02:00
|
|
|
Ref<HTMLParserAttribute> a = get_attribute(attrib);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
if (a.is_valid()) {
|
2022-07-19 19:07:50 +02:00
|
|
|
return a->get_data();
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
Ref<HTMLParserAttribute> HTMLParserTag::get_attribute(const String &attrib) {
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _attributes.size(); ++i) {
|
|
|
|
Ref<HTMLParserAttribute> a = _attributes[i];
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
if (a->match_attrib(attrib)) {
|
|
|
|
return a;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-19 14:26:40 +02:00
|
|
|
return Ref<HTMLParserAttribute>();
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
bool HTMLParserTag::has_attribute(const String &attrib) {
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _attributes.size(); ++i) {
|
|
|
|
Ref<HTMLParserAttribute> a = _attributes[i];
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
if (a->match_attrib(attrib)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2022-07-19 20:21:28 +02:00
|
|
|
Ref<HTMLParserAttribute> HTMLParserTag::get_attributec(const String &attrib, const String &contains_val) {
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _attributes.size(); ++i) {
|
|
|
|
Ref<HTMLParserAttribute> a = _attributes[i];
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
if (a->match_attrib(attrib) && a->contains_data(contains_val)) {
|
|
|
|
return a;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-19 14:26:40 +02:00
|
|
|
return Ref<HTMLParserAttribute>();
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-19 20:21:28 +02:00
|
|
|
bool HTMLParserTag::has_attributec(const String &attrib, const String &contains_val) {
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _attributes.size(); ++i) {
|
|
|
|
Ref<HTMLParserAttribute> a = _attributes[i];
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
if (a->match_attrib(attrib) && a->contains_data(contains_val)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void HTMLParserTag::process() {
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_type != HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) {
|
2022-06-25 01:55:54 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_data.size() < 2) {
|
2022-06-25 01:55:54 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
ERR_FAIL_COND(_data[0] != '<');
|
|
|
|
ERR_FAIL_COND(_data[_data.size() - 1] != '>');
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
int start_index = 1;
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_data[1] == '/') {
|
2022-06-25 01:55:54 +02:00
|
|
|
++start_index;
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
_type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CLOSING_TAG;
|
|
|
|
} else if (_data[1] == '!') {
|
|
|
|
if (_data.size() < 8) {
|
2022-06-25 01:55:54 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// test for comment. <!-- -->
|
|
|
|
++start_index;
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_data[2] == '-' && _data[3] == '-') {
|
|
|
|
_type = HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT;
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
int comment_start_index = _data.find_char(' ', 3);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
if (comment_start_index == -1) {
|
|
|
|
comment_start_index = 4;
|
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
_tag = _data.substr(comment_start_index, _data.size() - comment_start_index - 3);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_data.size() < 11) {
|
2022-06-25 01:55:54 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// test for doctype. <!doctype >
|
2022-07-19 19:07:50 +02:00
|
|
|
int doctype_start_index = _data.find("doctype ", 2);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
if (doctype_start_index == -1) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
_type = HTMLParserTag::HTML_PARSER_TAG_TYPE_DOCTYPE;
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
_tag = _data.substr(doctype_start_index + 8, _data.size() - doctype_start_index - 8 - 1);
|
2022-06-25 01:55:54 +02:00
|
|
|
} else {
|
|
|
|
String tag_text;
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_data[_data.size() - 2] == '/') {
|
2022-06-25 01:55:54 +02:00
|
|
|
// will catch all that looks like <br/>
|
|
|
|
// tags that look like <br> will be caught later in a post process, in a way
|
|
|
|
// which also tries to catch erroneously not closed tags that supposed to be closed
|
2022-07-19 19:07:50 +02:00
|
|
|
_type = HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG;
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
tag_text = _data.substr(1, _data.size() - 3);
|
2022-06-25 01:55:54 +02:00
|
|
|
} else {
|
2022-07-19 19:07:50 +02:00
|
|
|
_type = HTMLParserTag::HTML_PARSER_TAG_TYPE_OPENING_TAG;
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
tag_text = _data.substr(1, _data.size() - 2);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-03 18:13:41 +02:00
|
|
|
int fspc_index = tag_text.find_char(' ');
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
if (fspc_index == -1) {
|
|
|
|
// no args
|
2022-07-19 19:07:50 +02:00
|
|
|
_tag = tag_text;
|
2022-06-25 01:55:54 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// grab the tag itself
|
2022-07-19 19:07:50 +02:00
|
|
|
_tag = tag_text.substr(0, fspc_index);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
if (fspc_index + 1 == tag_text.size()) {
|
|
|
|
// no args, but had a space like <br />
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
String args = tag_text.substr(fspc_index + 1, tag_text.size() - fspc_index - 1);
|
|
|
|
parse_args(args);
|
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
int tag_end_index = _data.find_char(' ', start_index);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
if (tag_end_index == -1) {
|
|
|
|
// simple tag
|
2022-07-19 19:07:50 +02:00
|
|
|
_tag = _data.substr(start_index, _data.size() - start_index - 1);
|
2022-06-25 01:55:54 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void HTMLParserTag::parse_args(const String &args) {
|
2022-07-19 19:07:50 +02:00
|
|
|
_attributes.clear();
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
int i = 0;
|
|
|
|
while (i < args.size()) {
|
|
|
|
if (args[i] == ' ') {
|
|
|
|
//"trim"
|
|
|
|
++i;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2022-07-03 18:13:41 +02:00
|
|
|
int equals_index = args.find_char('=', i);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
Ref<HTMLParserAttribute> a;
|
|
|
|
a.instance();
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
if (equals_index == -1) {
|
2022-07-19 19:07:50 +02:00
|
|
|
a->set_attribute(args.substr(i, args.size() - i));
|
|
|
|
a->set_single(true);
|
|
|
|
_attributes.push_back(a);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
a->set_attribute(args.substr(i, equals_index - i));
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
// todo
|
|
|
|
// a.trim();
|
|
|
|
|
|
|
|
int next_char_index = equals_index + 1;
|
|
|
|
|
|
|
|
if (next_char_index >= args.size()) {
|
|
|
|
// an attribute looks like this "... attrib="
|
2022-07-19 19:07:50 +02:00
|
|
|
_attributes.push_back(a);
|
2022-06-25 01:55:54 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// skip spaces
|
|
|
|
while (args[next_char_index] == ' ') {
|
|
|
|
++next_char_index;
|
|
|
|
|
|
|
|
if (next_char_index >= args.size()) {
|
|
|
|
// an attribute looks like this "... attrib= "
|
2022-07-19 19:07:50 +02:00
|
|
|
_attributes.push_back(a);
|
2022-06-25 01:55:54 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
char c = args[next_char_index];
|
|
|
|
char find_char = ' ';
|
|
|
|
|
|
|
|
if (c == '"' || c == '\'') {
|
|
|
|
++next_char_index;
|
|
|
|
find_char = c;
|
|
|
|
}
|
|
|
|
|
2022-07-03 18:13:41 +02:00
|
|
|
int end_index = args.find_char(find_char, next_char_index);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
if (end_index == -1) {
|
|
|
|
// missing closing ' or " if c is ' or "
|
|
|
|
// else missing parameter
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
a->set_data(args.substr(next_char_index, args.size() - next_char_index - 1));
|
|
|
|
_attributes.push_back(a);
|
2022-06-25 01:55:54 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
a->set_data(args.substr(next_char_index, end_index - next_char_index));
|
|
|
|
_attributes.push_back(a);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
i = end_index + 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
String HTMLParserTag::convert_to_string(const int level) const {
|
2022-06-25 01:55:54 +02:00
|
|
|
String s;
|
|
|
|
|
2022-07-03 18:13:41 +02:00
|
|
|
s += String(" ").repeat(level);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_type == HTML_PARSER_TAG_TYPE_CONTENT) {
|
|
|
|
s += _data + "\n";
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_tags.size() != 0) {
|
2022-07-03 18:13:41 +02:00
|
|
|
s += String(" ").repeat(level);
|
2022-06-25 01:55:54 +02:00
|
|
|
s += "(!CONTENT TAG HAS TAGS!)\n";
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _tags.size(); ++i) {
|
|
|
|
s += _tags[i]->convert_to_string(level + 1) + "\n";
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
}
|
2022-07-19 19:07:50 +02:00
|
|
|
} else if (_type == HTML_PARSER_TAG_TYPE_OPENING_TAG) {
|
2022-06-25 01:55:54 +02:00
|
|
|
int ln = level + 1;
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
s += "<" + _tag;
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _attributes.size(); ++i) {
|
|
|
|
s += " " + _attributes[i]->convert_to_string();
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
s += ">\n";
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _tags.size(); ++i) {
|
|
|
|
s += _tags[i]->convert_to_string(ln);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-03 18:13:41 +02:00
|
|
|
s += String(" ").repeat(level);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
s += "</" + _tag + ">\n";
|
|
|
|
} else if (_type == HTML_PARSER_TAG_TYPE_CLOSING_TAG) {
|
2022-06-25 01:55:54 +02:00
|
|
|
// HTMLParserTag should handle this automatically
|
|
|
|
// it's here for debugging purposes though
|
2022-07-19 19:07:50 +02:00
|
|
|
s += "</" + _tag + "(!)>";
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_tags.size() != 0) {
|
2022-07-03 18:13:41 +02:00
|
|
|
s += String(" ").repeat(level);
|
2022-06-25 01:55:54 +02:00
|
|
|
s += "(!CLOSING TAG HAS TAGS!)\n";
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _tags.size(); ++i) {
|
|
|
|
s += _tags[i]->convert_to_string(level + 1) + "\n";
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
}
|
2022-07-19 19:07:50 +02:00
|
|
|
} else if (_type == HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG) {
|
|
|
|
s += "<" + _tag;
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _attributes.size(); ++i) {
|
|
|
|
s += " " + _attributes[i]->convert_to_string();
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
s += "/>\n";
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_tags.size() != 0) {
|
2022-07-03 18:13:41 +02:00
|
|
|
s += String(" ").repeat(level);
|
2022-06-25 01:55:54 +02:00
|
|
|
s += "(!SELF CLOSING TAG HAS TAGS!)\n";
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _tags.size(); ++i) {
|
|
|
|
s += _tags[i]->convert_to_string(level + 1) + "\n";
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
}
|
2022-07-19 19:07:50 +02:00
|
|
|
} else if (_type == HTML_PARSER_TAG_TYPE_COMMENT) {
|
|
|
|
s += "<!-- " + _data + " -->\n";
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_tags.size() != 0) {
|
2022-07-03 18:13:41 +02:00
|
|
|
s += String(" ").repeat(level);
|
2022-06-25 01:55:54 +02:00
|
|
|
s += "(!COMMENT TAG HAS TAGS!)\n";
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _tags.size(); ++i) {
|
|
|
|
s += _tags[i]->convert_to_string(level + 1) + "\n";
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
}
|
2022-07-19 19:07:50 +02:00
|
|
|
} else if (_type == HTML_PARSER_TAG_TYPE_DOCTYPE) {
|
|
|
|
s += _data + "\n";
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_tags.size() != 0) {
|
2022-07-03 18:13:41 +02:00
|
|
|
s += String(" ").repeat(level);
|
2022-06-25 01:55:54 +02:00
|
|
|
s += "(!DOCTYPE TAG HAS TAGS!)\n";
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
for (int i = 0; i < _tags.size(); ++i) {
|
|
|
|
s += _tags[i]->convert_to_string(level + 1) + "\n";
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
}
|
2022-07-19 19:07:50 +02:00
|
|
|
} else if (_type == HTML_PARSER_TAG_TYPE_NONE) {
|
|
|
|
for (int i = 0; i < _tags.size(); ++i) {
|
|
|
|
s += _tags[i]->convert_to_string(level) + "\n";
|
2022-07-03 18:13:41 +02:00
|
|
|
s += String(" ").repeat(level);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
2022-07-19 13:58:26 +02:00
|
|
|
void HTMLParserTag::print() const {
|
|
|
|
PLOG_MSG(convert_to_string());
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
HTMLParserTag::HTMLParserTag() {
|
2022-07-19 19:07:50 +02:00
|
|
|
_type = HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE;
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
HTMLParserTag::~HTMLParserTag() {
|
2022-07-19 19:07:50 +02:00
|
|
|
_tags.clear();
|
|
|
|
_attributes.clear();
|
|
|
|
}
|
|
|
|
|
2022-07-19 20:21:28 +02:00
|
|
|
void HTMLParserTag::_bind_methods() {
|
|
|
|
ClassDB::bind_method(D_METHOD("get_type"), &HTMLParserTag::get_type);
|
|
|
|
ClassDB::bind_method(D_METHOD("set_type", "val"), &HTMLParserTag::set_type);
|
|
|
|
ADD_PROPERTY(PropertyInfo(Variant::INT, "type"), "set_type", "get_type");
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("get_tag"), &HTMLParserTag::get_tag);
|
|
|
|
ClassDB::bind_method(D_METHOD("set_tag", "val"), &HTMLParserTag::set_tag);
|
|
|
|
ADD_PROPERTY(PropertyInfo(Variant::STRING, "tag"), "set_tag", "get_tag");
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("get_data"), &HTMLParserTag::get_data);
|
|
|
|
ClassDB::bind_method(D_METHOD("set_data", "val"), &HTMLParserTag::set_data);
|
|
|
|
ADD_PROPERTY(PropertyInfo(Variant::STRING, "data"), "set_data", "get_data");
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("add_child_tag", "tag"), &HTMLParserTag::add_child_tag);
|
|
|
|
ClassDB::bind_method(D_METHOD("remote_child_tag", "index"), &HTMLParserTag::remote_child_tag);
|
|
|
|
ClassDB::bind_method(D_METHOD("get_child_tag", "index"), &HTMLParserTag::get_child_tag);
|
|
|
|
ClassDB::bind_method(D_METHOD("get_child_tag_count"), &HTMLParserTag::get_child_tag_count);
|
|
|
|
ClassDB::bind_method(D_METHOD("clear_child_tags"), &HTMLParserTag::clear_child_tags);
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("get_child_tags"), &HTMLParserTag::get_child_tags);
|
|
|
|
ClassDB::bind_method(D_METHOD("set_child_tags", "val"), &HTMLParserTag::set_child_tags);
|
|
|
|
ADD_PROPERTY(PropertyInfo(Variant::ARRAY, "child_tags", PROPERTY_HINT_NONE, "17/17:HTMLParserTag", PROPERTY_USAGE_DEFAULT, "HTMLParserTag"), "set_child_tags", "get_child_tags");
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("add_child_attribute", "tag"), &HTMLParserTag::add_child_attribute);
|
|
|
|
ClassDB::bind_method(D_METHOD("remote_child_attribute", "index"), &HTMLParserTag::remote_child_attribute);
|
|
|
|
ClassDB::bind_method(D_METHOD("get_child_attribute", "index"), &HTMLParserTag::get_child_attribute);
|
|
|
|
ClassDB::bind_method(D_METHOD("get_child_attribute_count"), &HTMLParserTag::get_child_attribute_count);
|
|
|
|
ClassDB::bind_method(D_METHOD("clear_child_attributes"), &HTMLParserTag::clear_child_attributes);
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("get_attributes"), &HTMLParserTag::get_attributes);
|
|
|
|
ClassDB::bind_method(D_METHOD("set_attributes", "val"), &HTMLParserTag::set_attributes);
|
|
|
|
ADD_PROPERTY(PropertyInfo(Variant::ARRAY, "attributes", PROPERTY_HINT_NONE, "17/17:HTMLParserAttribute", PROPERTY_USAGE_DEFAULT, "HTMLParserAttribute"), "set_attributes", "get_attributes");
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("get_first", "t"), &HTMLParserTag::get_first);
|
|
|
|
ClassDB::bind_method(D_METHOD("get_firstc", "t", "attrib", "val"), &HTMLParserTag::get_firstc);
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("get_attribute_value", "attrib"), &HTMLParserTag::get_attribute_value);
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("get_attribute", "attrib"), &HTMLParserTag::get_attribute);
|
|
|
|
ClassDB::bind_method(D_METHOD("has_attribute", "attrib"), &HTMLParserTag::has_attribute);
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("get_attributec", "attrib", "contains_val"), &HTMLParserTag::get_attributec);
|
|
|
|
ClassDB::bind_method(D_METHOD("has_attributec", "attrib", "contains_val"), &HTMLParserTag::has_attributec);
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("process"), &HTMLParserTag::process);
|
|
|
|
ClassDB::bind_method(D_METHOD("parse_args", "args"), &HTMLParserTag::parse_args);
|
|
|
|
|
|
|
|
ClassDB::bind_method(D_METHOD("convert_to_string", "level"), &HTMLParserTag::convert_to_string, 0);
|
|
|
|
ClassDB::bind_method(D_METHOD("print"), &HTMLParserTag::print);
|
|
|
|
|
|
|
|
BIND_ENUM_CONSTANT(HTML_PARSER_TAG_TYPE_NONE);
|
|
|
|
BIND_ENUM_CONSTANT(HTML_PARSER_TAG_TYPE_OPENING_TAG);
|
|
|
|
BIND_ENUM_CONSTANT(HTML_PARSER_TAG_TYPE_CLOSING_TAG);
|
|
|
|
BIND_ENUM_CONSTANT(HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG);
|
|
|
|
BIND_ENUM_CONSTANT(HTML_PARSER_TAG_TYPE_COMMENT);
|
|
|
|
BIND_ENUM_CONSTANT(HTML_PARSER_TAG_TYPE_DOCTYPE);
|
|
|
|
BIND_ENUM_CONSTANT(HTML_PARSER_TAG_TYPE_CONTENT);
|
|
|
|
};
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
Ref<HTMLParserTag> HTMLParser::get_root() {
|
|
|
|
return _root;
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
void HTMLParser::parse(const String &data) {
|
2022-07-19 13:58:26 +02:00
|
|
|
Vector<Ref<HTMLParserTag>> tags;
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
// <script> content parsing is based on https://stackoverflow.com/questions/14574471/how-do-browsers-parse-a-script-tag-exactly
|
|
|
|
const int STATE_NONE = 0;
|
|
|
|
const int STATE_DATA_1 = 1;
|
|
|
|
const int STATE_DATA_2 = 2;
|
|
|
|
const int STATE_DATA_3 = 3;
|
|
|
|
|
|
|
|
int state = STATE_NONE;
|
|
|
|
|
|
|
|
// split into tags
|
|
|
|
for (int i = 0; i < data.size(); ++i) {
|
|
|
|
if (state == STATE_NONE) {
|
|
|
|
if (data[i] == '<') {
|
|
|
|
// tag
|
|
|
|
|
|
|
|
if (data.is_word_at(i, "<script")) {
|
|
|
|
// after the opening <script> tag, the parser goes to data1 state
|
|
|
|
state = STATE_DATA_1;
|
|
|
|
// no else, we need to process the tag istelf!
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int j = i + 1; j < data.size(); ++j) {
|
|
|
|
if (data[j] == '>') {
|
2022-07-19 19:26:24 +02:00
|
|
|
Ref<HTMLParserTag> t;
|
|
|
|
t.instance();
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
t->set_data(data.substr(i, j - i + 1));
|
2022-06-25 01:55:54 +02:00
|
|
|
t->process();
|
|
|
|
|
|
|
|
tags.push_back(t);
|
|
|
|
|
|
|
|
i = j;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// content
|
|
|
|
|
|
|
|
for (int j = i + 1; j < data.size(); ++j) {
|
|
|
|
if (data[j] == '<') {
|
2022-07-19 19:26:24 +02:00
|
|
|
Ref<HTMLParserTag> t;
|
|
|
|
t.instance();
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
t->set_data(data.substr(i, j - i));
|
|
|
|
t->set_type(HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
tags.push_back(t);
|
|
|
|
|
|
|
|
i = j - 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// script tag content
|
|
|
|
|
|
|
|
bool done = false;
|
|
|
|
for (int j = i; j < data.size(); ++j) {
|
|
|
|
char c = data[j];
|
|
|
|
|
|
|
|
if (c != '<' && c != '-') {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (data.is_word_at(j, "-->")) {
|
|
|
|
// if --> is encountered while in any state, switch to data1 state
|
|
|
|
state = STATE_DATA_1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (state == STATE_DATA_1) {
|
|
|
|
if (data.is_word_at(j, "<!--")) {
|
|
|
|
// if <!-- is encountered while in data1 state, switch to data2 state
|
|
|
|
state = STATE_DATA_2;
|
|
|
|
} else if (data.is_word_at(j, "</script")) {
|
|
|
|
// if </script[\s/>] is encountered while in any other state (than data3), stop parsing
|
|
|
|
done = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
} else if (state == STATE_DATA_2) {
|
|
|
|
if (data.is_word_at(j, "<script")) {
|
|
|
|
// if <script[\s/>] is encountered while in data2 state, switch to data3 state
|
|
|
|
state = STATE_DATA_3;
|
|
|
|
} else if (data.is_word_at(j, "</script")) {
|
|
|
|
// if </script[\s/>] is encountered while in any other state (than data3), stop parsing
|
|
|
|
done = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
} else if (state == STATE_DATA_3) {
|
|
|
|
// if </script[\s/>] is encountered while in data3 state, switch to data2 state
|
|
|
|
if (data.is_word_at(j, "</script")) {
|
|
|
|
state = STATE_DATA_2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (done) {
|
|
|
|
state = STATE_NONE;
|
2022-07-19 19:26:24 +02:00
|
|
|
Ref<HTMLParserTag> t;
|
|
|
|
t.instance();
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
t->set_data(data.substr(i, j - i));
|
|
|
|
t->set_type(HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT);
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
tags.push_back(t);
|
|
|
|
|
|
|
|
i = j - 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
_root.instance();
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
// process tags into hierarchical order
|
2022-07-19 13:58:26 +02:00
|
|
|
Vector<Ref<HTMLParserTag>> tag_stack;
|
2022-06-25 01:55:54 +02:00
|
|
|
for (int i = 0; i < tags.size(); ++i) {
|
2022-07-19 13:58:26 +02:00
|
|
|
Ref<HTMLParserTag> t = tags[i];
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 14:26:40 +02:00
|
|
|
ERR_CONTINUE(!t.is_valid());
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
if (t->get_type() == HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) {
|
2022-07-03 18:13:41 +02:00
|
|
|
ERR_PRINT("HTMLParser::parse: t->type == HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE!");
|
2022-07-19 13:58:26 +02:00
|
|
|
//memdelete(t);
|
|
|
|
tags.write[i].unref();
|
2022-06-25 01:55:54 +02:00
|
|
|
continue;
|
2022-07-19 19:07:50 +02:00
|
|
|
} else if (t->get_type() == HTMLParserTag::HTML_PARSER_TAG_TYPE_OPENING_TAG) {
|
2022-06-25 01:55:54 +02:00
|
|
|
tag_stack.push_back(t);
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
tags.write[i].unref();
|
2022-06-25 01:55:54 +02:00
|
|
|
continue;
|
2022-07-19 19:07:50 +02:00
|
|
|
} else if (t->get_type() == HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG) {
|
2022-06-25 01:55:54 +02:00
|
|
|
if (tag_stack.size() == 0) {
|
2022-07-19 19:07:50 +02:00
|
|
|
_root->add_child_tag(t);
|
2022-06-25 01:55:54 +02:00
|
|
|
} else {
|
2022-07-19 19:07:50 +02:00
|
|
|
tag_stack.write[tag_stack.size() - 1]->add_child_tag(t);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
tags.write[i].unref();
|
2022-06-25 01:55:54 +02:00
|
|
|
continue;
|
2022-07-19 19:07:50 +02:00
|
|
|
} else if (t->get_type() == HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT) {
|
2022-06-25 01:55:54 +02:00
|
|
|
if (tag_stack.size() == 0) {
|
2022-07-19 19:07:50 +02:00
|
|
|
_root->add_child_tag(t);
|
2022-06-25 01:55:54 +02:00
|
|
|
} else {
|
2022-07-19 19:07:50 +02:00
|
|
|
tag_stack.write[tag_stack.size() - 1]->add_child_tag(t);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
tags.write[i].unref();
|
2022-06-25 01:55:54 +02:00
|
|
|
continue;
|
2022-07-19 19:07:50 +02:00
|
|
|
} else if (t->get_type() == HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT) {
|
2022-06-25 01:55:54 +02:00
|
|
|
if (tag_stack.size() == 0) {
|
2022-07-19 19:07:50 +02:00
|
|
|
_root->add_child_tag(t);
|
2022-06-25 01:55:54 +02:00
|
|
|
} else {
|
2022-07-19 19:07:50 +02:00
|
|
|
tag_stack.write[tag_stack.size() - 1]->add_child_tag(t);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
tags.write[i].unref();
|
2022-06-25 01:55:54 +02:00
|
|
|
continue;
|
2022-07-19 19:07:50 +02:00
|
|
|
} else if (t->get_type() == HTMLParserTag::HTML_PARSER_TAG_TYPE_DOCTYPE) {
|
2022-06-25 01:55:54 +02:00
|
|
|
if (tag_stack.size() == 0) {
|
2022-07-19 19:07:50 +02:00
|
|
|
_root->add_child_tag(t);
|
2022-06-25 01:55:54 +02:00
|
|
|
} else {
|
2022-07-19 19:07:50 +02:00
|
|
|
tag_stack.write[tag_stack.size() - 1]->add_child_tag(t);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
tags.write[i].unref();
|
2022-06-25 01:55:54 +02:00
|
|
|
continue;
|
2022-07-19 19:07:50 +02:00
|
|
|
} else if (t->get_type() == HTMLParserTag::HTML_PARSER_TAG_TYPE_CLOSING_TAG) {
|
2022-06-25 01:55:54 +02:00
|
|
|
if (tag_stack.size() == 0) {
|
2022-07-19 13:58:26 +02:00
|
|
|
tags.write[i].unref();
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
// ill-formed html
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// find it's pair
|
|
|
|
int tag_index = 0;
|
|
|
|
for (int j = tag_stack.size() - 1; j > 0; --j) {
|
2022-07-19 13:58:26 +02:00
|
|
|
Ref<HTMLParserTag> ts = tag_stack[j];
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
// we sould only have opening tags on the stack
|
2022-07-19 19:07:50 +02:00
|
|
|
if (ts->get_tag() == t->get_tag()) {
|
2022-06-25 01:55:54 +02:00
|
|
|
// found
|
|
|
|
tag_index = j;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
Ref<HTMLParserTag> opening_tag = tag_stack[tag_index];
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
// mark everything else that we found before finding the opening tag as self closing, and add them to out opening tag
|
|
|
|
// If the html is ill formed, it just grabs everything from the tag stack
|
|
|
|
for (int j = tag_index + 1; j < tag_stack.size(); ++j) {
|
2022-07-19 13:58:26 +02:00
|
|
|
Ref<HTMLParserTag> ts = tag_stack[j];
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
ts->set_type(HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG);
|
|
|
|
opening_tag->add_child_tag(ts);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
tag_stack.resize(tag_index);
|
|
|
|
|
|
|
|
if (tag_stack.size() == 0) {
|
2022-07-19 19:07:50 +02:00
|
|
|
_root->add_child_tag(opening_tag);
|
2022-06-25 01:55:54 +02:00
|
|
|
} else {
|
2022-07-19 19:07:50 +02:00
|
|
|
tag_stack.write[tag_stack.size() - 1]->add_child_tag(opening_tag);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
tags.write[i].unref();
|
2022-06-25 01:55:54 +02:00
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
// add everything remaining on the stack to _root
|
2022-06-25 01:55:54 +02:00
|
|
|
for (int i = 0; i < tag_stack.size(); ++i) {
|
2022-07-19 19:07:50 +02:00
|
|
|
_root->add_child_tag(tag_stack[i]);
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < tags.size(); ++i) {
|
2022-07-19 13:58:26 +02:00
|
|
|
Ref<HTMLParserTag> t = tags[i];
|
2022-06-25 01:55:54 +02:00
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
if (t.is_valid()) {
|
2022-07-03 18:13:41 +02:00
|
|
|
ERR_PRINT("HTMLParser::parse(const String &data): tag was not processed!\n");
|
2022-06-25 01:55:54 +02:00
|
|
|
t->print();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-19 13:58:26 +02:00
|
|
|
String HTMLParser::convert_to_string() const {
|
2022-07-19 19:07:50 +02:00
|
|
|
if (!_root.is_valid()) {
|
2022-06-25 01:55:54 +02:00
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
2022-07-19 19:07:50 +02:00
|
|
|
return _root->convert_to_string();
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
2022-07-19 13:58:26 +02:00
|
|
|
void HTMLParser::print() const {
|
2022-07-19 19:07:50 +02:00
|
|
|
if (_root.is_valid()) {
|
|
|
|
_root->print();
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
HTMLParser::HTMLParser() {
|
|
|
|
}
|
|
|
|
|
|
|
|
HTMLParser::~HTMLParser() {
|
2022-07-19 19:07:50 +02:00
|
|
|
_root.unref();
|
2022-06-25 01:55:54 +02:00
|
|
|
}
|
2022-07-19 20:21:28 +02:00
|
|
|
|
|
|
|
void HTMLParser::_bind_methods() {
|
|
|
|
ClassDB::bind_method(D_METHOD("get_root"), &HTMLParser::get_root);
|
|
|
|
ClassDB::bind_method(D_METHOD("parse", "data"), &HTMLParser::parse);
|
|
|
|
ClassDB::bind_method(D_METHOD("convert_to_string"), &HTMLParser::convert_to_string);
|
|
|
|
ClassDB::bind_method(D_METHOD("print"), &HTMLParser::print);
|
|
|
|
};
|