Added a new HTMLParser class. Still WIP.

2025-04-20 01:43:12 +02:00 · 2021-11-18 00:59:13 +01:00 · 2021-11-18 00:59:13 +01:00 · 1fcc47b353
commit 1fcc47b353
parent fd7bcb43cb
4 changed files with 276 additions and 2 deletions
--- a/core/html/html_parser.cpp
+++ b/core/html/html_parser.cpp
@ -0,0 +1,220 @@
+#include "html_parser.h"
+
+HTMLParserAttribute::HTMLParserAttribute() {
+}
+
+HTMLParserAttribute::~HTMLParserAttribute() {
+}
+
+void HTMLParserTag::process() {
+	if (type != HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) {
+		return;
+	}
+
+	if (data.size() < 2) {
+		return;
+	}
+
+	ERR_FAIL_COND(data[0] != '<');
+	ERR_FAIL_COND(data[data.size() - 1] != '>');
+
+	int start_index = 1;
+	if (data[1] == '/') {
+		++start_index;
+
+		type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CLOSING_TAG;
+	} else if (data[1] == '!') {
+		if (data.size() < 8) {
+			return;
+		}
+
+		//test for comment. <!-- -->
+		++start_index;
+		if (data[2] == '-' && data[3] == '-') {
+			type = HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT;
+
+			int comment_start_index = data.find(' ', 3);
+
+			if (comment_start_index == -1) {
+				comment_start_index = 4;
+			}
+
+			tag = data.substr(comment_start_index, comment_start_index - data.size() - 3);
+		}
+
+		if (data.size() < 11) {
+			return;
+		}
+
+		//test for doctype. <!doctype >
+		int doctype_start_index = data.find("doctype ", 2);
+
+		if (doctype_start_index == -1) {
+			return;
+		}
+
+		type = HTMLParserTag::HTML_PARSER_TAG_TYPE_DOCTYPE;
+
+		tag = data.substr(doctype_start_index + 8, data.size() - doctype_start_index - 8 - 1);
+	} else {
+		String tag_text;
+
+		if (data[data.size() - 2] == '/') {
+			//will catch all that looks like <br/>
+			//tags that look like <br> will be caught later in a post process, in a way
+			//which also tries to catch erroneously not closed tags that supposed to be closed
+			type = HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG;
+
+			tag_text = data.substr(1, data.size() - 3);
+		} else {
+			type = HTMLParserTag::HTML_PARSER_TAG_TYPE_OPENING_TAG;
+
+			tag_text = data.substr(1, data.size() - 2);
+		}
+
+		int fspc_index = tag_text.find(' ');
+
+		if (fspc_index == -1) {
+			//no args
+			tag = tag_text;
+			return;
+		}
+
+		//grab the tag itself
+		tag = tag_text.substr(0, fspc_index + 1);
+
+		String args = tag_text.substr(fspc_index + 1, tag_text.size() - fspc_index - 1);
+		parse_args(args);
+	}
+
+	int tag_end_index = data.find(' ', start_index);
+
+	if (tag_end_index == -1) {
+		//simple tag
+		tag = data.substr(start_index, data.size() - start_index - 1);
+		return;
+	}
+}
+
+void HTMLParserTag::parse_args(const String &args) {
+	attributes.clear();
+
+	int i = 0;
+	while (i < args.size()) {
+		if (args[i] == ' ') {
+			//"trim"
+			++i;
+			continue;
+		}
+
+		int equals_index = args.find('=', i);
+
+		HTMLParserAttribute a;
+
+		if (equals_index == -1) {
+			a.attribute = args.substr(i, args.size() - 1);
+			attributes.push_back(a);
+
+			return;
+		}
+
+		a.attribute = args.substr(i, args.size() - equals_index - 1);
+		a.attribute.print();
+
+		//todo
+		//a.trim();
+
+		int next_char_index = equals_index;
+
+		//skip spaces
+		while (data[next_char_index] == ' ') {
+			++next_char_index;
+
+			if (next_char_index >= data.size()) {
+				//an attribute looks like this "attrib=     "
+				attributes.push_back(a);
+				return;
+			}
+		}
+
+		char c = data[next_char_index];
+		char find_char;
+
+		if (c == '"' || c == '\'') {
+			++next_char_index;
+			find_char = c;
+		} else {
+			find_char = ' ';
+		}
+
+		int end_index = args.find(find_char, next_char_index);
+
+		if (end_index == -1) {
+			//missing closing ' or " if c is ' or "
+			//else missing parameter
+
+			a.data = args.substr(next_char_index, args.size() - next_char_index - 1);
+			attributes.push_back(a);
+			//a.data.print();
+			return;
+		}
+
+		a.data = args.substr(next_char_index, args.size() - end_index - 2);
+		attributes.push_back(a);
+		//a.data.print();
+
+		i = end_index + 1;
+	}
+}
+
+HTMLParserTag::HTMLParserTag() {
+	type = HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE;
+}
+
+HTMLParserTag::~HTMLParserTag() {
+}
+
+void HTMLParser::parse(const String &data) {
+	Vector<HTMLParserTag> tags;
+
+	for (int i = 0; i < data.size(); ++i) {
+		if (data[i] == '<') {
+			for (int j = i + 1; j < data.size(); ++j) {
+				if (data[j] == '>') {
+					HTMLParserTag t;
+
+					t.data = data.substr(i, j - i + 1);
+
+					tags.push_back(t);
+
+					i = j;
+					break;
+				}
+			}
+		} else {
+			for (int j = i + 1; j < data.size(); ++j) {
+				if (data[j] == '<') {
+					HTMLParserTag t;
+
+					t.data = data.substr(i, j - i);
+					t.type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT;
+
+					tags.push_back(t);
+
+					i = j - 1;
+					break;
+				}
+			}
+		}
+	}
+
+	for (int i = 0; i < tags.size(); ++i) {
+		tags[i].process();
+	}
+}
+
+HTMLParser::HTMLParser() {
+}
+
+HTMLParser::~HTMLParser() {
+}
--- a/core/html/html_parser.h
+++ b/core/html/html_parser.h
@ -0,0 +1,54 @@
+#ifndef HTML_BUILDER_H
+#define HTML_BUILDER_H
+
+#include "core/string.h"
+#include "core/containers/vector.h"
+
+class HTMLParserAttribute {
+public:
+	String attribute;
+	String data;
+
+	HTMLParserAttribute();
+	virtual ~HTMLParserAttribute();
+};
+
+class HTMLParserTag {
+public:
+	enum HTMLParserTagType {
+		HTML_PARSER_TAG_TYPE_NONE = 0,
+		HTML_PARSER_TAG_TYPE_OPENING_TAG,
+		HTML_PARSER_TAG_TYPE_CLOSING_TAG,
+		HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG,
+		HTML_PARSER_TAG_TYPE_COMMENT,
+		HTML_PARSER_TAG_TYPE_DOCTYPE,
+		HTML_PARSER_TAG_TYPE_CONTENT
+	};
+
+	int type;
+
+	String tag;
+	String data;
+
+	Vector<HTMLParserTag> tags;
+	Vector<HTMLParserAttribute> attributes;
+
+	void process();
+	void parse_args(const String& args);
+
+	HTMLParserTag();
+	virtual ~HTMLParserTag();
+};
+
+class HTMLParser {
+public:
+	HTMLParserTag html;
+
+	void parse(const String &data);
+	//void parse_tag(const String &data, const int index);
+
+	HTMLParser();
+	virtual ~HTMLParser();
+};
+
+#endif
--- a/core/string.cpp
+++ b/core/string.cpp
@ -166,7 +166,7 @@ void String::get_substr_nt(char *into_buf, const int start_index, const int len)
 	into_buf[len + 1] = '\0';
 }

-String String::substr(const int start_index, const int len) {
+String String::substr(const int start_index, const int len) const {
 	ERR_FAIL_INDEX_V(start_index, _size, String());

 	int sil = start_index + len;
--- a/core/string.h
+++ b/core/string.h
@ -26,7 +26,7 @@ public:
 	int find(const String &val, const int from = 0) const;
 	void get_substr(char *into_buf, const int start_index, const int len);
 	void get_substr_nt(char *into_buf, const int start_index, const int len);
-	String substr(const int start_index, const int len);
+	String substr(const int start_index, const int len) const;

 	void replace_from(const int start_index, const int length, const String &with);
 	void replace(const String &find_str, const String &with);