rcpp_framework/core/html/html_parser.cpp

#include "html_parser.h"

String HTMLParserAttribute::to_string() {
	if (single) {
		return attribute;
	}

	if (data.find('"' == -1)) {
		return attribute + "=\"" + data + "\"";
	} else {
		return attribute + "=\'" + data + "\'";
	}
}

void HTMLParserAttribute::print() {
	to_string().print();
}

HTMLParserAttribute::HTMLParserAttribute() {
	single = false;
}

HTMLParserAttribute::~HTMLParserAttribute() {
}

void HTMLParserTag::process() {
	if (type != HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) {
		return;
	}

	if (data.size() < 2) {
		return;
	}

	ERR_FAIL_COND(data[0] != '<');
	ERR_FAIL_COND(data[data.size() - 1] != '>');

	int start_index = 1;
	if (data[1] == '/') {
		++start_index;

		type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CLOSING_TAG;
	} else if (data[1] == '!') {
		if (data.size() < 8) {
			return;
		}

		//test for comment. <!-- -->
		++start_index;
		if (data[2] == '-' && data[3] == '-') {
			type = HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT;

			int comment_start_index = data.find(' ', 3);

			if (comment_start_index == -1) {
				comment_start_index = 4;
			}

			tag = data.substr(comment_start_index, comment_start_index - data.size() - 3);
		}

		if (data.size() < 11) {
			return;
		}

		//test for doctype. <!doctype >
		int doctype_start_index = data.find("doctype ", 2);

		if (doctype_start_index == -1) {
			return;
		}

		type = HTMLParserTag::HTML_PARSER_TAG_TYPE_DOCTYPE;

		tag = data.substr(doctype_start_index + 8, data.size() - doctype_start_index - 8 - 1);
	} else {
		String tag_text;

		if (data[data.size() - 2] == '/') {
			//will catch all that looks like <br/>
			//tags that look like <br> will be caught later in a post process, in a way
			//which also tries to catch erroneously not closed tags that supposed to be closed
			type = HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG;

			tag_text = data.substr(1, data.size() - 3);
		} else {
			type = HTMLParserTag::HTML_PARSER_TAG_TYPE_OPENING_TAG;

			tag_text = data.substr(1, data.size() - 2);
		}

		int fspc_index = tag_text.find(' ');

		if (fspc_index == -1) {
			//no args
			tag = tag_text;
			return;
		}

		//grab the tag itself
		tag = tag_text.substr(0, fspc_index);

		String args = tag_text.substr(fspc_index + 1, tag_text.size() - fspc_index - 1);
		parse_args(args);
	}

	int tag_end_index = data.find(' ', start_index);

	if (tag_end_index == -1) {
		//simple tag
		tag = data.substr(start_index, data.size() - start_index - 1);
		return;
	}
}

void HTMLParserTag::parse_args(const String &args) {
	attributes.clear();

	int i = 0;
	while (i < args.size()) {
		if (args[i] == ' ') {
			//"trim"
			++i;
			continue;
		}

		int equals_index = args.find('=', i);

		HTMLParserAttribute *a = new HTMLParserAttribute();

		if (equals_index == -1) {
			a->attribute = args.substr(i, args.size() - i);
			a->single = true;
			attributes.push_back(a);

			return;
		}

		a->attribute = args.substr(i, equals_index - i);

		//todo
		//a.trim();

		int next_char_index = equals_index + 1;

		if (next_char_index >= args.size()) {
			//an attribute looks like this "... attrib="
			attributes.push_back(a);
			return;
		}

		//skip spaces
		while (args[next_char_index] == ' ') {
			++next_char_index;

			if (next_char_index >= args.size()) {
				//an attribute looks like this "... attrib=     "
				attributes.push_back(a);
				return;
			}
		}

		char c = args[next_char_index];
		char find_char = ' ';

		if (c == '"' || c == '\'') {
			++next_char_index;
			find_char = c;
		}

		int end_index = args.find(find_char, next_char_index);

		if (end_index == -1) {
			//missing closing ' or " if c is ' or "
			//else missing parameter

			a->data = args.substr(next_char_index, args.size() - next_char_index - 1);
			attributes.push_back(a);
			return;
		}

		a->data = args.substr(next_char_index, end_index - next_char_index);
		attributes.push_back(a);

		i = end_index + 1;
	}
}

String HTMLParserTag::to_string() {
	String s;

	if (type == HTML_PARSER_TAG_TYPE_CONTENT) {
		s = data;
	} else if (type == HTML_PARSER_TAG_TYPE_OPENING_TAG) {
		s = "<" + tag;

		for (int i = 0; i < attributes.size(); ++i) {
			s += " " + attributes[i]->to_string();
		}

		s += ">";
	} else if (type == HTML_PARSER_TAG_TYPE_CLOSING_TAG) {
		s = "</" + tag + ">";
	} else if (type == HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG) {
		s = "<" + tag;

		for (int i = 0; i < attributes.size(); ++i) {
			s += " " + attributes[i]->to_string();
		}

		s += "/>";
	} else if (type == HTML_PARSER_TAG_TYPE_COMMENT) {
		s = "<!-- " + data + " -->";
	} else if (type == HTML_PARSER_TAG_TYPE_DOCTYPE) {
		s = "<!doctype " + data + ">";
	}

	for (int i = 0; i < tags.size(); ++i) {
		s += tags[i]->to_string();
	}

	return s;
}
void HTMLParserTag::print() {
	to_string().print();
}

HTMLParserTag::HTMLParserTag() {
	type = HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE;
}

HTMLParserTag::~HTMLParserTag() {
}

void HTMLParser::parse(const String &data) {
	Vector<HTMLParserTag *> tags;

	//split into tags
	for (int i = 0; i < data.size(); ++i) {
		if (data[i] == '<') {
			for (int j = i + 1; j < data.size(); ++j) {
				if (data[j] == '>') {
					HTMLParserTag *t = new HTMLParserTag();

					t->data = data.substr(i, j - i + 1);
					t->process();
					t->print();

					tags.push_back(t);

					i = j;
					break;
				}
			}
		} else {
			for (int j = i + 1; j < data.size(); ++j) {
				if (data[j] == '<') {
					HTMLParserTag *t = new HTMLParserTag();

					t->data = data.substr(i, j - i);
					t->type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT;

					tags.push_back(t);

					i = j - 1;
					break;
				}
			}
		}
	}

	//process tags into hierarchical order
	//Vector<HTMLParserTag> tag_stack;
	//for (int i = 0; i < tags.size(); ++i) {
	//}

	for (int i = 0; i < tags.size(); ++i) {
		delete tags[i];
	}
}

String HTMLParser::to_string() {
	return html->to_string();
}
void HTMLParser::print() {
	html->print();
}

HTMLParser::HTMLParser() {
	html = nullptr;
}

HTMLParser::~HTMLParser() {
}
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`#include "html_parser.h"`

Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`String HTMLParserAttribute::to_string() {`
			`if (single) {`
			`return attribute;`
			`}`

			`if (data.find('"' == -1)) {`
			`return attribute + "=\"" + data + "\"";`
			`} else {`
			`return attribute + "=\'" + data + "\'";`
			`}`
			`}`

			`void HTMLParserAttribute::print() {`
			`to_string().print();`
			`}`

Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`HTMLParserAttribute::HTMLParserAttribute() {`
Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`single = false;`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`}`

			`HTMLParserAttribute::~HTMLParserAttribute() {`
			`}`

			`void HTMLParserTag::process() {`
			`if (type != HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) {`
			`return;`
			`}`

			`if (data.size() < 2) {`
			`return;`
			`}`

			`ERR_FAIL_COND(data[0] != '<');`
			`ERR_FAIL_COND(data[data.size() - 1] != '>');`

			`int start_index = 1;`
			`if (data[1] == '/') {`
			`++start_index;`

			`type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CLOSING_TAG;`
			`} else if (data[1] == '!') {`
			`if (data.size() < 8) {`
			`return;`
			`}`

			`//test for comment. <!-- -->`
			`++start_index;`
			`if (data[2] == '-' && data[3] == '-') {`
			`type = HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT;`

			`int comment_start_index = data.find(' ', 3);`

			`if (comment_start_index == -1) {`
			`comment_start_index = 4;`
			`}`

			`tag = data.substr(comment_start_index, comment_start_index - data.size() - 3);`
			`}`

			`if (data.size() < 11) {`
			`return;`
			`}`

			`//test for doctype. <!doctype >`
			`int doctype_start_index = data.find("doctype ", 2);`

			`if (doctype_start_index == -1) {`
			`return;`
			`}`

			`type = HTMLParserTag::HTML_PARSER_TAG_TYPE_DOCTYPE;`

			`tag = data.substr(doctype_start_index + 8, data.size() - doctype_start_index - 8 - 1);`
			`} else {`
			`String tag_text;`

			`if (data[data.size() - 2] == '/') {`
			`//will catch all that looks like <br/>`
			`//tags that look like <br> will be caught later in a post process, in a way`
			`//which also tries to catch erroneously not closed tags that supposed to be closed`
			`type = HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG;`

			`tag_text = data.substr(1, data.size() - 3);`
			`} else {`
			`type = HTMLParserTag::HTML_PARSER_TAG_TYPE_OPENING_TAG;`

			`tag_text = data.substr(1, data.size() - 2);`
			`}`

			`int fspc_index = tag_text.find(' ');`

			`if (fspc_index == -1) {`
			`//no args`
			`tag = tag_text;`
			`return;`
			`}`

			`//grab the tag itself`
Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`tag = tag_text.substr(0, fspc_index);`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00
			`String args = tag_text.substr(fspc_index + 1, tag_text.size() - fspc_index - 1);`
			`parse_args(args);`
			`}`

			`int tag_end_index = data.find(' ', start_index);`

			`if (tag_end_index == -1) {`
			`//simple tag`
			`tag = data.substr(start_index, data.size() - start_index - 1);`
			`return;`
			`}`
			`}`

			`void HTMLParserTag::parse_args(const String &args) {`
			`attributes.clear();`

			`int i = 0;`
			`while (i < args.size()) {`
			`if (args[i] == ' ') {`
			`//"trim"`
			`++i;`
			`continue;`
			`}`

			`int equals_index = args.find('=', i);`

Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`HTMLParserAttribute *a = new HTMLParserAttribute();`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00
			`if (equals_index == -1) {`
Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`a->attribute = args.substr(i, args.size() - i);`
			`a->single = true;`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`attributes.push_back(a);`

			`return;`
			`}`

Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`a->attribute = args.substr(i, equals_index - i);`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00
			`//todo`
			`//a.trim();`

A bit more work on fixing the html argument parser's indexing. 2021-11-18 07:18:35 +01:00			`int next_char_index = equals_index + 1;`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00
Fixed the parse_args method. 2021-11-18 11:18:05 +01:00			`if (next_char_index >= args.size()) {`
			`//an attribute looks like this "... attrib="`
			`attributes.push_back(a);`
			`return;`
			`}`

Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`//skip spaces`
Fixed the parse_args method. 2021-11-18 11:18:05 +01:00			`while (args[next_char_index] == ' ') {`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`++next_char_index;`

Fixed the parse_args method. 2021-11-18 11:18:05 +01:00			`if (next_char_index >= args.size()) {`
			`//an attribute looks like this "... attrib= "`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`attributes.push_back(a);`
			`return;`
			`}`
			`}`

Fixed the parse_args method. 2021-11-18 11:18:05 +01:00			`char c = args[next_char_index];`
			`char find_char = ' ';`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00
			`if (c == '"' \|\| c == '\'') {`
			`++next_char_index;`
			`find_char = c;`
			`}`

			`int end_index = args.find(find_char, next_char_index);`

			`if (end_index == -1) {`
			`//missing closing ' or " if c is ' or "`
			`//else missing parameter`

Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`a->data = args.substr(next_char_index, args.size() - next_char_index - 1);`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`attributes.push_back(a);`
			`return;`
			`}`

Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`a->data = args.substr(next_char_index, end_index - next_char_index);`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`attributes.push_back(a);`

			`i = end_index + 1;`
			`}`
			`}`

Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`String HTMLParserTag::to_string() {`
			`String s;`

			`if (type == HTML_PARSER_TAG_TYPE_CONTENT) {`
			`s = data;`
			`} else if (type == HTML_PARSER_TAG_TYPE_OPENING_TAG) {`
			`s = "<" + tag;`

			`for (int i = 0; i < attributes.size(); ++i) {`
			`s += " " + attributes[i]->to_string();`
			`}`

			`s += ">";`
			`} else if (type == HTML_PARSER_TAG_TYPE_CLOSING_TAG) {`
			`s = "</" + tag + ">";`
			`} else if (type == HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG) {`
			`s = "<" + tag;`

			`for (int i = 0; i < attributes.size(); ++i) {`
			`s += " " + attributes[i]->to_string();`
			`}`

			`s += "/>";`
			`} else if (type == HTML_PARSER_TAG_TYPE_COMMENT) {`
			`s = "<!-- " + data + " -->";`
			`} else if (type == HTML_PARSER_TAG_TYPE_DOCTYPE) {`
			`s = "<!doctype " + data + ">";`
			`}`

			`for (int i = 0; i < tags.size(); ++i) {`
			`s += tags[i]->to_string();`
			`}`

			`return s;`
			`}`
			`void HTMLParserTag::print() {`
			`to_string().print();`
			`}`

Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`HTMLParserTag::HTMLParserTag() {`
			`type = HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE;`
			`}`

			`HTMLParserTag::~HTMLParserTag() {`
			`}`

			`void HTMLParser::parse(const String &data) {`
Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`Vector<HTMLParserTag *> tags;`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00
Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`//split into tags`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`for (int i = 0; i < data.size(); ++i) {`
			`if (data[i] == '<') {`
			`for (int j = i + 1; j < data.size(); ++j) {`
			`if (data[j] == '>') {`
Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`HTMLParserTag *t = new HTMLParserTag();`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00
Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`t->data = data.substr(i, j - i + 1);`
			`t->process();`
			`t->print();`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00
			`tags.push_back(t);`

			`i = j;`
			`break;`
			`}`
			`}`
			`} else {`
			`for (int j = i + 1; j < data.size(); ++j) {`
			`if (data[j] == '<') {`
Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`HTMLParserTag *t = new HTMLParserTag();`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00
Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`t->data = data.substr(i, j - i);`
			`t->type = HTMLParserTag::HTML_PARSER_TAG_TYPE_CONTENT;`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00
			`tags.push_back(t);`

			`i = j - 1;`
			`break;`
			`}`
			`}`
			`}`
			`}`
Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00
			`//process tags into hierarchical order`
			`//Vector<HTMLParserTag> tag_stack;`
			`//for (int i = 0; i < tags.size(); ++i) {`
			`//}`

			`for (int i = 0; i < tags.size(); ++i) {`
			`delete tags[i];`
			`}`
			`}`

			`String HTMLParser::to_string() {`
			`return html->to_string();`
			`}`
			`void HTMLParser::print() {`
			`html->print();`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`}`

			`HTMLParser::HTMLParser() {`
Implemented to_string, and print for HTMLParser. also indexing fixes, and changed attributes and tags into pointers. 2021-11-18 12:03:43 +01:00			`html = nullptr;`
Added a new HTMLParser class. Still WIP. 2021-11-18 00:59:13 +01:00			`}`

			`HTMLParser::~HTMLParser() {`
			`}`