mirror of
https://github.com/Relintai/rcpp_framework.git
synced 2025-05-02 13:47:56 +02:00
Implemented a few helpers for the HTMLParser.
This commit is contained in:
parent
da6d18beef
commit
ff8790f81d
@ -8,11 +8,11 @@ bool HTMLParserAttribute::match_data(const String &d) {
|
|||||||
return data == d;
|
return data == d;
|
||||||
}
|
}
|
||||||
bool HTMLParserAttribute::match_data(const Vector<String> &d) {
|
bool HTMLParserAttribute::match_data(const Vector<String> &d) {
|
||||||
//todo
|
// todo
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool HTMLParserAttribute::contains_data(const String &d) {
|
bool HTMLParserAttribute::contains_data(const String &d) {
|
||||||
return attribute.find(d) != -1;
|
return data.find(d) != -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
String HTMLParserAttribute::to_string() {
|
String HTMLParserAttribute::to_string() {
|
||||||
@ -38,6 +38,98 @@ HTMLParserAttribute::HTMLParserAttribute() {
|
|||||||
HTMLParserAttribute::~HTMLParserAttribute() {
|
HTMLParserAttribute::~HTMLParserAttribute() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HTMLParserTag *HTMLParserTag::get_first(const String &t) {
|
||||||
|
if (tag == t) {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < tags.size(); ++i) {
|
||||||
|
HTMLParserTag *ht = tags[i]->get_first(t);
|
||||||
|
|
||||||
|
if (ht) {
|
||||||
|
return ht;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
HTMLParserTag *HTMLParserTag::get_first(const String &t, const String &attrib, const String &val) {
|
||||||
|
if (tag == t) {
|
||||||
|
if (has_attribute(attrib, val)) {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < tags.size(); ++i) {
|
||||||
|
HTMLParserTag *ht = tags[i]->get_first(t, attrib, val);
|
||||||
|
|
||||||
|
if (ht) {
|
||||||
|
return ht;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
String HTMLParserTag::get_attribute_value(const String &attrib) {
|
||||||
|
HTMLParserAttribute *a = get_attribute(attrib);
|
||||||
|
|
||||||
|
if (a) {
|
||||||
|
return a->data;
|
||||||
|
}
|
||||||
|
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
HTMLParserAttribute *HTMLParserTag::get_attribute(const String &attrib) {
|
||||||
|
for (int i = 0; i < attributes.size(); ++i) {
|
||||||
|
HTMLParserAttribute *a = attributes[i];
|
||||||
|
|
||||||
|
if (a->match_attrib(attrib)) {
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HTMLParserTag::has_attribute(const String &attrib) {
|
||||||
|
for (int i = 0; i < attributes.size(); ++i) {
|
||||||
|
HTMLParserAttribute *a = attributes[i];
|
||||||
|
|
||||||
|
if (a->match_attrib(attrib)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
HTMLParserAttribute *HTMLParserTag::get_attribute(const String &attrib, const String &contains_val) {
|
||||||
|
for (int i = 0; i < attributes.size(); ++i) {
|
||||||
|
HTMLParserAttribute *a = attributes[i];
|
||||||
|
|
||||||
|
if (a->match_attrib(attrib) && a->contains_data(contains_val)) {
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HTMLParserTag::has_attribute(const String &attrib, const String &contains_val) {
|
||||||
|
for (int i = 0; i < attributes.size(); ++i) {
|
||||||
|
HTMLParserAttribute *a = attributes[i];
|
||||||
|
|
||||||
|
if (a->match_attrib(attrib) && a->contains_data(contains_val)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
void HTMLParserTag::process() {
|
void HTMLParserTag::process() {
|
||||||
if (type != HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) {
|
if (type != HTMLParserTag::HTML_PARSER_TAG_TYPE_NONE) {
|
||||||
return;
|
return;
|
||||||
@ -60,7 +152,7 @@ void HTMLParserTag::process() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
//test for comment. <!-- -->
|
// test for comment. <!-- -->
|
||||||
++start_index;
|
++start_index;
|
||||||
if (data[2] == '-' && data[3] == '-') {
|
if (data[2] == '-' && data[3] == '-') {
|
||||||
type = HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT;
|
type = HTMLParserTag::HTML_PARSER_TAG_TYPE_COMMENT;
|
||||||
@ -78,7 +170,7 @@ void HTMLParserTag::process() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
//test for doctype. <!doctype >
|
// test for doctype. <!doctype >
|
||||||
int doctype_start_index = data.find("doctype ", 2);
|
int doctype_start_index = data.find("doctype ", 2);
|
||||||
|
|
||||||
if (doctype_start_index == -1) {
|
if (doctype_start_index == -1) {
|
||||||
@ -92,9 +184,9 @@ void HTMLParserTag::process() {
|
|||||||
String tag_text;
|
String tag_text;
|
||||||
|
|
||||||
if (data[data.size() - 2] == '/') {
|
if (data[data.size() - 2] == '/') {
|
||||||
//will catch all that looks like <br/>
|
// will catch all that looks like <br/>
|
||||||
//tags that look like <br> will be caught later in a post process, in a way
|
// tags that look like <br> will be caught later in a post process, in a way
|
||||||
//which also tries to catch erroneously not closed tags that supposed to be closed
|
// which also tries to catch erroneously not closed tags that supposed to be closed
|
||||||
type = HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG;
|
type = HTMLParserTag::HTML_PARSER_TAG_TYPE_SELF_CLOSING_TAG;
|
||||||
|
|
||||||
tag_text = data.substr(1, data.size() - 3);
|
tag_text = data.substr(1, data.size() - 3);
|
||||||
@ -107,12 +199,12 @@ void HTMLParserTag::process() {
|
|||||||
int fspc_index = tag_text.find(' ');
|
int fspc_index = tag_text.find(' ');
|
||||||
|
|
||||||
if (fspc_index == -1) {
|
if (fspc_index == -1) {
|
||||||
//no args
|
// no args
|
||||||
tag = tag_text;
|
tag = tag_text;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
//grab the tag itself
|
// grab the tag itself
|
||||||
tag = tag_text.substr(0, fspc_index);
|
tag = tag_text.substr(0, fspc_index);
|
||||||
|
|
||||||
String args = tag_text.substr(fspc_index + 1, tag_text.size() - fspc_index - 1);
|
String args = tag_text.substr(fspc_index + 1, tag_text.size() - fspc_index - 1);
|
||||||
@ -122,7 +214,7 @@ void HTMLParserTag::process() {
|
|||||||
int tag_end_index = data.find(' ', start_index);
|
int tag_end_index = data.find(' ', start_index);
|
||||||
|
|
||||||
if (tag_end_index == -1) {
|
if (tag_end_index == -1) {
|
||||||
//simple tag
|
// simple tag
|
||||||
tag = data.substr(start_index, data.size() - start_index - 1);
|
tag = data.substr(start_index, data.size() - start_index - 1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -153,23 +245,23 @@ void HTMLParserTag::parse_args(const String &args) {
|
|||||||
|
|
||||||
a->attribute = args.substr(i, equals_index - i);
|
a->attribute = args.substr(i, equals_index - i);
|
||||||
|
|
||||||
//todo
|
// todo
|
||||||
//a.trim();
|
// a.trim();
|
||||||
|
|
||||||
int next_char_index = equals_index + 1;
|
int next_char_index = equals_index + 1;
|
||||||
|
|
||||||
if (next_char_index >= args.size()) {
|
if (next_char_index >= args.size()) {
|
||||||
//an attribute looks like this "... attrib="
|
// an attribute looks like this "... attrib="
|
||||||
attributes.push_back(a);
|
attributes.push_back(a);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
//skip spaces
|
// skip spaces
|
||||||
while (args[next_char_index] == ' ') {
|
while (args[next_char_index] == ' ') {
|
||||||
++next_char_index;
|
++next_char_index;
|
||||||
|
|
||||||
if (next_char_index >= args.size()) {
|
if (next_char_index >= args.size()) {
|
||||||
//an attribute looks like this "... attrib= "
|
// an attribute looks like this "... attrib= "
|
||||||
attributes.push_back(a);
|
attributes.push_back(a);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -186,8 +278,8 @@ void HTMLParserTag::parse_args(const String &args) {
|
|||||||
int end_index = args.find(find_char, next_char_index);
|
int end_index = args.find(find_char, next_char_index);
|
||||||
|
|
||||||
if (end_index == -1) {
|
if (end_index == -1) {
|
||||||
//missing closing ' or " if c is ' or "
|
// missing closing ' or " if c is ' or "
|
||||||
//else missing parameter
|
// else missing parameter
|
||||||
|
|
||||||
a->data = args.substr(next_char_index, args.size() - next_char_index - 1);
|
a->data = args.substr(next_char_index, args.size() - next_char_index - 1);
|
||||||
attributes.push_back(a);
|
attributes.push_back(a);
|
||||||
@ -236,8 +328,8 @@ String HTMLParserTag::to_string(const int level) {
|
|||||||
|
|
||||||
s += "</" + tag + ">\n";
|
s += "</" + tag + ">\n";
|
||||||
} else if (type == HTML_PARSER_TAG_TYPE_CLOSING_TAG) {
|
} else if (type == HTML_PARSER_TAG_TYPE_CLOSING_TAG) {
|
||||||
//HTMLParserTag should handle this automatically
|
// HTMLParserTag should handle this automatically
|
||||||
//it's here for debugging purposes though
|
// it's here for debugging purposes though
|
||||||
s += "</" + tag + "(!)>";
|
s += "</" + tag + "(!)>";
|
||||||
|
|
||||||
if (tags.size() != 0) {
|
if (tags.size() != 0) {
|
||||||
@ -317,7 +409,7 @@ HTMLParserTag::~HTMLParserTag() {
|
|||||||
void HTMLParser::parse(const String &data) {
|
void HTMLParser::parse(const String &data) {
|
||||||
Vector<HTMLParserTag *> tags;
|
Vector<HTMLParserTag *> tags;
|
||||||
|
|
||||||
//split into tags
|
// split into tags
|
||||||
for (int i = 0; i < data.size(); ++i) {
|
for (int i = 0; i < data.size(); ++i) {
|
||||||
if (data[i] == '<') {
|
if (data[i] == '<') {
|
||||||
for (int j = i + 1; j < data.size(); ++j) {
|
for (int j = i + 1; j < data.size(); ++j) {
|
||||||
@ -356,7 +448,7 @@ void HTMLParser::parse(const String &data) {
|
|||||||
|
|
||||||
root = new HTMLParserTag();
|
root = new HTMLParserTag();
|
||||||
|
|
||||||
//process tags into hierarchical order
|
// process tags into hierarchical order
|
||||||
Vector<HTMLParserTag *> tag_stack;
|
Vector<HTMLParserTag *> tag_stack;
|
||||||
for (int i = 0; i < tags.size(); ++i) {
|
for (int i = 0; i < tags.size(); ++i) {
|
||||||
HTMLParserTag *t = tags[i];
|
HTMLParserTag *t = tags[i];
|
||||||
@ -417,18 +509,18 @@ void HTMLParser::parse(const String &data) {
|
|||||||
delete t;
|
delete t;
|
||||||
tags[i] = nullptr;
|
tags[i] = nullptr;
|
||||||
|
|
||||||
//ill-formed html
|
// ill-formed html
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
//find it's pair
|
// find it's pair
|
||||||
int tag_index = 0;
|
int tag_index = 0;
|
||||||
for (int j = tag_stack.size() - 1; j > 0; --j) {
|
for (int j = tag_stack.size() - 1; j > 0; --j) {
|
||||||
HTMLParserTag *ts = tag_stack[j];
|
HTMLParserTag *ts = tag_stack[j];
|
||||||
|
|
||||||
//we sould only have opening tags on the stack
|
// we sould only have opening tags on the stack
|
||||||
if (ts->tag == t->tag) {
|
if (ts->tag == t->tag) {
|
||||||
//found
|
// found
|
||||||
tag_index = j;
|
tag_index = j;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -436,8 +528,8 @@ void HTMLParser::parse(const String &data) {
|
|||||||
|
|
||||||
HTMLParserTag *opening_tag = tag_stack[tag_index];
|
HTMLParserTag *opening_tag = tag_stack[tag_index];
|
||||||
|
|
||||||
//mark everything else that we found before finding the opening tag as self closing, and add them to out opening tag
|
// mark everything else that we found before finding the opening tag as self closing, and add them to out opening tag
|
||||||
//If the html is ill formed, it just grabs everything from the tag stack
|
// If the html is ill formed, it just grabs everything from the tag stack
|
||||||
for (int j = tag_index + 1; j < tag_stack.size(); ++j) {
|
for (int j = tag_index + 1; j < tag_stack.size(); ++j) {
|
||||||
HTMLParserTag *ts = tag_stack[j];
|
HTMLParserTag *ts = tag_stack[j];
|
||||||
|
|
||||||
@ -460,7 +552,7 @@ void HTMLParser::parse(const String &data) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//add everything remaining on the stack to root
|
// add everything remaining on the stack to root
|
||||||
for (int i = 0; i < tag_stack.size(); ++i) {
|
for (int i = 0; i < tag_stack.size(); ++i) {
|
||||||
root->tags.push_back(tag_stack[i]);
|
root->tags.push_back(tag_stack[i]);
|
||||||
}
|
}
|
||||||
|
@ -42,6 +42,17 @@ public:
|
|||||||
Vector<HTMLParserTag*> tags;
|
Vector<HTMLParserTag*> tags;
|
||||||
Vector<HTMLParserAttribute*> attributes;
|
Vector<HTMLParserAttribute*> attributes;
|
||||||
|
|
||||||
|
HTMLParserTag *get_first(const String &t);
|
||||||
|
HTMLParserTag *get_first(const String &t, const String &attrib, const String &val);
|
||||||
|
|
||||||
|
String get_attribute_value(const String &attrib);
|
||||||
|
|
||||||
|
HTMLParserAttribute *get_attribute(const String &attrib);
|
||||||
|
bool has_attribute(const String &attrib);
|
||||||
|
|
||||||
|
HTMLParserAttribute *get_attribute(const String &attrib, const String &contains_val);
|
||||||
|
bool has_attribute(const String &attrib, const String &contains_val);
|
||||||
|
|
||||||
void process();
|
void process();
|
||||||
void parse_args(const String &args);
|
void parse_args(const String &args);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user