GitHub - mylogin/htmlparser: Fast and lightweight C++ HTML parser

Usage

Access nodes

html::parser p;
html::node_ptr node = p.parse(R"(<!DOCTYPE html><body><div attr="val">text</div><!--comment--></body>)");

// `parse` method returns root node of type html::node_t::none
assert(node->type_node == html::node_t::none);
assert(node->at(0)->type_node == html::node_t::doctype);
assert(node->at(1)->type_node == html::node_t::tag);
assert(node->at(1)->at(0)->at(0)->type_node == html::node_t::text);
assert(node->at(1)->at(1)->type_node == html::node_t::comment);

std::cout << "Number of child elements: " << node->size() << std::endl << std::endl; // 2

std::cout << "Loop through child nodes: " << std::endl;
for(auto& n : *(node->at(1))) {
	std::cout << n->to_html() << std::endl;
}
std::cout << std::endl;

std::cout << "Get node properties: " << std::endl;
std::cout << "DOCTYPE name: " << node->at(0)->content << std::endl; // html
std::cout << "BODY tag: " << node->at(1)->tag_name << std::endl; // body
std::cout << "Attr value: " << node->at(1)->at(0)->get_attr("attr") << std::endl; // val
std::cout << "Text node: " << node->at(1)->at(0)->at(0)->content << std::endl; // text
std::cout << "Comment: " << node->at(1)->at(1)->content << std::endl; // comment

Find nodes using `select` method

List of available selectors

html::parser p;
html::node_ptr node = p.parse(R"(<div id="my_id"><p class="my_class"></p></div>)");
std::vector<html::node*> selected = node->select("div#my_id p.my_class");
for(auto elem : selected) {
	std::cout << elem->to_html() << std::endl;
}

Access nodes using callback (called when the document is parsed)

html::parser p;
p.set_callback("meta[http-equiv='Content-Type'][content*='charset=']", [](html::node& n) {
	if (n.type_node == html::node_t::tag && n.type_tag == html::tag_t::open) {
		std::cout << "Callback with selector to filter elements:" << std::endl;
		std::cout << n.to_html() << std::endl << std::endl;
	}
});
p.set_callback([](html::node& n) {
	if(n.type_node == html::node_t::tag && n.type_tag == html::tag_t::open && n.tag_name == "meta") {
		if(n.get_attr("http-equiv") == "Content-Type" && n.get_attr("content").find("charset=") != std::string::npos) {
			std::cout << "Callback without selector:" << std::endl;
			std::cout << n.to_html() << std::endl;
		}
	}
});
p.parse(R"(<head><title>Title</title><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>)");

Manual search

std::cout << "Search `li` tags which not in `ol`:" << std::endl;
html::parser p;
html::node_ptr node = p.parse("<ul><li>li1</li><li>li2</li></ul><ol><li>li</li></ol>");
node->walk([](html::node& n) {
	if(n.type_node == html::node_t::tag && n.tag_name == "ol") {
		return false; // not scan child tags
	}
	if(n.type_node == html::node_t::tag && n.tag_name == "li") {
		std::cout << n.to_html() << std::endl;
	}
	return true; // scan child tags
});

Finding unclosed tags

html::parser p;

// Callback to handle errors
p.set_callback([](html::err_t e, html::node& n) {
	if(e == html::err_t::tag_not_closed) {
		std::cout << "Tag not closed: " << n.to_html(' ', false);
		std::string msg;
		html::node* current = &n;
		while(current->get_parent()) {
			msg.insert(0, " " + current->tag_name);
			current = current->get_parent();
		}
		msg.insert(0, "\nPath:");
		std::cout << msg << std::endl;
	}
});
p.parse("<div><p><a></p></div>");

Print document formatted

html::parser p;
html::node_ptr node = p.parse("<ul><li>li1</li><li>li2</li></ul><ol><li>li</li></ol>");

// method takes two arguments, the indentation character and whether to output child elements (tabulation and true by default)
std::cout << node->to_html(' ', true) << std::endl;

Print text content of a node

html::parser p;
html::node_ptr node = p.parse("<div><p><b>First</b> p</p><p><i>Second</i> p</p>Text<br />Text</div>");

std::cout << "Print text with line breaks preserved:" << std::endl;
std::cout << node->to_text() << std::endl << std::endl;

std::cout << "Print text with line breaks replaced with spaces:" << std::endl;
std::cout << node->to_text(true) << std::endl;

Build document

std::cout << "Using helpers:" << std::endl;

html::node hdiv = html::utils::make_node(html::node_t::tag, "div");
hdiv.append(html::utils::make_node(html::node_t::text, "Link:"));
hdiv.append(html::utils::make_node(html::node_t::tag, "br"));
html::node ha = html::utils::make_node(html::node_t::tag, "a", {{"href", "https://github.com/"}, {"class", "a_class"}});
ha.append(html::utils::make_node(html::node_t::text, "Github.com"));
std::cout << hdiv.append(ha).to_html() << std::endl << std::endl;

std::cout << "Without helpers:" << std::endl;

html::node div;
div.type_node = html::node_t::tag;
div.tag_name = "div";

html::node text;
text.type_node = html::node_t::text;
text.content = "Link:";
div.append(text);

html::node br;
br.type_node = html::node_t::tag;
br.tag_name = "br";
br.self_closing = true;
div.append(br);

html::node a;
a.type_node = html::node_t::tag;
a.tag_name = "a";
a.set_attr("href", "https://github.com/");
a.set_attr("class", "a_class");

html::node a_text;
a_text.type_node = html::node_t::text;
a_text.content = "Github.com";
a.append(a_text);

div.append(a);

std::cout << div.to_html() << std::endl;

Selectors

Selector example	Description	select	callback
*	all elements	√	√
div	tag name	√	√
#id1	id="id1"	√	√
.class1	class="class1"	√	√
.class1.class2	class="class1 class2"	√	√
:first	first element	√	√
:last	last element	√	-
:eq(3)	element index = 3 (starts from 0)	√	√
:gt(3)	element index > 3 (starts from 0)	√	√
:lt(3)	element index < 3 (starts from 0)	√	√
[attr]	element that have attribute "attr"	√	√
[attr='val']	attribute is equal to "val"	√	√
[attr!='val']	attribute is not equal to "val" or does not exist	√	√
[attr^='http:']	attribute starts with "http:"	√	√
[attr$='.jpeg']	attribute ends with ".jpeg"	√	√
[attr*='/path/']	attribute contains "/path/"	√	√
[attr~='flower']	attribute contains word "flower"	√	√
[attr\|='en']	attribute equal to "en" or starting with "en-"	√	√
div#id1.class1[attr='val']	element that matches all of these selectors	√	√
p,div	element that matches any of these selectors	√	√
div p	all `<p>` elements inside `<div>` elements	√	-
div>p	all `<p>` elements where the parent is a `<div>` element	√	-
div div>p>i	combination of nested selectors	√	-

Name		Name	Last commit message	Last commit date
Latest commit History 35 Commits
example		example
test		test
CMakeLists.txt		CMakeLists.txt
LICENSE		LICENSE
README.md		README.md
html.cpp		html.cpp
html.hpp		html.hpp

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Repository files navigation

Usage

Access nodes

Find nodes using `select` method

Access nodes using callback (called when the document is parsed)

Manual search

Finding unclosed tags

Print document formatted

Print text content of a node

Build document

Selectors

About

Uh oh!

Releases

Packages

Languages

License

mylogin/htmlparser

Folders and files

Latest commit

History

Repository files navigation

Usage

Access nodes

Find nodes using select method

Access nodes using callback (called when the document is parsed)

Manual search

Finding unclosed tags

Print document formatted

Print text content of a node

Build document

Selectors

About

Topics

Resources

License

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Languages

Find nodes using `select` method

Packages