HtmlCXX 解析html

最新推荐文章于 2024-04-15 17:36:25 发布

li_jian_xing

最新推荐文章于 2024-04-15 17:36:25 发布

阅读量1.8k

点赞数

分类专栏： C++ 文章标签： html cpp

本文链接：https://blog.csdn.net/li_jian_xing/article/details/44352805

版权

C++ 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

鉴于xml直接解析html会发生错误，也就是因为标点符号的问题。无奈只能还是以html的语法进行解析

在使用htmlCXX进行解析时会因为html的换行问题带来一行内容一个空的text，所以在处理前尽量去除所以 [\n][\r\n],再进行解析。

#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <cstdio>
//#include <unistd.h>

#include ".\html\ParserDom.h"
#include ".\html\wincstring.h"

using namespace std;
using namespace htmlcxx;

int main()
{

	//解析一段Html代码
	string html;
	fstream inf("1.xml");
	
	while (!inf.eof())
	{
		char buf[2048];
		inf.getline(buf,2048);
		html += buf;
	}
	//string html ="<html><body>hey</body></html>";

	HTML::ParserDom parser;// 可以用来将html代码转换成dom树

	tree<HTML::Node> dom = parser.parseTree(html);// 用来储存html各个节点

	std::cout << "------------------------------------------------------------------" << std::endl;
	cout<< dom << endl;// 输出整棵DOM树
	std::cout << "------------------------------------------------------------------" << std::endl;

	//输出树中所有的超链接节点
	tree<HTML::Node>::iterator it = dom.begin();
	tree<HTML::Node>::iterator end = dom.end();

	for(; it != end; ++it)
	{
		string html = it->tagName();// 获取节点的标签名

		if (it->tagName() == "table")
		{
			it->parseAttributes();// 附上节点属性
			

			// 获取class 的属性first 如果不存在为false
			if (!it->attribute("class").first) continue;

			// 获取class 的属性
			html = it->attribute("class").second;
			if(it->attribute("class").second != "TableList")
				continue;

			html = it->tagName();
			html = it->text();// 整个标签text

			//
			// 获取子标签
			tree<HTML::Node>::iterator it1 = dom.begin(it);
			tree<HTML::Node>::iterator end1 = dom.end(it);
			for (;it1!=end1;++it1)
			{
				it1->parseAttributes();
				//std::cout << it1->text() << std::endl;
				if (it1->tagName() == "tr")
				{
					// 再次获取子标签
					tree<HTML::Node>::iterator it2 = dom.begin(it1);
					tree<HTML::Node>::iterator end2 = dom.end(it1);
					for (;it2!=end2;++it2)
					{
						it2->parseAttributes();

						if ((!it2->isTag()) && (!it2->isComment()))
						{
							std::cout << it2->text() << std::endl;
						}
					}
				}
			}
		}

	}

	//输出所有的文本节点

	std::cout << "------------------------------------------------------------------" << std::endl;
	it= dom.begin();
	end= dom.end();
	for(; it != end; ++it)

	{
		html = it->text();
		if (it->isTag())// 是标签
		{
			;
		}else if(it->isComment())// 是注释
		{
			;
		}else// 是文本内容
		{
			;
		}

		// 是否为标记         是否为注释
		if ((!it->isTag()) && (!it->isComment()))
		{
			cout << it->text();

		}

	}

	cout << endl;
	std::cout << "------------------------------------------------------------------" << std::endl;

}

以下为html文件的一部分，在以前我通过TinyXml2进行解析时会发生错误。也就是因为标点符号不统一的原因

<!-- 哈哈 --><table class="TableList" align="center" width="95%"><tr class="TableData"><td nowrap align="center">第1次登记</td><td nowrap align="center">上班登记</td><td nowrap align="center">9:00:00</td><td nowrap align="center">08:13:26</td><td nowrap align="center">已考勤 <a href="javascript:remark('1','2015-03-16 08:13:26');">说明情况2015/3/17 15:56:11</a></td></tr><tr class="TableData"><td nowrap align="center">第2次登记</td><td nowrap align="center">下班登记</td><td nowrap align="center">17:30:00</td><td nowrap align="center">18:21:38</td><td nowrap align="center">已考勤<a href="javascript:remark('2','2015-03-16 18:21:38');">说明情况</a></td></tr></table>

以下是运行截图