用htmlcxx解析从libcurl中获取到的web网页源码

最新推荐文章于 2022-08-31 15:26:34 发布

每天看一遍，防止恋爱&&堕落

最新推荐文章于 2022-08-31 15:26:34 发布

阅读量4.6k

点赞数

分类专栏： C、C++ iOS相关 VC++

本文链接：https://blog.csdn.net/zengraoli/article/details/13623701

版权

iOS相关同时被 3 个专栏收录

52 篇文章 0 订阅

订阅专栏

C、C++

51 篇文章 5 订阅

订阅专栏

VC++

17 篇文章 0 订阅

订阅专栏

0、前言：

结合上一篇blog文，已经大体知道了如何使用libcurl来获取网页的源代码（使用libcurl获取经过gzip压缩的网页文件），下来我们在配合htmlcxx，解析用libcurl获取到的网页源代码。

1、相关源代码：

封装的libcurl的类的代码，在上一篇文章中（使用libcurl获取经过gzip压缩的网页文件）；这里主要看主要部分：

// AnalysisHtml.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include <string.h>
#include <iostream>
#include <fstream>
#include "html/ParserDom.h"
#include <stdlib.h>
#include <stdio.h>
#include "html/utils.h"
#include "GetPageByURL.h"


using namespace std;
using namespace htmlcxx;


void Init(string &page_content)
{
	GetPageByURL::Initialize();

	GetPageByURL::GetPageContent("http://www.fenzhi.com/", page_content);
}


void ClearUp()
{
	GetPageByURL::Cleanup();
}


void UseHtmlCxxAnalysisHtmlStringTestCase()
{

	//解析一段Html代码
	string html ="<html><body>测试文字</body></html>";
	//	string html ="<html><body>测试文字</html>";
	//	string html ="<html><body>测试文字</body</html>";
	//	string html ="<html>测试文字</body></html>";

	HTML::ParserDom parser;

	tree<HTML::Node> dom = parser.parseTree(html);

	//输出整棵DOM树

	cout<< dom << endl;

	//输出树中所有的超链接节点

	tree<HTML::Node>::iterator it = dom.begin();

	tree<HTML::Node>::iterator end = dom.end();

	for(; it != end; ++it)
	{
		// _stricmp()函数，在linux下用strcasecmp()函数替换
		if (_stricmp(it->tagName().c_str(), "A") == 0)
		{
			it->parseAttributes();
			cout <<it->attribute("href").second << endl;
		}
	}

	//输出所有的文本节点
	it= dom.begin();
	end= dom.end();
	for(; it != end; ++it)
	{
		if ((!it->isTag()) && (!it->isComment()))
		{
			cout << it->text();
		}
	}

	cout << endl;
}


void UseHtmlCxxAnalysisWebPageStringTestCase()
{
	string html;

	Init(html);

	HTML::ParserDom parser;

	tree<HTML::Node> dom = parser.parseTree(html);

	//输出整棵DOM树
	cout << dom << endl;

	ofstream zengraoli("e://zeng.txt");

	zengraoli << dom << endl;

	zengraoli.close();

	//输出树中所有的超链接节点

	tree<HTML::Node>::iterator it  = dom.begin();

	tree<HTML::Node>::iterator end = dom.end();

	for(; it != end; ++it)
	{
		// _stricmp()函数，在linux下用strcasecmp()函数替换
		if (_stricmp(it->tagName().c_str(), "A") == 0)
		{
			it->parseAttributes();
			cout << it->attribute("href").second << endl;
		}
	}

	// 输出所有的文本节点
	it  = dom.begin();
	end = dom.end();
	for(; it != end; ++it)
	{
		if ((!it->isTag()) && (!it->isComment()))
		{
			cout << it->text();
		}
	}

	cout << endl;

	ClearUp();
}


int _tmain(int argc, _TCHAR* argv[])
{
//	UseHtmlCxxAnalysisHtmlStringTestCase();

	UseHtmlCxxAnalysisWebPageStringTestCase();

	return 0;
}