C++ 提取网页内容系列之一

最新推荐文章于 2024-06-24 13:24:01 发布

weixin_30430169

最新推荐文章于 2024-06-24 13:24:01 发布

阅读量245

点赞数

文章标签： c/c++

原文链接：http://www.cnblogs.com/itdef/p/4171179.html

版权

标题: C++ 提取网页内容系列
作者: itdef
链接: http://www.cnblogs.com/itdef/p/4171179.html

欢迎转帖请保持文本完整并注明出处

首先分析网页就要下载网页内容这里给出了两种方案

一种是使用MFC自带函数

代码如下:

int GetHttpFileData(CString strUrl,char* szDownloadHtmFileName)
{
	CInternetSession Session("Internet Explorer", 0);
	CHttpFile *pHttpFile = NULL;
	CString strData;
	CString strClip;
	int iRet = -1;

	if(szDownloadHtmFileName == NULL)
	{	
		cerr << "DownloadHtmFileName is NULL" << endl;
		Session.Close();
		return iRet;
	}

	ofstream of(szDownloadHtmFileName);
	if (of.bad())
	{
		cerr << "of create file error" << endl;
		Session.Close();
		return iRet;
	}

	try
	{
		pHttpFile = (CHttpFile*)Session.OpenURL(strUrl);
		while ( pHttpFile->ReadString(strClip) )
		{
			of << strClip;
		}
	}catch(CInternetException* pEx)
	{
		TCHAR pszError[64];
		pEx->GetErrorMessage(pszError, 64);
		cerr << __FUNCTION__ << pszError << endl;
		goto GetHttpFileData_EXIT;
	}

	iRet = 0;

GetHttpFileData_EXIT:
	Session.Close();
	of.close();

	return iRet;
}

这里我将下载内容写入了一个文件存入硬盘。另外还需要注意的是网页文件下载的格式可能是宽字节使用UTF8格式，这里需要将其转换为GBK多字节。

int UTF8Str2GBK(const string& strUTF8,string& strGBK)
{
	int i = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0);
	WCHAR *wsz = NULL;
	TCHAR *tsz = NULL;
	int iRet = -1;

	wsz = new WCHAR[i+1];
	if( NULL == wsz)
	{
		goto UTF8Str2GBK_EXIT;
	}
	MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, wsz, i);

	i = WideCharToMultiByte(CP_ACP, 0, wsz, -1, NULL, 0, NULL, NULL);
	tsz = new TCHAR[i+1];
	if( NULL == tsz)
	{
		goto UTF8Str2GBK_EXIT;
	}
	WideCharToMultiByte(CP_ACP, 0, wsz, -1, tsz, i, NULL, NULL);
	
	strGBK = string(tsz);

	iRet = 0;
UTF8Str2GBK_EXIT:

	delete []wsz;
	delete []tsz;

	return iRet;
}

全部代码见 http://www.oschina.net/code/snippet_614253_43732

效果图见 http://www.cnblogs.com/itdef/p/4081963.html

转载于:https://www.cnblogs.com/itdef/p/4171179.html

weixin_30430169

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
C++ 提取网页内容系列之一

标题:C++ 提取网页内容系列作者:itdef链接:http://www.cnblogs.com/itdef/p/4171179.html欢迎转帖请保持文本完整并注明出处首先分析网页就要下载网页内容这里给出了两种方案一种是使用MFC自带函数代码如下:int GetHttpFileData(CString strUrl,char* szDownloadHtm...
复制链接

扫一扫