C++爬取网页文本并生成.txt文件

最新推荐文章于 2024-04-29 11:56:38 发布

王也校长

最新推荐文章于 2024-04-29 11:56:38 发布

阅读量273

点赞数

分类专栏： C++ 文章标签： c++ 开发语言

本文链接：https://blog.csdn.net/weixin_43192020/article/details/132339031

版权

C++ 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

相比较python爬虫，c++的爬虫实现起来要复杂一些，本人查看网上一些资料，对c++爬取网页中的文本数据并将其保存到本地生成的txt文件做一点总结，参考的文章如下：

https://blog.csdn.net/weixin_50964512/article/details/123076248?spm=1001.2014.3001.5506

c++代码如下：

#include<Windows.h>
#include<regex>
#include<iostream>

#include <fstream>
#pragma comment(lib,"Urlmon.lib")
using namespace std;
/// <summary>
/// 从网页中获取指定文本
/// </summary>
/// <param name="url">网页地址</param>
/// <param name="pattern">匹配模式</param>
/// <param name="patternIndex">返回第几个()中的内容</param>
/// <param name="num">返回匹配到的数量</param>
/// <returns>返回的字符串数组</returns>
string* GetTextFromUrlA(const string& url, const string& pattern, int patternIndex, int& num)
{
	HRESULT ret = URLDownloadToFileA(NULL, url.c_str(), ".\\tmp.txt", 0, NULL); //下载网页到tmp.txt文件中
	if (ret != S_OK) { //如果下载失败返回NULL
		return NULL;
	}
	//下载成功,读取文本内容
	FILE* file;
	errno_t err = fopen_s(&file, ".\\tmp.txt", "r");
	if (err != 0) {
		return NULL;
	}
	fseek(file, 0, SEEK_END);
	int nSize = ftell(file);
	fseek(file, 0, SEEK_SET);
	std::string buf;
	buf.resize(nSize + 1);
	fread((char*)buf.c_str(), sizeof(char), nSize, file);
	fclose(file);

	//开始匹配
	regex r(pattern); //初始化匹配模式变量r
	smatch result;	  //保存匹配到的结果result
	string::const_iterator begin = buf.begin(); //获取文本开始的迭代器
	string::const_iterator end = buf.end();		//获取文本结束的迭代器
	int i = 0; //统计可以匹配到的个数
	while (regex_search(begin, end, result, r)) { //匹配成功返回true,继续下一次匹配.失败则退出循环
		i++; //匹配到一个,加一
		begin = result[0].second;  //获取当前匹配到的位置,更新匹配的开始位置
	}
	num = i;
	//知道了有多少个,分配对应内存,重新开始匹配
	begin = buf.begin();
	string* strBuf = new string[i + 1]{};
	int index = 0;
	while (regex_search(begin, end, result, r)) {
		strBuf[index++] = result[patternIndex].str();
		begin = result[0].second;
	}

	//DeleteFileA(".\\tmp.txt"); //匹配完成,可以删除下载的文件了

	return strBuf; //返回匹配到的结果
}

int main() {
	string url; //保存url
	url.resize(1024);

	printf("(输入网址后五位数字）https://celestrak.org/NORAD/elements/gp.php?CATNR=:");
	string urlNumber;
	cin >> urlNumber;
	string u = "https://celestrak.org/NORAD/elements/gp.php?CATNR=" + urlNumber;  //拼接成完整链接

	char U[56];
	strcpy_s(U, u.c_str());  //将网址链接的字符串形式转成数组形式
	
	sprintf_s((char*)url.c_str(), 1024, U, 1); //格式化url
	int nums = 1;
	string* str = GetTextFromUrlA(url.c_str(), "<li><a href=\"(.*?)\"\\s{0,1}title=\"(.*?)\".*?><img src=\"(.*?)\" .*?</li>", 1, nums);

	delete[] str;

	
}

运行结果：