c\c++写网络爬虫，curl+gumbo配合使用

最新推荐文章于 2024-05-26 09:36:24 发布

wu_lian_nan

最新推荐文章于 2024-05-26 09:36:24 发布

阅读量1k

点赞数 6

本文链接：https://blog.csdn.net/wu_lian_nan/article/details/90206097

版权

爬虫专栏收录该内容

0 篇文章 0 订阅

订阅专栏

是的，你没有听错。就是用c++或者说c语言写爬虫。

其实不难，虽然没有Python写起来那么简单。但是也不是那么复杂啦，毕竟好多大佬都写了那么多库，我们只要会用大佬写的库就行。

网址：https://acm.sjtu.edu.cn/OnlineJudge/status

我们就爬取这个页面的评审状态的所有内容。

代码如下：

#include <iostream>
#include <fstream>
#include "gumbo/Document.h"
#include "gumbo/Node.h"
#include "MyStringFormat.h"
#include "curl/curl.h"

using namespace std;

#define  URL_REFERER "https://acm.sjtu.edu.cn/OnlineJudge/"

void printFunc(string page)
{
	CDocument doc;
	doc.parse(page.c_str());

	CSelection c = doc.find("#status tr");
	for (int i = 0; i < c.nodeNum(); i++)
	{
		for (int j = 0; j < c.nodeAt(i).childNum(); j++)
		{
			CNode nd = c.nodeAt(i).childAt(j);
			cout << MyStringFormat::UTF_82ASCII(nd.text()).c_str() << "  ";
		}
		cout << endl;
	}
}

static size_t OnWriteData(void* buffer, size_t size, size_t nmemb, void* lpVoid)
{
	string* str = dynamic_cast<string*>((string *)lpVoid);
	if (NULL == str || NULL == buffer)
	{
		return -1;
	}

	char* pData = (char*)buffer;
	str->append(pData, size * nmemb);

	return nmemb;
}


bool HttpRequest(const char* url,
	string& strResponse,
	bool get/* = true*/,
	const char* headers/* = NULL*/,
	const char* postdata/* = NULL*/,
	bool bReserveHeaders/* = false*/,
	int timeout/* = 10*/)
{
	CURLcode res;
	CURL* curl = curl_easy_init();
	if (NULL == curl)
	{
		return false;
	}

	curl_easy_setopt(curl, CURLOPT_URL, url);

	//响应结果中保留头部信息
	if (bReserveHeaders)
		curl_easy_setopt(curl, CURLOPT_HEADER, 1);
	curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
	curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL);
	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, OnWriteData);
	curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&strResponse);
	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
	//设定为不验证证书和HOST
	//curl_easy_setopt(curl, CURLOPT_PROXY, "127.0.0.1:8888");//设置代理
	//curl_easy_setopt(curl, CURLOPT_PROXYPORT, 9999); //代理服务器端口
	curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
	curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false);

	//设置超时时间
	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, timeout);
	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
	curl_easy_setopt(curl, CURLOPT_REFERER, URL_REFERER);

	curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
	//不设置接收的编码格式或者设置为空，libcurl会自动解压压缩的格式，如gzip
	//curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip, deflate, br");
	//设置hostConnection: Keep-Alive
	struct curl_slist *chunk = NULL;
	chunk = curl_slist_append(chunk, "Host: acm.sjtu.edu.cn");
	chunk = curl_slist_append(chunk, "Connection: Keep-Alive");
	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);

	//添加自定义头信息
	if (headers != NULL)
	{
		chunk = curl_slist_append(chunk, headers);
		curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
	}

	if (!get && postdata != NULL)
	{
		curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postdata);
	}

	res = curl_easy_perform(curl);
	bool bError = false;
	if (res == CURLE_OK)
	{
		int code;
		res = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code);
		if (code != 200 && code != 302)
		{
			bError = true;
		}
	}
	else
	{
		bError = true;
	}

	curl_easy_cleanup(curl);

	return !bError;
}



int main(int argc, char * argv[])
{

	string response;
	HttpRequest("https://acm.sjtu.edu.cn/OnlineJudge/status", response, true, NULL, NULL, false, 10);
	printFunc(response);
	system("pause");
	return 0;
}

我知道，我贴出这些代码，也没法运行，所以我把工程文件也发出来。为了不被大家说我骗积分，我的所有东西都贴出百度云链接。

链接：https://pan.baidu.com/s/1jBZ-6tT-4ne0uTMw4jFvKA
提取码：pmg6

喜欢的欢迎关注我的公众号

wu_lian_nan

关注

6
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
c\c++写网络爬虫，curl+gumbo配合使用

是的，你没有听错。就是用c++或者说c语言写爬虫。其实不难，虽然没有Python写起来那么简单。但是也不是那么复杂啦，毕竟好多大佬都写了那么多库，我们只要会用大佬写的库就行。网址：https://acm.sjtu.edu.cn/OnlineJudge/status我们就爬取这个页面的评审状态的所有内容。代码如下：#include <iostream&...
复制链接

扫一扫