爬取12306网站验证码

最新推荐文章于 2021-03-30 15:32:10 发布

wu_lian_nan

最新推荐文章于 2021-03-30 15:32:10 发布

阅读量833

点赞数 2

分类专栏：随笔文章标签：爬虫 12306

本文链接：https://blog.csdn.net/wu_lian_nan/article/details/89092397

版权

随笔专栏收录该内容

2 篇文章 0 订阅

订阅专栏

上图就是我爬取的效果图。我这个人看别人的博客的时候第一件事情就是看看有没有图，所以我上来就把图给放出来。

我所使用的语言是c++语言，不要以为C++就不能进行爬虫了。

废话不多说，直接上代码。

#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include "json/json.h"
#include <curl/curl.h>
#include "Base64Util.h"
#include <fstream>
#include<atltime.h>
#include <sstream>
#include <io.h>
#include <direct.h>

using namespace std;

#define URL_REFERER         "https://kyfw.12306.cn/otn/index/init"
#pragma comment(lib,"libcurl.lib")

static size_t OnWriteData(void* buffer, size_t size, size_t nmemb, void* lpVoid)
{
	string* str = dynamic_cast<string*>((string *)lpVoid);
	if (NULL == str || NULL == buffer)
	{
		return -1;
	}

	char* pData = (char*)buffer;
	str->append(pData, size * nmemb);
	return nmemb;
}



bool base64ToJpg(string& data)
{
	Json::Reader JsonReader;
	Json::Value json_object;
	if (!JsonReader.parse(data, json_object))
		return false;
	string str = json_object.get("image", "").asCString();
	int len = str.length();
	char *buff = new char[len];
	CBase64Util::Base64_Decode(buff, str.c_str(), len, '=', len);

	char                m_szCurrVCodeName[256];
	time_t now = time(NULL);
	struct tm* tblock = localtime(&now);
	memset(m_szCurrVCodeName, 0, sizeof(m_szCurrVCodeName));
	//创建或者判断是否存在vcode 文件夹
	int ret = _access("vcode", 0);
	if (ret == -1)
	{
		//创建文件夹
		_mkdir("vcode");
	}

	sprintf(m_szCurrVCodeName, "vcode\\vcode%04d%02d%02d%02d%02d%02d.jpg",
		1900 + tblock->tm_year, 1 + tblock->tm_mon, tblock->tm_mday,
		tblock->tm_hour, tblock->tm_min, tblock->tm_sec);

	ofstream out(m_szCurrVCodeName, ios::binary | ios::out);
	out.write(buff, len);
	out.close();
	delete[] buff;
	return true;
}

bool HttpRequest(const char* url,
	string& strResponse,
	bool get/* = true*/,
	const char* headers/* = NULL*/,
	const char* postdata/* = NULL*/,
	bool bReserveHeaders/* = false*/,
	int timeout/* = 10*/)
{
	CURLcode res;
	CURL* curl = curl_easy_init();
	if (NULL == curl)
	{
		return false;
	}

	curl_easy_setopt(curl, CURLOPT_URL, url);

	//响应结果中保留头部信息
	if (bReserveHeaders)
		curl_easy_setopt(curl, CURLOPT_HEADER, 1);
	curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
	curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL);
	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, OnWriteData);
	curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&strResponse);
	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
	//设定为不验证证书和HOST
	//curl_easy_setopt(curl, CURLOPT_PROXY, "127.0.0.1:8888");//设置代理
	curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
	curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false);

	//设置超时时间
	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, timeout);
	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
	curl_easy_setopt(curl, CURLOPT_REFERER, URL_REFERER);
	
	curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36");
	//不设置接收的编码格式或者设置为空，libcurl会自动解压压缩的格式，如gzip
	//curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip, deflate, br");


	//添加自定义头信息
	if (headers != NULL)
	{
		struct curl_slist *chunk = NULL;
		chunk = curl_slist_append(chunk, headers);
		curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
	}

	if (!get && postdata != NULL)
	{
		curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postdata);
	}


	res = curl_easy_perform(curl);
	bool bError = false;
	if (res == CURLE_OK)
	{
		int code;
		res = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code);
		if (code != 200 && code != 302)
		{
			bError = true;
		}
	}
	else
	{
		bError = true;
	}

	curl_easy_cleanup(curl);

	return !bError;
}


int main()
{
	int count = 0,icount=0;
	cout << "请输入要下载多少张(例如:100)：";
	cin >> icount;
	
	while (count<icount)
	{
		string strResponse;
		string url = "https://kyfw.12306.cn/passport/captcha/captcha-image64?login_site=E&module=login&rand=sjrand&";
		SYSTEMTIME tmSys;
		GetLocalTime(&tmSys);
		CTime tm3(tmSys);
		__int64 tmDst = __int64(tm3.GetTime()) * 1000 + tmSys.wMilliseconds;
		stringstream surl;
		surl << url << tmDst << "\0";
		if (!HttpRequest(surl.str().c_str(), strResponse, true, "Upgrade-Insecure-Requests: 1", NULL, false, 10))
		{
			cout << "超时了，或者ip被封了\n";
			continue;
		}
		
		char sjbuf[64] = { 0 };

		sprintf_s(sjbuf, 64, "%d-%d-%d %d:%d:%d", tm3.GetYear(), tm3.GetMonth(), tm3.GetDay()
			,tm3.GetHour(), tm3.GetMinute(), tm3.GetSecond());

		bool ret =  base64ToJpg(strResponse);
		if (ret==true)
		{
			count++;
			cout << "已下载" << count << "张图片----->" << "当前时间-->" << sjbuf << endl;
		}
		else
		{
			cout << "图片下载失败----->" << "当前时间-->" << sjbuf << endl;
		}
		Sleep(4000);
	}

	system("pause");
	return 0;
}

当然你直接复制这个代码没法使用的，因为我用到curl这个库，但是我怕你们骂我，所以我把工程也上传，我又怕你们骂我骗积分，所以我的所有的代码工程都在百度云能找到。

链接：https://pan.baidu.com/s/1x4AVeJ-LogsTrRj-DQS5GQ
提取码：bvjr

希望对你们有用。

欢迎关注我们的公众号，本人知识能力有限，如果文章中有错误的地方欢迎向我反馈或者留言，十分感谢！