上图就是我爬取的效果图。 我这个人看别人的博客的时候第一件事情就是看看有没有图,所以我上来就把图给放出来。
我所使用的语言是c++语言,不要以为C++就不能进行爬虫了。
废话不多说,直接上代码。
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include "json/json.h"
#include <curl/curl.h>
#include "Base64Util.h"
#include <fstream>
#include<atltime.h>
#include <sstream>
#include <io.h>
#include <direct.h>
using namespace std;
#define URL_REFERER "https://kyfw.12306.cn/otn/index/init"
#pragma comment(lib,"libcurl.lib")
static size_t OnWriteData(void* buffer, size_t size, size_t nmemb, void* lpVoid)
{
string* str = dynamic_cast<string*>((string *)lpVoid);
if (NULL == str || NULL == buffer)
{
return -1;
}
char* pData = (char*)buffer;
str->append(pData, size * nmemb);
return nmemb;
}
bool base64ToJpg(string& data)
{
Json::Reader JsonReader;
Json::Value json_object;
if (!JsonReader.parse(data, json_object))
return false;
string str = json_object.get("image", "").asCString();
int len = str.length();
char *buff = new char[len];
CBase64Util::Base64_Decode(buff, str.c_str(), len, '=', len);
char m_szCurrVCodeName[256];
time_t now = time(NULL);
struct tm* tblock = localtime(&now);
memset(m_szCurrVCodeName, 0, sizeof(m_szCurrVCodeName));
//创建或者判断是否存在vcode 文件夹
int ret = _access("vcode", 0);
if (ret == -1)
{
//创建文件夹
_mkdir("vcode");
}
sprintf(m_szCurrVCodeName, "vcode\\vcode%04d%02d%02d%02d%02d%02d.jpg",
1900 + tblock->tm_year, 1 + tblock->tm_mon, tblock->tm_mday,
tblock->tm_hour, tblock->tm_min, tblock->tm_sec);
ofstream out(m_szCurrVCodeName, ios::binary | ios::out);
out.write(buff, len);
out.close();
delete[] buff;
return true;
}
bool HttpRequest(const char* url,
string& strResponse,
bool get/* = true*/,
const char* headers/* = NULL*/,
const char* postdata/* = NULL*/,
bool bReserveHeaders/* = false*/,
int timeout/* = 10*/)
{
CURLcode res;
CURL* curl = curl_easy_init();
if (NULL == curl)
{
return false;
}
curl_easy_setopt(curl, CURLOPT_URL, url);
//响应结果中保留头部信息
if (bReserveHeaders)
curl_easy_setopt(curl, CURLOPT_HEADER, 1);
curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, OnWriteData);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&strResponse);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
//设定为不验证证书和HOST
//curl_easy_setopt(curl, CURLOPT_PROXY, "127.0.0.1:8888");//设置代理
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false);
//设置超时时间
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, timeout);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
curl_easy_setopt(curl, CURLOPT_REFERER, URL_REFERER);
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36");
//不设置接收的编码格式或者设置为空,libcurl会自动解压压缩的格式,如gzip
//curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip, deflate, br");
//添加自定义头信息
if (headers != NULL)
{
struct curl_slist *chunk = NULL;
chunk = curl_slist_append(chunk, headers);
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
}
if (!get && postdata != NULL)
{
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postdata);
}
res = curl_easy_perform(curl);
bool bError = false;
if (res == CURLE_OK)
{
int code;
res = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code);
if (code != 200 && code != 302)
{
bError = true;
}
}
else
{
bError = true;
}
curl_easy_cleanup(curl);
return !bError;
}
int main()
{
int count = 0,icount=0;
cout << "请输入要下载多少张(例如:100):";
cin >> icount;
while (count<icount)
{
string strResponse;
string url = "https://kyfw.12306.cn/passport/captcha/captcha-image64?login_site=E&module=login&rand=sjrand&";
SYSTEMTIME tmSys;
GetLocalTime(&tmSys);
CTime tm3(tmSys);
__int64 tmDst = __int64(tm3.GetTime()) * 1000 + tmSys.wMilliseconds;
stringstream surl;
surl << url << tmDst << "\0";
if (!HttpRequest(surl.str().c_str(), strResponse, true, "Upgrade-Insecure-Requests: 1", NULL, false, 10))
{
cout << "超时了,或者ip被封了\n";
continue;
}
char sjbuf[64] = { 0 };
sprintf_s(sjbuf, 64, "%d-%d-%d %d:%d:%d", tm3.GetYear(), tm3.GetMonth(), tm3.GetDay()
,tm3.GetHour(), tm3.GetMinute(), tm3.GetSecond());
bool ret = base64ToJpg(strResponse);
if (ret==true)
{
count++;
cout << "已下载" << count << "张图片----->" << "当前时间-->" << sjbuf << endl;
}
else
{
cout << "图片下载失败----->" << "当前时间-->" << sjbuf << endl;
}
Sleep(4000);
}
system("pause");
return 0;
}
当然你直接复制这个代码没法使用的,因为我用到curl这个库,但是我怕你们骂我,所以我把工程也上传,我又怕你们骂我骗积分,所以我的所有的代码工程都在百度云能找到。
链接:https://pan.baidu.com/s/1x4AVeJ-LogsTrRj-DQS5GQ
提取码:bvjr
希望对你们有用。
欢迎关注我们的公众号,本人知识能力有限,如果文章中有错误的地方欢迎向我反馈或者留言,十分感谢!