最近学深度学习,但是无奈很缺数据,所以就写了一个网页爬虫去爬取图像,(一个一个手动下载的话太烦了)
#define _SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS
#define _WINSOCK_DEPRECATED_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS
#include <winsock2.h>
#include <Windows.h>
#include <string>
#include <iostream>
#include <vector>
#include <process.h>
#include <WinInet.h>
#include <assert.h>
#pragma comment(lib, "Wininet.lib")
#pragma comment(lib, "ws2_32.lib")
using namespace std;
//获取网站的源码
void GetWebCode(const char* szWeb,char* szCode,int nSize,int& nLen)
{
HINTERNET hOpen = NULL, hOpenUrl = NULL;
nLen = 0;
hOpen = InternetOpen("Testing", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (hOpen)
{
hOpenUrl = InternetOpenUrl(hOpen, szWeb, NULL, 0, INTERNET_FLAG_RELOAD, 0);
if (hOpenUrl)
{
Sleep(40);
DWORD dwByteRead = 0;
if (InternetReadFile(hOpenUrl, szCode, nSize, &dwByteRead))
{
assert(dwByteRead < nSize);
nLen = dwByteRead;
}
}
}
if(hOpen)
InternetCloseHandle(hOpen);
if(hOpenUrl)
InternetCloseHandle(hOpen);
}
int FindSubstr(const char* szMain,int nMainSize,const char* szSub,int nSubSize,int nBegin = 0)
{
if (!szMain || !szSub)
return -1;
if (nMainSize <= 0 || nSubSize <= 0)
return -1;
int i = 0;
for (i = nBegin; i < nMainSize; i++)
{
bool bRet = true;
for (int j = 0; j < nSubSize; j++)
{
if (szMain[i + j] != szSub[j])
{
bRet = false;
break;
}
}
if (bRet)
break;
}
return (i == nMainSize) ? -1 : i;
}
void GetWebHref(vector<string>& cHrefList,const char* szBuffer,int nLen)
{
const char* szSub = "href=";
const char* szSubEnd = "html";
int nSubLen = strlen(szSub), nSubEndlen = strlen(szSubEnd);
int nRet = 0, nEnd = 0;
int nIndex = 0;
do
{
nRet = FindSubstr(szBuffer, nLen, szSub, nSubLen, nEnd);
if (nRet != -1)
{
char szTemp[1024] = "0";
nEnd = FindSubstr(szBuffer, nLen, szSubEnd, nSubEndlen, nRet + nSubLen);
if (nEnd != -1)
{
strncpy(szTemp, szBuffer + nRet + nSubLen + 1, nEnd - nRet - 2);
cout << "[" << nIndex++ << "] -> " << szTemp << endl;
cHrefList.emplace_back(szTemp);
}
}
} while (nRet != -1 && nEnd != -1);
}
void GetWebJpg(vector<string>& cUrl,vector<string>& cJpg)
{
const int Size = 1024 * 40;
char szCode[Size] = "0";
const char* szPos = "paper-down";
const char* szBegin = "href=";
const char* szEnd = "jpg";
int nIndex = 0;
for (auto& it : cUrl)
{
//不会超过20张图像
for (int i = 1; i < 20; i++)
{
char szWeb[1024] = "0";
strncpy(szWeb, it.c_str(), it.size());
szWeb[it.size() - 5] = '\0';
sprintf(szWeb, "%s_%d.html", szWeb, i);
//获取网站源代码
int nLen = 0;
GetWebCode(szW