由于使用MFC框架,只上传按钮监听函数及爬取递归的代码,其他都是MFC自己生成的函数,注意:有的网址禁止爬虫获取(注意网站协议)
void StartCrawing(CString In)//开始爬取
{
if (Time<100)//至多访问100个页面(根据需要调整,无上限,曾调试过250000,无异常,只是时间长)
{
Time++;
_cprintf("%d Websites Visited\n", Time);//调试时查看进度
}
else
{
return;
}
std::vector<char> WebCode;//页面源码
TCHAR SzUrl[100];
_tcscpy(SzUrl, In);
HINTERNET Net1 = InternetOpen(NULL, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, NULL);
if (Net1 == NULL)
{
InternetCloseHandle(Net1);
return;
}
HINTERNET Net2 = InternetOpenUrl(Net1, SzUrl, NULL, NULL, INTERNET_FLAG_NO_CACHE_WRITE, NULL);
if (Net2 == NULL)
{
InternetCloseHandle(Net2);
InternetCloseHandle(Net1);
return;
}
DWORD DwMaxDataLength = 500;
PBYTE PBuff = (PBYTE)malloc(DwMaxDataLength*sizeof(TCHAR));
if (PBuff == NULL)
{
InternetCloseHandle(Net2);
InternetCloseHandle(Net1);
return;
}
DWORD DwReadDataLength = NULL;
BOOL bReta = TRUE;
do
{
ZeroMemory(PBuff, DwMaxDataLength*sizeof(TCHAR));
bReta = InternetReadFile(Net2, PBuff, DwMaxDataLength, &DwReadDataLength);
for (DWORD dw = 0; dw < DwReadDataLength; dw++)
{
WebCode.push_back(PBuff[dw]);
}
} while (DwReadDataLength != NULL);
CString Temp;
for (std::vector<char>::iterator i = WebCode.begin(); i != WebCode.end(); i++)
{
Temp = "";
CString Front;
if (*i == 'h'&&*(i + 1) == 'r'&&*(i + 2) == 'e'&&*(i + 3) == 'f'&&*(i + 4) == '='&&*(i + 5) == '"')
{
std::vector<char>::iterator j = i + 6;
while (*j != '"')
{
Temp += *j;
j++;
}
if (Temp != "javascript:;"&&Temp != "")
{
if (Temp[0] == '/'&&Temp[1] == '/')
{
Front = "https:";
Temp = Front + Temp;
Out += Temp;
Out += '\r';
Out += '\n';
StartCrawing(Temp);
}
else if (Temp[0] == '/' || Temp[0] == '#')
{
Front = "";
int i = 0, Sum = 0;
while (1)
{
if (In[i] == '/')
{
Sum++;
}
if (Sum>2)
{
break;
}
Front += In[i];
i++;
}
Temp = Front + Temp;
Out += Temp;
Out += '\r';//这是归位符,编辑框换行要用到
Out += '\n';
StartCrawing(Temp);
}
}
}
else if (*i == 's'&&*(i + 1) == 'r'&&*(i + 2) == 'c'&&*(i + 3) == '='&&*(i + 4) == '"')
{
std::vector<char>::iterator j = i + 5;
while (*j != '"')
{
Temp += *j;
j++;
}
if (Temp != "javascript:;"&&Temp != "")
{
if (Temp[0] == '/'&&Temp[1] == '/')
{
Front = "https:";
Temp = Front + Temp;
Out += Temp;
Out += '\r';
Out += '\n';
StartCrawing(Temp);
}
else if (Temp[0] == '/' || Temp[0] == '#')
{
Front = "";
int i = 0, Sum = 0;
while (1)
{
if (Sum>2)
{
break;
}
Front += In[i];
if (In[i] == '/')
{
Sum++;
}
i++;
}
Temp = Front + Temp;
Out += Temp;
Out += '\r';
Out += '\n';
StartCrawing(Temp);
}
else if (Temp[0] == 'h'&&Temp[1] == 't'&&Temp[2] == 't'&&Temp[3] == 'p')
{
Out += Temp;
Out += '\r';
Out += '\n';
StartCrawing(Temp);
}
}
}
}
WebCode.clear();
}
void CWebCrawingDlg::OnBnClickedButton1()//开始爬取按钮事件
{
// TODO: 在此添加控件通知处理程序代码
AllocConsole();//开启控制台查看进度
CString In;
GetDlgItem(IDC_EDIT1)->GetWindowText(In);//编辑框输入第一个靶向网址
StartCrawing(In);
FreeConsole();
SetDlgItemText(IDC_EDIT3,Out);
}