吾也爱C++_C++爬虫,MFC实现,爬取网页超链接及资源链接,使用递归爬取下一层

由于使用MFC框架,只上传按钮监听函数及爬取递归的代码,其他都是MFC自己生成的函数,注意:有的网址禁止爬虫获取(注意网站协议)

void StartCrawing(CString In)//开始爬取
{
	if (Time<100)//至多访问100个页面(根据需要调整,无上限,曾调试过250000,无异常,只是时间长)
	{
		Time++;
		_cprintf("%d Websites Visited\n", Time);//调试时查看进度
	}
	else
	{
		return;
	}
	std::vector<char> WebCode;//页面源码
	TCHAR SzUrl[100];
	_tcscpy(SzUrl, In);
	HINTERNET Net1 = InternetOpen(NULL, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, NULL);
	if (Net1 == NULL)
	{
		InternetCloseHandle(Net1);
		return;
	}
	HINTERNET Net2 = InternetOpenUrl(Net1, SzUrl, NULL, NULL, INTERNET_FLAG_NO_CACHE_WRITE, NULL);
	if (Net2 == NULL)
	{
		InternetCloseHandle(Net2);
		InternetCloseHandle(Net1);
		return;
	}
	DWORD DwMaxDataLength = 500;
	PBYTE PBuff = (PBYTE)malloc(DwMaxDataLength*sizeof(TCHAR));
	if (PBuff == NULL)
	{
		InternetCloseHandle(Net2);
		InternetCloseHandle(Net1);
		return;
	}
	DWORD DwReadDataLength = NULL;
	BOOL bReta = TRUE;
	do
	{
		ZeroMemory(PBuff, DwMaxDataLength*sizeof(TCHAR));
		bReta = InternetReadFile(Net2, PBuff, DwMaxDataLength, &DwReadDataLength);
		for (DWORD dw = 0; dw < DwReadDataLength; dw++)
		{
			WebCode.push_back(PBuff[dw]);
		}
	} while (DwReadDataLength != NULL);
	CString Temp;
	for (std::vector<char>::iterator i = WebCode.begin(); i != WebCode.end(); i++)
	{
		Temp = "";
		CString Front;
		if (*i == 'h'&&*(i + 1) == 'r'&&*(i + 2) == 'e'&&*(i + 3) == 'f'&&*(i + 4) == '='&&*(i + 5) == '"')
		{
			std::vector<char>::iterator j = i + 6;
			while (*j != '"')
			{
				Temp += *j;
				j++;
			}
			if (Temp != "javascript:;"&&Temp != "")
			{
				if (Temp[0] == '/'&&Temp[1] == '/')
				{
					Front = "https:";
					Temp = Front + Temp;
					Out += Temp;
					Out += '\r';
					Out += '\n';
					StartCrawing(Temp);
				}
				else if (Temp[0] == '/' || Temp[0] == '#')
				{
					Front = "";
					int i = 0, Sum = 0;
					while (1)
					{
						if (In[i] == '/')
						{
							Sum++;
						}
						if (Sum>2)
						{
							break;
						}
						Front += In[i];
						i++;
					}
					Temp = Front + Temp;
					Out += Temp;
					Out += '\r';//这是归位符,编辑框换行要用到
					Out += '\n';
					StartCrawing(Temp);
				}
			}
		}
		else if (*i == 's'&&*(i + 1) == 'r'&&*(i + 2) == 'c'&&*(i + 3) == '='&&*(i + 4) == '"')
		{
			std::vector<char>::iterator j = i + 5;
			while (*j != '"')
			{
				Temp += *j;
				j++;
			}
			if (Temp != "javascript:;"&&Temp != "")
			{
				if (Temp[0] == '/'&&Temp[1] == '/')
				{
					Front = "https:";
					Temp = Front + Temp;
					Out += Temp;
					Out += '\r';
					Out += '\n';
					StartCrawing(Temp);
				}
				else if (Temp[0] == '/' || Temp[0] == '#')
				{
					Front = "";
					int i = 0, Sum = 0;
					while (1)
					{
						if (Sum>2)
						{
							break;
						}
						Front += In[i];
						if (In[i] == '/')
						{
							Sum++;
						}
						i++;
					}
					Temp = Front + Temp;
					Out += Temp;
					Out += '\r';
					Out += '\n';
					StartCrawing(Temp);
				}
				else if (Temp[0] == 'h'&&Temp[1] == 't'&&Temp[2] == 't'&&Temp[3] == 'p')
				{
					Out += Temp;
					Out += '\r';
					Out += '\n';
					StartCrawing(Temp);
				}

			}
		}
	}
	WebCode.clear();
}

void CWebCrawingDlg::OnBnClickedButton1()//开始爬取按钮事件
{
	// TODO:  在此添加控件通知处理程序代码
	AllocConsole();//开启控制台查看进度
	CString In;
	GetDlgItem(IDC_EDIT1)->GetWindowText(In);//编辑框输入第一个靶向网址
	StartCrawing(In);
	FreeConsole();
	SetDlgItemText(IDC_EDIT3,Out);
}

结果如图,避免等待上限设为10

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值