吾也爱C++_C++爬虫(爬取百度知道的问题和答案保存文件,问答)

代码较长,但是可复用

#include<iostream>
#include<stdlib.h>
#include<string>

#include<fstream>
#include<cassert>

#include<windows.h>
#include<Wininet.h>
#include<vector>
#include<tchar.h>
#include<ctime>
#pragma comment(lib,"Wininet.lib")

#define DEF_URL "https://zhidao.baidu.com/question/"
#define DEF_SAVEPOS "MyAnswer.txt"

class MyFileManager//单例模式;唯一管理器
{
public:
	static MyFileManager* GetFileManager()
	{
		if (FileManager==nullptr)
		{
			FileManager = new MyFileManager();
		}
		return FileManager;
	}

	void Read(std::string file)
	{
		CheckIn(file);
		CloseIn();
	}

	void Write(std::string file, char c)
	{
		OutFile.put(c);	
	}

	std::string Search(std::string file,std::string qua)
	{
		std::string Temp;
		int i = 0;
		while (Temp != qua)
		{
			std::getline(InFile, Temp);
			i++;
			if (i > 100)
			{
				return "人家不知道呢";
			}
		}
		std::getline(InFile, Temp);
		return Temp;
	}

	void CheckIn(std::string file)
	{
		InFile.open(file);
		if (!InFile.is_open())
		{
			CreateSaveFile();
			CloseIn();
		}
		else if (!InFile)
		{
			std::cout << "存在未关闭输入流"<<std::endl;
		}
	}
	void CloseIn()
	{
		InFile.close();
	}

	void CheckOut(std::string file)
	{
		OutFile.open(file);
		if (!OutFile)
		{
			std::cout << "存在未关闭输出流" << std::endl;
		}
	}
	void CloseOut()
	{
		OutFile.close();
	}

	bool CheckPermission(std::string file)//检查写入许可
	{
		CheckIn(file);
		std::string Temp;
		std::getline(InFile, Temp);
		CloseIn();
		if (Temp.empty())
			return true;
		else
		{
			std::cout << "存在数据,禁止写入" << std::endl;
			return false;
		}
	}

	void CreateSaveFile()//如果没有文件则创造
	{
		std::ofstream Temp(DEF_SAVEPOS);
		Temp.close();
		std::cout << "创造成功" << std::endl;
	}
private:
	std::ifstream InFile;//文件输入流
	std::ofstream OutFile;//文件输出流
	static MyFileManager* FileManager;
	MyFileManager(){};
};
MyFileManager* MyFileManager::FileManager = nullptr;

MyFileManager* OpenFileManager = MyFileManager::GetFileManager();//文件管理指针

class Group
{
public:
	static Group* GetGroupObject()
	{
		if (group==nullptr)
		{
			group = new Group();
		}
		return group;
	}
	void SetColor()
	{
		system("color 0a");
	}
	std::string GetGroupAnswer(std::string qua)
	{
		return OpenFileManager->Search(DEF_SAVEPOS, qua);
	}
private:
	static Group* group;
	Group(){};
};
Group* Group::group = nullptr;

int StartCrawling(int max)//开始爬取
{
	if (!OpenFileManager->CheckPermission(DEF_SAVEPOS))
	{
		return 0;
	}
	time_t StartTime = time(0);
	std::vector<char> WebCode;//所有源码字节//使用标准库中的容器访问较快,用char*访问字符集很慢,当然效果是一样的
	OpenFileManager->CheckOut(DEF_SAVEPOS);
	int Time=0;
	for (; Time < max; Time++)
	{
		int UrlLess = 11230416+Time*10;//网址后缀
		std::string CurrtUrl = DEF_URL+std::to_string(UrlLess);
		TCHAR SzUrl[100];
		_stprintf_s(SzUrl, _T("%S"), CurrtUrl.c_str());
		HINTERNET Net1 = InternetOpen(NULL, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, NULL);
		if (Net1 == NULL)
		{
			InternetCloseHandle(Net1);
			return 0;
		}
		HINTERNET Net2 = InternetOpenUrl(Net1, SzUrl, NULL, NULL, INTERNET_FLAG_NO_CACHE_WRITE, NULL);
		if (Net2 == NULL)
		{
			InternetCloseHandle(Net2);
			InternetCloseHandle(Net1);
			return 0;
		}
		DWORD DwMaxDataLength = 500;
		PBYTE PBuff = (PBYTE)malloc(DwMaxDataLength*sizeof(TCHAR));
		if (PBuff == NULL)
		{
			InternetCloseHandle(Net2);
			InternetCloseHandle(Net1);
			return 0;
		}
		DWORD DwReadDataLength = NULL;
		BOOL bReta = TRUE;
		do
		{
			ZeroMemory(PBuff, DwMaxDataLength*sizeof(TCHAR));
			bReta = InternetReadFile(Net2, PBuff, DwMaxDataLength, &DwReadDataLength);
			for (DWORD dw = 0; dw < DwReadDataLength; dw++)
			{
				WebCode.push_back(PBuff[dw]);
			}
		} while (DwReadDataLength!=NULL);
		std::vector<char>::iterator i;
		for (i = WebCode.begin(); i != WebCode.end(); i++)
		{
			if (*i == 't'&&*(i + 1) == 'i'&&*(i + 2) == 't'&&*(i + 3) == 'l'&&*(i + 4) == 'e')//捕获titile//如果用string 或者字符数组可以直接用正则表达式
			{
				std::vector<char>::iterator j = i + 6;
				while (*j !='<'&&*j!='_')
				{
					if ((*j == '百'&&*(j + 1) == '度'&&*(j + 2) == '知'&&*(j + 3) == '道')||(*(j + 1) == '百'&&*(j + 2) == '度'&&*(j + 3) == '知'&&*(j + 4) == '道'))
					{
						break;
					}
					else
					{
						if (*j!='  '&&*j!='\n'){
							OpenFileManager->Write(DEF_SAVEPOS, *j);
						}
						j++;
					}
					
				}
				OpenFileManager->Write(DEF_SAVEPOS, '\n');
				break;
			}
		}
		for (i = WebCode.begin(); i != WebCode.end(); i++)
		{
			if (*i == 'a'&&*(i + 1) == 'r'&&*(i + 2) == 'r'&&*(i + 3) == 'o'&&*(i + 4) == 'w'&&*(i + 5) == 'd'&&*(i + 6) == 'o'&&*(i + 7) == 'w'&&*(i + 8) == 'n')//捕获arrowdown
			{
				std::vector<char>::iterator j = i + 32;
				while (*j != '<')
				{
					if (*j != '  '&&*j != '\n'){
						OpenFileManager->Write(DEF_SAVEPOS, *j);
					}
					j++;
				}
				OpenFileManager->Write(DEF_SAVEPOS, '\n');
				OpenFileManager->Write(DEF_SAVEPOS, '\n');
				break;
			}
		}
		if (i==WebCode.end())
		{
			char *NullAnswer = "未找到答案\n";
			while (*NullAnswer!='\0')
			{
				OpenFileManager->Write(DEF_SAVEPOS, *NullAnswer);
				NullAnswer++;
			}
		}
		WebCode.clear();//一定要清掉,要不然下一次查找会定位到本次的结果
		system("cls");
		std::cout << "已完成爬取:" << Time << "/" << max << std::endl;
		
	}
	if (Time==max)
	{
		time_t EndTime = time(0);
		system("cls");
		std::cout << "已完成100%";
		if (EndTime - StartTime>3600)
		{
			std::cout << " 耗时:" << (EndTime - StartTime) / 3600 << "h" << (EndTime - StartTime)%3600/ 60 << "min" << (EndTime - StartTime) % 3600 % 60 << std::endl;
		}
		else if (EndTime - StartTime>60)
		{
			std::cout << " 耗时:" << (EndTime - StartTime) / 60 << "min" << (EndTime - StartTime) % 60 << "s" << std::endl;
		}
		else
		{
			std::cout << " 耗时:" << EndTime - StartTime << "s" << std::endl;
		}
	}
	OpenFileManager->CloseOut();
	return 0;
}

int main()
{
	Group *Talk = Group::GetGroupObject();
	Talk->SetColor();
	StartCrawling(10);
	std::string qua;
	std::cout << "简单问答(1对1)"<<std::endl;
	while (qua!="关闭")
	{
		OpenFileManager->CheckIn(DEF_SAVEPOS);
		std::cout << "提问:";
		std::cin >> qua;
		std::cout << "小水:";
		std::cout<<(Talk->GetGroupAnswer(qua));
		std::cout << std::endl<<std::endl;
		OpenFileManager->CloseIn();
	}
	
	return 0;
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值