c++编写网络爬虫

luciferau

已于 2023-11-18 13:49:53 修改

阅读量3k

点赞数 3

文章标签： c++ 爬虫

于 2023-07-05 15:20:02 首次发布

本文链接：https://blog.csdn.net/m0_72703340/article/details/131555757

版权

文章介绍了如何使用C++和EasyX库创建一个带有图形化界面的爬虫程序。首先，详细讲解了如何安装和使用EasyX来创建窗口，并调整窗口位置。接着，展示了如何设计和绘制按钮，包括拖动窗口、点击事件处理和按钮状态的改变。之后，文章提到了输入URL的功能，并创建了用于输入的对话框。最后，文章简要提及了网络连接、下载HTML内容以及遍历和处理网页中URL的初步步骤。

摘要由CSDN通过智能技术生成

c++爬虫项目

实现图形化界面UI

在这里插入图片描述

安装easyX（需要用的graphisc.h）我之前的文章详细写到过如何安装。是这篇文章提到的：传送门

easyx官网

创建图形化界面

#define WINDOW_WIDTH 482
#define WINDOW_HEIGHT 300
void initUI() {
	initgraph(WINDOW_WIDTH, WINDOW_HEIGHT,EX_SHOWCONSOLE);
	setbkmode(TRANSPARENT);
}

移动窗口位置

int screenWidth;
int screenHeight;//屏幕分辨率数据

//获取屏幕分辨率
	screenWidth = GetSystemMetrics(SM_CXSCREEN);
	screenHeight = GetSystemMetrics(SM_CYSCREEN);
	hwnd = GetHWnd(); //获取当前窗口句柄
	//减去原有上方白色标题栏
	SetWindowLong( //设置窗口属性说
		hwnd,
		GWL_STYLE, //设定一个新的窗口风格。
		//GetWindowLong 获取指定串口的属性
		GetWindowLong(hwnd, GWL_STYLE) - WS_CAPTION);//WS_CAPTION 带标题栏的窗口风格

MoveWindow(hwnd, screenWidth * 0.7, 100, WINDOW_WIDTH, WINDOW_HEIGHT, false);

绘制UI界面

	MoveWindow(hwnd, screenWidth * 0.7, 100, WINDOW_WIDTH, WINDOW_HEIGHT, false);
	loadimage(&imgBg, "");
	putimage(0, 0, &imgBg);

编译运行

在这里插入图片描述

更改代码

MoveWindow(hwnd, screenWidth*0.5, 100, WINDOW_WIDTH, WINDOW_HEIGHT, true);

完美绘制 true参数表示是否重新绘制
在这里插入图片描述

初始化按钮

制作按钮

我们需要从背景图上扣出两个按钮一个是原本的标题栏（可以拖动程序的），另一个是程序的关闭按钮。

在这里插入图片描述

制作的比较随意。。。

最后一个按钮是进入按钮（方案放弃）

[ 在这里插入图片描述

再次DIY

在这里插入图片描述

对按钮进行初始化

先创建按钮的类

struct Button {
	IMAGE imgNormal;
	IMAGE imgPress;
	int width, highth;
	int x, y;
	int flag; // 按钮的int类型标记
	bool pressed;
};

写一个初始化按钮的函数

void initButton(Button* btn, const char* normalFile, const char* pressFile,
	int width, int highth, int flag) {
	if (!btn) return;
	loadimage(&btn->imgNormal, normalFile, width, highth, true);
	loadimage(&btn->imgPress, pressFile, width, highth, true);

	btn->width = width;
	btn->highth = highth;

	btn->pressed = false;
	btn->flag = flag;
}

定义三个按钮

//定义按钮
Button btnClose;
Button btnTitle;
Button btnEnter;

载入函数

	loadimage(&imgBg, "./UI.jpg");
	putimage(0, 0, &imgBg);
	//初始化关闭按钮
	initButton(&btnClose, "./normal.bmp", "./press.bmp", 37, 29, 0);
	btnClose.x = WINDOW_WIDTH - 37;
	btnClose.y = 0;
	//初始化标题按钮
	initButton(&btnTitle, "./title_normal.jpg", "./title_press.jpg", 445, 29, 0);
	btnTitle.x = 0;
	btnTitle.y = 0;
//初始化进入按钮

	//初始化进入按钮
	initButton(&btnEnter, "./enter_normal.bmp", "./enter_press.bmp", 165, 53, 0);
	btnEnter.x = 145;
	btnEnter.y = 172;

初始化完毕（累死我了大部份时间都在绘图）；

绘制按钮

写一个绘制按钮的函数

在写一个绘制透明图片的函数（easyx默认是不可以使用透明图片的贝叶斯定理）

void drawButton(Button* btn) {
	if (!btn) return;
	if (btn->pressed) {
		drawPNG(btn->x, btn->y, &btn->imgPress);
	}
	else {
		drawPNG(btn->x, btn->y, &btn->imgNormal);
	}
}

void drawPNG(int  picture_x, int picture_y, IMAGE* picture) //x为载入图片的X坐标，y为Y坐标
{

	// 变量初始化
	DWORD* dst = GetImageBuffer();    // GetImageBuffer()函数，用于获取绘图设备的显存指针，EASYX自带
	DWORD* draw = GetImageBuffer();
	DWORD* src = GetImageBuffer(picture); //获取picture的显存指针
	int picture_width = picture->getwidth(); //获取picture的宽度，EASYX自带
	int picture_height = picture->getheight(); //获取picture的高度，EASYX自带
	int graphWidth = getwidth();       //获取绘图区的宽度，EASYX自带
	int graphHeight = getheight();     //获取绘图区的高度，EASYX自带
	int dstX = 0;    //在显存里像素的角标

	// 实现透明贴图 公式： Cp=αp*FP+(1-αp)*BP ， 贝叶斯定理来进行点颜色的概率计算
	for (int iy = 0; iy < picture_height; iy++)
	{
		for (int ix = 0; ix < picture_width; ix++)
		{
			int srcX = ix + iy * picture_width; //在显存里像素的角标
			int sa = ((src[srcX] & 0xff000000) >> 24); //0xAArrggbb;AA是透明度
			int sr = ((src[srcX] & 0xff0000) >> 16); //获取RGB里的R
			int sg = ((src[srcX] & 0xff00) >> 8);   //G
			int sb = src[srcX] & 0xff;              //B
			if (ix >= 0 && ix <= graphWidth && iy >= 0 && iy <= graphHeight && dstX <= graphWidth * graphHeight)
			{
				dstX = (ix + picture_x) + (iy + picture_y) * graphWidth; //在显存里像素的角标
				int dr = ((dst[dstX] & 0xff0000) >> 16);
				int dg = ((dst[dstX] & 0xff00) >> 8);
				int db = dst[dstX] & 0xff;
				draw[dstX] = ((sr * sa / 255 + dr * (255 - sa) / 255) << 16)  //公式： Cp=αp*FP+(1-αp)*BP  ； αp=sa/255 , FP=sr , BP=dr
					| ((sg * sa / 255 + dg * (255 - sa) / 255) << 8)         //αp=sa/255 , FP=sg , BP=dg
					| (sb * sa / 255 + db * (255 - sa) / 255);              //αp=sa/255 , FP=sb , BP=db
			}
		}
	}
}

先对一些变量进行初始化

	bool titleDrag = false; //表示“标题栏”是否被单击拖动
	int titleLastX; //窗口的上一次位置（X 坐标位置）
	int titleLastY; //窗口的上一次位置（X 坐标位置）

然后写一个函数判断此时鼠标的位置是否在button上

bool checkButtonSelect(Button* btn, MOUSEMSG* msg) {
	float margin = 0.01;
	if (msg->x >= btn->x + btn->width * margin &&
		msg->x <= btn->x + btn->width * (1 - margin) &&
		msg->y >= btn->y + btn->highth * margin &&
		msg->y <= btn->y + btn->highth * (1 - margin)) {
		return true;
	}
	else {
		return false;
	}
}

实现鼠标的读写 UI界面的综合逻辑代码 Talking is cheap，show me the code;

while (1) {
		MOUSEMSG m = GetMouseMsg();
		FlushMouseMsgBuffer(); //不能少，后缀快速拖动顶部的标题按钮，将导致鼠标消息太多，
		//出现混乱！
		switch (m.uMsg) {
		case WM_MOUSEMOVE:
			if (checkButtonSelect(&btnTitle, &m)) {
				if (btnTitle.pressed == true) {
					if (titleDrag == false) { // 此时标题栏已经被点击按下，正准备拖动
						titleLastX = m.x; // 记录初始坐标
						titleLastY = m.y;
						titleDrag = true;
					}
					else { // 此时标题栏已经被点击按下，正在拖动
						// 计算拖动偏移量
						int offX = m.x - titleLastX;
						int offY = m.y - titleLastY;
						moveWindow(hwnd, offX, offY); // 根据拖动偏移量，移动窗口
					}
				}
			}
			else if (checkButtonSelect(&btnEnter, &m)) {
				btnEnter.pressed = true;
				drawButton(&btnEnter);
			}
			else if (checkButtonSelect(&btnClose, &m)) {
				btnClose.pressed = true;
				drawButton(&btnClose);
			}
			else {
				// 检查鼠标是否从按钮内移动到按钮之外
				if (btnClose.pressed == true) { // 鼠标从关闭按钮移出
					btnClose.pressed = false;
					drawButton(&btnClose);
				}
				if (btnEnter.pressed == true) { // 鼠标从发送按钮移出
					btnEnter.pressed = false;
					drawButton(&btnEnter);
				}
			}
			break;
		case WM_LBUTTONDOWN:
			if (checkButtonSelect(&btnTitle, &m)) {
				btnTitle.pressed = true; // 单击按下标题栏
			}
			else if (checkButtonSelect(&btnClose, &m)) {
				btnClose.pressed = true;
				drawButton(&btnClose);
			}
			else if (checkButtonSelect(&btnEnter, &m)) {
				btnEnter.pressed = true;
				drawButton(&btnEnter);
			}
			break;
		case WM_LBUTTONUP:
			if (checkButtonSelect(&btnClose, &m)) {
				closegraph();
				exit(0);
			}
			else if (checkButtonSelect(&btnEnter, &m)) {
			//to do
			}
			else if (checkButtonSelect(&btnTitle, &m))
			{
				// 松开标题栏按钮（左键抬起）
				btnTitle.pressed = false;
				titleDrag = false;
			}
			break;
		}
	}

绘制按钮

	drawButton(&btnEnter);
	drawButton(&btnClose);
	drawButton(&btnTitle);

这样差不多UI就设计完成

run一下
在这里插入图片描述

在这里插入图片描述

获取url地址（主角登场）

//URL
wchar_t url[1024];

.....
else if (checkButtonSelect(&btnEnter, &m)) {
			//to do
		
			//输入URL 正戏开始
			InputBox((LPTSTR)url, 1024, "请输入URL地址");//easyX提供的函数进行数据的读入
			 
		}

InputBox函数

bool InputBox(
	LPTSTR	pString,
	int		nMaxCount,
	LPCTSTR	pPrompt = NULL,
	LPCTSTR	pTitle = NULL,
	LPCTSTR	pDefault = NULL,
	int		width = 0,
	int		height = 0,
	bool	bHideCancelBtn = true
);

参数

pString

指定接收用户输入字符串的指针。

nMaxCount

指定 pString 指向的缓冲区的大小，该值会限制用户输入内容的长度。缓冲区的大小包括表示字符串结尾的 ‘\0’ 字符。当允许多行输入时，用户键入的回车占两个字符位置。

pPrompt

指定显示在对话框中的提示信息。提示信息中可以用“\n”分行。InputBox 的高度会随着提示信息内容的多少自动扩充。如果该值为 NULL，则不显示提示信息。

pTitle

指定 InputBox 的标题栏。如果为 NULL，将显示应用程序的名称。

pDefault

指定显示在用户输入区的默认值。

width

指定 InputBox 的宽度（不包括边框），最小为 200 像素。如果为 0，则使用默认宽度。

height

指定 InputBox 的高度（不包括边框）。如果为 0，表示自动计算高度，用户输入框只允许输入一行内容，按“回车”确认输入信息；如果大于 0，用户输入框的高度会自动拓展，同时允许输入多行内容，按“Ctrl+回车”确认输入信息。

bHideCancelBtn

指定是否隐藏取消按钮禁止用户取消输入。如果为 true(默认)，InputBox 只有一个“确定”按钮，没有“X”关闭按钮，按 ESC 无效；如果为 false，InputBox 有“确定”和“取消”按钮，允许点“X”和按 ESC 关闭窗口。

更新代码

string url;
char *  URL;

InputBox(URL,1024,NULL,NULL,"请输入URL地址",NULL,NULL,false);

将URL 转为string格式的url；

url = URL;

创建文件夹保存爬取资源

创建reptile();函数

void reptile() {
	//输入URL 正戏开始
	InputBox((LPTSTR)url, 1024, "请输入URL地址");

}

优化代码

void reptile() {
	//输入URL 正戏开始
	InputBox((LPTSTR)url, 1024, "请输入URL地址");

}
-------------------------------
.....
else if (checkButtonSelect(&btnEnter, &m)) {
			//to do
		
		reptile();
			 
		}

创建文件夹

	CreateDirectory("./resource", NULL);

	CreateDirectory("./resource/images", NULL);

	CreateDirectory("./resource/videos", NULL);

抓取函数

 bool startCatch(string url)//抓取url
{
	 
	queue<string> q;//url队列
	q.push(url);

	while (!q.empty())
	{

		string currentUrl = q.front();//当前的URL
		q.pop();//删除

		//解析URL
		if (false == AnalysisURL(url))
		{
			continue;
		}
	}



	return true;
}

原理图
在这里插入图片描述

包含头文件queue

#include

 //解析URL
bool AnalysisURL(string url) {
	HWND hwndurl;
	//获取当前窗口句柄
	hwndurl = GetHWnd();

	//找http协议
	if (string::npos == url.find("http://")) {
	
	MessageBox(hwnd, "解析失败未找到协议",NULL,NULL);
	return false;//解析失败

	}

	if (url.length() <= 7)
	{
		MessageBox(hwnd, "url长度过小", NULL, NULL);
		return false;	
	}

	//截取域名
	int pos = url.find('/', 7);//从第七个开始截取
	if (pos == string::npos)
	{
		sHost = url.substr(7);
		sObject = "/";
	}
	else {
		sHost = url.substr(7, pos - 7);
		sObject = url.substr(pos);

	}

	if (sHost.empty() || sObject.empty())
	{
		MessageBox(hwndurl, "host or object is empty ", NULL, NULL);

		return false;
	}

	cout << "域名：" << sHost << endl << "资源:" << sObject << endl;


	return true;
}

 void reptile() {
	

	//创建文件夹保存爬取资源 video img   

	CreateDirectory("./resource", NULL);

	CreateDirectory("./resource/images", NULL);

	CreateDirectory("./resource/videos", NULL);

	loadimage(&imgAnalysis, "./catch.jpg");
	putimage(0, 0, &imgAnalysis);
	drawButton(&btnClose);
	 

	cin >> url;
	//开始抓
	startCatch(url);



}

	else if (checkButtonSelect(&btnEnter, &m)) {
					//to do
					MessageBox(hwndUI, "请在控制台输入URL地址", NULL, NULL);
					cout << "请输入URL" << endl;
					reptile();
				}

优化使用多线程进行逻辑判断界面操作

好处可以一直循环使用程序不需要重开 UI界面一直在运行不会死

	LPVOID param = 0; 

mainUI(param);//逻辑初始化界面

	DWORD  threadID = 0;
	
	HANDLE handleUI = CreateThread(NULL, NULL,mainUI, NULL, NULL, &threadID);

DWORD WINAPI mainUI(LPVOID param);

DWORD WINAPI mainUI(LPVOID param)
{
	 
		HWND hwndUI;
		hwndUI = GetHWnd();
		bool titleDrag = false; //表示“标题栏”是否被单击拖动
		while (1) {
			MOUSEMSG m = GetMouseMsg();
			FlushMouseMsgBuffer(); //不能少，后缀快速拖动顶部的标题按钮，将导致鼠标消息太多，
			//出现混乱！
			switch (m.uMsg) {
			case WM_MOUSEMOVE:
				if (checkButtonSelect(&btnTitle, &m)) {
					if (btnTitle.pressed == true) {
						if (titleDrag == false) { // 此时标题栏已经被点击按下，正准备拖动
							titleLastX = m.x; // 记录初始坐标
							titleLastY = m.y;
							titleDrag = true;
						}
						else { // 此时标题栏已经被点击按下，正在拖动
							// 计算拖动偏移量
							int offX = m.x - titleLastX;
							int offY = m.y - titleLastY;
							moveWindow(hwnd, offX, offY); // 根据拖动偏移量，移动窗口
						}
					}
				}
				else if (checkButtonSelect(&btnEnter, &m)) {
					
					btnEnter.pressed = true;
					drawButton(&btnEnter);
				}
				else if (checkButtonSelect(&btnClose, &m)) {
					btnClose.pressed = true;
					drawButton(&btnClose);
				}
				else {
					// 检查鼠标是否从按钮内移动到按钮之外
					if (btnClose.pressed == true) { // 鼠标从关闭按钮移出
						btnClose.pressed = false;
						drawButton(&btnClose);
					}
					if (btnEnter.pressed == true) { // 鼠标从发送按钮移出
						btnEnter.pressed = false;
						drawButton(&btnEnter);
					}
				}
				break;
			case WM_LBUTTONDOWN:
				if (checkButtonSelect(&btnTitle, &m)) {
					btnTitle.pressed = true; // 单击按下标题栏
				}
				else if (checkButtonSelect(&btnClose, &m)) {
					btnClose.pressed = true;
					drawButton(&btnClose);
				}
				else if (checkButtonSelect(&btnEnter, &m)) {
					btnEnter.pressed = true;
					drawButton(&btnEnter);
				}
				break;
			case WM_LBUTTONUP:
				if (checkButtonSelect(&btnClose, &m)) {
					closegraph();
					exit(0);
				}
				else if (checkButtonSelect(&btnEnter, &m)) {
					//to do
					MessageBox(hwndUI, "请在控制台输入URL地址", NULL, NULL);
					cout << "请输入URL" << endl;
					reptile();


				}
				else if (checkButtonSelect(&btnTitle, &m))
				{
					// 松开标题栏按钮（左键抬起）
					btnTitle.pressed = false;
					titleDrag = false;
				}
				break;
			}
		}

	 
	return NULL;
}

联网下载html

bool startCatch(string url)//抓取url
{
	 
	queue<string> q;//url队列
	q.push(url);

	while (!q.empty())
	{

		string currentUrl = q.front();//当前的URL
		q.pop();//删除

		//解析URL
		if (false == AnalysisURL(url))
		 
			continue;
		if (false == Connect())
			continue;

			
	}



	return true;
}

bool Connect() {

	//初始化网络
	WSADATA wsadata;
	if (WSAStartup(MAKEWORD(2, 2), &wsadata))  
		return false;
	 
	//创建套接字
	sock_client = socket(AF_INET, SOCK_STREAM, 0);

	if (sock_client == INVALID_SOCKET)
	 return false;
	 

	//解析域名为IP地址
	hostent *p =  gethostbyname(sHost.c_str());

	if (p == NULL)
		return false;
	//连接web服务器
	sockaddr_in sa;
	sa.sin_family = AF_INET;
	sa.sin_port = htons(80);
	//IP地址
	memcpy(&sa.sin_port, p->h_addr, 4);

	if (connect(sock_client, (sockaddr*)&sa, sizeof(sockaddr)))
		return false;


}

获取网页

string GetHtml(string url)//获取网页
{
	//解析URL
	if (false == AnalysisURL(url))

		return "";
	if (false == Connect())
		return "";

	//获取网页，发送Get请求 HTTP协议
	string info;
	info += "GET" + sObject + "HTTP/1.1\r\n";
	info += "Host: " + sHost + "\r\n";
	info += "Connection: Close\r\n\r\n";

	if (SOCKET_ERROR == send(sock_client, info.c_str(), info.length(), NULL))
		return false;
	
	//接受数据
	char ch = 0;
	string html;
	while (recv(sock_client, &ch, sizeof(char), 0)) {
		html += ch;
	}

	return html;

}

代码优化

#Include
//接受数据
char ch = 0;

fstream dataFile;   //创建一个文件，用于存放html的内容dataFile.open("D:\\html.txt", std::ios::out);
dataFile.open("./html.txt", ios::out);
if (!dataFile)
{
	printf("文件打开失败!\n");
 
}



string html;
while (recv(sock_client, &ch, sizeof(char), 0)) {
	html += ch;
	dataFile << ch;

}
dataFile.close();

运行出现错误connect连接失败；
在这里插入图片描述

更改代码（代码写错了）

	if (SOCKET_ERROR == connect(sock_client, (sockaddr*)&sa, sizeof(sockaddr)) )
		{
			cout << "服务器连接失败" << endl;

			return false;
		}
		else
		{
			cout<<"服务器连接成功"<<endl;
			return true;
		}

遍历URL

cout << html << endl;

		//匹配所有URL 正则表达式
		smatch mat;
		string::const_iterator start = html.begin();
		string::const_iterator end = html.end();

		//正则表达式
		regex gex("http://[^\\s'\"<>()]+");
		vector<string> vecUrl;

		while (regex_search(start,end,mat,gex))
		{
			string newurl(mat[0].first, mat[0].second);
			

			 //把新的URL存起来
			vecUrl.push_back(newurl);

			start = mat[0].second;

		}
		
		//遍历所有URL
		for (int i = 0;i < vecUrl.size();i++)
		{
			string filename = "./resource/image/1.jpg";

			//判断是否为图片
			string imgUrl = vecUrl[i];
			if (imgUrl.find(".jpg") != string::npos)
			{
				//这是一个jpg图片
				Download(imgUrl, filename);
			}


		}

下载图片

bool Download(string url, string filename) {

	FILE* fp = fopen(filename.c_str(), "wb");
	char ch = 0;
	char buffer[20] = { 0 };
	int nRecv = 0;
	while (recv(sock_client,buffer,sizeof(buffer),0))
	{
		fwrite(buffer, 1, nRecv, fp);
	}


	fclose(fp);

}

至此，一个图形化界面的c++爬虫就做好了，后续还有代码优化（文件头处理，图片批量下载，代码重构）

源代码：

head.h

#pragma once
#ifndef HEAD_H    //目的是为了防止头文件重复包含
#define HEAD_H
#include <iostream>
#include <string>
#include <conio.h>
#include <queue>
#include <graphics.h>
#include <Windows.h>
#include <regex>
#pragma comment (lib,"WS2_32.lib")

#include <fstream>    //包含文件流的头文件

using namespace std;

#define WINDOW_WIDTH 482
#define WINDOW_HEIGHT 300


//定义按钮
struct Button {
	IMAGE imgNormal;
	IMAGE imgPress;
	int width, highth;
	int x, y;
	int flag; // 按钮的int类型标记
	bool pressed;
};


void initButton(Button* btn, const char* normalFile, const char* pressFile, int width, int highth, int flag);
void drawPNG(int  picture_x, int picture_y, IMAGE* picture);
bool checkButtonSelect(Button* btn, MOUSEMSG* msg);
void moveWindow(HWND hwnd, int offX, int offY);
void drawButton(Button* btn);
void initUI();
DWORD WINAPI mainUI(LPVOID param);

void reptile();//爬虫函数
bool startCatch(string url);
bool AnalysisURL(string url);
bool Connect();//连接网络
string GetHtml(string url);
bool Download(string url,string filename);

int screenWidth;
int screenHeight;
HWND hwnd;//UI窗口句柄
IMAGE imgBg;//背景图片
IMAGE imgAnalysis;
//定义按钮
Button btnClose;
Button btnTitle;
Button btnEnter;

int titleLastX; //窗口的上一次位置（X 坐标位置）
int titleLastY; //窗口的上一次位置（X 坐标位置
 


	
SOCKET sock_client;//客户端套接字

//URL
string url;
string sObject;
string sHost;//主机名


#endif