网络编程实现百度查询

编写一个程序,模拟浏览器向百度服务器发送HTTP请求,获取搜索结果,然后将百度服务器返回的第一页结果里面所有的链接给提取出来,显示在屏幕上

头文件
http.h

class Chttp
{
public:
	std::string m_host;				//主机名
	std::string m_path;				//资源路径
	SOCKET m_socket;
public:
	Chttp();
	~Chttp();
	bool AnalyseURL(std::string url);					//解析URL
	bool Connect();										//连接服务器
	bool HandleMes(std::string& html);
	bool FetGet(std::string url, std::string& html);	//获取网页	
	bool GetLink(std::string& html);					//获取正确连接
};


http.cpp

#define _WINSOCK_DEPRECATED_NO_WARNINGS
#include<iostream>
#include<WinSock2.h>
#include<regex>
#include<set>
#pragma comment(lib,"ws2_32.lib")
using namespace std;

#include "Chttp.h"

Chttp::Chttp()//连接套接字
{
	WSADATA wsadata;
	WSAStartup(MAKEWORD(2, 2), &wsadata);

	m_socket = socket(AF_INET, SOCK_STREAM, 0);

}
Chttp::~Chttp()
{

	WSACleanup();
}

bool Chttp::AnalyseURL(std::string url)
{

	if (url.find("http://") != std::string::npos)
	{
		if (url.find('/',7) == std::string::npos)
		{
			m_host = url.substr(7);
			m_path = "/";
		}
		else
		{
			size_t pos = url.find_first_of("/",7);
			m_host = url.substr(7, pos-7);
			m_path = url.substr(pos);
		}

	}
	else if (url.find("https://") != std::string::npos)
	{
		if (url.find('/', 8) == std::string::npos)
		{
			m_host = url.substr(8);
			m_path = "/";
		}
		else
		{
			size_t pos = url.find_first_of("/",8);
			m_host = url.substr(8, pos-8);
			m_path = url.substr(pos);
		}
	}

	if (m_host.empty())		
		return false;

	return true;
}

bool Chttp::Connect()
{
	HOSTENT* ip;
	SOCKADDR_IN m_adr;

	ip = gethostbyname(m_host.c_str());
	if (ip == NULL)
		return false;

	memset(&m_adr, 0, sizeof(SOCKADDR_IN));
	m_adr.sin_family = AF_INET;
	m_adr.sin_port = htons(80);
	memcpy(&m_adr.sin_addr, ip->h_addr, 4);

	if (int i = connect(m_socket, (SOCKADDR*)&m_adr, sizeof(SOCKADDR)) == SOCKET_ERROR)
	{
		cout << i << endl;
		return false;
	}

	return true;
}

bool Chttp::FetGet(std::string url, std::string &html)
{

	//解析URL
	if (AnalyseURL(url) == false)
	{
		std::cout << "analyse False\n";
		return false;
	}
	std::cout << "host:" << m_host << "\tpath:" << m_path << std::endl;

	//连接服务端
	if (Connect() == false)
		std::cout << "Connect() error !" << std::endl;
	if (HandleMes(html) == false)
	{
		std::cout << "Handle Message error \t";
		return false;
	}

	

	return true;
}

bool Chttp::HandleMes(std::string& html)
{
	//发送get请求
	std::string get = "";
	get += "GET " + m_path + " HTTP/1.1\r\n";
	get += "HOST: " + m_host + "\r\n";
	//get += "User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36\r\n";
	get += "User-Agent:*\r\n";
	get += "Connection: Close\r\n";
	get += "\r\n";

	if (send(m_socket, get.c_str(), get.length(), 0) == SOCKET_ERROR)
		return false;

	//接收
	char ch = 0;
	int sum = 0;
	while (recv(m_socket, &ch, 1, 0))
	{
		sum++;
		html += ch;
	}
	std::cout << "总共接收了" << sum << "个字符.\n";

	//编码转换 防止在控制台显示乱码
	char* pszBuffer = new char[html.length() + 1];
	wchar_t* pszWideBuffer = new wchar_t[(html.length() + 1) * 2];
	memset(pszWideBuffer, 0, (html.length() + 1) * 2);
	memset(pszBuffer, 0, html.length() + 1);
	MultiByteToWideChar(CP_UTF8, 0, html.c_str(), html.length(), pszWideBuffer, (html.length() + 1) * 2);//将unicode编码,转换为宽字节
	WideCharToMultiByte(CP_ACP, 0, pszWideBuffer, wcslen(pszWideBuffer), pszBuffer, html.length() + 1, NULL, NULL);//将宽字节,转换为控制台编码
	html = pszBuffer;
	delete[] pszBuffer;
	delete[] pszWideBuffer;

	closesocket(m_socket);

	return true;
}

bool Chttp::GetLink(std::string& html)
{
	//解析网页内容
	std::set<string> s;
	std::smatch mat;
	std::regex rex("http://[^\\s'\"<>()]+");
	string::const_iterator start = html.begin();
	string::const_iterator end = html.end();

	while (regex_search(start, end, mat, rex))
	{
		
		string j_link(mat[0].first, mat[0].second);			//得到的只是百度的跳转链接,并不是真是链接
		if (j_link.find("link?url=") != string::npos)
		{
			s.insert(j_link);
		}

		start = mat[0].second;
	}

	std::set<string>::iterator it;
	for (it = s.begin(); it != s.end(); it++)
	{
		std::cout << *it << std::endl;

	}
	return true;
}

百度查询.cpp

#define _WINSOCK_DEPRECATED_NO_WARNINGS//gethostbyname 和 inet_ntoa
#define _CRT_SECURE_NO_WARNINGS
//#include"Chttp.h"
#include<iostream>
#include<string>
#include<regex>
#include<winsock.h>
#include"Chttp.h"
#pragma comment(lib,"ws2_32.lib")
using namespace std;

bool Catch(string);

int main()
{
	string url,name;
	cout << "请输入要查询事物的名称:" << endl;
	//cout << "请输入要抓取的URL地址: " << endl;
	cin >> name;
	url = "http://www.baidu.com/s?wd=" + name;
	Catch(url);

	return 0;
}

bool Catch(string url)
{

	//获取网页
	string html;
	Chttp http;
	http.FetGet(url,html);
	//cout <<endl << html << endl;
		
	http.GetLink(html);
		

	return true;
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值