编写一个程序,模拟浏览器向百度服务器发送HTTP请求,获取搜索结果,然后将百度服务器返回的第一页结果里面所有的链接给提取出来,显示在屏幕上
头文件
http.h
class Chttp
{
public:
std::string m_host; //主机名
std::string m_path; //资源路径
SOCKET m_socket;
public:
Chttp();
~Chttp();
bool AnalyseURL(std::string url); //解析URL
bool Connect(); //连接服务器
bool HandleMes(std::string& html);
bool FetGet(std::string url, std::string& html); //获取网页
bool GetLink(std::string& html); //获取正确连接
};
http.cpp
#define _WINSOCK_DEPRECATED_NO_WARNINGS
#include<iostream>
#include<WinSock2.h>
#include<regex>
#include<set>
#pragma comment(lib,"ws2_32.lib")
using namespace std;
#include "Chttp.h"
Chttp::Chttp()//连接套接字
{
WSADATA wsadata;
WSAStartup(MAKEWORD(2, 2), &wsadata);
m_socket = socket(AF_INET, SOCK_STREAM, 0);
}
Chttp::~Chttp()
{
WSACleanup();
}
bool Chttp::AnalyseURL(std::string url)
{
if (url.find("http://") != std::string::npos)
{
if (url.find('/',7) == std::string::npos)
{
m_host = url.substr(7);
m_path = "/";
}
else
{
size_t pos = url.find_first_of("/",7);
m_host = url.substr(7, pos-7);
m_path = url.substr(pos);
}
}
else if (url.find("https://") != std::string::npos)
{
if (url.find('/', 8) == std::string::npos)
{
m_host = url.substr(8);
m_path = "/";
}
else
{
size_t pos = url.find_first_of("/",8);
m_host = url.substr(8, pos-8);
m_path = url.substr(pos);
}
}
if (m_host.empty())
return false;
return true;
}
bool Chttp::Connect()
{
HOSTENT* ip;
SOCKADDR_IN m_adr;
ip = gethostbyname(m_host.c_str());
if (ip == NULL)
return false;
memset(&m_adr, 0, sizeof(SOCKADDR_IN));
m_adr.sin_family = AF_INET;
m_adr.sin_port = htons(80);
memcpy(&m_adr.sin_addr, ip->h_addr, 4);
if (int i = connect(m_socket, (SOCKADDR*)&m_adr, sizeof(SOCKADDR)) == SOCKET_ERROR)
{
cout << i << endl;
return false;
}
return true;
}
bool Chttp::FetGet(std::string url, std::string &html)
{
//解析URL
if (AnalyseURL(url) == false)
{
std::cout << "analyse False\n";
return false;
}
std::cout << "host:" << m_host << "\tpath:" << m_path << std::endl;
//连接服务端
if (Connect() == false)
std::cout << "Connect() error !" << std::endl;
if (HandleMes(html) == false)
{
std::cout << "Handle Message error \t";
return false;
}
return true;
}
bool Chttp::HandleMes(std::string& html)
{
//发送get请求
std::string get = "";
get += "GET " + m_path + " HTTP/1.1\r\n";
get += "HOST: " + m_host + "\r\n";
//get += "User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36\r\n";
get += "User-Agent:*\r\n";
get += "Connection: Close\r\n";
get += "\r\n";
if (send(m_socket, get.c_str(), get.length(), 0) == SOCKET_ERROR)
return false;
//接收
char ch = 0;
int sum = 0;
while (recv(m_socket, &ch, 1, 0))
{
sum++;
html += ch;
}
std::cout << "总共接收了" << sum << "个字符.\n";
//编码转换 防止在控制台显示乱码
char* pszBuffer = new char[html.length() + 1];
wchar_t* pszWideBuffer = new wchar_t[(html.length() + 1) * 2];
memset(pszWideBuffer, 0, (html.length() + 1) * 2);
memset(pszBuffer, 0, html.length() + 1);
MultiByteToWideChar(CP_UTF8, 0, html.c_str(), html.length(), pszWideBuffer, (html.length() + 1) * 2);//将unicode编码,转换为宽字节
WideCharToMultiByte(CP_ACP, 0, pszWideBuffer, wcslen(pszWideBuffer), pszBuffer, html.length() + 1, NULL, NULL);//将宽字节,转换为控制台编码
html = pszBuffer;
delete[] pszBuffer;
delete[] pszWideBuffer;
closesocket(m_socket);
return true;
}
bool Chttp::GetLink(std::string& html)
{
//解析网页内容
std::set<string> s;
std::smatch mat;
std::regex rex("http://[^\\s'\"<>()]+");
string::const_iterator start = html.begin();
string::const_iterator end = html.end();
while (regex_search(start, end, mat, rex))
{
string j_link(mat[0].first, mat[0].second); //得到的只是百度的跳转链接,并不是真是链接
if (j_link.find("link?url=") != string::npos)
{
s.insert(j_link);
}
start = mat[0].second;
}
std::set<string>::iterator it;
for (it = s.begin(); it != s.end(); it++)
{
std::cout << *it << std::endl;
}
return true;
}
百度查询.cpp
#define _WINSOCK_DEPRECATED_NO_WARNINGS//gethostbyname 和 inet_ntoa
#define _CRT_SECURE_NO_WARNINGS
//#include"Chttp.h"
#include<iostream>
#include<string>
#include<regex>
#include<winsock.h>
#include"Chttp.h"
#pragma comment(lib,"ws2_32.lib")
using namespace std;
bool Catch(string);
int main()
{
string url,name;
cout << "请输入要查询事物的名称:" << endl;
//cout << "请输入要抓取的URL地址: " << endl;
cin >> name;
url = "http://www.baidu.com/s?wd=" + name;
Catch(url);
return 0;
}
bool Catch(string url)
{
//获取网页
string html;
Chttp http;
http.FetGet(url,html);
//cout <<endl << html << endl;
http.GetLink(html);
return true;
}