我突发奇想,想用C++写一个爬虫(真该死,为啥我的想法这么丰富呢)于是在收集大量资料后,成功写出了支持HTTP和HTTPS协议请求的代码,并能支持解析请求内容
用的OpenSSL和Socket实现的HTTPS请求
支持库
#include <cstring>
#include <string>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>
#include <vector>
#include <unistd.h>
#include <fcntl.h>
#include <chrono>
#include <sys/time.h>
#include <openssl/ssl.h>
#include <openssl/err.h>
如何实现?
首先要设计一个类,用来封装请求。
class HttpsAndHttpRequest
{
private:
sockaddr_in serverAddr;
int socketFD = 0;
int timeOutVal;
struct UrlStructure
{
std::string agreement;
std::string host;
hostent *ip;
std::string port;
std::string path;
std::string param;
};
// 解析url
UrlStructure urlAnalysis(std::string url);
// https
SSL_CTX *ctx = NULL;
SSL *ssl = NULL;
std::string httpResponse;
struct httpProtocol
{
std::string allContent;
std::string option;
std::string content;
};
bool noBlock = true;
// 解析网页数据
void analysisHttpProtocol();
// http请求
std::string httpRequest(std::string url);
// https请求
std::string httpsRequest(std::string url);
public:
bool useHandleProtocol = false;
HttpsAndHttpRequest(int timeoutSeconds = 5, bool noBlockSet = true) : timeOutVal(timeoutSeconds)
{
// SSL初始化
SSL_library_init();
OpenSSL_add_all_algorithms();
SSL_load_error_strings();
ctx = SSL_CTX_new(SSLv23_client_method());
// 设置请求模式
noBlock = noBlockSet;
}
std::string ConnectWebsite(const std::string &url, bool analysis = false);
// 设置
void setTimeoutSeconds(int time);
~HttpsAndHttpRequest()
{
if (ssl)
{
SSL_shutdown(ssl);
SSL_free(ssl);
ssl = NULL;
}
if (ctx)
SSL_CTX_free(ctx);
}
protected:
// 提供接口
std::vector<httpProtocol> protocol;
virtual void handleProtocol() = 0;
};
首先我们先解析url
HttpsAndHttpRequest::UrlStructure HttpsAndHttpRequest::urlAnalysis(std::string url)
{
UrlStructure result;
std::string partText[3] = {"://", "/", "?"};
int start = 0, end;
std::string host, path;
bool isParam = false;
bool notPath = false;
for (size_t i = 0; i < 2; i++)
{
end = url.find(partText[i], start);
if (end == std::string::npos)
{
if (partText[i + 1] == "/")
continue;
end = url.find(partText[i + 1], start);
if (end == std::string::npos)
{
notPath = true;
break;
}
if (partText[i + 1] == "?")
isParam = true;
}
if (i == 0)
result.agreement = url.substr(start, end - start);
else if (i == 1)
host = url.substr(start, end - start);
else if (i == 2)
path = url.substr(start, end - start);
start = end + partText[i].length();
}
if (start != 0)
{
if (notPath)
host = url.substr(start);
else
path = url.substr(start);
}
else
{
host = url;
}
start = 0;
end = host.find(":", start);
if (end != std::string::npos)
{
result.host = host.substr(start, end);
result.port = host.substr(end + 1);
}
else
result.host = host;
end = path.find("?", start);
if (isParam)
result.param = path;
else if (end != std::string::npos)
{
result.path = path.substr(start, end);
result.param = path.substr(end + 1);
}
else
result.path = path;
result.ip = gethostbyname(result.host.c_str());
return result;
}
要实现HTTPS请求,就要先实现最基本的HTTP请求,这里使用socket实现HTTP请求
std::string HttpsAndHttpRequest::httpRequest(std::string url)
{
if (socketFD)
close(socketFD);
UrlStructure urlResource = urlAnalysis(url);
if (urlResource.port.empty())
urlResource.port = "80";
if (!urlResource.ip)
return "";
socketFD = socket(AF_INET, SOCK_STREAM, 0);
if (noBlock)
{
int flags = fcntl(socketFD, F_GETFL, 0);
fcntl(socketFD, F_SETFL, flags | O_NONBLOCK);
}
// 设置连接地址
serverAddr.sin_family = AF_INET;
serverAddr.sin_port = htons(atoi(urlResource.port.c_str()));
serverAddr.sin_addr = *(in_addr *)urlResource.ip->h_addr_list[0];
// 构建http请求
std::string request = "GET /" + urlResource.path + " HTTP/1.1\r\n";
request += "Host: " + urlResource.host + "\r\n";
request += "Connection: close\r\n";
request += "\r\n";
int nRet = connect(socketFD, (sockaddr *)&serverAddr, sizeof(serverAddr));
timeval timeout;
timeout.tv_sec = timeOutVal;
fd_set wait;
FD_ZERO(&wait);
FD_SET(socketFD, &wait);
nRet = select(socketFD + 1, NULL, &wait, NULL, &timeout);
if (nRet <= 0)
return "";
// 发送HTTP请求
if (write(socketFD, request.c_str(), strlen(request.c_str())) < 0)
return "";
std::string response = "";
char buffer[1024];
int len = 0;
timeout.tv_sec = timeOutVal;
FD_ZERO(&wait);
FD_SET(socketFD, &wait);
while (true)
{
int ready = select(socketFD + 1, &wait, NULL, NULL, &timeout);
if (ready > 0)
{
if (FD_ISSET(socketFD, &wait))
{
len = read(socketFD, buffer, sizeof(buffer));
if (len > 0)
response.append(buffer, len);
else
break;
}
}
else
break;
}
close(socketFD);
return response;
}
在HTTPS基础上实现HTTPS
std::string HttpsAndHttpRequest::httpsRequest(std::string url)
{
if (socketFD)
close(socketFD);
UrlStructure urlResource = urlAnalysis(url);
if (urlResource.port.empty())
{
urlResource.port = "443";
}
if (!urlResource.ip)
{
return "";
}
socketFD = socket(AF_INET, SOCK_STREAM, 0);
if (noBlock)
{
int flags = fcntl(socketFD, F_GETFL, 0);
fcntl(socketFD, F_SETFL, flags | O_NONBLOCK);
}
// 设置连接地址
serverAddr.sin_family = AF_INET;
serverAddr.sin_port = htons(atoi(urlResource.port.c_str()));
serverAddr.sin_addr = *(in_addr *)urlResource.ip->h_addr_list[0];
// 构建http请求
std::string request = "GET /" + urlResource.path + " HTTP/1.1\r\n";
request += "Host: " + urlResource.host + "\r\n";
request += "Connection: close\r\n";
request += "\r\n";
int nRet = connect(socketFD, (sockaddr *)&serverAddr, sizeof(serverAddr));
timeval timeout;
timeout.tv_sec = timeOutVal;
fd_set wait;
FD_ZERO(&wait);
FD_SET(socketFD, &wait);
nRet = select(socketFD + 1, NULL, &wait, NULL, &timeout);
if (nRet <= 0)
return "";
SSL_CTX_set_timeout(ctx, timeOutVal);
ssl = SSL_new(ctx);
SSL_set_fd(ssl, socketFD);
timeval startTime;
gettimeofday(&startTime, NULL);
timeval nowTime;
int cutTime = 0;
while ((nRet = SSL_connect(ssl)) != 1)
{
int sslError = SSL_get_error(ssl, nRet);
if ((sslError != SSL_ERROR_WANT_READ && sslError != SSL_ERROR_WANT_WRITE) || cutTime > timeOutVal)
return "";
gettimeofday(&nowTime, NULL);
cutTime = nowTime.tv_sec - startTime.tv_sec;
usleep(100000);
}
if (SSL_write(ssl, request.c_str(), request.size()) <= 0)
{
return "";
}
std::string response = "";
char buffer[1024];
int len = 0;
FD_ZERO(&wait);
FD_SET(socketFD, &wait);
while (true)
{
int ready = select(socketFD + 1, &wait, NULL, NULL, &timeout);
if (ready > 0)
{
if (FD_ISSET(socketFD, &wait))
{
len = SSL_read(ssl, buffer, sizeof(buffer));
if (len > 0)
response.append(buffer, len);
else
break;
}
}
else
break;
}
return response;
}
然后就是解析返回的网页数据,设置超时时间
void HttpsAndHttpRequest::analysisHttpProtocol()
{
if (httpResponse.empty())
return;
if (protocol.size() > 0)
{
protocol.clear();
protocol.shrink_to_fit();
}
size_t start, end;
std::string value = "";
start = 0;
do
{
end = httpResponse.find("\r\n", start);
value = httpResponse.substr(start, end - start);
if (value == "")
{
start = end + 2;
value = httpResponse.substr(start);
protocol.push_back({"request-date : ...", "request-date", value});
break;
}
std::string option = "";
std::string content = "";
int tempStart = 0, tempEnd;
if (start == 0)
{
tempStart = value.find(" ", tempStart) + 1;
option = "request-result";
tempEnd = value.find(" ", tempStart);
content = value.substr(tempStart, tempEnd - tempStart);
}
else
{
tempEnd = value.find(":", tempStart);
option = value.substr(tempStart, tempEnd - tempStart);
tempStart = tempEnd + 1;
content = value.substr(tempStart);
}
protocol.push_back({value, option, content});
start = end + 2;
} while (start > end);
useHandleProtocol = false;
}
void HttpsAndHttpRequest::setTimeoutSeconds(int time)
{
timeOutVal = time;
}
最后在做个汇总
std::string HttpsAndHttpRequest::ConnectWebsite(const std::string &url, bool analysis)
{
UrlStructure urlResource = urlAnalysis(url);
bool NoAgreement = false;
bool existPort = true;
if (urlResource.agreement != "https" || urlResource.agreement != "http")
{
urlResource.agreement = "http";
NoAgreement = true;
}
if (urlResource.port == "")
{
existPort = false;
if (urlResource.agreement == "https")
urlResource.port = "443";
else
urlResource.port = "80";
}
std::string urlContent = urlResource.agreement + "://" + urlResource.host + ":" + urlResource.port + "/" + urlResource.path + "?" + urlResource.param;
std::string result = "";
if (NoAgreement)
{
result = httpsRequest(urlContent);
if (result.empty())
{
if (existPort)
urlContent = "http://" + urlResource.host + ":80" + "/" + urlResource.path + "?" + urlResource.param;
result = httpRequest(urlContent);
}
}
else if (urlResource.agreement == "https")
result = httpsRequest(urlContent);
else
result = httpRequest(urlContent);
if (analysis)
{
httpResponse = result;
analysisHttpProtocol();
}
return result;
}
以上就完成了一个封装好的HTTPS请求
示例
//上面的类和函数
class HttpsAndHttpRequests : public HttpsAndHttpRequest
{
public:
//重写那个虚函数
void handleProtocol()
{
//可以进行处理
}
};
int main(){
HttpsAndHttpRequests request
std::string response = request.ConnectWebsite("https://sober-up.cn/");
std::cout << response << std::endl;
}
总结
这个代码还有很多可以优化的地方,为啥不优化一下?
作者能力有限,不会优化(懒不想优化,目前够用)
有不懂的地方在评论区提出,不想写注释。