#include <iostream>
#include <fstream>
#include <cstdio>
#include <regex>
#include <set>
#include <winsock2.h>
#pragma comment(lib,"ws2_32.lib") // 需要链接的库文件
using namespace std;
const int MAX_BUFFER_SIZE = 1024 * 1024;
const int MAX_URL_LENGTH = 1024;
const int MAX_HOST_LENGTH = 64;
// 从URL中解析主机名和文件路径
void parse_url(const char* url, char* host, char* path)
{
char* p = strstr(url, "://");
if (p == NULL) return;
p += 3;
char* q = strstr(p, "/");
if (q == NULL) {
strcpy_s(path, MAX_URL_LENGTH, "/");
strcpy_s(host, MAX_HOST_LENGTH, p);
} else {
*q = '\0';
strcpy_s(path, MAX_URL_LENGTH, q);
strcpy_s(host, MAX_HOST_LENGTH, p);
*q = '/';
}
}
// 获取HTML源码
bool get_http_content(SOCKET s, const char* path, char* buffer, int size)
{
char request[MAX_URL_LENGTH + MAX_HOST_LENGTH + 32] = { 0 };
sprintf_s(request, MAX_URL_LENGTH + MAX_HOST_LENGTH + 32, "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n", path, buffer);
int n = send(s, request, strlen(request), 0);
if (n < 0) return false;
char recvbuf[MAX_BUFFER_SIZE] = { 0 };
n = recv(s, recvbuf, MAX_BUFFER_SIZE - 1, 0);
if (n <= 0) return false;
recvbuf[n] = '\0';
char* p = strstr(recvbuf, "\r\n\r\n");
if (p == NULL) return false;
memcpy_s(buffer, size, p + 4, n - (p - recvbuf) - 4);
return true;
}
// 获取所有URL
void get_all_urls(const char* html, const char* url, set<string>& urls)
{
regex reg("<a.*?href=\"([^\"]+)\".*?>");
smatch match;
string s(html);
while (regex_search(s, match, reg)) {
string tmp = match[1].str();
if (tmp.find("http") == 0) { // 绝对路径
urls.insert(tmp);
} else if (tmp.find("/") == 0) { // 相对路径
char host[MAX_HOST_LENGTH] = { 0 };
char path[MAX_URL_LENGTH] = { 0 };
parse_url(url, host, path);
char newurl[MAX_URL_LENGTH] = { 0 };
sprintf_s(newurl, "%s%s", host, tmp.c_str());
urls.insert(newurl);
} else { // 相对路径
char newurl[MAX_URL_LENGTH] = { 0 };
char* p = strrchr((char*)url, '/');
if (p == NULL) continue;
sprintf_s(newurl, "%.*s/%s", p - url, url, tmp.c_str());
urls.insert(newurl);
}
s = match.suffix();
}
}
// 保存文件
bool save_file(const char* url, const char* html, const char* save_dir)
{
char filename[MAX_URL_LENGTH] = { 0 };
sprintf_s(filename, "%s\\%s.html", save_dir, url);
ofstream out(filename);
if (out.fail()) return false;
out << html << endl;
out.close();
return true;
}
// 下载网页并获取所有URL
void download_and_parse(const char* url, const char* save_dir, set<string>& urls)
{
char host[MAX_HOST_LENGTH] = { 0 };
char path[MAX_URL_LENGTH] = { 0 };
parse_url(url, host, path);
WSADATA wsaData = { 0 };
int nResult = WSAStartup(MAKEWORD(2, 2), &wsaData);
if (nResult != NO_ERROR) {
cout << "WSAStartup failed with error: " << nResult << endl;
return;
}
SOCKET socket_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
struct hostent* server = gethostbyname(host);
if (server == NULL) {
cout << "No such host: " << host << endl;
return;
}
struct sockaddr_in serv_addr = { 0 };
serv_addr.sin_family = AF_INET;
serv_addr.sin_addr.s_addr = *((unsigned long*)server->h_addr);
serv_addr.sin_port = htons(80);
if (connect(socket_fd, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) < 0) {
cout << "Connect failed" << endl;
return;
}
char request[MAX_URL_LENGTH + MAX_HOST_LENGTH + 32] = { 0 };
sprintf_s(request, MAX_URL_LENGTH + MAX_HOST_LENGTH + 32, "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n", path, host);
if (send(socket_fd, request, strlen(request), 0) < 0) {
cout << "Send failed" << endl;
return;
}
char recvbuf[MAX_BUFFER_SIZE] = { 0 };
int n = recv(socket_fd, recvbuf, MAX_BUFFER_SIZE - 1, 0);
if (n <= 0) {
cout << "Recv failed" << endl;
return;
}
recvbuf[n] = '\0';
char* p = strstr(recvbuf, "\r\n\r\n");
if (p == NULL) {
cout << "HTTP response format error" << endl;
return;
}
char html[MAX_BUFFER_SIZE] = { 0 };
memcpy_s(html, MAX_BUFFER_SIZE, p + 4, n - (p - recvbuf) - 4);
save_file(url, html, save_dir);
get_all_urls(html, url, urls);
closesocket(socket_fd);
WSACleanup();
}
// 爬取网站
void spider(const char* url, const char* save_dir, int max_depth)
{
set<string> urls;
urls.insert(url);
for (int depth = 1; depth <= max_depth; depth++) {
cout << "depth:" << depth << endl;
set<string> tmp_urls;
for (auto& u : urls) {
cout << u << endl;
download_and_parse(u.c_str(), save_dir, tmp_urls);
Sleep(100);
}
urls.clear();
urls = tmp_urls;
}
}
int main()
{
spider("https://blog.csdn.net/nav/sec", "D:\\", 3);
return 0;
}
c++信息收集
最新推荐文章于 2024-09-16 18:00:58 发布