#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
// 回调函数,用于处理HTTP响应
size_t write_callback(void *contents, size_t size, size_t nmemb, void *userp) {
size_t total_size = size * nmemb;
printf("%.*s", total_size, (char *)contents);
return total_size;
}
int main() {
CURL *curl;
CURLcode res;
curl = curl_easy_init();
if (curl) {
// 设置要爬取的网页URL
curl_easy_setopt(curl, CURLOPT_URL, "https://example.com");
// 设置回调函数
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
// 执行请求
res = curl_easy_perform(curl);
// 检查是否请求成功
if (res != CURLE_OK) {
fprintf(stderr, "curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
}
// 清理curl句柄
curl_easy_cleanup(curl);
}
return 0;
}
用C语言编写爬虫相对于使用Python可能会更复杂一些,因为需要手动处理HTTP请求和解析HTML。此外,还需要安装和链接CURL库(在示例代码中使用了curl库来进行HTTP请求)
C语言编写的爬取网页并解析HTML的简单示例:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
// 回调函数,用于处理HTTP响应
size_t write_callback(void *contents, size_t size, size_t nmemb, void *userp) {
size_t total_size = size * nmemb;
printf("%.*s", total_size, (char *)contents);
return total_size;
}
int main() {
CURL *curl;
CURLcode res;
curl = curl_easy_init();
if (curl) {
// 设置要爬取的网页URL
curl_easy_setopt(curl, CURLOPT_URL, "https://example.com");
// 设置回调函数
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
// 执行请求
res = curl_easy_perform(curl);
// 检查是否请求成功
if (res != CURLE_OK) {
fprintf(stderr, "curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
}
// 清理curl句柄
curl_easy_cleanup(curl);
}
// 解析HTML
const char *html = "<html><body><h1>Hello, World!</h1></body></html>";
htmlDocPtr doc = htmlReadMemory(html, strlen(html), NULL, NULL, 0);
if (doc) {
xmlNodePtr root = xmlDocGetRootElement(doc);
if (root && xmlStrcmp(root->name, (const xmlChar *)"html") == 0) {
xmlNodePtr body = xmlFirstElementChild(root);
if (body && xmlStrcmp(body->name, (const xmlChar *)"body") == 0) {
xmlNodePtr h1 = xmlFirstElementChild(body);
if (h1 && xmlStrcmp(h1->name, (const xmlChar *)"h1") == 0) {
xmlChar *text = xmlNodeGetContent(h1);
printf("Parsed HTML: %s\n", text);
xmlFree(text);
}
}
}
// 释放内存
xmlFreeDoc(doc);
}
return 0;
}