C语言编写的简单爬虫代码

Jiang_Immortals

于 2024-03-02 00:15:00 发布

阅读量1.1k

点赞数 13

分类专栏： C语言文章标签： c语言爬虫服务器

本文链接：https://blog.csdn.net/jiang_changsheng/article/details/136382975

版权

C语言专栏收录该内容

24 篇文章 0 订阅

订阅专栏

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>

// 回调函数，用于处理HTTP响应
size_t write_callback(void *contents, size_t size, size_t nmemb, void *userp) {
    size_t total_size = size * nmemb;
    printf("%.*s", total_size, (char *)contents);
    return total_size;
}

int main() {
    CURL *curl;
    CURLcode res;

    curl = curl_easy_init();
    if (curl) {
        // 设置要爬取的网页URL
        curl_easy_setopt(curl, CURLOPT_URL, "https://example.com");

        // 设置回调函数
        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);

        // 执行请求
        res = curl_easy_perform(curl);

        // 检查是否请求成功
        if (res != CURLE_OK) {
            fprintf(stderr, "curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
        }

        // 清理curl句柄
        curl_easy_cleanup(curl);
    }

    return 0;
}

用C语言编写爬虫相对于使用Python可能会更复杂一些，因为需要手动处理HTTP请求和解析HTML。此外，还需要安装和链接CURL库（在示例代码中使用了curl库来进行HTTP请求）

C语言编写的爬取网页并解析HTML的简单示例：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
#include <libxml/HTMLparser.h>

// 回调函数，用于处理HTTP响应
size_t write_callback(void *contents, size_t size, size_t nmemb, void *userp) {
    size_t total_size = size * nmemb;
    printf("%.*s", total_size, (char *)contents);
    return total_size;
}

int main() {
    CURL *curl;
    CURLcode res;

    curl = curl_easy_init();
    if (curl) {
        // 设置要爬取的网页URL
        curl_easy_setopt(curl, CURLOPT_URL, "https://example.com");

        // 设置回调函数
        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);

        // 执行请求
        res = curl_easy_perform(curl);

        // 检查是否请求成功
        if (res != CURLE_OK) {
            fprintf(stderr, "curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
        }

        // 清理curl句柄
        curl_easy_cleanup(curl);
    }

    // 解析HTML
    const char *html = "<html><body><h1>Hello, World!</h1></body></html>";
    htmlDocPtr doc = htmlReadMemory(html, strlen(html), NULL, NULL, 0);
    if (doc) {
        xmlNodePtr root = xmlDocGetRootElement(doc);
        if (root && xmlStrcmp(root->name, (const xmlChar *)"html") == 0) {
            xmlNodePtr body = xmlFirstElementChild(root);
            if (body && xmlStrcmp(body->name, (const xmlChar *)"body") == 0) {
                xmlNodePtr h1 = xmlFirstElementChild(body);
                if (h1 && xmlStrcmp(h1->name, (const xmlChar *)"h1") == 0) {
                    xmlChar *text = xmlNodeGetContent(h1);
                    printf("Parsed HTML: %s\n", text);
                    xmlFree(text);
                }
            }
        }

        // 释放内存
        xmlFreeDoc(doc);
    }

    return 0;
}