界面新闻自动采集脚本PHP

最新推荐文章于 2024-08-22 09:44:18 发布

赫兹hzz

最新推荐文章于 2024-08-22 09:44:18 发布

阅读量300

点赞数 8

分类专栏：采集文章标签： php 开发语言

本文链接：https://blog.csdn.net/qq_25989369/article/details/139307709

版权

采集专栏收录该内容

1 篇文章 0 订阅

订阅专栏

界面新闻自动采集脚本PHP

有朋友的系统需要对接一下新闻源内容，所以就有了这样一个简单的PHP脚本，分享给大家

代码

<?php

// 分类ID数组
$jiemianVariables = [65, 9, 112, 62, 154, 105, 680, 472, 51, 31, 838, 851, 86, 699, 174, 32, 71];

// 随机选择一个ID
$jiemianSection = $jiemianVariables[array_rand($jiemianVariables)];

// 构建选定部分的URL
$jiemianurl = "https://www.jiemian.com/lists/{$jiemianSection}.html";

// 获取部分页面的HTML内容
$jiemianhtml = fetchUrl($jiemianurl);
// 解析部分页面中的文章链接
$articleUrls = parseLinks($jiemianhtml);

// 随机选择一个文章链接
$randomArticleUrl = $articleUrls[array_rand($articleUrls)];

// 获取选定文章的HTML内容
$jiemianHtml = fetchUrl($randomArticleUrl);

// 清理HTML以准备提取内容
$cleanDom = cleanHtml($jiemianHtml);
// 从清理后的HTML中提取标题和内容

$tempResult = jiemianContent($jiemianHtml);

// 将提取的标题和内容赋值给变量
$jmtitle = $tempResult[0];
$jmcontent = $tempResult[1];

// 输出结果
echo "Title: $jmtitle\n";
echo "Content: $jmcontent\n";

// 获取URL内容的函数
function fetchUrl($url) {
    // 使用cURL和Baidu Spider User-Agent获取URL内容
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); //ssl
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_USERAGENT, 'Baiduspider+(+http://www.baidu.com/search/spider.htm)');
    $output = curl_exec($ch);
    curl_close($ch);
    return $output;
}
// 从HTML解析链接的函数
function parseLinks($html) {
    // 使用DOMDocument和DOMXPath提取链接
    $dom = new DOMDocument;
    @$dom->loadHTML($html);
    $xpath = new DOMXPath($dom);
    $hrefs = $xpath->evaluate("/html/body//a");
    $links = [];
    for ($i = 0; $i < $hrefs->length; $i++) {
        $href = $hrefs->item($i);
        $url = $href->getAttribute('href');
        if (strpos($url, 'article') !== false) { // 假设文章链接中包含'article'
            $links[] = $url;
        }
    }
    return $links;
}

// 清理HTML的函数
function cleanHtml($html) {
    // 移除脚本和样式
    $dom = new DOMDocument;
    @$dom->loadHTML($html);
    $scriptTags = $dom->getElementsByTagName('script');
    $styleTags = $dom->getElementsByTagName('style');

    while ($scriptTags->length > 0) {
        $scriptTags->item(0)->parentNode->removeChild($scriptTags->item(0));
    }
    while ($styleTags->length > 0) {
        $styleTags->item(0)->parentNode->removeChild($styleTags->item(0));
    }

    return $dom->saveHTML();
}

// 从清理后的HTML中提取内容的函数
function jiemianContent($html) {
    // 使用DOMDocument和DOMXPath提取标题和内容
    $dom = new DOMDocument;
    @$dom->loadHTML($html);
    $xpath = new DOMXPath($dom);

    // 提取标题
    $title = $xpath->evaluate("string(//title)");

    // 提取内容（文章内容在class为'article-content'的<div>中）
    $contentNode = $xpath->query("//div[contains(@class, 'article-content')]");
    $content = '';

    if ($contentNode->length > 0) {
        $contentNode = $contentNode->item(0);
        //保存内容（包含HTML）
        $content = $dom->saveHTML($contentNode);
    }
    return [$title, $content];
}
?>