爬虫项目②（其他新闻网站的爬取）

最新推荐文章于 2024-04-15 18:07:06 发布

秃头呆脑

最新推荐文章于 2024-04-15 18:07:06 发布

阅读量854

点赞数 3

分类专栏：爬虫项目

本文链接：https://blog.csdn.net/PLH799407332/article/details/116066049

版权

爬虫项目专栏收录该内容

5 篇文章 2 订阅

订阅专栏

Web编程期中作业②

（其他新闻网站的爬取）

以新浪网为例子进行详细说明

根据不同新闻网站的特点可以对不同的新闻网站设计相应的代码

以新浪网为例子进行详细说明

获取新闻网站的名称、编码以及网址

var source_name = "新浪网";
var myEncoding = "utf-8";
var seedURL = 'https://news.sina.com.cn/';

进入任意新闻页面，检查对应的源代码

网页编码的获取：一个简单的方法就是直接在google的console中输入

document.charset

返回值就是网页的编码，这样就不需要在源码中找关于charset的信息。

定义新闻页面里具体的元素的读取方式

var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = " $('meta[property=\"og:title\"]').eq(0).attr(\"content\")";
var date_format = " $('meta[property=\"article:published_time\"]').eq(0).attr(\"content\")";
var author_format = " $('meta[property=\"article:author\"]').eq(0).attr(\"content\")";
var content_format = "$('.article').text()";
var desc_format = " $('meta[property=\"og:description\"]').eq(0).attr(\"content\")";
var source_format =  " $('meta[property=\"og:url\"]').eq(0).attr(\"content\")";

下图重点叙述了修改对应网站要爬取内容的过程

（示例文章链接为https://news.sina.com.cn/c/xl/2021-04-26/doc-ikmxzfmk9082123.shtml）

修改正则表达式（个人认为是本项目的重难点，学习链接如下：https://www.runoob.com/js/js-regexp.html）

根据新浪网下的文件url链接的特点，如

https://news.sina.com.cn/c/2021-04-23/doc-ikmxzfmk8552578.shtml

https://news.sina.com.cn/c/2021-04-23/doc-ikmyaawc1407626.shtml

可以判断该网站的url链接基本格式为 https://news.sina.com.cn/c 加上年月日的标识以及 doc-9位字符以及10位数字末尾格式为 .shtml

var url_reg = /news.sina.com.cn\/.\/(\d{4})-(\d{2})-(\d{2})\/doc-[a-z0-9]{10,}.shtml/;
var regExp = /(\d{4})-(\d{2})-(\d{2})/

其余内容（防止网站屏蔽我们的爬虫、request模块异步fetch url、读取种子页面、读取新闻页面等内容）借鉴老师的代码

//防止网站屏蔽我们的爬虫
var headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}

//request模块异步fetch url
function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        //proxy: 'http://x.x.x.x:8080', newBuf
        headers: headers,
        timeout: 10000 //
    }
    myRequest(options, callback)
}

request(seedURL, function(err, res, body) { //读取种子页面
    // try {
    //用iconv转换编码
    var html = myIconv.decode(body, myEncoding);
    //console.log(html);
    //准备用cheerio解析html
    var $ = myCheerio.load(html, { decodeEntities: true });
    // } catch (e) { console.log('读种子页面并转码出错：' + e) };

    var seedurl_news;

    try {
        seedurl_news = eval(seedURL_format);
        //console.log(seedurl_news);
    } catch (e) { console.log('url列表所处的html块识别出错：' + e) };

    seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
        var myURL = "";
        try {
            //得到具体新闻url
            var href = "";
            href = $(e).attr("href");
            if (typeof(href) == "undefined") {  // 有些网页地址undefined
                return true;
            }
            if (href.toLowerCase().indexOf('http://') >= 0 || href.toLowerCase().indexOf('https://') >= 0) myURL = href; //http://开头的或者https://开头
            else if (href.startsWith('//')) myURL = 'http:' + href; 开头的
            else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他

        } catch (e) { console.log('识别种子页面中的新闻链接出错：' + e) }

        if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式
        //console.log(myURL);
        newsGet(myURL); //读取新闻页面
    });
});

function newsGet(myURL) { //读取新闻页面
    request(myURL, function(err, res, body) { //读取新闻页面
        //try {
        var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
        //console.log(html_news);
        //准备用cheerio解析html_news
        var $ = myCheerio.load(html_news, { decodeEntities: true });
        myhtml = html_news;
        //} catch (e) {    console.log('读新闻页面并转码出错：' + e);};

        console.log("转码读取成功:" + myURL);
        //动态执行format字符串，构建json对象准备写入文件或数据库
        var fetch = {};
        fetch.title = "";
        fetch.content = "";
        fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
        //fetch.html = myhtml;
        fetch.url = myURL;
        fetch.source_name = source_name;
        fetch.source_encoding = myEncoding; //编码
        fetch.crawltime = new Date();

        if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format);  //没有关键词就用sourcename
        else fetch.keywords = eval(keywords_format);

        if (title_format == "") fetch.title = ""
        else fetch.title = eval(title_format); //标题

        if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期   
        console.log('date: ' + fetch.publish_date);
        console.log(myURL);
        fetch.publish_date = regExp.exec(fetch.publish_date)[0];
        fetch.publish_date = fetch.publish_date.replace('年', '-')
        fetch.publish_date = fetch.publish_date.replace('月', '-')
        fetch.publish_date = fetch.publish_date.replace('日', '')
        fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");

        if (author_format == "") fetch.author = source_name; //eval(author_format);  //作者
        else fetch.author = eval(author_format);

        if (content_format == "") fetch.content = "";
        else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定

       if (source_format == "") fetch.source = fetch.source_name;
        else fetch.source = eval(source_format).replace("\r\n", ""); //来源

        if (desc_format == "") fetch.desc = fetch.title;
        else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要    

        var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
            "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
        存储json
        fs.writeFileSync(filename, JSON.stringify(fetch));
    });
        
    
}

爬取的结果如下：

这是具体的爬取内容示例：

根据不同新闻网站的特点可以对不同的新闻网站设计相应的代码

网易新闻网

var source_name = "网易新闻";
var myEncoding = "GBK";
var seedURL = 'https://news.163.com/';

var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = " $('meta[property=\"og:title\"]').eq(0).attr(\"content\")";
var date_format = "$('.post_time_source').text()"|"$(#ptime).text()";
var author_format = "$('meta[name=\"author\"]').eq(0).attr(\"content\")";
var content_format = "$('#endText').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('#ne_article_source').text()";
var url_reg = /\/(\d{2})\/(\d{4})\/(\d{2})\/(\w{10,30}).html/;

var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');

其他代码内容参考老师的代码，这里不再赘述

爬取的结果：

东方财富网

var source_name="东方财富网";
var myEncoding = "utf-8";
var seedURL = 'https://www.eastmoney.com/';

var seedURL_format ="$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('.time').text()";  
var author_format = "$('.author').text()";
var content_format = "$('#ContentBody').text()"; 
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('source data-source').eq(0).attr(\"data-source\")";
var url_reg = /\/(\d{18}).html/;

其他代码内容参考老师的代码

爬取的结果：

中证网

var source_name="中证网";
var seedURL = 'http://www.cs.com.cn/';
var myEncoding = "gbk";

var mysql = require('./mysql.js');

var seedURL_format="$('a')";
var keywords_format = " $('meta[name=\"Keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var desc_format = " $('meta[name=\"Description\"]').eq(0).attr(\"content\")";
var date_format = "$('.info').find('p').eq(0).next().find('em').eq(0).text()";   
var author_format = "$('.info').find('p').eq(0).text()";
var source_format = "$('.info').find('p').eq(0).next().find('em').eq(0).next().text()";
var content_format = "$('.article','div').text()"; 
var url_reg = /\/(\d{6})\/t(\d{8})_(\d{7}).html/;

注意事项

在上述代码的执行过程中可能对部分爬取的页面会出现未定义的数据，如下图所示

可以采取一下的方法进行修正（红色部分为新增加的代码）

秃头呆脑

关注

3
点赞
踩
11

收藏

觉得还不错? 一键收藏
0
评论
爬虫项目②（其他新闻网站的爬取）

Web编程第一个实验项目实验名称：新闻爬虫及爬取结果的查询网站核心需求：（1）、选取3-5个代表性的新闻网站（比如新浪新闻、网易新闻等，或者某个垂直领域权威性的网站比如经济领域的雪球财经、东方财富等，或者体育领域的腾讯体育、虎扑体育等等）建立爬虫，针对不同网站的新闻页面进行分析，爬取出编码、标题、作者、时间、关键词、摘...
复制链接

扫一扫