第一次爬虫大作业

最新推荐文章于 2024-04-30 19:47:45 发布

漫天星的老巢

最新推荐文章于 2024-04-30 19:47:45 发布

阅读量750

点赞数 1

分类专栏： Web编程文章标签： js javascript 爬虫

本文链接：https://blog.csdn.net/weixin_51194771/article/details/116074508

版权

Web编程专栏收录该内容

2 篇文章 0 订阅

订阅专栏

第一次爬虫大作业

• 新闻爬虫及爬取结果的查询网站

• 核心需求：

• 1 、选取 3-5 个代表性的新闻网站（比如新浪新闻、网易新闻等，或者某个垂直领域权威性的网站比如经济领域的雪球财经、东方财富等，或者体育领域的腾讯体育、虎扑体育等等）建立爬虫，针对不同网站的新闻页面进行分析，爬取出编码、标题、作者、时间、关键词、摘要、内容、来源等结构化信息，存储在数据库中。

• 2 、建立网站提供对爬取内容的分项全文搜索，给出所查关键词的时间热度分析。

• 技术要求：

• 1 、必须采用 Node.JS 实现网络爬虫

•2、必须采用Node.JS实现查询网站后端，HTML+JS实现前端（尽量不要使用任何前后端框架）

四.前端，后端——用网页发送请求到后端查询

五.总结

参考资料：菜鸟教程，node.js中文网,CSDN博客，PPT以及B站

一.基础爬虫代码（老师的）

/*在这里插入代码片*/var source_name = "中国新闻网";
var domain = 'http://www.chinanews.com/';
var myEncoding = "utf-8";
var seedURL = 'http://www.chinanews.com/';

var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('#pubtime_baidu').text()";
var author_format = "$('#editor_baidu').text()";
var content_format = "$('.left_zw').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('#source_baidu').text()";
var url_reg = /\/(\d{4})\/(\d{2})-(\d{2})\/(\d{7}).shtml/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/

var fs = require('fs'); 
var myRequest = require('request')
var myCheerio = require('cheerio')
var myIconv = require('iconv-lite')
require('date-utils');

//防止网站屏蔽我们的爬虫
var headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}

//request模块异步fetch url
function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        //proxy: 'http://x.x.x.x:8080',
        headers: headers,
        timeout: 10000 
    }
    myRequest(options, callback)
}

request(seedURL, function(err, res, body) { 
    
    var html = myIconv.decode(body, myEncoding);
    
    var $ = myCheerio.load(html, { decodeEntities: true });
    
    var seedurl_news;

    try {
        seedurl_news = eval(seedURL_format);
        //console.log(seedurl_news);
    } catch (e) { console.log('url列表所处的html块识别出错：' + e) };

    seedurl_news.each(function(i, e) { 
        var myURL = "";
        try {
            var href = "";
            href = $(e).attr("href");
            if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; 
            else if (href.startsWith('//')) myURL = 'http:' + href; 
            else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; 

        } catch (e) { console.log('识别种子页面中的新闻链接出错：' + e) }

        if (!url_reg.test(myURL)) return; 
        //console.log(myURL);
        newsGet(myURL); 
    });
});

function newsGet(myURL) { 
    request(myURL, function(err, res, body) { 
        var html_news = myIconv.decode(body, myEncoding); 
        var $ = myCheerio.load(html_news, { decodeEntities: true });
        myhtml = html_news;
        

        console.log("转码读取成功:" + myURL);
       
        var fetch = {};
        fetch.title = "";
        fetch.content = "";
        fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
       
        fetch.url = myURL;
        fetch.source_name = source_name;
        fetch.source_encoding = myEncoding; 
        fetch.crawltime = new Date();

        if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format);  //没有关键词就用sourcename
        else fetch.keywords = eval(keywords_format);

        if (title_format == "") fetch.title = ""
        else fetch.title = eval(title_format); 

        if (date_format != "") fetch.publish_date = eval(date_format);  
        console.log('date: ' + fetch.publish_date);
        fetch.publish_date = regExp.exec(fetch.publish_date)[0];
        fetch.publish_date = fetch.publish_date.replace('年', '-')
        fetch.publish_date = fetch.publish_date.replace('月', '-')
        fetch.publish_date = fetch.publish_date.replace('日', '')
        fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");

        if (author_format == "") fetch.author = source_name; 
        else fetch.author = eval(author_format);

        if (content_format == "") fetch.content = "";
        else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); 

        if (source_format == "") fetch.source = fetch.source_name;
        else fetch.source = eval(source_format).replace("\r\n", ""); 

        if (desc_format == "") fetch.desc = fetch.title;
        else fetch.desc = eval(desc_format).replace("\r\n", "");    

        var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
            "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
        
        fs.writeFileSync(filename, JSON.stringify(fetch));
    });
}

二.爬虫代码

不得不说这个爬虫代码是如此的折磨人，我刚开始爬新浪竞技风暴，因为老师的有点长且比较复杂，我刚好在B站上看了一个视频，里面的代码很简单，我就用了下面这个代码，这个代码爬第一页的时候很成功， but第一页只里面有七十条数据，不满足Web编程作业要求，后来我又改成了老师的代码。

下面是b站代码：

var https = require("https");//引入http模块
var cheerio = require("cheerio");
var fs = require("fs");

var uri = "https://sports.sina.com.cn/nba/1.shtml";

function httpsGet(uri,cb){

    var html = "";

    https.get(uri,function(res){

        res.on("data",function(chunk){
            html += chunk;
        });

        res.on("end",function(){
            cb(html);
        });

    }).on("error",function(e){
        console.log(e.message);
    });
    return html;
}

httpsGet(uri,function(html){

    var $ = cheerio.load(html);

    $("#S_Cont_11 a").each(function(index){

        var newsUri = $(this).attr("href");

        httpsGet(newsUri,function(body){

            var jq = cheerio.load(body);
        
            fs.writeFile('./.vscode/内容文档/'+index+'.txt',jq("#artibody").text(),function(err){
                if(err){
                    return console.log(err.messsage);
                }
                console.log("完成");
            });
        });
    });
});

这个只爬了新闻的内容部分，还没来得及改成标题，时间，作者，内容的形式就发现它不行了（捂脸哭）。

当我爬第二页时：

是http和https的缘故吗？但我改完是这样：

我又和b站视频上对照了一下，那个老师是爬了全部6页的内容，"#right a"，结果我只能爬一页"#S_Col_11 a"。

困惑，why?

下面是在老师给出代码基础上改的

先

var myRequest = require('request')
var myCheerio = require('cheerio')
var myURL = 'https://sports.sina.com.cn/nba/1.shtml'
function request(url, callback) {//request module fetching url
    var options = {
        url: url,  encoding: null, headers: null
    }
    myRequest(options, callback)
}
request(myURL, function (err, res, body) {
    var html = body;  
    var $ = myCheerio.load(html, { decodeEntities: false });
    console.log($.html());            
})

得出：

完整代码：

var myRequest = require('request');
var myIconv = require('iconv-lite');
var myCheerio = require('cheerio');
var myEncoding = "utf-8";
var mysql = require('./mysql.js');
var source_name = "新浪竞技风暴";
var domain = 'https://sports.sina.com.cn/nba/1.shtml';
var myEncoding = "utf-8";
var seedURL = 'https://sports.sina.com.cn/nba/1.shtml';

function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        //proxy: 'http://x.x.x.x:8080',
        headers: headers,
        timeout: 10000 //
    }
    myRequest(options, callback)
};

var headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
seedget();

function seedget() {
    request('https://sports.sina.com.cn/nba/1.shtml', function(err, res, body) { 
        
        var html = myIconv.decode(body, myEncoding);
       
        var $ = myCheerio.load(html, { decodeEntities: true });
        var seedurl_news;
        try {
            seedurl_news = eval($("#right"));
        } catch (e) { console.log('url列表所处的html块识别出错：' + e) };
    
        seedurl_news.each(function(i, e) { 
            var myURL = "";
            var href="";
            try {
               href=$(e).attr("href");
               console.log(href);
               
               myURL='https://sports.sina.com.cn/nba/1.shtml'+href
            } catch (e) { console.log('识别种子页面中的新闻链接出错：' + e) }

            newsget(myURL)

            


        });
    });
}
function newsget(myURL){


 request(myURL, function(err, res, body){
    var html_news = myIconv.decode(body, myEncoding); 
    var $ = myCheerio.load(html_news, { decodeEntities: true });
    myhtml = html_news; 
    
    console.log("转码读取成功:" + myURL);

    var fetch = {};
        fetch.title = "";
        fetch.content = "";
      
        
        fetch.url = myURL;

    fetch.title=$("title").text();
    fetch.content=$('meta[name="description"]').attr("content");
    
    var fetchadd ='INSERT INTO fetches(url,title,content )VALUES(?,?,?)';
    
    var fetchadd_params=[fetch.url,fetch.title,fetch.content];
    mysql.query(fetchadd,fetchadd_params,function(qerr,vals,fields){});
  
 });

 
}

还有mysql.js等，npm install ***等

三.运行并存入数据库中及.txt文件

建表是在命令提示符里

txt格式是这样的：

四.前端，后端——用网页发送请求到后端查询

前端示例：

<!DOCTYPE html>
<html>
 
<body>
    <form action="http://127.0.0.1:8080/7.02.html" method="GET">
        <br> 标题：<input type="text" name="title">
        <input type="submit" value="Submit">
    </form>
    <script>
    </script>
</body>
 
</html>

后端示例：

var http = require('http');
var fs = require('fs');
var url = require('url');
var mysql = require('./mysql.js');
http.createServer(function(request, response) {
    var pathname = url.parse(request.url).pathname;
    var params = url.parse(request.url, true).query;
    fs.readFile(pathname.substr(1), function(err, data) {
        response.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' });
        if ((params.title === undefined) && (data !== undefined))
            response.write(data.toString());
        else {
            response.write(JSON.stringify(params));
            var select_Sql = "select title,author,publish_date from fetches where title like '%" +
                params.title + "%'";
            mysql.query(select_Sql, function(qerr, vals, fields) {
                console.log(vals);
            });
        }
        response.end();
    });
}).listen(8080);
console.log('Server running at http://127.0.0.1:8080/');

五.总结

我遇到的问题：

1.刚开始用b站代码爬新浪竞技风暴时只能爬第一页（应该是http和https的问题吧？)，70条数据，后来改成其他新闻网站也不大成功，其实我觉得新浪竞技风暴（哈登·休斯顿火箭_NBA|NBA直播|新浪竞技风暴_新浪网 (sina.com.cn)）挺好用的，里面基本没什么图文

2.后来用老师的，主要是mysql这里问题比较多，npm install，建表等~~让我很心累~~。

3.就是用网页发送请求到后端查询，html这些，虽然老师给了示例，我还是搞了很长时间。

漫天星的老巢

关注

1
点赞
踩
10

收藏

觉得还不错? 一键收藏
0
评论
第一次爬虫大作业

第一次爬虫大作业•新闻爬虫及爬取结果的查询网站•核心需求：•1、选取3-5个代表性的新闻网站（比如新浪新闻、网易新闻等，或者某个垂直领域权威性的网站比如经济领域的雪球财经、东方财富等，或者体育领域的腾讯体育、虎扑体育等等）建立爬虫，针对不同网站的新闻页面进行分析，爬取出编码、标题、作者、时间、关键词、摘要、内容、来源等结构化信息，存储在数据库中。•2、建立网站提供对爬取内容的分项全文搜索，给出所查关键词的时间热度分析。•技术要求：•1、必须采用Node.JS实现网络爬虫•2、必须
复制链接

扫一扫

专栏目录