第一次爬虫大作业

第一次爬虫大作业

新闻爬虫及爬取结果的查询网站
核心需求:
1 、选取 3-5 个代表性的新闻网站(比如新浪新闻、网易新闻等,或者某个垂直领域权威性的网站比如经济领域的雪球财经、东方财富等,或者体育领域的腾讯体育、虎扑体育等等)建立爬虫,针对不同网站的新闻页面进行分析,爬取出编码、标题、作者、时间、关键词、摘要、内容、来源等结构化信息,存储在数据库中。
2 、建立网站提供对爬取内容的分项全文搜索,给出所查关键词的时间热度分析。
技术要求:
1 、必须采用 Node.JS 实现网络爬虫
•2、必须采用Node.JS实现查询网站后端,HTML+JS实现前端(尽量不要使用任何前后端框架)
参考资料:菜鸟教程,node.js中文网,CSDN博客,PPT以及B站

一.基础爬虫代码(老师的)

/*在这里插入代码片*/var source_name = "中国新闻网";
var domain = 'http://www.chinanews.com/';
var myEncoding = "utf-8";
var seedURL = 'http://www.chinanews.com/';

var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('#pubtime_baidu').text()";
var author_format = "$('#editor_baidu').text()";
var content_format = "$('.left_zw').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('#source_baidu').text()";
var url_reg = /\/(\d{4})\/(\d{2})-(\d{2})\/(\d{7}).shtml/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/

var fs = require('fs'); 
var myRequest = require('request')
var myCheerio = require('cheerio')
var myIconv = require('iconv-lite')
require('date-utils');

//防止网站屏蔽我们的爬虫
var headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}

//request模块异步fetch url
function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        //proxy: 'http://x.x.x.x:8080',
        headers: headers,
        timeout: 10000 
    }
    myRequest(options, callback)
}

request(seedURL, function(err, res, body) { 
    
    var html = myIconv.decode(body, myEncoding);
    
    var $ = myCheerio.load(html, { decodeEntities: true });
    
    var seedurl_news;

    try {
        seedurl_news = eval(seedURL_format);
        //console.log(seedurl_news);
    } catch (e) { console.log('url列表所处的html块识别出错:' + e) };

    seedurl_news.each(function(i, e) { 
        var myURL = "";
        try {
            var href = "";
            href = $(e).attr("href");
            if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; 
            else if (href.startsWith('//')) myURL = 'http:' + href; 
            else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; 

        } catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }

        if (!url_reg.test(myURL)) return; 
        //console.log(myURL);
        newsGet(myURL); 
    });
});

function newsGet(myURL) { 
    request(myURL, function(err, res, body) { 
        var html_news = myIconv.decode(body, myEncoding); 
        var $ = myCheerio.load(html_news, { decodeEntities: true });
        myhtml = html_news;
        

        console.log("转码读取成功:" + myURL);
       
        var fetch = {};
        fetch.title = "";
        fetch.content = "";
        fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
       
        fetch.url = myURL;
        fetch.source_name = source_name;
        fetch.source_encoding = myEncoding; 
        fetch.crawltime = new Date();

        if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format);  //没有关键词就用sourcename
        else fetch.keywords = eval(keywords_format);

        if (title_format == "") fetch.title = ""
        else fetch.title = eval(title_format); 

        if (date_format != "") fetch.publish_date = eval(date_format);  
        console.log('date: ' + fetch.publish_date);
        fetch.publish_date = regExp.exec(fetch.publish_date)[0];
        fetch.publish_date = fetch.publish_date.replace('年', '-')
        fetch.publish_date = fetch.publish_date.replace('月', '-')
        fetch.publish_date = fetch.publish_date.replace('日', '')
        fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");

        if (author_format == "") fetch.author = source_name; 
        else fetch.author = eval(author_format);

        if (content_format == "") fetch.content = "";
        else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); 

        if (source_format == "") fetch.source = fetch.source_name;
        else fetch.source = eval(source_format).replace("\r\n", ""); 

        if (desc_format == "") fetch.desc = fetch.title;
        else fetch.desc = eval(desc_format).replace("\r\n", "");    

        var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
            "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
        
        fs.writeFileSync(filename, JSON.stringify(fetch));
    });
}

二.爬虫代码

         不得不说这个爬虫代码是如此的折磨人,我刚开始爬新浪竞技风暴,因为老师的有点长且比较复杂,我刚好在B站上看了一个视频,里面的代码很简单,我就用了下面这个代码,这个代码爬第一页的时候很成功, but第一页只里面有七十条数据,不满足Web编程作业要求,后来我又改成了老师的代码。

         下面是b站代码:

var https = require("https");//引入http模块
var cheerio = require("cheerio");
var fs = require("fs");

var uri = "https://sports.sina.com.cn/nba/1.shtml";

function httpsGet(uri,cb){

    var html = "";

    https.get(uri,function(res){

        res.on("data",function(chunk){
            html += chunk;
        });

        res.on("end",function(){
            cb(html);
        });

    }).on("error",function(e){
        console.log(e.message);
    });
    return html;
}

httpsGet(uri,function(html){

    var $ = cheerio.load(html);

    $("#S_Cont_11 a").each(function(index){

        var newsUri = $(this).attr("href");

        httpsGet(newsUri,function(body){

            var jq = cheerio.load(body);
        
            fs.writeFile('./.vscode/内容文档/'+index+'.txt',jq("#artibody").text(),function(err){
                if(err){
                    return console.log(err.messsage);
                }
                console.log("完成");
            });
        });
    });
});

          这个只爬了新闻的内容部分,还没来得及改成标题,时间,作者,内容的形式就发现它不行了(捂脸哭)。

          当我爬第二页时:

是http和https的缘故吗?但我改完是这样:

我又和b站视频上对照了一下,那个老师是爬了全部6页的内容,"#right a",结果我只能爬一页"#S_Col_11 a"。

困惑,why?

下面是在老师给出代码基础上改的

var myRequest = require('request')
var myCheerio = require('cheerio')
var myURL = 'https://sports.sina.com.cn/nba/1.shtml'
function request(url, callback) {//request module fetching url
    var options = {
        url: url,  encoding: null, headers: null
    }
    myRequest(options, callback)
}
request(myURL, function (err, res, body) {
    var html = body;  
    var $ = myCheerio.load(html, { decodeEntities: false });
    console.log($.html());            
})

得出:

完整代码:

var myRequest = require('request');
var myIconv = require('iconv-lite');
var myCheerio = require('cheerio');
var myEncoding = "utf-8";
var mysql = require('./mysql.js');
var source_name = "新浪竞技风暴";
var domain = 'https://sports.sina.com.cn/nba/1.shtml';
var myEncoding = "utf-8";
var seedURL = 'https://sports.sina.com.cn/nba/1.shtml';

function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        //proxy: 'http://x.x.x.x:8080',
        headers: headers,
        timeout: 10000 //
    }
    myRequest(options, callback)
};

var headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
seedget();

function seedget() {
    request('https://sports.sina.com.cn/nba/1.shtml', function(err, res, body) { 
        
        var html = myIconv.decode(body, myEncoding);
       
        var $ = myCheerio.load(html, { decodeEntities: true });
        var seedurl_news;
        try {
            seedurl_news = eval($("#right"));
        } catch (e) { console.log('url列表所处的html块识别出错:' + e) };
    
        seedurl_news.each(function(i, e) { 
            var myURL = "";
            var href="";
            try {
               href=$(e).attr("href");
               console.log(href);
               
               myURL='https://sports.sina.com.cn/nba/1.shtml'+href
            } catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }

            newsget(myURL)

            


        });
    });
}
function newsget(myURL){


 request(myURL, function(err, res, body){
    var html_news = myIconv.decode(body, myEncoding); 
    var $ = myCheerio.load(html_news, { decodeEntities: true });
    myhtml = html_news; 
    
    console.log("转码读取成功:" + myURL);

    var fetch = {};
        fetch.title = "";
        fetch.content = "";
      
        
        fetch.url = myURL;

    fetch.title=$("title").text();
    fetch.content=$('meta[name="description"]').attr("content");
    
    var fetchadd ='INSERT INTO fetches(url,title,content )VALUES(?,?,?)';
    
    var fetchadd_params=[fetch.url,fetch.title,fetch.content];
    mysql.query(fetchadd,fetchadd_params,function(qerr,vals,fields){});
  
 });

 
}

还有mysql.js等,npm install ***等

三.运行并存入数据库中及.txt文件

建表是在命令提示符里

txt格式是这样的:

四.前端,后端——用网页发送请求到后端查询

前端示例:

<!DOCTYPE html>
<html>
 
<body>
    <form action="http://127.0.0.1:8080/7.02.html" method="GET">
        <br> 标题:<input type="text" name="title">
        <input type="submit" value="Submit">
    </form>
    <script>
    </script>
</body>
 
</html>

后端示例:

var http = require('http');
var fs = require('fs');
var url = require('url');
var mysql = require('./mysql.js');
http.createServer(function(request, response) {
    var pathname = url.parse(request.url).pathname;
    var params = url.parse(request.url, true).query;
    fs.readFile(pathname.substr(1), function(err, data) {
        response.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' });
        if ((params.title === undefined) && (data !== undefined))
            response.write(data.toString());
        else {
            response.write(JSON.stringify(params));
            var select_Sql = "select title,author,publish_date from fetches where title like '%" +
                params.title + "%'";
            mysql.query(select_Sql, function(qerr, vals, fields) {
                console.log(vals);
            });
        }
        response.end();
    });
}).listen(8080);
console.log('Server running at http://127.0.0.1:8080/');

五.总结

我遇到的问题:

1.刚开始用b站代码爬新浪竞技风暴时只能爬第一页(应该是http和https的问题吧?),70条数据,后来改成其他新闻网站也不大成功,其实我觉得新浪竞技风暴(哈登·休斯顿火箭_NBA|NBA直播|新浪竞技风暴_新浪网 (sina.com.cn))挺好用的,里面基本没什么图文

2.后来用老师的,主要是mysql这里问题比较多,npm install,建表等让我很心累

3.就是用网页发送请求到后端查询,html这些,虽然老师给了示例,我还是搞了很长时间。

  • 1
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值