Node.js爬虫项目实验1

最新推荐文章于 2024-02-11 11:56:49 发布

paper_cat

最新推荐文章于 2024-02-11 11:56:49 发布

阅读量247

点赞数 1

分类专栏：爬虫

本文链接：https://blog.csdn.net/paper_cat/article/details/105958070

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

Node.js爬虫项目示例1

1.核心需求

选取3-5个新闻网站建立爬虫，针对不同网站的新闻页面进行分析，爬取出编码、标题、作者、时间、关键词、摘要、内容、来源等结构化信息，存储在数据库中。
建立网站提供对爬取内容的分项全文搜索，给出所查关键词的时间热度分析。

2.技术要求

采用Node.JS实现网络爬虫
采用Node.JS实现查询网站后端，HTML+JS实现前端（尽量不要使用任何前后端框架）

3.使用的模块库等工具

Cheerio

解析文档中信息

Request

爬取文档信息，并将其存储于本地变量中

(参考资料：https://requests.readthedocs.io/zh_CN/latest/user/install.html）

mysql

创建数据库crawl，创建表fetches

(参考资料：https://www.runoob.com/mysql/mysql-tutorial.html)

iconv-lite

实现编码转换

（参考资料：https://blog.csdn.net/xixi880928/article/details/51918400）

（三个都要进行安装才能使用（在cmd用npm install 软件名）

初始引入必须模块

var fs = require('fs');
var myRequest = require('request')
var myCheerio = require('cheerio')
var myIconv = require('iconv-lite')
require('date-utils');

以下是一个代码示例

var source_name = "网易新闻";
var domain = 'https://news.163.com/';
var myEncoding = "utf-8";
var seedURL = 'https://news.163.com/';

var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('#pubtime_baidu').text()";
var author_format = "$('#editor_baidu').text()";
var content_format = "$('.left_zw').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('#source_baidu').text()";
var url_reg = /\/(\d{4})\/(\d{2})-(\d{2})\/(\d{7}).shtml/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/

var fs = require('fs');
var myRequest = require('request')
var myCheerio = require('cheerio')
var myIconv = require('iconv-lite')
require('date-utils');

//防止网站屏蔽我们的爬虫
var headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}

//request模块异步fetch url
function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        //proxy: 'http://x.x.x.x:8080',
        headers: headers,
        timeout: 10000 //
    }
    myRequest(options, callback)
}

request(seedURL, function(err, res, body) { //读取种子页面
    // try {
    //用iconv转换编码
    var html = myIconv.decode(body, myEncoding);
    //console.log(html);
    //准备用cheerio解析html
    var $ = myCheerio.load(html, { decodeEntities: true });
    // } catch (e) { console.log('读种子页面并转码出错：' + e) };

    var seedurl_news;

    try {
        seedurl_news = eval(seedURL_format);
        //console.log(seedurl_news);
    } catch (e) { console.log('url列表所处的html块识别出错：' + e) };

    seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
        var myURL = "";
        try {
            //得到具体新闻url
            var href = "";
            href = $(e).attr("href");
            if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; //http://开头的
            else if (href.startsWith('//')) myURL = 'http:' + href; 开头的
            else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他

        } catch (e) { console.log('识别种子页面中的新闻链接出错：' + e) }

        if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式
        //console.log(myURL);
        newsGet(myURL); //读取新闻页面
    });
});

function newsGet(myURL) { //读取新闻页面
    request(myURL, function(err, res, body) { //读取新闻页面
        //try {
        var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
        //console.log(html_news);
        //准备用cheerio解析html_news
        var $ = myCheerio.load(html_news, { decodeEntities: true });
        myhtml = html_news;
        //} catch (e) {    console.log('读新闻页面并转码出错：' + e);};

        console.log("转码读取成功:" + myURL);
        //动态执行format字符串，构建json对象准备写入文件或数据库
        var fetch = {};
        fetch.title = "";
        fetch.content = "";
        fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
        //fetch.html = myhtml;
        fetch.url = myURL;
        fetch.source_name = source_name;
        fetch.source_encoding = myEncoding; //编码
        fetch.crawltime = new Date();

        if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format);  //没有关键词就用sourcename
        else fetch.keywords = eval(keywords_format);

        if (title_format == "") fetch.title = ""
        else fetch.title = eval(title_format); //标题

        if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期   
        console.log('date: ' + fetch.publish_date);
        fetch.publish_date = regExp.exec(fetch.publish_date)[0];
        fetch.publish_date = fetch.publish_date.replace('年', '-')
        fetch.publish_date = fetch.publish_date.replace('月', '-')
        fetch.publish_date = fetch.publish_date.replace('日', '')
        fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");

        if (author_format == "") fetch.author = source_name; //eval(author_format);  //作者
        else fetch.author = eval(author_format);

        if (content_format == "") fetch.content = "";
        else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定

        if (source_format == "") fetch.source = fetch.source_name;
        else fetch.source = eval(source_format).replace("\r\n", ""); //来源

        if (desc_format == "") fetch.desc = fetch.title;
        else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要    

        var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
            "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
        存储json
        fs.writeFileSync(filename, JSON.stringify(fetch));
    });
}

（由于本身的能力不足，最终我的代码也有很大一部分是基于老师的代码进行的尝试。）

运行结果：

4.mysql查询已爬取的数据

var mysql = require('./mysql.js');
var title = '创造';
var select_Sql = "select title,author,publish_date from fetches where title like '%" + title + "%'";//固定格式

mysql.query(select_Sql, function(qerr, vals, fields) {
    console.log(vals);
});

当时我电脑上的mysql下载一直都有问题，用cmd方法安装的时候会出现很多error，密码也没有给我，问了老师，查了很多博客也没发现解决的方法……

5.用网页发送请求到后端查询

（参考资料：https://www.runoob.com/mysql/mysql-select-query.html）

用js进行查询：

前端的代码示例：

<!DOCTYPE html>
<html>

<body>
    <form action="http://127.0.0.1:8080/7.02.html" method="GET">
        <br> 标题：<input type="text" name="title">
        <input type="submit" value="Submit">
    </form>
    <script>
    </script>
</body>

</html>

此为后端的创建代码示例：

var http = require('http');
var fs = require('fs');
var url = require('url');
var mysql = require('./mysql.js');
http.createServer(function(request, response) {
    var pathname = url.parse(request.url).pathname;
    var params = url.parse(request.url, true).query;
    fs.readFile(pathname.substr(1), function(err, data) {
        response.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' });
        if ((params.title === undefined) && (data !== undefined))
            response.write(data.toString());
        else {
            response.write(JSON.stringify(params));
            var select_Sql = "select title,author,publish_date from fetches where title like '%" +
                params.title + "%'";
            mysql.query(select_Sql, function(qerr, vals, fields) {
                console.log(vals);
            });
        }
        response.end();
    });
}).listen(8080);
console.log('Server running at http://127.0.0.1:8080/');

接着我尝试着加入定时功能和查重功能

代码示例：

var rule = new schedule.RecurrenceRule();
var times = [0, 12]; 
var times2 = 5; 
rule.hours = times;
rule.minutes = times2;

//定时执行httpGet()函数
schedule.scheduleJob(rule, function() {
    seedget();
});

下一步是通过express框架构建网站对mysql进行访问，这一步使得对网站的访问变得简便。

(参考资料：https://www.jianshu.com/p/229162404823

http://www.360doc.com/content/18/0422/07/32517277_747703104.shtml )

这一步不知道为什么我的代码一直有错误，书写也比较混乱，希望参考资料能对你们有所帮助。

代码示例：（前端网页代码）

<!DOCTYPE html>
<html>

<body>
    <form action="http://127.0.0.1:8080/process_get" method="GET">
        <br> 标题：<input type="text" name="title">
        <input type="submit" value="Submit">
    </form>
    <script>
    </script>
</body>

</html>

（后端代码）

var express = require('express');
var mysql = require('./mysql.js')
var app = express();
//app.use(express.static('public'));
app.get('/7.03.html', function(req, res) {
    res.sendFile(__dirname + "/" + "7.03.html");
})
app.get('/7.04.html', function(req, res) {
    res.sendFile(__dirname + "/" + "7.04.html");
})
app.get('/process_get', function(req, res) {
    res.writeHead(200, { 'Content-Type': 'text/html;charset=utf-8' }); //设置res编码为utf-8
    //sql字符串和参数
    var fetchSql = "select url,source_name,title,author,publish_date from fetches where title like '%" +
        req.query.title + "%'";
    mysql.query(fetchSql, function(err, result, fields) {
        console.log(result);
        res.end(JSON.stringify(result));
    });
})
var server = app.listen(8080, function() {
    console.log("访问地址为 http://127.0.0.1:8080/7.03.html")

})

paper_cat

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
Node.js爬虫项目实验1

Node.js爬虫项目示例11.核心需求选取3-5个新闻网站建立爬虫，针对不同网站的新闻页面进行分析，爬取出编码、标题、作者、时间、关键词、摘要、内容、来源等结构化信息，存储在数据库中。建立网站提供对爬取内容的分项全文搜索，给出所查关键词的时间热度分析。2.技术要求采用Node.JS实现网络爬虫采用Node.JS实现查询网站后端，HTML+JS实现前端（尽量不要使用任何前后端...
复制链接

扫一扫