Web大作业.爬虫

前言

大一暑假web编程考试内容为大作业形式,主要包括爬虫+网页,本文主要介绍爬虫部分。


一、使用语言以及各种包的简介

  • 主要使用Node.js          ------------一个基于V8引擎的JavaScript运行时环境
  • request                        ------------主要用于发送http请求获取网页源代码
  • cheerio                        ------------是JQuery的一个子集,用于抓取各种网页元素
  • iconv-lite                      ------------字符串编码转换工具,可用于将爬取到的网页解码
  • node-schedule            -------------定时执行代码的工具
  • mysql                           -------------数据库链接
  • date-utils                      -------------日期工具

二、实现步骤

1.创建数据库链接文件

文件名为mysql.js

var mysql = require('mysql');
var db = mysql.createPool({
    host: '127.0.0.1',
    user: 'root',
    password: '123456',
    database: 'news',
});

var query = function(sql, sqlparam, callback) {
    db.getConnection(function(err, conn) {
        if (err) {
            callback(err, null, null);
        } else {
            conn.query(sql, sqlparam, function(qerr, vals, fields) {
                conn.release(); //释放连接 
                callback(qerr, vals, fields); //事件驱动回调 
            });
        }
    });
};
var query_noparam = function(sql, callback) {
    db.getConnection(function(err, conn) {
        if (err) {
            callback(err, null, null);
        } else {
            conn.query(sql, function(qerr, vals, fields) {
                conn.release(); //释放连接 
                callback(qerr, vals, fields); //事件驱动回调 
            });
        }
    });
};
exports.query = query;
exports.query_noparam = query_noparam;

2.导入各种包

const db = require('./mysql');
const schedule = require('node-schedule');
const myRequest = require('request');
const myCheerio = require('cheerio');
const myIconv = require('iconv-lite');
require('date-utils');

 3.获取网页源代码

var headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        headers: headers,
        timeout: 50000
    }
    myRequest(options, callback)
}

//执行以下函数便可以从data中获取网页源代码
request(seedURL, (err, res, data)=>{}

4.利用iconv-lite解码

data是request函数中获得的data,myEncoding是utf-8

var html = myIconv.decode(data, myEncoding);

5.利用cheerio将要爬取的内容取出

先设定好每个内容要取出时执行的字符串

var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('#pubtime_baidu').text()";
var author_format = "$('#editor_baidu').text()";
var content_format = "$('.left_zw').text()";
var desc_format = "$('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('#source_baidu').text()";

 在需要时先用以下代码加载html

var $ = myCheerio.load(html);

然后直接

var title = eval(title_format);

就可以获得相应的内容(eval函数将执行括号内的字符串内容)

6.保存的数据库

获得所有内容后,利用数据库插入指令将每条新闻插入。 

var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
            'keywords,author,publish_date,crawltime,content,description,crawl_web) VALUES(?,?,?,?,?,?,?,?,?,?,?)';
var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
            fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
            fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content, fetch.desc, source_name];
db.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
            if (qerr) {
                console.log(qerr);
            }
            }); //mysql写入
        

 7.设置定时执行

将在每天0点5分与12点5分执行一次

//!定时执行
var rule = new schedule.RecurrenceRule();
var times = [0, 12]; //每天2次自动执行
var times2 = 5; //定义在第几分钟执行
rule.hour = times;
rule.minute = times2;

schedule.scheduleJob(rule, function() {
    seedget();  //爬取函数
});

8.源代码

mysql文件

var mysql = require('mysql');
var db = mysql.createPool({
    host: '127.0.0.1',
    user: 'root',
    password: '123456',
    database: 'news',
});

var query = function(sql, sqlparam, callback) {
    db.getConnection(function(err, conn) {
        if (err) {
            callback(err, null, null);
        } else {
            conn.query(sql, sqlparam, function(qerr, vals, fields) {
                // db.releaseConnection(conn);
                conn.release(); //释放连接 
                callback(qerr, vals, fields); //事件驱动回调 
            });
        }
    });
};
var query_noparam = function(sql, callback) {
    db.getConnection(function(err, conn) {
        if (err) {
            callback(err, null, null);
        } else {
            conn.query(sql, function(qerr, vals, fields) {
                conn.release(); //释放连接 
                callback(qerr, vals, fields); //事件驱动回调 
            });
        }
    });
};
exports.query = query;
exports.query_noparam = query_noparam;




const db = require('./mysql');
const schedule = require('node-schedule');
const myRequest = require('request');
const myCheerio = require('cheerio');
const myIconv = require('iconv-lite');
require('date-utils');

var source_name = "中国新闻网";
var myEncoding = "utf-8";
var seedURL = 'http://www.chinanews.com/';

var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('#pubtime_baidu').text()";
var author_format = "$('#editor_baidu').text()";
var content_format = "$('.left_zw').text()";
var desc_format = "$('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('#source_baidu').text()";
var url_reg = /\/(\d{4})\/(\d{2})-(\d{2})\/(\d{7}).shtml/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/

var headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}

function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        headers: headers,
        timeout: 50000
    }
    myRequest(options, callback)
}

//!定时执行
var rule = new schedule.RecurrenceRule();
var times = [0, 12]; //每天2次自动执行
var times2 = 5; //定义在第几分钟执行
rule.hour = times;
rule.minute = times2;

//定时执行httpGet()函数
schedule.scheduleJob(rule, function() {
    seedget();
});

function seedget(){
    request(seedURL, (err, res, data)=>{
        let j = 0;
        var html = myIconv.decode(data, myEncoding);
        var $ = myCheerio.load(html);
    
        var seedurl_news;
    
        try {
            seedurl_news = eval(seedURL_format);
        } catch (e) { console.log('url列表所处的html块识别出错:' + e) };
    
        seedurl_news.each(function(i, e) { 
            var myURL = "";
            try {
                
                var href = "";
                href = $(e).attr("href");
                if (href)
                {
                    if (href.toLowerCase().indexOf('http://') >= 0 || href.toLowerCase().indexOf('https://') >= 0) myURL = href; 
                    else if (href.startsWith('//')) myURL = 'http:' + href; 
                    else myURL = seedURL.substring(0, seedURL.lastIndexOf('/') + 1) + href; 
                }
            } catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
            if (myURL.indexOf('iframe') > 0) return;
            if (myURL.indexOf('8426100') > 0) return;
            if (!url_reg.test(myURL)) return; 
            
            // console.log(myURL);
            var fetch_url_Sql = 'select url from fetches where url=?';
            var fetch_url_Sql_Params = [myURL];
            
            db.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
                console.log("here!!!" + j++)
                if (vals.length > 0) {
                    console.log('URL duplicate!')
                } else newsGet(myURL); //读取新闻页面
            });
        });
    })
}


function newsGet(myURL) { //读取新闻页面
    request(myURL, function(err, res, body) { //读取新闻页面
        var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
        var $ = myCheerio.load(html_news);
        myhtml = html_news;
        var fetch = {};
        fetch.title = "";
        fetch.content = "";
        fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
        //fetch.html = myhtml;
        fetch.url = myURL;
        fetch.source_name = source_name;
        fetch.source_encoding = myEncoding; //编码
        fetch.crawltime = new Date();

        try 
        {
        if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format);  //没有关键词就用sourcename
        else fetch.keywords = eval(keywords_format);

        if (title_format == "") fetch.title = ""
        else fetch.title = eval(title_format); //标题

        if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期   

        if (fetch.publish_date)
        {
            fetch.publish_date = regExp.exec(fetch.publish_date)[0];
            fetch.publish_date = fetch.publish_date.replace('年', '-');
            fetch.publish_date = fetch.publish_date.replace('月', '-');
            fetch.publish_date = fetch.publish_date.replace('日', '');
            fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");
        }
        else 
        {
            fetch.publish_date = ''
        }

        if (author_format == "") fetch.author = source_name; //eval(author_format);  //作者
        else fetch.author = eval(author_format);

        if (content_format == "") fetch.content = "";
        else fetch.content = eval(content_format).replace("\r\n" + fetch.author, "").replace("\n", "").trim(); //内容,是否要去掉作者信息自行决定

        if (source_format == "") fetch.source = fetch.source_name;
        else fetch.source = eval(source_format).replace("\r\n", ""); //来源

        if (desc_format == "") fetch.desc = fetch.title;
        else fetch.desc = eval(desc_format); //摘要    
        if (fetch.desc) fetch.desc = fetch.desc.replace("\r\n", "");
        else fetch.desc = '';

        var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
            'keywords,author,publish_date,crawltime,content,description,crawl_web) VALUES(?,?,?,?,?,?,?,?,?,?,?)';
        var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
            fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
            fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content, fetch.desc, source_name
        ];
        if (fetch.publish_date !== '')
        {
            db.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
            if (qerr) {
                console.log(qerr);
            }
            }); //mysql写入
        }
    }catch(e) {
        console.log('读取或存储某项内容出错' + e)
    }
    });
}


总结

如果想要爬取其他网站,需要将各个format字符串修改一下。不同网站形式不同,需要好好研究才能做好爬虫。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值