1.新闻网站爬虫的实现
以新浪新闻为例,新闻的html按照下图的格式编码
'use strict';
var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');
var mysql = require('./mysql.js');
var source_name = "新浪新闻";
var domain = 'https://news.sina.com.cn/';
var myEncoding = "utf-8";
var seedURL = 'https://news.sina.com.cn/';
var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var source_format = " $('meta[name=\"mediaid\"]').eq(0).attr(\"content\")";
var title_format = "$('meta[property=\"og:title\"]').eq(0).attr(\"content\")";
var date_format = "$('meta[property=\"article:published_time\"]').eq(0).attr(\"content\")";
var author_format = "$('meta[property=\"article:author\"]').eq(0).attr(\"content\")";
var desc_format = " $('meta[property=\"og:description\"]').eq(0).attr(\"content\")";
var content_format = "$('.article').text()";
var url_reg = /[0-9]+.shtml$/;
根据不同文本的标签设置获取的format。
function seedget() {
request(seedURL, function (err, res, body) { //读取种子页面
// try {
//用iconv转换编码
var html = myIconv.decode(body, myEncoding);
//console.log(html);
//准备用cheerio解析html
var $ = myCheerio.load(html, { decodeEntities: true });
// } catch (e) { console.log('读种子页面并转码出错:' + e) };
var seedurl_news;
try {
seedurl_news = eval(seedURL_format);
} catch (e) { console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function (i, e) { //遍历种子页面里所有的a链接
var myURL = "";
try {
//得到具体新闻url
var href = "";
href = $(e).attr("href");
if (href == undefined) return;
if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; //http://开头的
else if (href.startsWith('//')) myURL = 'http:' + href; 开头的
else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他
} catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式
//console.log('get:'+myURL);
var fetch_url_Sql = 'select url from fetches_test where url=?';
var fetch_url_Sql_Params = [myURL];
mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function (qerr, vals, fields) {