Node.js实现网络新闻爬虫及搜索功能(三)
系列文章查看不到可能是CSDN审核原因,可以在我的知乎专栏看到所有文章:https://www.zhihu.com/column/c_1370026160999415808
项目要求
一、爬虫部分
1、完成目标网站的网页分析和爬虫设计。
2、爬取不少于100条数据(每条数据包括7个字段,新闻关键词、新闻标题、新闻日期、新闻作者、新闻来源、新闻摘要、新闻内容),并存储在数据库中。
二、搜索网站部分
1、完成对数据库中爬取新闻内容和标题的搜索功能,搜索结果以表格形式展示在前端页面中。
2、完成对搜索内容的时间热度分析,使用表格展示爬取数据内容中每一天包含搜索内容的条数。
本文是该项目第三部分:改写爬虫为定时爬取
二、定时爬虫改写
经过该系列文章(一)和(二),我们已经拥有了两个Node.js爬虫,因为新闻网站不断对新的新闻进行推送,所以我们需要设置一个定时函数,在每个指定时间调用爬虫,实现爬取的新闻更新。
1. 回顾已有代码
crawler_163.js
var crawler_request = require('request');
var crawler_iconv = require('iconv-lite');
var crawler_cheerio = require('cheerio');
require('date-utils');
// 连接数据库
var crawler_sql = require("./crawler_sql.js");
// var crawler_sql = require("mysql");
// var pool = crawler_sql.createPool({
// host: '127.0.0.1',
// user: 'root',
// password: 'root',
// database: 'crawl'
// });
// var query = function(sql, sqlparam, callback) {
// pool.getConnection(function(err, conn) {
// if (err) {
// callback(err, null, null);
// } else {
// conn.query(sql, sqlparam, function(qerr, vals, fields) {
// conn.release(); //释放连接
// callback(qerr, vals, fields); //事件驱动回调
// });
// }
// });
// };
// var query_noparam = function(sql, callback) {
// pool.getConnection(function(err, conn) {
// if (err) {
// callback(err, null, null);
// } else {
// conn.query(sql, function(qerr, vals, fields) {
// conn.release(); //释放连接
// callback(qerr, vals, fields); //事件驱动回调
// });
// }
// });
// };
// exports.query = query;
// exports.query_noparam = query_noparam;
// 爬取网页首页
function request(url, callback) {
var options = {
url: url,
encoding: null,
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
},
timeout: 10000
}
crawler_request(options, callback);
};
var crawler_url = 'https://news.163.com/';
crawler();
function crawler() {
request(crawler_url, function(err, res, body) {
// 网页解析
try {
// 编码转换
var url_encoding = 'UTF-8';
var url_html = crawler_iconv.decode(body, url_encoding);
//解析网页
var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
} catch (e) {
console.log('页面解码错误:' + e);
}
// 判断网页是否存在超链接
var url_hrefs;
try {
url_hrefs = eval("$('a')");
} catch (e) {
console.log('页面不存在超链接' + e);
}
// 遍历网页中所有超链接
url_hrefs.each(function(i, e) {
// 获取新闻
var news_url = "";
try {
var url_href = "";
url_href = $(e).attr("href");
if (typeof(url_href) == "undefined") {
return true;
}
if (url_href.toLowerCase().indexOf('http://') >= 0 || url_href.toLowerCase().indexOf('https://') >= 0) {
news_url = url_href;
} else if (url_href.startsWith('//')) {
news_url = 'https:' + url_href;
} else {
news_url = crawler_url.substr(0, crawler_url.lastIndexOf('/') + 1) + url_href;
}
} catch (e) {
console.log('获取新闻页面出错' + e);
}
// 检验新闻网页url是否符合url命名格式
var news_reg = /\/news\/article\/([a-zA-Z0-9]{16}).html/;
var news_reg_special = /\/news\/article\/([a-zA-Z0-9]{8})0001982T.html/;
// 如:https://www.163.com/news/article/G8HQOAKE0001899O.html
if (!news_reg.test(news_url) || news_reg_special.test(news_url)) {
console.log('新闻链接不符合格式!');
return;
}
// 爬取新闻页面
var news_search_sql = 'select url from news where url=?';
var news_search = [news_url];
crawler_sql.query(news_search_sql, news_search, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('该新闻页面已被爬取!')
} else {
crawler_news_url(news_url);
}
});
});
});
}
// 爬取新闻链接
function crawler_news_url(news_url) {
request(news_url, function(err, res, body) {
// 网页解析
try {
// 编码转换
var url_encoding = 'UTF-8';
var url_html = crawler_iconv.decode(body, url_encoding);
//解析网页
var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
} catch (e) {
console.log('页面解码错误:' + e);
}
// 定义新闻信息json
var news = {};
news.crawler_time = (new Date()).toFormat("YYYY-MM-DD HH:MM:SS.SSSS");
news.url = news_url;
news.url_encoding = 'UTF-8';
news.keywords = '';
news.title = '';
news.date = new Date();
news.author = '';
news.source = '';
news.summary = '';
news.content = '';
// 获取新闻关键词
try {
news.keywords = eval("$('meta[name=\"keywords\"]').eq(0).attr(\"content\")");
} catch (e) {
console.log('新闻关键词获取错误:' + e);
}
// 获取新闻标题
try {
news.title = eval("$('title').text()").replace(/[\r\n\s]/g, "");
} catch (e) {
console.log('新闻标题获取错误:' + e);
}
// 获取新闻时间
try {
news.date = eval("$('#ne_wrap').eq(0).attr(\"data-publishtime\")");
} catch (e) {
console.log('新闻日期获取错误:' + e);
}
// 获取新闻作者
try {
news.author = eval("$('.icon').eq(0).attr(\"alt\")");
if (news.author == 'netease') {
news.author = eval("$('.post_author').text()").replace(/[\r\n\s]/g, "").replace("本文来源:", "");
var author_reg = /责任编辑:.+_/;
news.author = author_reg.exec(news.author).toString().replace("责任编辑:", "").replace("_", "");
}
} catch (e) {
console.log('新闻作者获取错误:' + e);
}
// 获取新闻来源
try {
news.source = eval("$('.post_info').children(':first').text()").replace(/[\r\n\s]/g, "");
if (news.source == '举报') {
news.source = eval("$('.post_info').prop('firstChild').nodeValue").replace(/[\r\n\s]/g, "");
var source_reg = /.+来源:/;
var tmp = source_reg.exec(news.source).toString();
news.source = news.source.replace(tmp, "");
}
} catch (e) {
console.log('新闻来源获取错误:' + e);
}
// 获取新闻摘要
try {
news.summary = eval("$('meta[name=\"description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, "");
} catch (e) {
console.log('新闻摘要获取错误:' + e);
}
// 获取新闻内容
try {
news.content = eval("$('.post_body').text()").replace(/[\r\n\s]/g, "");
} catch (e) {
console.log('新闻内容获取错误:' + e);
}
console.log(JSON.stringify(news));
// 写入数据库
if (news.content != '') {
var news_add_sql = 'INSERT INTO news(url, source, url_encoding, title, keywords, author, date, crawler_time, summary, content) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)';
var news_add = [news.url, news.source, news.url_encoding,
news.title, news.keywords, news.author, news.date,
news.crawler_time, news.summary, news.content
];
crawler_sql.query(news_add_sql, news_add, function(qerr, vals, fields) {
if (qerr) {
console.log(qerr);
}
});
}
});
}
crawler_sina.js
var crawler_request = require('request');
var crawler_iconv = require('iconv-lite');
var crawler_cheerio = require('cheerio');
require('date-utils');
// 连接数据库
var crawler_sql = require("./crawler_sql.js");
// 爬取网页首页
function request(url, callback) {
var options = {
url: url,
encoding: null,
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
},
timeout: 10000
}
crawler_request(options, callback);
};
var crawler_url = 'https://news.sina.com.cn/';
crawler();
function crawler() {
request(crawler_url, function(err, res, body) {
// 网页解析
try {
// 编码转换
var url_encoding = 'UTF-8';
var url_html = crawler_iconv.decode(body, url_encoding);
//解析网页
var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
} catch (e) {
console.log('页面解码错误:' + e);
}
// 判断网页是否存在超链接
var url_hrefs;
try {
url_hrefs = eval("$('a')");
} catch (e) {
console.log('页面不存在超链接' + e);
}
// 遍历网页中所有超链接
url_hrefs.each(function(i, e) {
// 获取新闻
var news_url = "";
try {
var url_href = "";
url_href = $(e).attr("href");
if (typeof(url_href) == "undefined") {
return true;
}
if (url_href.toLowerCase().indexOf('http://') >= 0 || url_href.toLowerCase().indexOf('https://') >= 0) {
news_url = url_href;
} else if (url_href.startsWith('//')) {
news_url = 'https:' + url_href;
} else {
news_url = crawler_url.substr(0, crawler_url.lastIndexOf('/') + 1) + url_href;
}
} catch (e) {
console.log('获取新闻页面出错' + e);
}
// 检验新闻网页url是否符合url命名格式
var news_reg = /\/(\d{4})-(\d{2})-(\d{2})\/doc-([a-zA-Z0-9]{15}).shtml/;
// 如:https://news.sina.com.cn/c/xl/2021-04-29/doc-ikmyaawc2421496.shtml
if (!news_reg.test(news_url)) {
console.log('新闻链接不符合格式!');
return;
}
// 爬取新闻页面
var news_search_sql = 'select url from news where url=?';
var news_search = [news_url];
crawler_sql.query(news_search_sql, news_search, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('该新闻页面已被爬取!')
} else {
crawler_news_url(news_url);
}
});
});
});
};
// 爬取新闻链接
function crawler_news_url(news_url) {
request(news_url, function(err, res, body) {
// 网页解析
try {
// 编码转换
var url_encoding = 'UTF-8';
var url_html = crawler_iconv.decode(body, url_encoding);
//解析网页
var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
} catch (e) {
console.log('页面解码错误:' + e);
}
// 定义新闻信息json
var news = {};
news.crawler_time = (new Date()).toFormat("YYYY-MM-DD HH:MM:SS.SSSS");
news.url = news_url;
news.url_encoding = 'UTF-8';
news.keywords = '';
news.title = '';
news.date = new Date();
news.author = '';
news.source = '';
news.summary = '';
news.content = '';
// 获取新闻关键词
try {
news.keywords = eval("$('meta[name=\"keywords\"]').eq(0).attr(\"content\")");
} catch (e) {
console.log('新闻关键词获取错误:' + e);
}
// 获取新闻标题
try {
news.title = eval("$('title').text()").replace(/[\r\n\s]/g, "");
} catch (e) {
console.log('新闻标题获取错误:' + e);
}
// 获取新闻时间
try {
news.date = eval("$('.date').text()");
news.date = news.date.replace('年', '-');
news.date = news.date.replace('月', '-');
news.date = news.date.replace('日', '');
} catch (e) {
console.log('新闻日期获取错误:' + e);
}
// 获取新闻作者
try {
news.author = eval("$('.show_author').text()").replace("责任编辑:", "");
} catch (e) {
console.log('新闻作者获取错误:' + e);
}
// 获取新闻来源
try {
news.source = eval("$('meta[name=\"mediaid\"]').eq(0).attr(\"content\")");
} catch (e) {
console.log('新闻来源获取错误:' + e);
}
// 获取新闻摘要
try {
news.summary = eval("$('meta[name=\"description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, "");
if (news.summary == "") {
news.summary = eval("$('meta[property=\"og:description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, "");
}
} catch (e) {
console.log('新闻摘要获取错误:' + e);
}
// 获取新闻内容
try {
news.content = eval("$('.article').text()").replace(/[\r\n\s]/g, "");
if (news.content == "") {
news.content = eval("$('#article_content').text()").replace(/[\r\n\s]/g, "");
}
} catch (e) {
console.log('新闻内容获取错误:' + e);
}
console.log(JSON.stringify(news));
// 写入数据库
if (news.author == '' || news.author == null) {
news.author = news.source;
}
if (news.content != '' && news.source != '') {
var news_add_sql = 'INSERT INTO news(url, source, url_encoding, title, keywords, author, date, crawler_time, summary, content) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)';
var news_add = [news.url, news.source, news.url_encoding,
news.title, news.keywords, news.author, news.date,
news.crawler_time, news.summary, news.content
];
crawler_sql.query(news_add_sql, news_add, function(qerr, vals, fields) {
if (qerr) {
console.log(qerr);
}
});
}
});
}
2. 引入相关包
新建crawler_schedule_163.js和crawler_schedule_sina.js文件。
这两个文件是对对应的crawler_163.js和crawler_sina.js文件分别进行同样的改写
引入需要的相关包node-schedule:
// 定时执行
var crawler_schedule = require('node-schedule');
建立定时规则:
var crawler_rule = new crawler_schedule.RecurrenceRule();
// crawler_rule.hour = [0, 12];
// crawler_rule.minute = 5;
crawler_rule.second = 0;
其中,.hour方法是对设定的时刻调用爬虫,如此处的[0, 12]就是指0时和12时执行爬虫;.minute方法是对设定的分钟数调用爬虫,如此处的5就是指每小时的第5分钟执行爬虫;.second方法是对设定的秒数调用爬虫,如此处的0就是指每分钟的第0秒开始爬虫。
将原来调用爬虫的主函数注释掉:
// crawler();
改为:
crawler_schedule.scheduleJob(crawler_rule, function() {
crawler();
});
3. 定时爬虫代码
crawler_schedule_163.js
var crawler_request = require('request');
var crawler_iconv = require('iconv-lite');
var crawler_cheerio = require('cheerio');
require('date-utils');
// 连接数据库
var crawler_sql = require("./crawler_sql.js");
// 定时执行
var crawler_schedule = require('node-schedule');
var crawler_rule = new crawler_schedule.RecurrenceRule();
// crawler_rule.hour = [0, 12];
// crawler_rule.minute = 5;
crawler_rule.second = 0;
crawler_schedule.scheduleJob(crawler_rule, function() {
crawler();
});
// 定时爬取网页首页
function request(url, callback) {
var options = {
url: url,
encoding: null,
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
},
timeout: 10000
}
crawler_request(options, callback);
};
var crawler_url = 'https://news.163.com/';
function crawler() {
request(crawler_url, function(err, res, body) {
// 网页解析
try {
// 编码转换
var url_encoding = 'UTF-8';
var url_html = crawler_iconv.decode(body, url_encoding);
//解析网页
var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
} catch (e) {
console.log('页面解码错误:' + e);
}
// 判断网页是否存在超链接
var url_hrefs;
try {
url_hrefs = eval("$('a')");
} catch (e) {
console.log('页面不存在超链接' + e);
}
// 遍历网页中所有超链接
url_hrefs.each(function(i, e) {
// 获取新闻
var news_url = "";
try {
var url_href = "";
url_href = $(e).attr("href");
if (typeof(url_href) == "undefined") {
return true;
}
if (url_href.toLowerCase().indexOf('http://') >= 0 || url_href.toLowerCase().indexOf('https://') >= 0) {
news_url = url_href;
} else if (url_href.startsWith('//')) {
news_url = 'https:' + url_href;
} else {
news_url = crawler_url.substr(0, crawler_url.lastIndexOf('/') + 1) + url_href;
}
} catch (e) {
console.log('获取新闻页面出错' + e);
}
// 检验新闻网页url是否符合url命名格式
var news_reg = /\/news\/article\/([a-zA-Z0-9]{16}).html/;
var news_reg_special = /\/news\/article\/([a-zA-Z0-9]{8})0001982T.html/
// 如:https://www.163.com/news/article/G8HQOAKE0001899O.html
if (!news_reg.test(news_url) || news_reg_special.test(news_url)) {
console.log('新闻链接不符合格式!');
return;
}
// 爬取新闻页面
var news_search_sql = 'select url from news where url=?';
var news_search = [news_url];
crawler_sql.query(news_search_sql, news_search, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('该新闻页面已被爬取!')
} else {
crawler_news_url(news_url);
}
});
});
});
}
// 爬取新闻链接
function crawler_news_url(news_url) {
request(news_url, function(err, res, body) {
// 网页解析
try {
// 编码转换
var url_encoding = 'UTF-8';
var url_html = crawler_iconv.decode(body, url_encoding);
//解析网页
var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
} catch (e) {
console.log('页面解码错误:' + e);
}
// 定义新闻信息json
var news = {};
news.crawler_time = (new Date()).toFormat("YYYY-MM-DD HH:MM:SS.SSSS");
news.url = news_url;
news.url_encoding = 'UTF-8';
news.keywords = '';
news.title = '';
news.date = new Date();
news.author = '';
news.source = '';
news.summary = '';
news.content = '';
// 获取新闻关键词
try {
news.keywords = eval("$('meta[name=\"keywords\"]').eq(0).attr(\"content\")");
} catch (e) {
console.log('新闻关键词获取错误:' + e);
}
// 获取新闻标题
try {
news.title = eval("$('title').text()").replace(/[\r\n\s]/g, "");
} catch (e) {
console.log('新闻标题获取错误:' + e);
}
// 获取新闻时间
try {
news.date = eval("$('#ne_wrap').eq(0).attr(\"data-publishtime\")");
} catch (e) {
console.log('新闻日期获取错误:' + e);
}
// 获取新闻作者
try {
news.author = eval("$('.icon').eq(0).attr(\"alt\")");
if (news.author == 'netease') {
news.author = eval("$('.post_author').text()").replace(/[\r\n\s]/g, "").replace("本文来源:", "");
var author_reg = /责任编辑:.+_/;
news.author = author_reg.exec(news.author).toString().replace("责任编辑:", "").replace("_", "");
}
} catch (e) {
console.log('新闻作者获取错误:' + e);
}
// 获取新闻来源
try {
news.source = eval("$('.post_info').children(':first').text()").replace(/[\r\n\s]/g, "");
if (news.source == '举报') {
news.source = eval("$('.post_info').prop('firstChild').nodeValue").replace(/[\r\n\s]/g, "");
var source_reg = /.+来源:/;
var tmp = source_reg.exec(news.source).toString();
news.source = news.source.replace(tmp, "");
}
} catch (e) {
console.log('新闻来源获取错误:' + e);
}
// 获取新闻摘要
try {
news.summary = eval("$('meta[name=\"description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, "");
} catch (e) {
console.log('新闻摘要获取错误:' + e);
}
// 获取新闻内容
try {
news.content = eval("$('.post_body').text()").replace(/[\r\n\s]/g, "");
} catch (e) {
console.log('新闻内容获取错误:' + e);
}
console.log(JSON.stringify(news));
// 写入数据库
if (news.content != '') {
var news_add_sql = 'INSERT INTO news(url, source, url_encoding, title, keywords, author, date, crawler_time, summary, content) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)';
var news_add = [news.url, news.source, news.url_encoding,
news.title, news.keywords, news.author, news.date,
news.crawler_time, news.summary, news.content
];
crawler_sql.query(news_add_sql, news_add, function(qerr, vals, fields) {
if (qerr) {
console.log(qerr);
}
});
}
});
}
crawler_schedule_sina.js
var crawler_request = require('request');
var crawler_iconv = require('iconv-lite');
var crawler_cheerio = require('cheerio');
require('date-utils');
// 连接数据库
var crawler_sql = require("./crawler_sql.js");
// 定时执行
var crawler_schedule = require('node-schedule');
var crawler_rule = new crawler_schedule.RecurrenceRule();
// crawler_rule.hour = [0, 12];
// crawler_rule.minute = 5;
crawler_rule.second = 0;
crawler_schedule.scheduleJob(crawler_rule, function() {
crawler();
});
// 定时爬取网页首页
function request(url, callback) {
var options = {
url: url,
encoding: null,
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
},
timeout: 10000
}
crawler_request(options, callback);
};
var crawler_url = 'https://news.sina.com.cn/';
function crawler() {
request(crawler_url, function(err, res, body) {
// 网页解析
try {
// 编码转换
var url_encoding = 'UTF-8';
var url_html = crawler_iconv.decode(body, url_encoding);
//解析网页
var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
} catch (e) {
console.log('页面解码错误:' + e);
}
// 判断网页是否存在超链接
var url_hrefs;
try {
url_hrefs = eval("$('a')");
} catch (e) {
console.log('页面不存在超链接' + e);
}
// 遍历网页中所有超链接
url_hrefs.each(function(i, e) {
// 获取新闻
var news_url = "";
try {
var url_href = "";
url_href = $(e).attr("href");
if (typeof(url_href) == "undefined") {
return true;
}
if (url_href.toLowerCase().indexOf('http://') >= 0 || url_href.toLowerCase().indexOf('https://') >= 0) {
news_url = url_href;
} else if (url_href.startsWith('//')) {
news_url = 'https:' + url_href;
} else {
news_url = crawler_url.substr(0, crawler_url.lastIndexOf('/') + 1) + url_href;
}
} catch (e) {
console.log('获取新闻页面出错' + e);
}
// 检验新闻网页url是否符合url命名格式
var news_reg = /\/(\d{4})-(\d{2})-(\d{2})\/doc-([a-zA-Z0-9]{15}).shtml/;
// 如:https://news.sina.com.cn/c/xl/2021-04-29/doc-ikmyaawc2421496.shtml
if (!news_reg.test(news_url)) {
console.log('新闻链接不符合格式!');
return;
}
// 爬取新闻页面
var news_search_sql = 'select url from news where url=?';
var news_search = [news_url];
crawler_sql.query(news_search_sql, news_search, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('该新闻页面已被爬取!')
} else {
crawler_news_url(news_url);
}
});
});
});
}
// 爬取新闻链接
function crawler_news_url(news_url) {
request(news_url, function(err, res, body) {
// 网页解析
try {
// 编码转换
var url_encoding = 'UTF-8';
var url_html = crawler_iconv.decode(body, url_encoding);
//解析网页
var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
} catch (e) {
console.log('页面解码错误:' + e);
}
// 定义新闻信息json
var news = {};
news.crawler_time = (new Date()).toFormat("YYYY-MM-DD HH:MM:SS.SSSS");
news.url = news_url;
news.url_encoding = 'UTF-8';
news.keywords = '';
news.title = '';
news.date = new Date();
news.author = '';
news.source = '';
news.summary = '';
news.content = '';
// 获取新闻关键词
try {
news.keywords = eval("$('meta[name=\"keywords\"]').eq(0).attr(\"content\")");
} catch (e) {
console.log('新闻关键词获取错误:' + e);
}
// 获取新闻标题
try {
news.title = eval("$('title').text()").replace(/[\r\n\s]/g, "");
} catch (e) {
console.log('新闻标题获取错误:' + e);
}
// 获取新闻时间
try {
news.date = eval("$('.date').text()");
news.date = news.date.replace('年', '-');
news.date = news.date.replace('月', '-');
news.date = news.date.replace('日', '');
} catch (e) {
console.log('新闻日期获取错误:' + e);
}
// 获取新闻作者
try {
news.author = eval("$('.show_author').text()").replace("责任编辑:", "");
} catch (e) {
console.log('新闻作者获取错误:' + e);
}
// 获取新闻来源
try {
news.source = eval("$('meta[name=\"mediaid\"]').eq(0).attr(\"content\")");
} catch (e) {
console.log('新闻来源获取错误:' + e);
}
// 获取新闻摘要
try {
news.summary = eval("$('meta[name=\"description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, "");
if (news.summary == "") {
news.summary = eval("$('meta[property=\"og:description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, "");
}
} catch (e) {
console.log('新闻摘要获取错误:' + e);
}
// 获取新闻内容
try {
news.content = eval("$('.article').text()").replace(/[\r\n\s]/g, "");
if (news.content == "") {
news.content = eval("$('#article_content').text()").replace(/[\r\n\s]/g, "");
}
} catch (e) {
console.log('新闻内容获取错误:' + e);
}
console.log(JSON.stringify(news));
// 写入数据库
if (news.author == '' || news.author == null) {
news.author = news.source;
}
if (news.content != '' && news.source != '') {
var news_add_sql = 'INSERT INTO news(url, source, url_encoding, title, keywords, author, date, crawler_time, summary, content) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)';
var news_add = [news.url, news.source, news.url_encoding,
news.title, news.keywords, news.author, news.date,
news.crawler_time, news.summary, news.content
];
crawler_sql.query(news_add_sql, news_add, function(qerr, vals, fields) {
if (qerr) {
console.log(qerr);
}
});
}
});
}