引入模块
var fs=require('fs'); //文件操作
var request=require('request');
var cheerio=require('cheerio');
var iconv=require('iconv-lite');
var mysql=require('./mysql.js'); //Mysql使用
var schedule=require('node-schedule'); //定时操作
var async=require('async');
//控制开发数,避免屏蔽,只是提及,不一定使用,具体使用见async.mapLimit用法。
require('date-utils');
mysql.js
var mysql = require("mysql");
var pool = mysql.createPool({
host: '127.0.0.1',
user: 'root',
password: 'root',
database: 'crawl' //本地连接mysql。
});
var query = function(sql, sqlparam, callback) {
pool.getConnection(function(err, conn) {
if (err) {
callback(err, null, null);
} else {
conn.query(sql, sqlparam, function(qerr, vals, fields) {
conn.release(); //释放连接
callback(qerr, vals, fields); //事件驱动回调
});
}
});
};
var query_noparam = function(sql, callback) {
pool.getConnection(function(err, conn) {
if (err) {
callback(err, null, null);
} else {
conn.query(sql, function(qerr, vals, fields) {
conn.release(); //释放连接
callback(qerr, vals, fields); //事件驱动回调
});
}
});
};
exports.query = query;
exports.query_noparam = query_noparam;
mysql.js内的内容不进行解析.
表单
CREATE TABLE `fetches` (
`id_fetches` int(11) NOT NULL AUTO_INCREMENT,
`url` varchar(200) DEFAULT NULL,
`source_name` varchar(200) DEFAULT NULL,
`source_encoding` varchar(45) DEFAULT NULL,
`title` varchar(200) DEFAULT NULL,
`keywords` varchar(200) DEFAULT NULL,
`author` varchar(200) DEFAULT NULL,
`publish_date` date DEFAULT NULL,
`crawltime` datetime DEFAULT NULL,
`content` longtext,
`createtime` datetime DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id_fetches`),
UNIQUE KEY `id_fetches_UNIQUE` (`id_fetches`),
UNIQUE KEY `url_UNIQUE` (`url`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
'id_fetches' int(11) NOT NULL AUTO_INCREMENT
//11为最大可显示宽度;
//varchar( one number)也一样,one number为一个数字
'content' longtext
//内容类型;
'createtime' datetime DEFAULT CURRENT_TIMESTAMP
//创建时间;
UNIQUE KEY 'id_fetches_UNIQUE' ('id_fetches'),
UNIQUE KEY 'url_UNIQUE' ('url')
//id_fetches和url在表单中不可重复.
具体操作
进入服务器:
输入mysql -u root -p:
建立数据库:
注意,我们可以发现,mysql没有反应,这是因为每个指令后须有一个’;’,即分号;
输入create database crawl;后,输入use crawl,回车,再复制以下代码,按鼠标右键,粘贴至命令行:
CREATE TABLE `fetches` (
`id_fetches` int(11) NOT NULL AUTO_INCREMENT,
`url` varchar(200) DEFAULT NULL,
`source_name` varchar(200) DEFAULT NULL,
`source_encoding` varchar(45) DEFAULT NULL,
`title` varchar(200) DEFAULT NULL,
`keywords` varchar(200) DEFAULT NULL,
`author` varchar(200) DEFAULT NULL,
`publish_date` varchar(50) DEFAULT NULL,
`crawltime` datetime DEFAULT NULL,
`content` longtext,
`createtime` datetime DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id_fetches`),
UNIQUE KEY `id_fetches_UNIQUE` (`id_fetches`),
UNIQUE KEY `url_UNIQUE` (`url`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
我们也可以利用Mysql图形化工具:
定位信息
var source_name="东方财富网";
var rooturl='https://www.eastmoney.com/';
var myEncoding = "utf-8";
var seedurl_format="$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var date_format = "$('.time').text()"; //"."表示class,下同.
var author_format = "$('.author').text()";
var source_format = "$('.source data-source').eq(0).attr(\"data-source\")";
var content_format = "$('#ContentBody').text()"; //"#"表示id;
var url_reg = /\/(\d{18}).html/; // 匹配所要的新闻网页
seedurl:
keyworks&desc:
寻找网页正则表示式:
其他就不详细讲了.
定时模块
var rule=new schedule.RecurrenceRule();//定时执行
var times=[0,6,12,18];//每天4次自动执行,0,6,12,18时。
var times2=30;//定义在第30分钟执行
rule.hour=times;
rule.minute=times2;
schedule.scheduleJob(rule,function(){
rootget();//定时执行rootGet()函数
})
设置请求头
function myrequest(starturl, callback) {
var options = {
url: starturl,
encoding: null,//这里不能为utf8,否则会与下部分的iconv冲突
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36"//避免屏蔽
},
timeout:10000//超过时间不进行处理
};
request(options, callback);
};
获取新闻页面
function rootget(){
myrequest(rooturl,function(err,res,body){
var html=iconv.decode(body,myEncoding);
//console.log(html);
用iconv转换编码
var $=cheerio.load(html,{
decodeEntities:true
}); //解析html
var seedurl_news;
try{
seedurl_news=eval(seedurl_format);//eval动态计算
//console.log(seedurl_news);
//console.log('recognizing');
}catch(e){
console.log('url列表所处的html块识别出错:'+e)//e:error message;
};
//async.mapLimit(5,function(url,callback){
seedurl_news.each(function(i,e){ //i:索引,e:操作对象
var myurl="";
//console.log('1');
//console.log(e);
try{
//得到具体新闻url
var href="";
href=$(e).attr("href");
//console.log(href);
if(href==undefined) return;
if(href.toLowerCase().startsWith('http://')) myurl=href;
else if(href.toLowerCase().startsWith('//')) myurl='http:'+href;
else myurl=rooturl.substr(0,rooturl.lastIndexOf('/')+1)+href;
//console.log('log');
}catch(e){
console.log('识别种子页面中的新闻链接出错:' + e);
}
if(!url_reg.test(myurl)) return;//判物是否是所要的新闻页面
//console.log(myurl);
var fetch_url_Sql='select url from fetches where url=?'; //select是查找操作,此处查找myurl是否已存在于fetches;
var fetch_url_Sql_Params=[myurl];
mysql.query(fetch_url_Sql,fetch_url_Sql_Params,function(qerr,vals,fields){
if(vals.length>0){
console.log('URL duplicate!');
}else newsget(myurl);//判断是否存在于fetch,有则不操作
});
});
//fetch(url,callback);
//});
});
};
获取信息
小心:’\r’是回车,前者使光标到行首,(carriage return)
'\n’是换行,后者使光标下移一格,(line feed)
Unix系统里,每行结尾只有“<换行>”,即“\n”;
Windows系统里面,每行结尾是“<回车><换行>”,即“\r\n”;
Mac系统里,每行结尾是“<回车>”,即“\r”。
function newsget(myurl){
myrequest(myurl,function(err,res,body){ //发送请求
var html_news=iconv.decode(body,myEncoding); //同上个部分
var $=cheerio.load(html_news,{decodeEntities:true}); //同上
myhtml=html_news;
//console.log(myhtml);
console.log("转码读取成功:" + myurl); //汇报结果
var fetch={}; //存储信息
fetch.title="";
fetch.content="";
fetch.publish_date="";
fetch.url=myurl;
fetch.source_name=source_name;
fetch.source_encoding=myEncoding;
fetch.crawltime=new Date(); //爬取时间
if(keywords_format=="") fetch.keywords=source_name;
else fetch.keywords=eval(keywords_format); //eval用法已经解释了,见上一部分
//console.log("keywords"+fetch.keywords);
if(title_format=="") fetch.title="";
else fetch.title=eval(title_format);
//console.log("title"+fetch.title);
//if(fetch.title="404 Not Found") //return;
if(date_format!="") fetch.publish_date=eval(date_format);
//console.log('date: '+fetch.publish_date);
if(author_format=="") fetch.author=fetch.source_name;
else fetch.author=eval(author_format);
//console.log("author"+fetch.author);
if(content_format=="") fetch.content="";
else fetch.content=eval(content_format).replace("\n"+fetch.author,""); //小心,不同系统下\n \r的使用;我使用Linux服务器,与Unix相同;
//若为windows,应为\r\n,Mac应为\r;
//console.log("content"+fetch.content);
if(source_format==undefined) fetch.source=fetch.source_name;
else fetch.source=eval(source_format);
if(fetch.source!=null) fetch.source=fetch.source.replace("\n","");
else fetch.source=fetch.source_name;
//console.log("source"+fetch.source);
if(desc_format=="") fetch.desc=fetch.title;
else fetch.desc=eval(desc_format); //replace所操作的对象不能为null,否则报错.
//console.log("1");
if(fetch.desc!=null) fetch.desc=fetch.desc.replace("\n","");
else fetch.desc=fetch.title;//我们会发现实际爬取下来可能存在null,这么做避免错误
//console.log("desc"+fetch.desc);
//console.log("1");
if(fetch.keywords!=undefined){//不存在的页面keywords为undefined,避免将其存入
var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)'; //存放信息到数据库
var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content];
//console.log("2");
mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
if (qerr) console.log(qerr);//放至数据库
});
}
//console.log("3");
});
};
对其他网站的尝试
我们只需修改以下部分:
var source_name="中证网";
var rooturl='http://www.cs.com.cn/';
var myEncoding = "gbk";//中证使用gbk,utf-8会导致结果乱码.
var seedurl_format="$('a')";
var keywords_format = " $('meta[name=\"Keywords\"]').eq(0).attr(\"content\")";//K大写
var title_format = "$('title').text()";
var desc_format = " $('meta[name=\"Description\"]').eq(0).attr(\"content\")";//D大写
var date_format = "$('.info').find('p').eq(0).next().find('em').eq(0).text()";
var author_format = "$('.info').find('p').eq(0).text()";
var source_format = "$('.info').find('p').eq(0).next().find('em').eq(0).next().text()";
var content_format = "$('.article','div').text()";
var url_reg = /\/(\d{6})\/t(\d{8})_(\d{7}).html/;
除此之外不需要其他修改
var source_name="雪球网";
var rooturl='http://xueqiu.com/';
var myEncoding = "utf-8";
var seedurl_format="$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('.article__bd__title').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var date_format = "$('.edit-time','a').text()";
var author_format = "$('.article__bd__detail','div').find('p').eq(0).text()";
var source_format = "$('.article__bd__from').find('a').eq(0).text()";
var content_format = "$('.article__bd__detail').text()";
var url_reg = /(\d{10})\/(\d{9})/;
实际操作中发现存入fetches的url是不可访问的,
e.g.http://xueqiu.com//8975058799/146518100
正确的是http://xueqiu.com/8975058799/146518100
方法一(不可行):
var rooturl=‘http://xueqiu.com’;
出现’buf’错误;
方法二:在rootget try语句中加入:
href=$(e).attr("href");
temp=href;
href=temp.substr(1,href.length);
又发现雪球上有友站的网页,html结构不同,爬取的content为"“且它的不存在页面的keywords为’404_雪球’,我们可以将其排除.
只需改为
if(fetch.keywords!=‘404_雪球’&&fetch.content!=”")
原为
if(fetch.keywords!=undefined)
同时发现雪球网的发布时间是"修改于XXXX时间前",没有什么意义,将publish_date设为空.
Linux服务器后台运行代码
附注
接下来应该是建立网站,有时间会修改下爬虫代码.
更新部分
发现雪球网中有部分无标题的记录,然而在后面的文章里是根据标题(title)搜索,这个需要修改下,避免无法使用的record占用空间.
我们先任意挑几个无title的record,进入它的页面观察内容
此处选择三个链接为例子:
https://xueqiu.com/3015848521/147228841
https://xueqiu.com/1325053376/147231489
https://xueqiu.com/7076673824/147233528
发现均为内容较短的讨论贴(或者是广告)
没有爬取的意义
所以我打算删除这部分内容
进入mysql-front,点击title下空值的一栏,右击点击过滤>title=’’,这时候就集合了所有title为空的record,点开左上角编辑栏,全选删除即可
成功删除
然后在雪球网爬虫代码中
if(fetch.keywords!=undefined)
改为
if(fetch.keywords!=undefined&&fetch.title!=’’)
解决问题!
2020-6-18:
重新更改代码:
var fs=require('fs'); //文件操作
var request=require('request');
var cheerio=require('cheerio');
var iconv=require('iconv-lite');
var mysql=require('./mysql.js'); //Mysql使用
var schedule=require('node-schedule'); //定时操作
var async=require('async'); //控制开发数,避免屏蔽
require('date-utils');
var source_name="东方财富网";
var rooturl='https://www.eastmoney.com/';
var myEncoding = "utf-8";
var seedurl_format="$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var date_format = "$('.time').text()";
var author_format = "$('.author').text()";
var source_format = "$('.source data-source').eq(0).attr(\"data-source\")";
var content_format = "$('#ContentBody').text()";
var url_reg = /\/(\d{18}).html/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/
var rule=new schedule.RecurrenceRule();//定时执行
var times=[0,4,8,12,16,20];//每天4次自动执行,0,6,12,18时。
var times2=30;//定义在第30分钟执行
rule.hour=times;
rule.minute=times2;
schedule.scheduleJob(rule,function(){
rootget();//定时执行rootGet()函数
})
function myrequest(starturl, callback) {
var options = {
url: starturl,
encoding: null,
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36"
},
timeout:1000
};
request(options, callback);
};
function rootget(){
myrequest(rooturl,function(err,res,body){
var html=iconv.decode(body,myEncoding);
//console.log(html);
用iconv转换编码
var $=cheerio.load(html,{
decodeEntities:true
});
var seedurl_news;
try{
seedurl_news=eval(seedurl_format);
//console.log(seedurl_news);
//console.log('recognizing');
}catch(e){
console.log('url列表所处的html块识别出错:'+e)//e:error message;
};
//async.mapLimit(5,function(url,callback){
seedurl_news.each(function(i,e){
var myurl="";
//console.log('1');
//console.log(e);
try{
//得到具体新闻url
var href="";
href=$(e).attr("href");
//console.log(href);
if(href==undefined)
return;
if(href.toLowerCase().startsWith('http://'))
myurl=href;
else if(href.toLowerCase().startsWith('//'))
myurl='http:'+href;
else
myurl=rooturl.substr(0,rooturl.lastIndexOf('/')+1)+href;
//console.log('log');
}catch(e){
console.log('识别种子页面中的新闻链接出错:' + e);
}
if(!url_reg.test(myurl))
return;
//console.log(myurl);
var fetch_url_Sql='select url from fetches where url=?';
var fetch_url_Sql_Params=[myurl];
mysql.query(fetch_url_Sql,fetch_url_Sql_Params,function(qerr,vals,fields){
if(vals.length>0){
console.log('URL duplicate!');
}else newsget(myurl);
});
});
//fetch(url,callback);
//});
});
};
function newsget(myurl){
myrequest(myurl,function(err,res,body){ //发送请求
if(Buffer.isBuffer(body)){
var html_news=iconv.decode(body,myEncoding); //同上个部分
var $=cheerio.load(html_news,{decodeEntities:true}); //同上
myhtml=html_news;
//console.log(myhtml);
console.log("转码读取成功:" + myurl); //汇报结果
var fetch={}; //存储信息
fetch.title="";
fetch.content="";
fetch.publish_date="";
fetch.url=myurl;
fetch.source_name=source_name;
fetch.source_encoding=myEncoding;
fetch.crawltime=new Date(); //爬取时间
if(keywords_format=="") fetch.keywords=source_name;
else fetch.keywords=eval(keywords_format); //eval用法已经解释了,见上一部分
console.log("keywords"+fetch.keywords);
if(title_format=="") fetch.title="";
else fetch.title=eval(title_format);
console.log("title"+fetch.title);
//if(fetch.title="404 Not Found") //return;
if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期
console.log('date: ' + fetch.publish_date);
//fetch.publish_date=JSON.parse(fetch.publish_date);
if(fetch.keywords!=undefined && fetch.publish_date!=""){
fetch.publish_date = regExp.exec(fetch.publish_date)[0];
fetch.publish_date = fetch.publish_date.replace('年', '-')
fetch.publish_date = fetch.publish_date.replace('月', '-')
fetch.publish_date = fetch.publish_date.replace('日', '')
fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");
}
if(author_format=="") fetch.author=fetch.source_name;
else fetch.author=eval(author_format);
console.log("author"+fetch.author);
if(content_format=="") fetch.content="";
else fetch.content=eval(content_format).replace("\n"+fetch.author,""); //小心,不同系统下\n \r的使用;我使用Linux服务器,与Unix相同;
//若为windows,应为\r\n,Mac应为\r;
console.log("content"+fetch.content);
if(source_format==undefined) fetch.source=fetch.source_name;
else fetch.source=eval(source_format);
if(fetch.source!=null) fetch.source=fetch.source.replace("\n","");
else fetch.source=fetch.source_name;
console.log("source"+fetch.source);
if(desc_format=="") fetch.desc=fetch.title;
else fetch.desc=eval(desc_format);
//console.log("1");
if(fetch.desc!=null) fetch.desc=fetch.desc.replace("\n","");
else fetch.desc=fetch.title;
console.log("desc"+fetch.desc);
//console.log("1");
//if(fetch.keywords!=undefined && fetch.publish_date!=""){
var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)'; //存放信息到数据库
var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content];
//console.log("2");
mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
if (qerr) console.log(qerr);
});
//}
//console.log("3");
}
});
};
关键点:
1.function newsget(myurl){
myrequest(myurl,function(err,res,body){ //发送请求
if(Buffer.isBuffer(body)){
加上这个,避免这种错误:The “buf” argument must be an instance of Buffer, TypedArray, or DataView.
原因:iconv的decode的第一个参数必须为buffer。
2.if(fetch.keywords!=undefined && fetch.publish_date!=""){
fetch.publish_date = regExp.exec(fetch.publish_date)[0];
fetch.publish_date = fetch.publish_date.replace(‘年’, ‘-’)
fetch.publish_date = fetch.publish_date.replace(‘月’, ‘-’)
fetch.publish_date = fetch.publish_date.replace(‘日’, ‘’)
fetch.publish_date = new Date(fetch.publish_date).toFormat(“YYYY-MM-DD”);
}
这是为了第2个作业而更改。