配置文件说明:
/root/Spider/nodejs-server-server的project.json
{
“open_allbaidu_hot” : 1, //是否爬取所有百度热点信息
“open_allwangyi_hot”:1,//是否爬取所有网易热点信息
“open_alljinritoujiao_hot”:1,//是否爬取今日头条热点信息
“open_allweibo_hot”:1,/是否爬取微博热点信息
“open_allxinlang_hot”:1,//是否爬取新浪热点信息
“open_allbaidutieba_hot”:1,//是否爬取百度贴吧热点信息
“open_baidu_hot_people”:1,//是否爬取百度热点明星
“open_weibo_hot_people”:1,//是否爬取微博热点明星
“timeInterval”:1//设置爬虫时间间隔单位为小时
“db_user”:”root”,//数据库用户名
“password”:”233233”,//密码
“database”:”nodejs”//数据库名字
}
主类别表:
mid: int 类别id,主键
mclassname: varchar(20) 主类名
次类别表:
Cid int 类别id,主键
Cclassname varchar(20) 次类名
成员表:
Id int 成员id,主键
Name varchar(20) 名字
mClass int 所属主类名id,外键
CClass int 所属次类名id,外键
Source varchar(50) 来源网站
Description varchar(2000) 信息描述
Hot varchar(5) 热度指数
Time varchar(20) 获取时间
登陆mysql查看结果:
重点内容
var sources = ["热点人物","娱乐名人","女演员","男演员","演员","女歌手","男歌手","歌手","名家人物","主持人","体坛人物","美女","帅哥","选秀歌手","欧美明星","财经人物","互联网人物","历史人物","公益明星"]
var bmsites = ["http://top.baidu.com/buzz?b=258&c=9&fr=topcategory_c9","http://top.baidu.com/buzz?b=618&c=9&fr=topbuzz_b258_c9","http://top.baidu.com/buzz?b=18&c=9&fr=topbuzz_b18_c9","http://top.baidu.com/buzz?b=17&c=9&fr=topbuzz_b18_c9","http://top.baidu.com/buzz?b=1395&c=9&fr=topbuzz_b17_c9","http://top.baidu.com/buzz?b=16&c=9&fr=topbuzz_b1395_c9","http://top.baidu.com/buzz?b=15&c=9&fr=topbuzz_b16_c9","http://top.baidu.com/buzz?b=1396&c=9&fr=topbuzz_b15_c9","http://top.baidu.com/buzz?b=260&c=9&fr=topbuzz_b1396_c9","http://top.baidu.com/buzz?b=454&c=9&fr=topbuzz_b260_c9","http://top.baidu.com/buzz?b=255&c=9&fr=topbuzz_b454_c9","http://top.baidu.com/buzz?b=3&c=9&fr=topbuzz_b255_c9","http://top.baidu.com/buzz?b=22&c=9&fr=topbuzz_b3_c9","http://top.baidu.com/buzz?b=493&c=9&fr=topbuzz_b22_c9","http://top.baidu.com/buzz?b=491&c=9&fr=topbuzz_b493_c9","http://top.baidu.com/buzz?b=261&c=9&fr=topbuzz_b491_c9","http://top.baidu.com/buzz?b=257&c=9&fr=topbuzz_b261_c9","http://top.baidu.com/buzz?b=259&c=9&fr=topbuzz_b257_c9","http://top.baidu.com/buzz?b=612&c=9&fr=topbuzz_b259_c9"]
//var bmsite = "http://top.baidu.com/buzz?b=258&c=9&fr=topcategory_c9";
var bmingxingentity = [];
var examples = {};
var mxurl = bmsites[mxn];
console.log(bmsites.length)
var req = http.get(mxurl,function(res){
// res.setEncoding('binary');
var buffer = new BufferHelper();
res.on('data',function(data){
buffer.concat(data);
}).on('end',function(){
var buf = buffer.toBuffer();
//var buf = new Buffer(html,'binary');
var str = iconv.decode(buf,'gbk');
try{
var res = str.match(/href_top.*>.*\</gm);
var mxhot = str.match(/span class=\"icon\-.*>.*\</gm);
if(res != null){
var conn = mysql.createConnection({
host: 'localhost',
user: project.db_user,
password: project.password,
database:project.database,
port: 3306
});
console.log("begin to connect");
conn.connect();
if(res.length != 0){
for(var i = 0;i < res.length;i++){
var r = res[i].match(/\>.*\</gm)[0];
var mh = mxhot[i].match(/\>.*\</gm)[0];
r = r.replace("<","");
r = r.replace(">","");
mh = mh.replace("<","");
mh = mh.replace(">","");
var CID = mxn + 1;
var baidu = "baidu";
var test = new Date();
var time = test.getFullYear() + "-" + test.getMonth() +"-" + test.getDate() + " " + test.getHours() + ":"+ test.getMinutes();
conn.query("insert into Member (name,hot,mid,source,cid,time) values ( '"+ r + "'" + "," + parseFloat(mh) + "," + 1 + "," + "'" + baidu +"'"+"," + CID + "," + "'" + time +"'" +")", function(err, rs, fields){
//处理数据
if(err) {
console.log('ClientConnectionReady Error: ' + err.message);
return;
}
});
bmingxingentity.push({
'name':r,
'hot':parseInt(mh)
})
}
}
}
conn.end();
examples['application/json'] = bmingxingentity;
if(Object.keys(examples).length > 0) {
console.log(10000);
//res.setHeader('Content-Type', 'application/json');
var data = JSON.stringify(examples[Object.keys(examples)[0]] || {}, null, 2);
client.set('Hot4' + (mxn+10000), data, redis.print);
//client.expire('Hot1' + (3), 300);
//res.end(data);
}
else {
res.end();
}
}catch(e){
}
}).on('close',function(){
console.log('Close recevied!');
});
});
req.on('error',function(error){
});
}