在前面爬虫的基础上改进了一点,改进的部分重点注释下
get、post请求全是异步的,这也就会出现得到的数据不会按for循环来
var http = require('http');
var fs = require('fs');
var cheerio = require('cheerio');
var request = require('request');
var request = request.defaults({jar: true});//保持cookie
var all="";
var i=14001;//起始页
var num = 16000;//结束页
//这里爬到数据用fs保存在txt中
request.post('http://hnust.hunbys.com/index.php/home/Public/mlogin.html', function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log('登录成功!')
start("http://hnust.hunbys.com/index.php/course/Index/userinfo/id/"+i+".html");
}
}).form({email: myemail, password: mypassword})//登录设置
function start(url){
(function(a){
i++;
request.get(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(body); //采用cheerio模块解析html
var name = $(".pull-left h2").text().trim().substr(0,4);
var xueyuan = $(".zy_course_rl").children().eq(0).text().substr(17,15).trim();
var three = $(".zy_course_rl").children().eq(1).text().trim();
indexm = three.indexOf("专业:");
indexl = three.indexOf("|");
var zhuanye = three.slice(indexm,indexl).trim();//专业
indexbj = three.indexOf("班级:")
var banji = three.substr(indexbj,10).trim();
var sfz = $(".zy_course_rl").children().eq(2).text().trim().substr(3,18);
var dh = $(".zy_course_rl").children().eq(3).text().trim();
all += a+"姓名:"+name +" "+ xueyuan +" "+zhuanye +" "+sfz+" "+dh+"\n";
// console.log(a);
if (i=num) {
console.log(all);
savedContent($,i,all);
}
}
})
var nextLink="http://hnust.hunbys.com/index.php/course/Index/userinfo/id/" + i+".html";
if (i <= num) {
start(nextLink);
}
})(i)
}
function savedContent($,i, news_title) {
//将新闻文本内容一段一段添加到/data文件夹下,并用新闻的标题来命名文件
fs.writeFile('./test/'+i+'.txt', all, 'utf-8', function (err) {
if (err) {
console.log(err);
}
});
}