var https = require('https');
var iconv = require('iconv-lite');
var cheerio = require('cheerio');
var mysql = require('mysql');
var text_url = 'https://www.zhihu.com/question/36260262/answer/67686600';
var baseurl = 'https://baike.baidu.com';
// 创建连接
var connection = mysql.createConnection({
host : 'localhost',
user : 'root',
password : '12345678',
database : 'testmysql',
useConnectionPooling: true
});
var options = {
// proxy:'1.197.16.3',
method: 'GET',
headers: {
"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
// "cookie": '__cfduid=dd8936888a6f1979387fac18b0e7cb1781563936793; PHPSESSID=g66j025p81urea4b35ku32alv4; Hm_lvt_2d527f7b5c8cdde7b45e368a84b53fe8=1564483203,1564484204,1564484208,1564484213; Hm_lpvt_2d527f7b5c8cdde7b45e368a84b53fe8=1565190149'
}
};
/**
*
* @param {*} urls
* @param { 通过传入的函数对数据返回之前进行处理 } fn
*/
function batchFetch(urls) {
return urls.map(url => {
return new Promise(function (resolve, reject) {
https.get(url, options, (res)=>{
var length=0;
var arr=[];
res.on("data",function(chunk){
arr.push(chunk);
length+=chunk.length;
});
res.on('end', ()=>{
var name = '';
var catee = '';
var data=Buffer.concat(arr,length);
var change_data = iconv.decode(data,'utf-8');
var $ = cheerio.load(change_data.toString());
var cur_cate = $('.lemmaWgt-lemmaTitle-title');
if(cur_cate != undefined){
name = $(cur_cate).find('h1').text();
catee = $(cur_cate).find('h2').text();
}
var contentsum = '';
if($('.lemma-summary').text() != undefined){
contentsum = $('.lemma-summary').text();
}
var pic = '';
if($('.side-content .summary-pic a').find('img').attr('src') != undefined){
pic = $('.side-content .summary-pic a').find('img').attr('src');
// console.log(pic);
}
// console.log('suc', 1)
// console.log({'name': name, 'class': catee, 'summary': contentsum})
resolve({'name': name, 'class': catee, 'image': pic, 'summary': contentsum});
})
}).on('error', (e)=>{
resolve({'name': '', 'class': '', 'summary': ''});
});
});
})
}
function reqinfo(url){
return new Promise(function (resolve, reject) {
setTimeout(()=>{reject(0)}, 5000);
https.get(url, options,(res) => {
// console.log(res.headers.location.slice(0,5), res.headers.location)
if(res.headers.location.slice(0,4) == 'http'){
reject(0);
}else {
setTimeout(()=>{reject(0)}, 5000);
https.get('https:' + res.headers.location, options, (res)=>{
setTimeout(()=>{reject(0)}, 5000);
https.get(baseurl + res.headers.location, options, (res)=>{
var length=0;
var arr=[];
var name = '';
var catee = '';
res.on("data",function(chunk){
arr.push(chunk);
length+=chunk.length;
});
res.on('end', ()=>{
var urls = [];
var result = [];
var data=Buffer.concat(arr,length);
var change_data = iconv.decode(data,'utf-8');
// console.log(change_data);
var $ = cheerio.load(change_data.toString());
var cur_cate = $('.lemmaWgt-lemmaTitle-title');
if(cur_cate != undefined){
name = $(cur_cate).find('h1').text();
catee = $(cur_cate).find('h2').text();
}
var contentsum = '';
if($('.lemma-summary').text() != undefined){
contentsum = $('.lemma-summary').text();
}
var pic = '';
if($('.side-content .summary-pic a').find('img').attr('src') != undefined){
pic = $('.side-content .summary-pic a').find('img').attr('src');
// console.log(pic);
}
result.push({'name': name, 'class': catee, 'image': pic,'summary': contentsum});
var cate = $('.cmn-clearfix .item');
if(cate != undefined){
cate.each((index, value)=>{
var ccate = $(value).find('a').attr('href');
if(ccate != undefined){
// console.log(ccate)
urls.push(baseurl + ccate)
}
});
}
if(urls.length !== 0){
let promises = batchFetch(urls);
// console.log('len', urls.length)
Promise.all(promises).then(function (data) {
// console.log('success', data.length)
// 所有请求返回值
for (let i of data){
result.push(i);
}
resolve(result);
}).catch(function (data) {
// console.log('err', data);
resolve(result);
});
// req_other(urls, 0, result, num);
}else {
resolve(result);
}
// console.log(result)
})
}).on('error', (e)=>{
reject(0);
})
}).on('error', (e)=>{
reject(0);
})
}
}).on('error', (e)=>{
reject(0);
})
})
}
var errornum = 0;
var Bindex = 0;
function req_text(offindex) {
console.log('index', offindex);
var sql = 'SELECT * from zhihuNum limit ' + offindex + ',1 ';
//查
connection.query(sql,function (err, result) {
if(err){
console.log('[SELECT ERROR] - ',err.message);
connection.end();
return;
}else {
if (result.length !== 0){
// console.log(result[0].name)
var url = encodeURI(baseurl + '/search/word?word=' + result[0].name);
// console.log(url)
reqinfo(url).then(function (data) {
var class_num = data.length;
for (var item of data){
var addSql = 'INSERT INTO zhihuFinish(Id, class_num, num, name, class, image, summary) VALUES(?,?,?,?,?,?,?)';
var addSqlParams = [Bindex, class_num, result[0].num, item.name, item.class, item.image, item.summary];
Bindex++;
// 这个连接也是异步的
connection.query(addSql, addSqlParams,function (err, result) {
if(err){
console.log('[INSERT ERROR] - ',err.message);
}else {
// console.log('success');
// kindex++;
}
});
}
offindex++;
if(offindex%10 === 0){
setTimeout(()=>{
req_text(offindex)
}, 10000)
}else {
req_text(offindex);
}
},function (data) {
offindex++;
// console.log('e', data);
if(offindex%100 === 0){
setTimeout(()=>{
req_text(offindex)
}, 10000)
}else {
req_text(offindex);
}
})
}else{
connection.end();
}
}
});
}
connection.connect();
req_text(0);
遇到问题比较多。
1、在百度百科中搜索中文时,url中会多出一个数字。我找了页面中的js文件,没找到这个数字是怎么得来的(我曾经爬过谷歌翻译,谷歌翻译页面中有一个js,就是根据输入字符串得到一个tk值)。
所以最后我发现可以在响应头中得到这一个数字。
除此之外,url中的中文经过url编码后得到的字串和最后请求总的url也是不一样的,依然是从响应头中获得。
所以,一共经过了三层get请求才得到最终所需要的值。
2、本系列的第二篇讲过使用promise将异步变为同步,按照第二篇里讲过的内容,就会失去js异步的强大功能,所以可以使用到promise的all方法。本片文章中的batchFetch就是将传过去的url数组变为promise数组,然后使用promise.all方法异步执行。
3、经过百度查阅资料,js的请求限时是120s。我在运行该js文件过程中,经常遇到代码卡死,应该是请求过程中出错,导致一直等到120s结束才进行下一步。所以,我在promise中使用一个setTimeout,设置时间是5000ms,当5000ms过去,执行函数,改变reject值。