爬取url的库request.js
const urllib=require('url');
const pathlib=require('path');
const http=require('http');
const https=require('https');
const assert=require('assert');
function requestUrl(url, headers){ //headers 参数 可以存user-agent 表示自己是什么机器浏览器版本等等
let urlObj=urllib.parse(url); //解析url
let httpMod=null;
if(urlObj.protocol=='http:'){
httpMod=http;
}else if(urlObj.protocol=='https:'){
httpMod=https;
}else{
throw new Error(`协议无法识别: ${urlObj.protocol}`);
} //兼容http,https
return new Promise((resolve, reject)=>{
let req=httpMod.request({ //http/https的request借口
host: urlObj.host,
path: urlObj.path,
headers
}, res=>{
if(res.statusCode>=200 && res.statusCode<300 || res.statusCode==304){
let arr=[];
res.on('data', data=>{
arr.push(data);
});
res.on('end', ()=>{
let buffer=Buffer.concat(arr);
resolve({
status: 200,
body: buffer,
headers:res.headers
});
});
}else if(res.statusCode==301 || res.statusCode==302){ //重定向
resolve({
status: res.statusCode,
body: null,
headers:res.headers
});
}else{
reject({
status: res.statusCode,
body: null,
headers:res.headers
});
}
});
req.on('error', err=>{
console.log('错了', err);
});
req.write(''); //发送POST数据
req.end(); //正式开始请求
});
}
module.exports=async (url, reqHeaders)=>{
try{
while(1){
let {status, body, headers}=await requestUrl(url, reqHeaders);
//console.log(status, url);
if(status==200){
return {body, headers};
}else{
assert(status==301 || status==302);
assert(headers.location);
url=headers.location;
}
}
}catch(e){
console.log(e);
}
};
调用这个库进行抓取
const request=require('./libs/request');
const fs=require('fs');
(async()=>{
try{
let {body, headers}=await request('http://www.zhihu.com/');
fs.writeFile(`tmp/zhihu.html`, body, err=>{
console.log(err);
});
}catch(e){
console.log(e);
}
})();