node爬虫
相关包依赖自己装哈!
// 爬虫的相关方法:
// 方法1: 可通过axios直接请求,如果网站是提供相应api的话
// 方法2:通过request请求页面,再分析页面数据
// 方法3:通过puppeteer加载完页面后,再分析
const axios = require("axios");
const request = require('request')
const url = 'https://juejin.cn/frontend';
let cheerio = require('cheerio');
function axiosFun(){
const url = 'https://api.juejin.cn/user_api/v1/author/recommend?aid=2608&uuid=6937585905940088351&category_id=&cursor=0&limit=20'
axios.get(url)
.then((res)=>{
console.log('res',res.status,res.data);
},error=>{
console.error('err',error);
})
}
// axiosFun()
function requestFun(){
request(url,(err,res,body)=>{
console.log('err',err);
// console.log('res',res);
console.log('body',body);
let $ = cheerio.load(body, {ignoreWhitespace: true})
console.log('object',$('title').text());
const arr = $('body').find('a');
console.log('arr',$(arr[0]).attr('href'),$(arr[0]).text());
const rel = []
for (let i = 0; i < arr.length; i++) {
let text= $(arr[i]).text()
if(text){
let href= $(arr[i]).attr('href')
rel.push({
name: text,
href
});
}
}
console.log('rel',rel.length);
var fs = require('fs');
fs.writeFileSync('images.md', JSON.stringify(rel, null, 2))
})
}
// requestFun()
async function puppeteerFun(){
const per = require('puppeteer');
const browser = await per.launch({headless: false});
const page = await browser.newPage()
await page.goto(url, {
waitUntil: 'networkidle2',
})
// const arr = await page.$eval('a',(a,b)=>{
// console.log('a,b',a,b);
// })
let arr=await page.$$eval("a.title",(elements)=>{
let array=[]
elements.forEach((item,index)=>{
array.push({
title: item.innerHTML,
href: item.href
})
console.log('item',index,item);
})
return array
})
console.log('arr',arr);
arr.push({
name: '日期',
date: new Date().toLocaleString()
})
// console.log('page',page)
await browser.close();
const fs = require('fs');
fs.writeFileSync('webUrl.json', JSON.stringify(arr, null, 2))
}
puppeteerFun()