puppeteer 爬虫实例(两种循环抓取列表信息的方法)

const puppeteer = require(‘puppeteer’);
(async () => {
const browser = await puppeteer.launch({
headless: false,
devtools: true,
timeout: 0 }); //默认超时时间为30000如果设零也不会超时
const page = await browser.newPage();
const viewConfig = {
width: 360,
height: 640,
isMobile: true
};
//设置窗口
page.setViewport(viewConfig);
//跳转
await page.goto(‘https://weekly.75team.com/‘, {
waitUntil: ‘networkidle2’ // 等待网络状态为空闲的时候才继续执行
});
//处理拿到需要的数据
const result = await page.evaluate(() => {
//方法一
——————————————————————————————————————
var list = […document.querySelectorAll(‘#bd > div.issues.container > ol > li> a’)]
return list.map(el => {
return {
url: el.href.trim(),
title: el.innerText
}
})
——————————————————————————————————
//方法二
let data = [];
let elements = document.querySelectorAll(‘.issue-list li’); //获取所有的li
for (var element of elements){ // 循环
let title = element.querySelector(‘a’).innerHTML;
let url = element.querySelector(‘a’).href; //抓取链接(href)属性
let date = element.querySelector(‘.date’).innerHTML;

          data.push({title, url,date}); // 存入数组
        }
    return data;
--------------------------------------------------------------------    
});
console.log(result)    //转出数组

// browser.close();
})();

输出结果:
[ { href: ‘https://weekly.75team.com/issue250.html‘,
title: ‘奇舞周刊第250期’ },
{ href: ‘https://weekly.75team.com/issue249.html‘,
title: ‘奇舞周刊第249期’ },
{ href: ‘https://weekly.75team.com/issue248.html‘,
title: ‘奇舞周刊第248期’ },
{ href: ‘https://weekly.75team.com/issue247.html‘,
title: ‘奇舞周刊第247期’ },
{ href: ‘https://weekly.75team.com/issue246.html‘,
title: ‘奇舞周刊第246期’ },
{ href: ‘https://weekly.75team.com/issue245.html‘,
title: ‘奇舞周刊第245期’ },
{ href: ‘https://weekly.75team.com/issue244.html‘,
title: ‘奇舞周刊第244期’ }]

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值