pyppeteer实战



二、使用步骤

1.引入库

代码如下(示例):

from pyppeteer import launch
import asyncio
import logging
from pyppeteer.errors import TimeoutError
from pyppeteer_stealth import stealth
import time
from fake_useragent import UserAgent
import json
from os import makedirs
from os.path import exists

logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s:%(message)s')
index_url='https://spa2.scrape.center/page/{page}'
timeout=10
total_page=1

window_width,window_height=1366,768
result_dir='results'
exists(result_dir) or makedirs(result_dir)
headless=False
brower,tab=None,None

sem = asyncio.Semaphore(5)
async def init():
    global brower,tab
    brower=await launch(headless=headless,
                        args=['--disable-infobars',f'--window-size={window_width},{window_height}'])
    tab=await brower.newPage()

    await stealth(tab)
    await tab.setViewport({'width':window_width,'height':window_height})

async def scrape_api(url,select):
    logging.info('scraping %s',url)
    try:
        await tab.setUserAgent(UserAgent().random)
        await tab.goto(url)
        await tab.waitForSelector(select,options={
            'timeout':timeout*1000
        })
        # await tab.waitForXPath(select,options={
        #     'timeout':timeout*1000
        # })
    except TimeoutError:
        logging.error('error occurred while scraping %s',url)
async def scrape_index(page):
    url=index_url.format(page=page)
    await scrape_api(url,'.item .name')
    # await scrape_api(url,'//*[@id="index"]/div[1]/div[1]/div[1]/div/div/div[2]/a/h2')
async def parse_index():
    return await tab.JJeval('.item .name','no=>no.map(n=>n.href)')
async def scrape_detail(url):
    await scrape_api(url,'h2')
    # await scrape_api(url,'//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/a/h2')
async def parse_deatil():
    url=tab.url
    name=await tab.Jeval('h2','node=>node.innerText')
    categories=await tab.JJeval('.categories button span','nodes=>nodes.map(node=>node.innerText)')
    cover=await tab.Jeval('.cover','node=>node.src')
    score=await tab.Jeval('.score','node=>node.innerText')
    drama=await tab.Jeval('.drama p','node=>node.innerText')
    # name = await tab.Jx('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/a/h2/text()')
    # categories = await tab.Jx('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/div[1]//text()')
    # cover = await tab.Jx('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[1]/a/img/@src')
    # score = await tab.Jx('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[3]/p[1]/text()')
    # drama = await tab.Jx('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/div[4]/p/text()')
    return {
            'url':url,
            'name':name,
            'categories':categories,
            'cover':cover,
            'score':score,
            'drama':drama
                        }
async def save_data(data):
    logging.info('saving %s',data)
    name=data.get('name')
    data_path = f'{result_dir}/{name}.json'
    json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
async def main():
    await init()
    try:
        for page in range(1,total_page+1):
            await scrape_index(page)
            detail_urls=await parse_index()
            for detail_url in detail_urls:
                await scrape_detail(detail_url)
                data=await parse_deatil()
                await save_data(data)
    finally:
        await brower.close()
if __name__ == '__main__':

    asyncio.get_event_loop().run_until_complete(main())

总结

以上就是今天要讲的内容,本文仅仅简单介绍了pyppeteer的使用,

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值