二、使用步骤
1.引入库
代码如下(示例):
from pyppeteer import launch import asyncio import logging from pyppeteer.errors import TimeoutError from pyppeteer_stealth import stealth import time from fake_useragent import UserAgent import json from os import makedirs from os.path import exists
logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s:%(message)s') index_url='https://spa2.scrape.center/page/{page}' timeout=10 total_page=1 window_width,window_height=1366,768
result_dir='results' exists(result_dir) or makedirs(result_dir) headless=False brower,tab=None,None sem = asyncio.Semaphore(5) async def init(): global brower,tab brower=await launch(headless=headless, args=['--disable-infobars',f'--window-size={window_width},{window_height}']) tab=await brower.newPage() await stealth(tab) await tab.setViewport({'width':window_width,'height':window_height}) async def scrape_api(url,select): logging.info('scraping %s',url) try: await tab.setUserAgent(UserAgent().random) await tab.goto(url) await tab.waitForSelector(select,options={ 'timeout':timeout*1000 }) # await tab.waitForXPath(select,options={ # 'timeout':timeout*1000 # }) except TimeoutError: logging.error('error occurred while scraping %s',url) async def scrape_index(page): url=index_url.format(page=page) await scrape_api(url,'.item .name') # await scrape_api(url,'//*[@id="index"]/div[1]/div[1]/div[1]/div/div/div[2]/a/h2') async def parse_index(): return await tab.JJeval('.item .name','no=>no.map(n=>n.href)') async def scrape_detail(url): await scrape_api(url,'h2') # await scrape_api(url,'//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/a/h2') async def parse_deatil(): url=tab.url name=await tab.Jeval('h2','node=>node.innerText') categories=await tab.JJeval('.categories button span','nodes=>nodes.map(node=>node.innerText)') cover=await tab.Jeval('.cover','node=>node.src') score=await tab.Jeval('.score','node=>node.innerText') drama=await tab.Jeval('.drama p','node=>node.innerText') # name = await tab.Jx('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/a/h2/text()') # categories = await tab.Jx('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/div[1]//text()') # cover = await tab.Jx('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[1]/a/img/@src') # score = await tab.Jx('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[3]/p[1]/text()') # drama = await tab.Jx('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/div[4]/p/text()') return { 'url':url, 'name':name, 'categories':categories, 'cover':cover, 'score':score, 'drama':drama } async def save_data(data): logging.info('saving %s',data) name=data.get('name') data_path = f'{result_dir}/{name}.json' json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2) async def main(): await init() try: for page in range(1,total_page+1): await scrape_index(page) detail_urls=await parse_index() for detail_url in detail_urls: await scrape_detail(detail_url) data=await parse_deatil() await save_data(data) finally: await brower.close() if __name__ == '__main__': asyncio.get_event_loop().run_until_complete(main())
总结
以上就是今天要讲的内容,本文仅仅简单介绍了pyppeteer的使用,