pyppeteer实战

import logging
import asyncio
from pyppeteer import launch
from pyppeteer.errors import TimeoutError
from motor.motor_asyncio import AsyncIOMotorClient

#数据库操作
motor_connect_string = 'mongodb://localhost:27017'
momgodb_name = 'movie'
mongo_collection_name = 'data'

client = AsyncIOMotorClient(motor_connect_string )
db = client[momgodb_name]
collection =db[mongo_collection_name]

logging.basicConfig(level= logging.INFO,format='%(levelname)s:%(message)s')

index_url = 'https://spa2.scrape.center/page/{page}'
total_page = 10
timeout = 10
window_width,window_height = 1366,768
headless = False

browser,tap = None,None

async def init():
    global browser,tap
    browser = await launch(headless =headless,
                            args = [f'--window-size={window_width},{window_height}',
                            '--disable-infobars'])
    tap = await browser.newPage()
    
    
    await tap.setViewport({'width':window_width,'height':window_height})

async def scrape_index(url,selector):
    logging.info('爬取 %s',url)
    try:
        await tap.goto(url)
        await tap.waitForSelector(selector,options={'timeout':timeout*1000})

    except TimeoutError:
        logging.error('错误 %s',url)

async def scrape_page(page):
    """爬取详情页的url"""
    url = index_url.format(page=page)
    await scrape_index(url,selector = '.item .name')

async def parse_index():
    """解析列表页"""
    return await tap.querySelectorAllEval('.item .name','nodes => nodes.map(node => node.href)')

async def scrape_detail(url):
    """爬取详情页"""
    await scrape_index(url,selector = '.item .name')

async def parse_detail():
    """解析详情页"""
    url = tap.url
    name = await tap.querySelectorEval('.item .name h2','node=>node.innerText')
    catalogs = await tap.querySelectorAllEval('.categories button span','nodes => nodes.map(node => node.innerText)')
    cover = await tap.querySelectorEval('.cover','node=>node.src')
    score = await tap.querySelectorEval('.score','node=>node.innerText')
    drama = await tap.querySelectorEval('.drama p','node=>node.innerText')
    return {
        'url':url,
        'name':name,
        'catagories':catalogs,
        'cover':cover,
        'score':score,
        'drama':drama
    }

async def main():
    await init()
    try:
        for page in range(1,total_page+1):
            await scrape_page(page)
            detail_urls = await parse_index()
            for detail_url in list(detail_urls):
                await scrape_detail(detail_url)
                data = await parse_detail()
                await collection.update_one(
            {'name':data.get('name')},{'$set':data},upsert = True)


    finally:
        await browser.close()

if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(main())

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值