python async + pyppeteer 并发

全实战详细操作过程之爬虫相关
全经过实战验证,记录相对还算详细,
殇夜00
# coding=utf-8

import asyncio, time
import pyppeteer
from collections import namedtuple

Response = namedtuple("rs", "title url html cookies headers history status")


async def get_html(url, timeout=10):
    # 默认30s
    print('---0')
    browser = await pyppeteer.launch(headless=False, args=['--no-sandbox'])
    page = await  browser.newPage()
    res = await page.goto(url, options={'timeout': int(timeout * 1000)})
    # await asyncio.sleep(3)
    data = await page.content()
    title = await page.title()
    print('title = 0', title)
    # resp_cookies = await page.cookies()
    # resp_headers = res.headers
    # resp_history = None
    # resp_status = res.status
    # response = Response(title=title, url=url,
    #                     html=data,
    #                     cookies=resp_cookies,
    #                     headers=resp_headers,
    #                     history=resp_history,
    #                     status=resp_status)
    await browser.close()
    # return response

async def get_html_1(url, timeout=10):
    # 默认30s
    print('---1')
    browser = await pyppeteer.launch(headless=True, args=['--no-sandbox'])
    page = await  browser.newPage()
    res = await page.goto(url, options={'timeout': int(timeout * 1000)})
    # await asyncio.sleep(3)
    data = await page.content()
    title = await page.title()
    print('title 1= ', title)
    # resp_cookies = await page.cookies()
    # resp_headers = res.headers
    # resp_history = None
    # resp_status = res.status
    # response = Response(title=title, url=url,
    #                     html=data,
    #                     cookies=resp_cookies,
    #                     headers=resp_headers,
    #                     history=resp_history,
    #                     status=resp_status)
    # return response

async def get_html_2(url, timeout=10):
    # 默认30s
    print('---2')
    browser = await pyppeteer.launch(headless=True, args=['--no-sandbox'])
    page = await  browser.newPage()
    res = await page.goto(url, options={'timeout': int(timeout * 1000)})
    # await asyncio.sleep(3)
    data = await page.content()
    title = await page.title()
    print('title 2= ', title)
    # resp_cookies = await page.cookies()
    # resp_headers = res.headers
    # resp_history = None
    # resp_status = res.status
    # response = Response(title=title, url=url,
    #                     html=data,
    #                     cookies=resp_cookies,
    #                     headers=resp_headers,
    #                     history=resp_history,
    #                     status=resp_status)
    # return response

if __name__ == '__main__':
    s_time = time.time()
    url_list = ["http://www.10086.cn/index/tj/index_220_220.html","http://www.10086.cn/index/tj/index_220_220.html","http://www.10086.cn/index/tj/index_220_220.html","http://www.10086.cn/index/tj/index_220_220.html","http://www.10086.cn/index/tj/index_220_220.html",

                ]  # , "http://www.10010.com/net5/011/", "http://python.jobbole.com/87541/"

    # tasks = [get_html(url_list[0]), get_html_1(url_list[1]), get_html_2(url_list[2])]

    loop = asyncio.get_event_loop()

    # loop.run_until_complete(get_html(url_list[0]))
    # loop.run_until_complete(get_html(url_list[1]))
    # loop.run_until_complete(get_html(url_list[2]))
    # loop.run_until_complete(get_html(url_list[3]))
    # loop.run_until_complete(get_html(url_list[4]))



    # results = loop.run_until_complete(asyncio.gather(*task))
    tasks = [(get_html(url)) for url in url_list]
    loop.run_until_complete(asyncio.wait(tasks))  # loop.run_until_complete() 既可以接收一个协程对象, 也可以接收一个 future 对象

    # loop.close()

    # for res in results:
    #     print(res.title)
    print('耗时:', time.time() - s_time)
©️2020 CSDN 皮肤主题: 技术黑板 设计师: CSDN官方博客 返回首页
实付0元
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值