pyppeteer(多页面并发)+异步 实战教程

以下为我自己写的项目的爬虫部分核心代码:

import asyncio
import random

from pyppeteer import launch

import city
import parm
import SQLite_Pool
import GraphicCode
import SliderCode
from Log import log

ip_arr = SQLite_Pool.find_all('ip_white')  # 代理池
result_3 = []  # 最后里面是二维数据,3个results 或 为空
# results为 航班信息列表 或者 [("破解验证码失败!", "", "", "", "", "")]
requests_num = 1  # 代表启动了多少次浏览器


async def crawl(url_5):     # 分3个页面依次获取
    proxy = random.choice(ip_arr)[0]  # 获取随机代理
    log.info('选择的代理:' + proxy)
    ua = parm.ua_pool().user_agent()  # 获取随机UA
    global result_3
    num = 0
    browser = None
    yanzheng, yanzheng_num, yanzheng_succeed, run = False, 0, True, True
    for i in range(len(url_5)):
        if run:
            dcity_code = city.all_cities.get(url_5[i][0]).lower()
            acity_code = city.all_cities.get(url_5[i][1]).lower()
            date = url_5[i][2]
            url = 'https://flights.ctrip.com/online/list/oneway-' + dcity_code + '-' + acity_code + '?_=1&depdate=' \
                  + date + '&cabin=Y_S_C_F'
            flag, empty_fail, ip_fail, results, img_id = True, 1, 1, "", ""
            num += 1
            if num == 1:
                browser = await launch(parm.start_parm(proxy, ua))  # 启动Chromium浏览器
            while flag:  # 让ip失效时仍然确保爬取的网页数不变
                page = await browser.newPage()  # 打开新页面
                await page.evaluateOnNewDocument(parm.js_text)  # 反爬js 本页刷新后值不变,自动执行js
                await page.setViewport(viewport={'width': parm.width, 'height': parm.height})  # 设置网页 视图大小
                try:
                    await page.goto(url=url, options={'timeout': 1000 * 30})  # 发起请求,超时等待设置30s
                except Exception as e:
                    ip_fail += 1  # 避免死循环保护机制
                    if ip_fail > 3:  # 当使用了3次代理仍然无法爬取,说明可能该ip已经失效!跳过
                        await page.close()
                        await browser.close()
                        log.error('使用的代理ip多次爬取失败!可能该ip已经失效!或网络不好!' + str(e))
                        global requests_num
                        if requests_num <= 5:
                            requests_num += 1
                            await browser.close()  # 关闭浏览器更换代理继续获取
                            asyncio.get_event_loop().run_until_complete(crawl(url_5[i:]))   # 继续获取后续url的航班信息
                            log.error("继续获取后续url的航班信息!")
                            run = False     # 避免半死循环和返回的数据重复
                            flag = False
                        else:  # 重复启动5次浏览器还没有获取完5个完整url信息,跳过
                            run = False     # 退出!!!
                            flag = False
                else:
                    if yanzheng:  # 开启破解验证码,测试开关
                        yanzheng_num += 1
                        log.info("开启破解验证码!")
                        await asyncio.sleep(3)
                        await page.evaluate('window.scrollTo(0,0)')  # 拉到网页顶部
                        # 滑块验证
                        asyncio.get_event_loop().run_until_complete(SliderCode.slider_main(page))
                        # 点选图形验证
                        img_id = asyncio.get_event_loop().run_until_complete(GraphicCode.verify_main(page))
                    elif num == 1:       # 第一次启动浏览器要点击一次点击确定按钮
                        await asyncio.sleep(3)
                        enter_botton = await page.xpath('//*[@id="outerContainer"]/div/div[3]/div/button')
                        await asyncio.sleep(1)
                        if enter_botton:
                            await enter_botton[0].click()
                    await asyncio.sleep(4)
                    height = 0
                    for sro in range(30):  # 模拟人工滑动,确保整张页面的Ajax数据都能加载出来
                        height += 300
                        await page.evaluate('window.scrollTo(0,{})'.format(height))
                        await asyncio.sleep(0.3)
                    await page.evaluate('window.scrollBy(0, document.body.scrollHeight)')  # 拉到网页最底部
                    await asyncio.sleep(1)  # 等待动态数据完全加载出来

                    div_list = await page.xpath('//*[@id="__next"]/div[2]/div/div[3]/div[3]/div[2]/span/div')  # 获取页面数据
                    if div_list:  # 是否获取到航班信息
                        results = await data_analysis(div_list)  # 解析数据
                        flag = False  # 成功获取数据,跳出循环
                        await page.close()
                    else:
                        empty_fail = empty_fail + 1  # 避免死循环保护机制
                        if empty_fail > 3:  # 当爬取的网页超过3次仍然没有数据,就开启验证模式(多半是被网站检测到了爬虫,需要破解验证码)
                            if yanzheng and img_id:  # 验证为已经开启,数据依然为空,说明验证码识别失败
                                GraphicCode.chaojiying.ReportError(img_id)  # 上报识别失败的图片
                            if yanzheng_num >= 2:  # 开启验证的情况  如果验证次数达到2次,跳过
                                flag = False  # 验证次数过多,退出循环
                                run = False    # 退出!!!
                                await page.close()
                                results = [("破解验证码失败!", "", "", "", "", "")]
                                log.info("破解验证码失败!")

                            yanzheng = True  # 开启验证

            result_3.append(results)   # results为 航班信息列表 或者 [("破解验证码失败!", "", "", "", "", "")]
            await asyncio.sleep(20)     # 限制获取频率,20秒一个页面
    try:
        await browser.close()   # 关闭浏览器
    except Exception as e:
        print(e)    # 跳过无法移除临时用户数据的异常


# 数据解析
async def data_analysis(div_list):
    oneway_list = []  # 直达航班
    stopway_list = []  # 经停航班
    changeway_list = []  # 中专航班
    passway_list = []  # 通程航班
    all_list = []  # 所有航班
    for div in div_list:  # 每列航班的基本信息对应在一个div标签中
        change_way1, change_way2, price, depart_time, depart_site, arrive_time, arrive_site, airline, airline1, \
            airline2, plane, stop_message, pass_message1, pass_message2, gj_plane2 \
            = "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
        element_price = await div.xpath('.//span[@class="price"]|'
                                        './/div[@class="flight-seats"]/div[1]//div[@class="price-box"]/div[1]')  # 价格
        element_depart_time = await div.xpath('.//div[@class="depart-box"]/div[@class="time"]')  # 起飞时间
        element_depart_site = await div.xpath('.//div[@class="depart-box"]/div[@class="airport"]')  # 起飞机场
        element_arrive_time = await div.xpath('.//div[@class="arrive-box"]/div[@class="time"]')  # 着陆时间
        element_arrive_site = await div.xpath('.//div[@class="arrive-box"]/div[@class="airport"]')  # 着陆机场
        element_airline = await div.xpath('.//div[@class="flight-airline"]/div[1]/span')  # 航空公司
        element_plane = await div.xpath('.//div[@class="flight-airline"]/div[2]/div[1]/span[1]')  # 航班号
        element_gj_plane2 = await div.xpath('.//div[@class="flight-airline"]/div[2]/div[2]/span[1]')  # 国际中转航班号2
        element_airline1 = await div.xpath('.//div[@class="flight-airline"]/div[1]/div[1]')  # 中转航空公司+航班号1
        element_airline2 = await div.xpath('.//div[@class="flight-airline"]/div[2]/div[1]')  # 中转航空公司+航班号2

        element_change_way1 = await div.xpath('.//div[@class="horizontal-center"]/div[1]')  # 中转信息1
        element_change_way2 = await div.xpath('.//div[@class="horizontal-center"]/div[2]')  # 中转信息2
        element_stop_message = await div.xpath('.//div[@class="arrow-box"]//span[@class="high-light"]')  # 经停信息
        element_pass_message1 = await div.xpath(
            './/div[@class="arrow-box"]//div[@class="horizontal-center"]/div[1]/span')  # 通程信息1
        element_pass_message2 = await div.xpath(
            './/div[@class="arrow-box"]//div[@class="horizontal-center"]/div[2]/span')  # 通程信息2

        for item in element_price:
            price = await(await item.getProperty('textContent')).jsonValue()  # 获取标签元素文本内容
        for item in element_depart_time:
            depart_time = await(await item.getProperty('textContent')).jsonValue()
        for item in element_depart_site:
            depart_site = await(await item.getProperty('textContent')).jsonValue()
        for item in element_arrive_time:
            arrive_time = await(await item.getProperty('textContent')).jsonValue()
        for item in element_arrive_site:
            arrive_site = await(await item.getProperty('textContent')).jsonValue()
        for item in element_change_way1:
            change_way1 = await(await item.getProperty('textContent')).jsonValue()
        for item in element_change_way2:
            change_way2 = await(await item.getProperty('textContent')).jsonValue()
        for item in element_airline:
            airline = await(await item.getProperty('textContent')).jsonValue()
        for item in element_plane:
            plane = await(await item.getProperty('textContent')).jsonValue()
        for item in element_gj_plane2:
            gj_plane2 = await(await item.getProperty('textContent')).jsonValue()
        for item in element_airline1:
            airline1 = await(await item.getProperty('textContent')).jsonValue()
        for item in element_airline2:
            airline2 = await(await item.getProperty('textContent')).jsonValue()
        for item in element_stop_message:
            stop_message = await(await item.getProperty('textContent')).jsonValue()
        for item in element_pass_message1:
            pass_message1 = await(await item.getProperty('textContent')).jsonValue()
        for item in element_pass_message2:
            pass_message2 = await(await item.getProperty('textContent')).jsonValue()

        if price:
            price = price.split('¥')[1]  # str型
        plane = plane.split('\xa0')[0]
        airline = airline + plane
        airline1 = airline1.split('\xa0')[0]
        airline2 = airline2.split('\xa0')[0]
        airlinex = airline1 + '转' + airline2
        timex = depart_time + '——>' + arrive_time
        sitex = depart_site + '—>' + arrive_site
        if "转" in change_way1 or "转" in change_way2:  # 中转
            way = '中转' + ' ' + change_way1 + change_way2
            if not airline1:  # 说明是国际中转航班
                gj_plane2 = gj_plane2.split('\xa0')[0]
                airlinex = airline + '转' + gj_plane2
            flight = (price, airlinex, timex, way, sitex)
            changeway_list.append(flight)
        elif stop_message:  # 经停
            way = '经停' + stop_message
            flight = (price, airline, timex, way, sitex)
            stopway_list.append(flight)
        elif pass_message1 or pass_message2:  # 通程
            way = '通程' + ' ' + pass_message1 + pass_message2
            flight = (price, airline, timex, way, sitex)
            passway_list.append(flight)
        else:
            way = "直达"  # 直达
            flight = (price, airline, timex, way, sitex)
            oneway_list.append(flight)
    oneway_list.pop()  # 移除列表最后一个元素
    all_list.extend(oneway_list)
    all_list.extend(stopway_list)
    all_list.extend(changeway_list)
    all_list.extend(passway_list)

    return all_list

(原创公开项目,版权本人所有!)

66495e9992c34aae8c5f5fd03c18dd8c.png

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

钢铁の洪流

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值