以下为我自己写的项目的爬虫部分核心代码:
import asyncio
import random
from pyppeteer import launch
import city
import parm
import SQLite_Pool
import GraphicCode
import SliderCode
from Log import log
ip_arr = SQLite_Pool.find_all('ip_white') # 代理池
result_3 = [] # 最后里面是二维数据,3个results 或 为空
# results为 航班信息列表 或者 [("破解验证码失败!", "", "", "", "", "")]
requests_num = 1 # 代表启动了多少次浏览器
async def crawl(url_5): # 分3个页面依次获取
proxy = random.choice(ip_arr)[0] # 获取随机代理
log.info('选择的代理:' + proxy)
ua = parm.ua_pool().user_agent() # 获取随机UA
global result_3
num = 0
browser = None
yanzheng, yanzheng_num, yanzheng_succeed, run = False, 0, True, True
for i in range(len(url_5)):
if run:
dcity_code = city.all_cities.get(url_5[i][0]).lower()
acity_code = city.all_cities.get(url_5[i][1]).lower()
date = url_5[i][2]
url = 'https://flights.ctrip.com/online/list/oneway-' + dcity_code + '-' + acity_code + '?_=1&depdate=' \
+ date + '&cabin=Y_S_C_F'
flag, empty_fail, ip_fail, results, img_id = True, 1, 1, "", ""
num += 1
if num == 1:
browser = await launch(parm.start_parm(proxy, ua)) # 启动Chromium浏览器
while flag: # 让ip失效时仍然确保爬取的网页数不变
page = await browser.newPage() # 打开新页面
await page.evaluateOnNewDocument(parm.js_text) # 反爬js 本页刷新后值不变,自动执行js
await page.setViewport(viewport={'width': parm.width, 'height': parm.height}) # 设置网页 视图大小
try:
await page.goto(url=url, options={'timeout': 1000 * 30}) # 发起请求,超时等待设置30s
except Exception as e:
ip_fail += 1 # 避免死循环保护机制
if ip_fail > 3: # 当使用了3次代理仍然无法爬取,说明可能该ip已经失效!跳过
await page.close()
await browser.close()
log.error('使用的代理ip多次爬取失败!可能该ip已经失效!或网络不好!' + str(e))
global requests_num
if requests_num <= 5:
requests_num += 1
await browser.close() # 关闭浏览器更换代理继续获取
asyncio.get_event_loop().run_until_complete(crawl(url_5[i:])) # 继续获取后续url的航班信息
log.error("继续获取后续url的航班信息!")
run = False # 避免半死循环和返回的数据重复
flag = False
else: # 重复启动5次浏览器还没有获取完5个完整url信息,跳过
run = False # 退出!!!
flag = False
else:
if yanzheng: # 开启破解验证码,测试开关
yanzheng_num += 1
log.info("开启破解验证码!")
await asyncio.sleep(3)
await page.evaluate('window.scrollTo(0,0)') # 拉到网页顶部
# 滑块验证
asyncio.get_event_loop().run_until_complete(SliderCode.slider_main(page))
# 点选图形验证
img_id = asyncio.get_event_loop().run_until_complete(GraphicCode.verify_main(page))
elif num == 1: # 第一次启动浏览器要点击一次点击确定按钮
await asyncio.sleep(3)
enter_botton = await page.xpath('//*[@id="outerContainer"]/div/div[3]/div/button')
await asyncio.sleep(1)
if enter_botton:
await enter_botton[0].click()
await asyncio.sleep(4)
height = 0
for sro in range(30): # 模拟人工滑动,确保整张页面的Ajax数据都能加载出来
height += 300
await page.evaluate('window.scrollTo(0,{})'.format(height))
await asyncio.sleep(0.3)
await page.evaluate('window.scrollBy(0, document.body.scrollHeight)') # 拉到网页最底部
await asyncio.sleep(1) # 等待动态数据完全加载出来
div_list = await page.xpath('//*[@id="__next"]/div[2]/div/div[3]/div[3]/div[2]/span/div') # 获取页面数据
if div_list: # 是否获取到航班信息
results = await data_analysis(div_list) # 解析数据
flag = False # 成功获取数据,跳出循环
await page.close()
else:
empty_fail = empty_fail + 1 # 避免死循环保护机制
if empty_fail > 3: # 当爬取的网页超过3次仍然没有数据,就开启验证模式(多半是被网站检测到了爬虫,需要破解验证码)
if yanzheng and img_id: # 验证为已经开启,数据依然为空,说明验证码识别失败
GraphicCode.chaojiying.ReportError(img_id) # 上报识别失败的图片
if yanzheng_num >= 2: # 开启验证的情况 如果验证次数达到2次,跳过
flag = False # 验证次数过多,退出循环
run = False # 退出!!!
await page.close()
results = [("破解验证码失败!", "", "", "", "", "")]
log.info("破解验证码失败!")
yanzheng = True # 开启验证
result_3.append(results) # results为 航班信息列表 或者 [("破解验证码失败!", "", "", "", "", "")]
await asyncio.sleep(20) # 限制获取频率,20秒一个页面
try:
await browser.close() # 关闭浏览器
except Exception as e:
print(e) # 跳过无法移除临时用户数据的异常
# 数据解析
async def data_analysis(div_list):
oneway_list = [] # 直达航班
stopway_list = [] # 经停航班
changeway_list = [] # 中专航班
passway_list = [] # 通程航班
all_list = [] # 所有航班
for div in div_list: # 每列航班的基本信息对应在一个div标签中
change_way1, change_way2, price, depart_time, depart_site, arrive_time, arrive_site, airline, airline1, \
airline2, plane, stop_message, pass_message1, pass_message2, gj_plane2 \
= "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
element_price = await div.xpath('.//span[@class="price"]|'
'.//div[@class="flight-seats"]/div[1]//div[@class="price-box"]/div[1]') # 价格
element_depart_time = await div.xpath('.//div[@class="depart-box"]/div[@class="time"]') # 起飞时间
element_depart_site = await div.xpath('.//div[@class="depart-box"]/div[@class="airport"]') # 起飞机场
element_arrive_time = await div.xpath('.//div[@class="arrive-box"]/div[@class="time"]') # 着陆时间
element_arrive_site = await div.xpath('.//div[@class="arrive-box"]/div[@class="airport"]') # 着陆机场
element_airline = await div.xpath('.//div[@class="flight-airline"]/div[1]/span') # 航空公司
element_plane = await div.xpath('.//div[@class="flight-airline"]/div[2]/div[1]/span[1]') # 航班号
element_gj_plane2 = await div.xpath('.//div[@class="flight-airline"]/div[2]/div[2]/span[1]') # 国际中转航班号2
element_airline1 = await div.xpath('.//div[@class="flight-airline"]/div[1]/div[1]') # 中转航空公司+航班号1
element_airline2 = await div.xpath('.//div[@class="flight-airline"]/div[2]/div[1]') # 中转航空公司+航班号2
element_change_way1 = await div.xpath('.//div[@class="horizontal-center"]/div[1]') # 中转信息1
element_change_way2 = await div.xpath('.//div[@class="horizontal-center"]/div[2]') # 中转信息2
element_stop_message = await div.xpath('.//div[@class="arrow-box"]//span[@class="high-light"]') # 经停信息
element_pass_message1 = await div.xpath(
'.//div[@class="arrow-box"]//div[@class="horizontal-center"]/div[1]/span') # 通程信息1
element_pass_message2 = await div.xpath(
'.//div[@class="arrow-box"]//div[@class="horizontal-center"]/div[2]/span') # 通程信息2
for item in element_price:
price = await(await item.getProperty('textContent')).jsonValue() # 获取标签元素文本内容
for item in element_depart_time:
depart_time = await(await item.getProperty('textContent')).jsonValue()
for item in element_depart_site:
depart_site = await(await item.getProperty('textContent')).jsonValue()
for item in element_arrive_time:
arrive_time = await(await item.getProperty('textContent')).jsonValue()
for item in element_arrive_site:
arrive_site = await(await item.getProperty('textContent')).jsonValue()
for item in element_change_way1:
change_way1 = await(await item.getProperty('textContent')).jsonValue()
for item in element_change_way2:
change_way2 = await(await item.getProperty('textContent')).jsonValue()
for item in element_airline:
airline = await(await item.getProperty('textContent')).jsonValue()
for item in element_plane:
plane = await(await item.getProperty('textContent')).jsonValue()
for item in element_gj_plane2:
gj_plane2 = await(await item.getProperty('textContent')).jsonValue()
for item in element_airline1:
airline1 = await(await item.getProperty('textContent')).jsonValue()
for item in element_airline2:
airline2 = await(await item.getProperty('textContent')).jsonValue()
for item in element_stop_message:
stop_message = await(await item.getProperty('textContent')).jsonValue()
for item in element_pass_message1:
pass_message1 = await(await item.getProperty('textContent')).jsonValue()
for item in element_pass_message2:
pass_message2 = await(await item.getProperty('textContent')).jsonValue()
if price:
price = price.split('¥')[1] # str型
plane = plane.split('\xa0')[0]
airline = airline + plane
airline1 = airline1.split('\xa0')[0]
airline2 = airline2.split('\xa0')[0]
airlinex = airline1 + '转' + airline2
timex = depart_time + '——>' + arrive_time
sitex = depart_site + '—>' + arrive_site
if "转" in change_way1 or "转" in change_way2: # 中转
way = '中转' + ' ' + change_way1 + change_way2
if not airline1: # 说明是国际中转航班
gj_plane2 = gj_plane2.split('\xa0')[0]
airlinex = airline + '转' + gj_plane2
flight = (price, airlinex, timex, way, sitex)
changeway_list.append(flight)
elif stop_message: # 经停
way = '经停' + stop_message
flight = (price, airline, timex, way, sitex)
stopway_list.append(flight)
elif pass_message1 or pass_message2: # 通程
way = '通程' + ' ' + pass_message1 + pass_message2
flight = (price, airline, timex, way, sitex)
passway_list.append(flight)
else:
way = "直达" # 直达
flight = (price, airline, timex, way, sitex)
oneway_list.append(flight)
oneway_list.pop() # 移除列表最后一个元素
all_list.extend(oneway_list)
all_list.extend(stopway_list)
all_list.extend(changeway_list)
all_list.extend(passway_list)
return all_list
(原创公开项目,版权本人所有!)