国内机票预定APP携程处于垄断地位,但是携程有反爬虫策略,对于密集的查询请求会要求验证,验证操作有两次,一次是拖动验证,一次是点选中文,selenium+webdriver可以轻松绕过这一反爬虫设置。
重点是中文点选问题,涉及到中文识别OCR技术,笔者曾经使用过阿里云中文识别API,识别率较高,但是比较贵,到开源社区逛了一圈后,发现chineseocr_lite这么一款轻量级OCR项目,实在是雪中送炭,所以本文的技术重点就是python3 + selenium + chromedriver + chineseocr_lite。
整体项目代码,有需要的童鞋可自取:
https://github.com/ag-niemin/ctrip
对于拖动滑块的破解,很简单,网上有很多帖子可以参考,大多都是使用selenium模拟仿真操作;复杂的是中文点选验证,大致分为三步:
1)识别目标中文字符;
2)识别点选区的中文字符及坐标位置;
3)按照目标中文字符顺序,依次点击中文字符坐标位置;
废话不多说,先上破解代码:
# -*- coding: utf-8 -*-
import os
import sys
import time
import logging
from selenium.webdriver.common.action_chains import ActionChains
sys.path.append(os.getcwd())
from chineseocr_lite import ocr
import importlib
importlib.reload(sys)
logging.basicConfig(level=logging.INFO,
filename='selenium.log',
filemode='a')
# 破解携程滑块验证码
def crack_slide_verification(browser,url):
driver = browser
slider_btn = driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq"]/div[1]/div[2]')
if slider_btn:
logging.info(url + u' drag slider button')
actions = ActionChains(driver)
actions.click_and_hold(slider_btn).perform()
actions.move_by_offset(280,0).release(slider_btn).perform()
# driver.save_screenshot('screenshot-verify.png')
return driver,url
# 破解携程中文验证码
def crack_ocr_verification(browser,url):
driver = browser
dest_img_url = driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq-choose"]/div[2]/div[1]/img').get_attribute('src')
dest_img_res = ocr.resultBase64(dest_img_url)
for dest_img_character in dest_img_res:
# dest_img_characters = unicode(dest_img_character['word'], 'utf-8')
dest_img_characters = dest_img_character['word']
logging.info(url + u' dest characters: ' + dest_img_characters)
characters = list(dest_img_characters)
sele_img_url = driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq-choose"]/div[2]/div[3]/img').get_attribute('src')
sele_img_res = ocr.resultBase64(sele_img_url)
sele_characters = []
sele_characters_pos = []
for sele_img_character in sele_img_res:
sele_characters.append(sele_img_character['word'])
sele_characters_pos.append(sele_img_character['pos'])
logging.info(url + u' candidate characters: ' + ' '.join(sele_characters))
characters_pos = []
for c in characters:
for i in range(0,len(sele_characters)):
if sele_characters[i] == c:
characters_pos.append(sele_characters_pos[i])
return driver,url,characters,characters_pos
# 刷新携程中文验证码
def fresh_verification(browser,url,characters,characters_pos):
driver = browser
if len(characters_pos) == len(characters):
return driver,url,characters,characters_pos
while (len(characters_pos) != len(characters)):
cpt_choose_refresh = driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq-choose"]/div[2]/div[4]/div/a')
cpt_choose_refresh.click()
driver,url,characters,characters_pos = crack_ocr_verification(driver,url)
if len(characters_pos) == len(characters):
# driver.save_screenshot('screenshot-verify.png')
return driver,url,characters,characters_pos
# 点选携程中文验证码
def click_verification(browser,url,characters,characters_pos):
driver = browser
actions = ActionChains(driver)
while (len(characters_pos) == len(characters)):
cpt_big_img = driver.find_element_by_class_name("cpt-big-img")
for i in range(0,len(characters)):
logging.info(url + u' click ' + characters[i] + u' located (' + str(characters_pos[i]['x']) + ',' + str(characters_pos[i]['y']) + ')')
actions.move_to_element_with_offset(cpt_big_img,0,0).perform()
actions.move_by_offset(characters_pos[i]['x'],characters_pos[i]['y']).click().perform()
time.sleep(2)
# driver.save_screenshot('screenshot-click.png')
# 提交点选验证码
cpt_choose_submit = driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq-choose"]/div[2]/div[4]/a')
cpt_choose_submit.click()
# driver.save_screenshot('screenshot-submit.png')
return driver
# 检查是否点选成功
def check_verification(browser,url):
driver = browser
cpt_success_click = driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq"]/div[1]/div[3]/div/span')
while (u'校验成功' not in cpt_success_click.text):
driver,url,characters,characters_pos = crack_ocr_verification(driver,url)
driver,url,characters,characters_pos = fresh_verification(driver, url, characters, characters_pos)
driver = click_verification(driver, url, characters, characters_pos)
logging.info(url + ' ' + cpt_success_click.text)
# 点击重新搜索
research_btn = driver.find_element_by_xpath('//*[@id="app"]/div/div[2]/div/div[2]/div/div[2]/div/button')
research_btn.click()
# driver.save_screenshot('screenshot-search.png')
time.sleep(2)
return driver
接下来的重点是爬虫,将chromedriver中的html元素标签使用xpath提取出来并解析,这一步,写过scrapy和requests爬虫的童鞋们肯定非常熟悉,就不做过多赘述。
废话不多说,直接贴出爬虫代码,笔者列了大约15条航线18天的机票价格:
# -*- coding: utf-8 -*-
import time
import sys
import os
sys.path.append(os.getcwd())
import datetime
import logging
from lxml import etree
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from data import t_market_airticket_day
from OracleUtils import Oracle
import crack as crack
import importlib
importlib.reload(sys)
logging.basicConfig(level=logging.INFO,
filename='selenium.log',
filemode='a')
class selenium_ctrip(object):
BROWSER_PATH = os.path.dirname(__file__) + '/browser/chromedriver.exe'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
DATABASE = 'oracle://stg:stg123@10.6.0.94:1521/?service_name=db'
city_dict_en = {
'BJS': "北京",
'SHA': "上海",
'SZX': "深圳",
'HGH': "杭州",
'CTU': "成都",
'SIA': "西安",
'CAN': "广州"
}
city_dict_cn = {v: k for k, v in city_dict_en.items()}
city_list = [
city_dict_cn["北京"] + '-' + city_dict_cn["上海"],
city_dict_cn["北京"] + '-' + city_dict_cn["深圳"],
city_dict_cn["北京"] + '-' + city_dict_cn["杭州"],
city_dict_cn["北京"] + '-' + city_dict_cn["成都"],
city_dict_cn["上海"] + '-' + city_dict_cn["深圳"],
city_dict_cn["上海"] + '-' + city_dict_cn["成都"],
city_dict_cn["上海"] + '-' + city_dict_cn["西安"],
city_dict_cn["深圳"] + '-' + city_dict_cn["杭州"],
city_dict_cn["深圳"] + '-' + city_dict_cn["成都"],
city_dict_cn["深圳"] + '-' + city_dict_cn["西安"],
city_dict_cn["北京"] + '-' + city_dict_cn["广州"],
city_dict_cn["上海"] + '-' + city_dict_cn["广州"],
city_dict_cn["成都"] + '-' + city_dict_cn["广州"],
city_dict_cn["杭州"] + '-' + city_dict_cn["广州"],
city_dict_cn["西安"] + '-' + city_dict_cn["广州"],
]
# 未来1天、2天、3天、4天、5天、6天、7天、8天、9天、10天、15天、20天、30天、40天、50天、60天、120天、180天
date_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 60, 120, 180]
def get_ctrip_data(self):
scan_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
scan_hour = time.strftime('%H', time.localtime(time.time()))
if int(scan_hour) >= 0 and int(scan_hour) <= 23:
request_urls = []
for city_li in self.city_list:
for i in self.date_list:
today = datetime.date.today()
sp_date = today + datetime.timedelta(days=i)
st_date = str(sp_date)[0:10]
request_url = "https://flights.ctrip.com/itinerary/oneway/" + city_li.lower() + "?date=" + st_date
request_urls.append(request_url)
browser_path = self.BROWSER_PATH
options = Options()
options.add_argument('--headless') # 设置Chrome不弹出界面
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu') # 禁用GPU加速
options.add_argument("--user-agent=%s" % self.USER_AGENT) # 设置用户代理
options.add_argument('--log-level=3') # python调用selenium会产生大量日志
options.add_argument('--start-maximized') # 最大化运行
options.add_argument('--disable-infobars') # 禁用浏览器正在被自动化程序控制的提示
# options.add_argument('--blink-settings=imagesEnabled=false') # 不加载图片
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = Chrome(executable_path=browser_path, chrome_options=options)
for url in request_urls:
items = []
driver.get(url)
# 判断是否弹出滑动验证码
try:
if driver.find_element_by_xpath('//*[@id="J_slider_verification_qwewq"]/div[1]/div[2]'):
driver, url = crack.crack_slide_verification(driver, url)
driver, url, characters, characters_pos = crack.crack_ocr_verification(driver, url)
driver, url, characters, characters_pos = crack.fresh_verification(driver, url, characters, characters_pos)
driver = crack.click_verification(driver, url, characters, characters_pos)
driver = crack.check_verification(driver, url)
# 判断是否下拉到底部
s = 0
t = 1
while s < t:
for i in range(10): # 下拉10次
driver.execute_script("var q=document.documentElement.scrollTop=10000")
elements = driver.find_elements_by_xpath('//div[@class="search_box search_box_tag search_box_light Label_Flight"]')
s = len(elements)
for i in range(10): # 再下拉10次
driver.execute_script("var q=document.documentElement.scrollTop=10000")
elements = driver.find_elements_by_xpath('//div[@class="search_box search_box_tag search_box_light Label_Flight"]')
t = len(elements)
except:
# 判断是否下拉到底部
s = 0
t = 1
while s < t:
for i in range(10): # 下拉10次
driver.execute_script("var q=document.documentElement.scrollTop=10000")
elements = driver.find_elements_by_xpath('//div[@class="search_box search_box_tag search_box_light Label_Flight"]')
s = len(elements)
for i in range(10): # 再下拉10次
driver.execute_script("var q=document.documentElement.scrollTop=10000")
elements = driver.find_elements_by_xpath('//div[@class="search_box search_box_tag search_box_light Label_Flight"]')
t = len(elements)
driver.implicitly_wait(2)
# driver.save_screenshot('screenshot-result.png')
html = driver.page_source
rbody = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8'))
res = rbody.xpath('//div[@class="search_box search_box_tag search_box_light Label_Flight"]')
if res:
# print(url + ' selenium chrome scraped %s records' % str(len(res)))
logging.info(url + ' selenium chrome scraped %s records' % str(len(res)))
for r in res:
st_date = url[-10:]
city_li = url.replace('https://flights.ctrip.com/itinerary/oneway/', '')[0:7].upper()
startcity = self.city_dict_en[city_li[0:city_li.index('-')]]
stopcity = self.city_dict_en[city_li[city_li.index('-') + 1:]]
startairport = r.xpath('./div[1]/div[1]/div[@class="inb right"]/div[@class="airport"]//text()')[0]
starttime = r.xpath('./div[1]/div[1]/div[@class="inb right"]/div[@class="time_box"]/strong[1]/text()')[0]
stopairport = r.xpath('./div[1]/div[1]/div[@class="inb left"]/div[@class="airport"]//text()')[0]
stoptime = r.xpath('./div[1]/div[1]/div[@class="inb left"]/div[@class="time_box"]/strong[1]/text()')[0]
airline = r.xpath('./div[1]/div[1]/div[@class="inb logo"]/div[1]/div[1]/span[1]/span[1]/strong[1]/text()')[0]
airtype = r.xpath('./div[1]/div[1]/div[@class="inb logo"]/div[1]/div[1]/span[1]/span[1]/span[1]/text()')[0]
if r.xpath('./div[1]/div[1]/div[@class="inb price child_price lowest_price"]/div[1]/span[@class="base_price02"]/text()'):
price = r.xpath('./div[1]/div[1]/div[@class="inb price child_price lowest_price"]/div[1]/span[@class="base_price02"]/text()')[0]
class_discount = r.xpath('./div[1]/div[1]/div[@class="inb price child_price lowest_price"]/div[1]/div[@class="flight_price_tips"]/div[1]/span[1]/text()')[0]
else:
price = r.xpath('./div[1]/div[1]/div[@class="inb price child_price"]/div[1]/span[@class="base_price02"]/text()')[0]
class_discount = r.xpath('./div[1]/div[1]/div[@class="inb price child_price"]/div[1]/div[@class="flight_price_tips"]/div[1]/span[1]/text()')[0]
classgrade = class_discount[0:class_discount.index(u'舱') + 1]
discount = class_discount.replace(classgrade, '') or u'全价'
item = {}
item['scan_date'] = datetime.datetime.strptime(str(scan_date), '%Y-%m-%d')
item['scan_hour'] = str(scan_hour)
item['start_city'] = startcity
item['stop_city'] = stopcity
item['start_airport'] = startairport
item['start_time'] = datetime.datetime.strptime(st_date + ' ' + starttime, '%Y-%m-%d %H:%M')
item['stop_airport'] = stopairport
if int(starttime[0:2]) <= int(stoptime[0:2]):
item['stop_time'] = datetime.datetime.strptime(st_date + ' ' + stoptime, '%Y-%m-%d %H:%M')
else:
item['stop_time'] = datetime.datetime.strptime(st_date + ' ' + stoptime,'%Y-%m-%d %H:%M') + datetime.timedelta(days=1)
item['airline'] = airline
item['air_type'] = airtype
item['source'] = url
item['low_price'] = price
item["discount"] = discount
item["class_grade"] = classgrade
# print(item)
items.append(item)
else:
# print(url + " selenium chrome failure, failure")
# driver.save_screenshot('screenshot-failure.png')
logging.info(url + " selenium chrome failure, failure")
driver.quit()
res = {'scan_date': scan_date,
'scan_hour': scan_hour,
'flights' : items}
return res
def load_ctrip_data(self,seleres):
table = t_market_airticket_day()
self.table_name = table.table_name
self.column_list = table.column_list
orcl = Oracle()
insertValues = []
deleteValues = []
scan_date = datetime.datetime.strptime(str(seleres['scan_date']), '%Y-%m-%d')
scan_hour = seleres['scan_hour']
deleteValues.append([scan_date,scan_hour])
for item in seleres['flights']:
insertValues.append([item['scan_date'],
item['scan_hour'],
item['start_city'],
item['stop_city'],
item['start_airport'],
item['start_time'],
item['stop_airport'],
item['stop_time'],
item['airline'],
item['air_type'],
item['class_grade'],
item['low_price'],
item['discount'],
item['source']])
column_nums = len(self.column_list)
orders = list(range(1, column_nums + 1))
value_orders = ','.join([':' + str(i) for i in orders])
insertsql = "insert into %s(%s) values(%s)" % (self.table_name, ','.join(self.column_list), value_orders)
deletesql = "delete from %s where scan_date=:1 and scan_hour=:2" % (self.table_name)
orcl.batchinsert_ex(deletesql, deleteValues, insertsql, insertValues)
if __name__ == '__main__':
ctrip = selenium_ctrip()
res = ctrip.get_ctrip_data()
ctrip.load_ctrip_data(res)
主体代码已经完成,笔者整体项目是将爬取的数据直接载入oracle数据库,这个根据所需自行完成剩下代码。
chineseocr_lite对于分散字体的识别准确率并不是那么高,但在每次打开浏览器后成功识别一次即可,最终运行效果如下:
Connected to pydev debugger (build 192.6817.19)
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 drag slider button
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 dest characters: 香糊丽舍
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 candidate characters: 糊 E 香 含 畅 在
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 dest characters: 四川盆地
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 candidate characters: 所 州 盆 四 地 责 I
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 dest characters: 好阿灿歌
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 candidate characters: 展 法 京 区 西 系 岛
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 dest characters: 复口大学
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 candidate characters: 大 资 十 快友 巴 色 想
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 dest characters: 洛带古镇
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 candidate characters: 带 古 异 作 镇 洛 托
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 click 洛 located (67,164)
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 click 带 located (240,48)
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 click 古 located (127,48)
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 click 镇 located (206,124)
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 校验成功,通过!
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-14 selenium chrome scraped 26 records
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-15 selenium chrome scraped 25 records
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-16 selenium chrome scraped 23 records
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-17 selenium chrome scraped 25 records
INFO:root:https://flights.ctrip.com/itinerary/oneway/bjs-sha?date=2020-05-18 selenium chrome scraped 37 records