本文介绍如何利用Python和Selenium自动化抓取同程旅行网站的飞机票信息,并将数据存储为Excel文件。代码简单高效,适合初学者和自动化测试工程师快速上手。
一、功能概述
链接:上海到北京机票预订 - 上海到北京机票预约 - 同程机票预订
通过自动化脚本抓取飞机票信息,可以获取指定日期、出发地和目的地的航班列表,包括航空公司、起飞时间、到达时间、机场信息及票价。此功能适用于多种场景,例如数据分析、价格监控等。
二、工具及依赖
在实现该功能前,请确保以下工具和依赖已安装:
- Python (推荐3.7以上版本)
- Selenium (用于浏览器操作)
- lxml (用于解析HTML)
- pandas (用于处理数据)
- Chrome浏览器 和 ChromeDriver
安装依赖库:
pip install selenium lxml pandas
三、核心代码解析
1. WebDriver 初始化
通过 get_driver
函数创建并配置 WebDriver 实例,设置浏览器参数和隐式等待时间。
def get_driver(executable_path="chromedriver"):
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors') # 忽略证书错误
driver = webdriver.Chrome(options=options, executable_path=executable_path)
driver.implicitly_wait(10)
return driver
2. 元素操作函数
元素点击
element_click
函数实现了带超时重试机制的元素点击操作。通过 WebDriverWait
等待元素可点击。
def element_click(driver, xpath, msg='', timeout=10):
error = TimeoutError(f'{msg} 超时')
for i in range(timeout):
try:
wait = WebDriverWait(driver, timeout)
element = wait.until(EC.element_to_be_clickable(('xpath', xpath)))
element.click()
return
except Exception as e:
error = e
time.sleep(1)
raise error
输入文本
send_keys
函数清除输入框内容并输入新的值,确保输入准确无误。
def send_keys(driver, xpath, value):
ele = driver.find_element('xpath', xpath)
ele.send_keys(Keys.CONTROL, 'a') # 全选内容
ele.send_keys(Keys.DELETE) # 删除内容
ele.send_keys(value) # 输入新内容
3. 数据解析与存储
HTML 数据提取
get_info
函数利用 lxml
解析页面内容,提取航班信息:
- 航空公司名称
- 起飞/到达时间和机场
- 价格
def get_info(html):
html_content = etree.HTML(html)
flights = html_content.xpath('//div[@class="flight-lists-container"]/div')
data = []
for flight in flights:
info = {}
airlines = flight.xpath('.//p/text()')[0]
info['airlines'] = airlines.strip()
info['start_time'] = flight.xpath('.//strong/text()')[0]
info['start_airport'] = flight.xpath('.//em/text()')[0]
info['end_time'] = flight.xpath('.//strong/text()')[1]
info['end_airport'] = flight.xpath('.//em/text()')[1]
info['price'] = flight.xpath('.//em/text()')[2]
data.append(info)
数据保存为Excel
通过 pandas
将抓取的数据保存为 Excel 文件:
df = pd.DataFrame(data)
path = Path(r'c:', 'flight_info.xlsx')
df.to_excel(path, index=False)
四、完整代码
以下是完整代码,可直接复制运行:
"""
抓取同程旅行飞机票信息
"""
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from lxml import etree
from pathlib import Path
def get_driver(executable_path="chromedriver"):
"""
获取web driver
:param executable_path:
:return:
"""
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
driver = webdriver.Chrome(options=options, executable_path=executable_path)
driver.implicitly_wait(10)
return driver
def element_click(driver, xpath, msg='', timeout=10):
error = TimeoutError(f'{msg} 超时')
for i in range(timeout):
try:
wait = WebDriverWait(driver, timeout)
element = wait.until(EC.element_to_be_clickable(('xpath', xpath)))
element.click()
return
except Exception as e:
error = e
time.sleep(1)
raise error
def send_keys(driver, xpath, value):
ele = driver.find_element('xpath', xpath)
ele.send_keys(Keys.CONTROL, 'a')
ele.send_keys(Keys.DELETE)
ele.send_keys(value)
def get_info(html):
html_content = etree.HTML(html)
flights = html_content.xpath('//div[@class="flight-lists-container"]/div')
print(len(flights))
data = []
for flight in flights:
info = {}
airlines = flight.xpath('.//p/text()')[0]
info['airlines'] = airlines
print(airlines.strip(), end='\t')
start_time = flight.xpath('.//strong/text()')[0]
start_airport = flight.xpath('.//em/text()')[0]
info['start_time'] = start_time
info['start_airport'] = start_airport
print(start_time, start_airport, end='\t')
end_time = flight.xpath('.//strong/text()')[1]
end_airport = flight.xpath('.//em/text()')[1]
info['end_time'] = end_time
info['end_airport'] = end_airport
print(end_time, end_airport, end='\t')
price = flight.xpath('.//em/text()')[2]
info['price'] = price
print(price)
data.append(info)
df = pd.DataFrame(data)
path = Path(r'c:', 'flight_info.xlsx')
print(path)
path.exists() and path.unlink()
df.to_excel(path, index=False, startrow=4)
return path
if __name__ == '__main__':
driver = get_driver()
go_date = '2024-12-01'
url = f"https://www.ly.com/flights/itinerary/oneway/SHA-PEK?date={go_date}"
driver.get(url)
element_click(driver, '//*[@id="__layout"]/div/section/div/div[1]/div[1]/div/span', '点击单程')
element_click(driver, '//*[@id="__layout"]//ul/li[2]', '点击往返')
from_city = '上海'
time.sleep(0.5)
send_keys(driver, '//*[@id="__layout"]/div/section/div/div[1]/div[2]/div[1]/div[1]/span/div/input', from_city)
to = '沈阳'
time.sleep(0.5)
send_keys(driver, '//*[@id="__layout"]/div/section/div/div[1]/div[2]/div[1]/div[2]/span/div/input', to)
time.sleep(1)
element_click(driver, '//*[@id="__layout"]/div/section/div/div[1]/div[3]/a', '点击搜索按钮')
time.sleep(2)
path = get_info(driver.page_source)
print(path)
driver.quit()
五、运行效果
- 自动打开浏览器并填写出发地、目的地和日期信息。
- 抓取航班信息并生成如下Excel文件:
六、注意事项
- 确保页面结构未改变:网页更新可能导致XPath失效。
- 确保ChromeDriver版本匹配:与Chrome浏览器保持一致。
- 避免过于频繁的抓取:尊重目标网站的使用规则。
通过本篇教程,您可以轻松实现航班信息的自动化抓取,助力数据分析和决策。