爬取厦门航空
接口地址:“https://csapi.xiamenair.com/offer/api/v1/offer/shopping”
1.浏览器抓包
请求头设置几个主要的参数
"headers": {
"Channel": "PCWEB",
"Content-Type": "application/json;charset=UTF-8",
"Cookie": "CS_SESSION_ID=e3b2482b-56be-4905-9a73-c67694e8c98d; gr_user_id=578b4c2a-1d4f-4901-8e0d-b9092107b2f9; b014f44b281415f1_gr_session_id=23e53fa9-1526-4960-ae35-ce941edbb1e5; b014f44b281415f1_gr_session_id_sent_vst=23e53fa9-1526-4960-ae35-ce941edbb1e5; _ga=GA1.2.46659139.1697422796; _gid=GA1.2.791524916.1697422796; _gat_UA-96517318-2=1; _ga_7GZXM749KM=GS1.2.1697422796.1.0.1697422812.44.0.0",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.27"
},
请求参数分析
"data": {
"shoppingPreference": {
"connectionPreference": {
"maxConnectionQuantity": -1
},
"flightPreference": {
"cabinCombineMode": "Cabin",
"lowestFare": "true",
"carrierPreferences": [
{
"carrierCode": "TKT_SHOPPING_CARRIERS_DS"
}
]
},
"accountCodeSlogan": "",
"pricingMode": "Money"
},
"cabinClasses": [
"Economy",
"Business",
"First"
],
"passengerCount": {
"adult": 1,
"child": 1,
"infant": 1
},
"itineraries": [
{
"departureDate": "2023-10-23",
"origin": {
"airport": {
"code": "CKG"
}
},
"destination": {
"airport": {
"code": "PKX"
}
},
"segments": []
}
]
},
可以通过设置origin下面的code设置目的地参数,通过设置destination下面的code参数设置出发参数,通过departureDate设置出发地日期。
2.第一次请求
发起Post请求,注意事项,踩坑笔记:数据要用json参数接受,不能用params和data接收
def request_api(url, params, header):
try:
response = requests.post(url, json=params, headers=header)
if response.status_code == 200:
return response.json()
else:
print(f"请求失败,状态码:{response.status_code}")
return None
except requests.exceptions.RequestException as e:
print(f"请求出现异常:{e}")
return None
加载配置文件,从文件加载出发地,目的地和日期三个参数,对请求数据进行构造。
data_config, df = get_config()
data = data_config["data"]
headers = data_config["headers"]
url = "https://csapi.xiamenair.com/offer/api/v1/offer/shopping"
data['itineraries'][0]['origin']['airport']['code'] = df.iloc[i, 0]
data['itineraries'][0]['destination']['airport']['code'] = df.iloc[i, 1]
data['itineraries'][0]['departureDate'] = k
# 发送请求并获取结果 key odInfos offers responseId refData
result = request_api(url, data, headers)["result"]
分析发现,此时请求的数据结果只有简单的航班信息,而没有详细的航班信息,因此需要再一次发起请求
3.再次抓包
请求头不需要重新设置,但请求数据需要重新设置,在请求数据的segments(第一次请求为空)此时需要填充。
而通过分析发现,segments字段在第一次请求的结果里面以一个数组的形式进行返回,但是我们请求的参数是每次一个segment,因此我们需要以遍历的方式每次加入,参数处理
for key, val in enumerate(result['refData']['segments']):
segment = result['refData']['segments'][val]
data['itineraries'][0]['segments'].append(segment)
x = request_api(url, data, headers)["result"]
data['itineraries'][0]['segments'] = data['itineraries'][0]['segments'][:-1] # 更新表头,每次只能加一个,因此需要弹出上一次的元素
4.提取元素
通过对接口二次请求的方式,可以获取完整的数据,根据自己的需求取出元素
def get_data(data,dict,result,i):
dict['paxSegmentIds'] = data['paxSegmentIds'][0]
dict['arrival_time'] = result['odInfos'][0]['flightInfos'][i]['arrival']['aircraftScheduledDateTime']
dict['departure_time'] = result['odInfos'][0]['flightInfos'][i]['departure']['aircraftScheduledDateTime']
dict['baseAmount'] = data['baseAmount']
dict['totalAmount'] = data['totalAmount']
dict['cabinCodes'] = data['cabinCodes'][0]
dict['cabinTypes'] = data['cabinTypes'][0]
dict['loyaltyAward_totalAmount'] = data['loyaltyAward']['totalAmount']
5.数据保存
将每次提取的元素存入一个字典,此时需要将json保存为excel表格的形式,先保存为json,在保存为excel
try:
with open("../数据存放/厦门航空结果存放/res_dict.json", "w", encoding='utf-8') as f:
json.dump(res_dict, f, ensure_ascii=False)
pd.read_json('../数据存放/厦门航空结果存放/res_dict.json', orient='records').T.to_excel("res_dict.xlsx", index=False)
except:
print("保存失败,您的文件可能已经被打开!保存为其他文件res_dict_1!!")
pd.read_json('res_dict_1.json', orient='records').T.to_excel("res_dict_1.xlsx", index=False)
此时一个单机版的请求已经结束,由于两次循环,请求的速度比较慢,因此,我们需要加入线程池,进行异步处理
6.加入线程池
将前面的单线程爬虫封装为一个方法,通过参数的方式将地点和日期输入,封装结果如下
def job(i,k):
global index
data_config, df = get_config()
data = data_config["data"]
headers = data_config["headers"]
url = "https://csapi.xiamenair.com/offer/api/v1/offer/shopping"
data['itineraries'][0]['origin']['airport']['code'] = df.iloc[i, 0]
data['itineraries'][0]['destination']['airport']['code'] = df.iloc[i, 1]
data['itineraries'][0]['departureDate'] = k
# 发送请求并获取结果 key odInfos offers responseId refData
result = request_api(url, data, headers)["result"]
if len(result['odInfos']) == 0:
return
segment = result['refData']['segments']
data['shoppingPreference']['flightPreference']['lowestFare'] = False
for key, val in enumerate(result['refData']['segments']):
segment = result['refData']['segments'][val]
data['itineraries'][0]['segments'].append(segment)
x = request_api(url, data, headers)["result"]
data['itineraries'][0]['segments'] = data['itineraries'][0]['segments'][:-1] # 更新表头)
data1 = x['odInfos'][0]['flightInfos'][0]['paxCabins']['ADT1']
# 得到详细的仓位信息
for i in range(len(data1)):
temp = {}
get_data(data1[i], temp, result, key)
res_dict[index] = temp
index += 1
print(temp)
其中,参数i是出发地—>目的地的行,k是存入的日期,多线程的请求网络爬虫
data_config, df = get_config()
date = get_date(10)
max_workers = 20
t = []
threadPool = ThreadPoolExecutor(max_workers) # 创建最大线程数为max_workers的线程池
# 若不需要获取返回值,则可不需要下面两行代码
for i in t:
print(i.result()) # 获取每个任务的返回值,result()会阻塞主线程
for k in date:
for i in range(len(df)):
threadPool.submit(job,i,k)
threadPool.shutdown()
完整代码如下:
import pandas as pd
import requests
import json
from datetime import date
from concurrent.futures import ThreadPoolExecutor # 导入ThreadPoolExecutor模块
index = 0
res_dict = {}
# 保存json格式
# with open("data.json","w",encoding='utf-8') as f :
# json.dump(data,f,ensure_ascii=False)
# print(segment)
#返回gap天内的日期
def get_date(gap):
today = date.today()
date_list = pd.date_range(start=today, periods=gap).strftime("%Y-%m-%d").tolist()
return date_list
#加载配置文件,请求参数和请求头
def get_config():
# 打开你的json文件
with open('../数据存放/厦门航空/厦门航空.json') as f:
data = json.load(f)
df = pd.read_excel("../数据存放/厦门航空/航班号.xlsx")
return data,df
#发起请求
def request_api(url, params, header):
try:
response = requests.post(url, json=params, headers=header)
if response.status_code == 200:
return response.json()
else:
print(f"请求失败,状态码:{response.status_code}")
return None
except requests.exceptions.RequestException as e:
print(f"请求出现异常:{e}")
return None
#发起请求,需要发两次
def spider():
#读取参数
data_config, df = get_config()
date = get_date(10)
max_workers = 20
t = []
threadPool = ThreadPoolExecutor(max_workers) # 创建最大线程数为max_workers的线程池
# 若不需要获取返回值,则可不需要下面两行代码
for i in t:
print(i.result()) # 获取每个任务的返回值,result()会阻塞主线程
for k in date:
for i in range(len(df)):
threadPool.submit(job,i,k)
threadPool.shutdown()
try:
with open("../数据存放/厦门航空结果存放/res_dict.json", "w", encoding='utf-8') as f:
json.dump(res_dict, f, ensure_ascii=False)
pd.read_json('../数据存放/厦门航空结果存放/res_dict.json', orient='records').T.to_excel("res_dict.xlsx", index=False)
except:
print("保存失败,您的文件可能已经被打开!保存为其他文件res_dict_1!!")
pd.read_json('res_dict_1.json', orient='records').T.to_excel("res_dict_1.xlsx", index=False)
def job(i,k):
global index
data_config, df = get_config()
data = data_config["data"]
headers = data_config["headers"]
url = "https://csapi.xiamenair.com/offer/api/v1/offer/shopping"
data['itineraries'][0]['origin']['airport']['code'] = df.iloc[i, 0]
data['itineraries'][0]['destination']['airport']['code'] = df.iloc[i, 1]
data['itineraries'][0]['departureDate'] = k
# 发送请求并获取结果 key odInfos offers responseId refData
result = request_api(url, data, headers)["result"]
if len(result['odInfos']) == 0:
return
segment = result['refData']['segments']
data['shoppingPreference']['flightPreference']['lowestFare'] = False
for key, val in enumerate(result['refData']['segments']):
segment = result['refData']['segments'][val]
data['itineraries'][0]['segments'].append(segment)
x = request_api(url, data, headers)["result"]
data['itineraries'][0]['segments'] = data['itineraries'][0]['segments'][:-1] # 更新表头)
data1 = x['odInfos'][0]['flightInfos'][0]['paxCabins']['ADT1']
# 得到详细的仓位信息
for i in range(len(data1)):
temp = {}
get_data(data1[i], temp, result, key)
res_dict[index] = temp
index += 1
print(temp)
#数据提取
def get_data(data,dict,result,i):
dict['paxSegmentIds'] = data['paxSegmentIds'][0]
dict['arrival_time'] = result['odInfos'][0]['flightInfos'][i]['arrival']['aircraftScheduledDateTime']
dict['departure_time'] = result['odInfos'][0]['flightInfos'][i]['departure']['aircraftScheduledDateTime']
dict['baseAmount'] = data['baseAmount']
dict['totalAmount'] = data['totalAmount']
dict['cabinCodes'] = data['cabinCodes'][0]
dict['cabinTypes'] = data['cabinTypes'][0]
dict['loyaltyAward_totalAmount'] = data['loyaltyAward']['totalAmount']
if __name__ == '__main__':
spider()