今天写一个12306余票查询的爬虫
首先使用Chrome浏览器打开12306检查这个查询过程发送的请求
点击完查询发现页面跳转了
现在打开检查元素,并且刷新此页面
在Network中找到了车次信息链接
右键点击它把它的curl拷贝出来
把这个拷贝出来的curl粘贴到Postman中
点击Code会生成一个python发送请求的代码
把这个代码复制出来粘贴到我们的常用的IDE中我这里用的是PyCharm观察这个请求
发现这个get请求中出发地和目的地是城市三字码,而这三字码是它们12306自己定义的
一般个自己定义的三字码对应关系表应该是存在它们的js文件中
12306的城市对应表是隐藏在favorite_name.js这个js文件中
def get_citycode():
# 12306的城市名和城市代码js文件url
url = 'https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9018'
requests.packages.urllib3.disable_warnings()
res = requests.get(url, verify=False)
pattern = u'([\u4e00-\u9fa5]+)\|([A-Z]+)'
res_data = re.findall(pattern, res.text)
station = dict(res_data)
# print(station)
return station
上面这段代码是用来处理城市名和城市代码对照的
接下来直接展示所有代码吧
# -*- coding: utf-8 -*-
import requests
import re
import json
def get_citycode():
# 12306的城市名和城市代码js文件url
url = 'https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9018'
requests.packages.urllib3.disable_warnings()
res = requests.get(url, verify=False)
pattern = u'([\u4e00-\u9fa5]+)\|([A-Z]+)'
res_data = re.findall(pattern, res.text)
station = dict(res_data)
# print(station)
return station
def get_shift_info(city_data, start_city, arrive_city, departure_date):
start = city_data[start_city]
arrive = city_data[arrive_city]
url = "https://kyfw.12306.cn/otn/leftTicket/queryT"
querystring = {"leftTicketDTO.train_date": departure_date, "leftTicketDTO.from_station": start,
"leftTicketDTO.to_station": arrive, "purpose_codes": "ADULT"}
headers = {
'Pragma': "no-cache",
'Sec-Fetch-Site': "same-origin",
'Accept-Encoding': "gzip, deflate, br",
'Accept-Language': "zh-CN,zh;q=0.9",
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
'Sec-Fetch-Mode': "cors",
'Accept': "*/*",
'Cache-Control': "no-cache",
'X-Requested-With': "XMLHttpRequest",
'Cookie': "JSESSIONID=1E9640C8A90B1F22E28A099EF398F3A2; _jc_save_fromStation=%u5317%u4EAC%2CBJP; _jc_save_wfdc_flag=dc; ten_js_key=2A%2BPoX%2F3brjJiKPjxa3Z4OtTyvAUGZtB; ten_key=2UCPq3YempVQhOyCKr83AD/HKI9dbxP7; RAIL_EXPIRATION=1567694595905; RAIL_DEVICEID=Ku9FxKj4_L0_UKpMtlU5fF2eVhtkY-6yhqmyE9MYOg4FN89T9vH0tFpkgr159qxtRIBqYe3xM_vRLSXAAoaPak1Qx7O87rGQ2AGfeqCEWZvI-3zCF9zrz9TiXXd9S-1Km3XY6WEoGSJq4y47UwJ05BpcS1udWqk0; BIGipServerpassport=988283146.50215.0000; route=6f50b51faa11b987e576cdb301e545c4; BIGipServerpool_passport=250413578.50215.0000; _jc_save_toDate=2019-09-03; BIGipServerotn=1189609738.24610.0000; _jc_save_toStation=%u5F00%u5C01%2CKFF; _jc_save_fromDate=2019-09-03",
'Connection': "keep-alive",
'If-Modified-Since': "0",
'Referer': "https://kyfw.12306.cn/otn/leftTicket/init?linktypeid=dc&fs=%E5%8C%97%E4%BA%AC,BJP&ts=%E5%BC%80%E5%B0%81,KFF&date=2019-09-03&flag=N,N,Y",
'Host': "kyfw.12306.cn",
'cache-control': "no-cache"
}
response = requests.request("GET", url, headers=headers, params=querystring).json()
return response
#把班次和座位按照字典解析{"serial_number":'',"ying_zuo":'',"wu_zuo":''}
def analytical_divisions(banci_list):
banci_list = banci_list['data']['result']
# print(banci_list)
list = []
for banci in banci_list:
item = banci.split('|')
list.append(item)
zuowei_list = []
for zuowei in list:
zuowei_dict = {
"serial_number": zuowei[3],#列车班次
"ying_zuo": zuowei[26],#硬座
"wu_zuo": zuowei[29]#无座
}
zuowei_list.append(zuowei_dict)
return zuowei_list
if __name__ == '__main__':
departure_date = '2019-09-30' #格式要求:2019-09-03
start_city = '北京'
arrive_city = '秦皇岛'
city_data = get_citycode()
banci_list = get_shift_info(city_data, start_city, arrive_city, departure_date)
a = analytical_divisions(banci_list)
print(a)