爬取12306车次信息
之前在在b站上做了一个爬取12306车票信息的小demo,有人私信我问能不能爬取某一天的全部车次信息(几乎快一年前的东西了),突然想到十一不知道去哪里玩,12306又不给力,不能展示所有的车次信息,这次我们就来做一下试试
首先,找到所有火车站的信息url(不知道为啥写在js上,这玩意不应该写在json上吗)找到url:https://www.12306.cn/index/script/core/common/station_name_v10042.js
import requests
import json
NAME_URL = 'https://www.12306.cn/index/script/core/common/station_name_v10042.js'
def get_respose(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = 'utf-8'
return r
except:
return None
def get_station_name_list():
r = get_respose(NAME_URL)
print(r.text[0:1000])
if __name__ == '__main__':
get_station_name_list()
爬完之后发现这东西挺有意思,用@做的分割符
我们解析一下,存好各类信息,可以发现一共有2881个车站(我们的祖国真的强大啊)
import requests
import json
import re
NAME_URL = 'https://www.12306.cn/index/script/core/common/station_name_v10042.js'
def get_respose(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = 'utf-8'
return r
except:
return None
def get_station_name_list():
r = get_respose(NAME_URL)
info = r.text
match = re.search(r'\'(.*?)\'', info)
station_list = match.group(1)[1:]
namelist = station_list.split('@')
return namelist
if __name__ == '__main__':
station_list = get_station_name_list()
print(len(station_list))
我们点击查询的时候,会发现我们的url上出现一个字段,分别包含出发和目的地点名称,着显然是一个字段构造url
然后我们找到12306放置车次信息的url
这里有个坑,网站会根据请求头的的cookie字段判断你是不是robot,所以自己加上,cookie就自己构造吧,内部字段随便写(我要是12306就再做一个cookie的正则匹配,防止这种情况)
import requests
import json
import re
NAME_URL = 'https://www.12306.cn/index/script/core/common/station_name_v10042.js'
TICKET_URL = 'https://kyfw.12306.cn/otn/leftTicket/query'
HEADERS = {'Cookie': 'RAIL_EXPIRATION=0; RAIL_DEVICEID=a; _jc_save_fromStation=a; _jc_save_toStation=a; _jc_save_fromDate=0; _jc_save_toDate=0; _jc_save_wfdc_flag=a'}
def get_respose(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = 'utf-8'
return r
except:
return None
def get_station_name_list():
r = get_respose(NAME_URL)
info = r.text
match = re.search(r'\'(.*?)\'', info)
station_list = match.group(1)[1:]
namelist = station_list.split('@')
return namelist
def get_left_ticket(date,from_station,to_station,al_or_st = 'ADULT'):
"""
params:al_or_st 学生票还是成人票,默认成人票
"""
params = {'leftTicketDTO.train_date':date,
'leftTicketDTO.from_station':from_station,
'leftTicketDTO.to_station':to_station,
'purpose_codes':al_or_st}
r = requests.get(TICKET_URL, headers = HEADERS,params = params)
json = r.json()
print(json)
if __name__ == '__main__':
station_list = get_station_name_list()
get_left_ticket('2019-10-14', 'BJP', 'HBB')
华丽丽丽的分割线-------------------------
(机器学习搞的我头晕,模式识别那边又想不到项目的落地点,这一天好烦哦)
具体的url提取和信息筛选我不写了,这里直接给代码
# -*- coding: utf-8 -*-
"""
Created on Sun Oct 13 14:47:11 2019
@author: wwe
"""
import requests
import json
import re
NAME_URL = 'https://www.12306.cn/index/script/core/common/station_name_v10042.js'
TICKET_URL = 'https://kyfw.12306.cn/otn/leftTicket/query'
PRICE_URL = 'https://kyfw.12306.cn/otn/leftTicket/queryTicketPrice'
HEADERS = {'Cookie': 'RAIL_EXPIRATION=0; RAIL_DEVICEID=a; _jc_save_fromStation=a; _jc_save_toStation=a; _jc_save_fromDate=0; _jc_save_toDate=0; _jc_save_wfdc_flag=a'}
def get_respose(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = 'utf-8'
return r
except:
return None
def get_station_name_list():
r = get_respose(NAME_URL)
info = r.text
match = re.search(r'\'(.*?)\'', info)
station_list = match.group(1)[1:]
namelist = station_list.split('@')
stations = {}
station_list = []
for station in namelist:
short_, name, tag, pinyin, short, station_no = station.split("|")
stations[name] = [short_, tag, pinyin, short, station_no]
station_list.append([name,short_, tag, pinyin, short, station_no])
return stations, station_list
def get_left_ticket(date,from_station,to_station,al_or_st = 'ADULT'):
"""
params:al_or_st 学生票还是成人票,默认成人票
"""
params = {'leftTicketDTO.train_date':date,
'leftTicketDTO.from_station':from_station,
'leftTicketDTO.to_station':to_station,
'purpose_codes':al_or_st}
r = requests.get(TICKET_URL, headers = HEADERS,params = params)
jsons = r.json()
all_info = []
for info in jsons['data']['result']:
all_info.append(info.split('|')[2:])
return all_info
def get_ticket_price(train_no, from_station_no,to_station_no,seat_types,train_date):
params = {
"train_no":train_no,
"from_station_no":from_station_no,
"to_station_no":to_station_no ,
"seat_types":seat_types,
"train_date":train_date
}
r = requests.get(PRICE_URL, headers = HEADERS,params = params)
return r.json()
if __name__ == '__main__':
station_list, name_list = get_station_name_list()
from_station = input("起始位置:")
_, ftag, _, _, fstation_no = station_list[from_station]
date = '2019-10-30'
for to_station in name_list:
name,_, ttag, _, _, tstation_no = to_station
if name != from_station:
print('到达车站',name)
all_info = get_left_ticket(date,ftag,ttag)
for ticket_info in all_info:
train_no = ticket_info[0]
train_name = ticket_info[1]
start_station = ticket_info[2]
final_station = ticket_info[3]
from_station = ticket_info[4]
to_station = ticket_info[5]
from_time = ticket_info[6]
to_time = ticket_info[7]
time_span = ticket_info[8]
have_seats = ticket_info[9]
from_date = ticket_info[11]
from_station_no = ticket_info[14]
to_station_no = ticket_info[15]
seat_types_fill_zeros = ticket_info[32]
seat_types = ticket_info[33]
others_info = [ticket_info[12],ticket_info[13]] + ticket_info[16:32] + ticket_info[34:]
shangwu = others_info[16]
yideng = others_info[15]
erdeng = others_info[14]
gaojiruanwo = others_info[5]
ruanwoyidengwo = others_info[7]
yingwoerdengwo = others_info[12]
yingzuo = others_info[13]
wuzuo = others_info[10]
r = get_ticket_price(train_no,from_station_no,to_station_no,seat_types,date)
print('车次',train_name)
print("出发时间",from_time)
print("到达时间",to_time)
if shangwu != None and shangwu != "" and shangwu != "无":
print("商务",shangwu,r['data']['A9'])
if yideng != None and yideng != "" and yideng != "无":
print("一等座",yideng,r['data']['M'])
if erdeng != None and erdeng != "" and erdeng != "无":
print("二等座",erdeng,r['data']['O'])
if gaojiruanwo != None and gaojiruanwo != "" and gaojiruanwo != "无":
print("高级软卧",gaojiruanwo,r['data']['A6'])
if ruanwoyidengwo != None and ruanwoyidengwo != '' and ruanwoyidengwo != "无":
print("软卧",ruanwoyidengwo,r['data']['A4'])
if yingwoerdengwo != None and yingwoerdengwo != '' and yingwoerdengwo != "无":
print("硬卧",yingwoerdengwo,r['data']['A3'])
if yingzuo != None and yingzuo != '' and yingzuo != "无":
print("硬座",yingzuo,r['data']['WZ'])
if wuzuo != None and wuzuo != '' and wuzuo != "无":
print("无座",wuzuo,r['data']['WZ'])
print("-----------------------------------------")
请注意!如果我们有缘你看到了这份代码,一定一定记得在循环上增加延时暂停,别给人家服务器造成压力!正所谓爬虫的盗亦有道!