火车车次信息爬取

爬取12306车次信息

之前在在b站上做了一个爬取12306车票信息的小demo,有人私信我问能不能爬取某一天的全部车次信息(几乎快一年前的东西了),突然想到十一不知道去哪里玩,12306又不给力,不能展示所有的车次信息,这次我们就来做一下试试
在这里插入图片描述首先,找到所有火车站的信息url(不知道为啥写在js上,这玩意不应该写在json上吗)找到url:https://www.12306.cn/index/script/core/common/station_name_v10042.js

import requests
import json

NAME_URL = 'https://www.12306.cn/index/script/core/common/station_name_v10042.js'



def get_respose(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r
    except:
        return None
    
def get_station_name_list():
    r = get_respose(NAME_URL)
    print(r.text[0:1000])
    
if __name__ == '__main__':
    get_station_name_list()

爬完之后发现这东西挺有意思,用@做的分割符
在这里插入图片描述我们解析一下,存好各类信息,可以发现一共有2881个车站(我们的祖国真的强大啊)

import requests
import json
import re

NAME_URL = 'https://www.12306.cn/index/script/core/common/station_name_v10042.js'



def get_respose(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r
    except:
        return None
    
def get_station_name_list():
    r = get_respose(NAME_URL)
    info = r.text
    match = re.search(r'\'(.*?)\'', info)
    station_list = match.group(1)[1:]
    namelist = station_list.split('@')
    return namelist
    
    
if __name__ == '__main__':
    station_list = get_station_name_list()
    print(len(station_list))

我们点击查询的时候,会发现我们的url上出现一个字段,分别包含出发和目的地点名称,着显然是一个字段构造url
在这里插入图片描述然后我们找到12306放置车次信息的url
在这里插入图片描述这里有个坑,网站会根据请求头的的cookie字段判断你是不是robot,所以自己加上,cookie就自己构造吧,内部字段随便写(我要是12306就再做一个cookie的正则匹配,防止这种情况)

import requests
import json
import re

NAME_URL = 'https://www.12306.cn/index/script/core/common/station_name_v10042.js'
TICKET_URL = 'https://kyfw.12306.cn/otn/leftTicket/query'
HEADERS = {'Cookie': 'RAIL_EXPIRATION=0; RAIL_DEVICEID=a; _jc_save_fromStation=a; _jc_save_toStation=a; _jc_save_fromDate=0; _jc_save_toDate=0; _jc_save_wfdc_flag=a'}

def get_respose(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r
    except:
        return None
    
def get_station_name_list():
    r = get_respose(NAME_URL)
    info = r.text
    match = re.search(r'\'(.*?)\'', info)
    station_list = match.group(1)[1:]
    namelist = station_list.split('@')
    return namelist
    
def get_left_ticket(date,from_station,to_station,al_or_st = 'ADULT'):
    """
    params:al_or_st 学生票还是成人票,默认成人票
    """
    params = {'leftTicketDTO.train_date':date,
              'leftTicketDTO.from_station':from_station,
              'leftTicketDTO.to_station':to_station,
              'purpose_codes':al_or_st}
    
    r = requests.get(TICKET_URL, headers = HEADERS,params = params)
    json = r.json()
    print(json)
if __name__ == '__main__':
    station_list = get_station_name_list()
    get_left_ticket('2019-10-14', 'BJP', 'HBB')

华丽丽丽的分割线-------------------------
(机器学习搞的我头晕,模式识别那边又想不到项目的落地点,这一天好烦哦)
具体的url提取和信息筛选我不写了,这里直接给代码

# -*- coding: utf-8 -*-
"""
Created on Sun Oct 13 14:47:11 2019

@author: wwe
"""

import requests
import json
import re

NAME_URL = 'https://www.12306.cn/index/script/core/common/station_name_v10042.js'
TICKET_URL = 'https://kyfw.12306.cn/otn/leftTicket/query'
PRICE_URL = 'https://kyfw.12306.cn/otn/leftTicket/queryTicketPrice'
HEADERS = {'Cookie': 'RAIL_EXPIRATION=0; RAIL_DEVICEID=a; _jc_save_fromStation=a; _jc_save_toStation=a; _jc_save_fromDate=0; _jc_save_toDate=0; _jc_save_wfdc_flag=a'}

def get_respose(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r
    except:
        return None
    
def get_station_name_list():
    r = get_respose(NAME_URL)
    info = r.text
    match = re.search(r'\'(.*?)\'', info)
    station_list = match.group(1)[1:]
    namelist = station_list.split('@')
    stations = {}
    station_list = []
    for station in namelist:
        short_, name, tag, pinyin, short, station_no = station.split("|")
        stations[name] = [short_, tag, pinyin, short, station_no]
        station_list.append([name,short_, tag, pinyin, short, station_no])
    return stations, station_list
    
def get_left_ticket(date,from_station,to_station,al_or_st = 'ADULT'):
    """
    params:al_or_st 学生票还是成人票,默认成人票
    """
    params = {'leftTicketDTO.train_date':date,
              'leftTicketDTO.from_station':from_station,
              'leftTicketDTO.to_station':to_station,
              'purpose_codes':al_or_st}
    
    r = requests.get(TICKET_URL, headers = HEADERS,params = params)
    jsons = r.json()
    all_info = []
    for info in jsons['data']['result']:
        all_info.append(info.split('|')[2:])
    return all_info

def get_ticket_price(train_no, from_station_no,to_station_no,seat_types,train_date):
    params = {
            "train_no":train_no,
            "from_station_no":from_station_no,
            "to_station_no":to_station_no	,
            "seat_types":seat_types,
            "train_date":train_date
            }
    r = requests.get(PRICE_URL, headers = HEADERS,params = params)
    return r.json()

    

if __name__ == '__main__':
    station_list, name_list = get_station_name_list()
    from_station = input("起始位置:")
    _, ftag, _, _, fstation_no = station_list[from_station]
    date = '2019-10-30'
    for to_station in name_list:
        name,_, ttag, _, _, tstation_no = to_station
        if name != from_station:
            print('到达车站',name)
            all_info = get_left_ticket(date,ftag,ttag)
            for ticket_info in all_info:
                train_no = ticket_info[0]
                train_name = ticket_info[1]
                start_station = ticket_info[2]
                final_station = ticket_info[3]
                from_station = ticket_info[4]
                to_station = ticket_info[5]
                from_time = ticket_info[6]
                to_time = ticket_info[7]
                time_span = ticket_info[8]
                have_seats = ticket_info[9]
                from_date = ticket_info[11]
                from_station_no = ticket_info[14]
                to_station_no = ticket_info[15]
                seat_types_fill_zeros = ticket_info[32]
                seat_types = ticket_info[33]
                others_info = [ticket_info[12],ticket_info[13]] + ticket_info[16:32] + ticket_info[34:]
                shangwu = others_info[16]
                yideng = others_info[15]
                erdeng = others_info[14]
                gaojiruanwo = others_info[5]
                ruanwoyidengwo = others_info[7]
                yingwoerdengwo = others_info[12]
                yingzuo = others_info[13]
                wuzuo = others_info[10]
                r = get_ticket_price(train_no,from_station_no,to_station_no,seat_types,date)
                print('车次',train_name)
                print("出发时间",from_time)
                print("到达时间",to_time)
                if shangwu != None and shangwu != "" and shangwu != "无":
                    print("商务",shangwu,r['data']['A9'])
                if yideng != None and yideng != "" and yideng != "无":
                    print("一等座",yideng,r['data']['M'])
                if erdeng != None and erdeng != "" and erdeng != "无":
                    print("二等座",erdeng,r['data']['O'])
                if gaojiruanwo != None and gaojiruanwo != "" and gaojiruanwo != "无":
                    print("高级软卧",gaojiruanwo,r['data']['A6'])
                if ruanwoyidengwo != None and ruanwoyidengwo != '' and ruanwoyidengwo != "无":
                    print("软卧",ruanwoyidengwo,r['data']['A4'])
                if yingwoerdengwo != None and yingwoerdengwo != '' and yingwoerdengwo != "无":
                    print("硬卧",yingwoerdengwo,r['data']['A3'])
                if yingzuo != None and yingzuo != '' and yingzuo != "无":
                    print("硬座",yingzuo,r['data']['WZ'])
                if wuzuo != None and wuzuo != '' and wuzuo != "无":
                    print("无座",wuzuo,r['data']['WZ'])
                print("-----------------------------------------")

    
    
    

请注意!如果我们有缘你看到了这份代码,一定一定记得在循环上增加延时暂停,别给人家服务器造成压力!正所谓爬虫的盗亦有道!
在这里插入图片描述

  • 6
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值