Python爬虫自己写项目之:爬取火车站的时刻表和票务信息

目标

给定指定的站点信息(始发站和终点站)和发车时间,得到相应的信息表格

代码


from selenium import webdriver
import sys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
import re
from lxml import etree
import pandas as pd
from prettytable import PrettyTable




def driver_construction():
    driver_path = r'C:\Users\dell\Anaconda3\Scripts\chromedriver.exe'
    driver = webdriver.Chrome(driver_path)
    return driver

def parse_argv(argv_lst):

    departure_station = argv_lst[1]
    destination = argv_lst[2]
    departure_time = argv_lst[3]
    time_match = re.search('(.*?)-(.*?)-(.*)', departure_time)
    year = time_match.group(1)
    month = time_match.group(2)
    day = time_match.group(3)
    month_dic = {
        "01":"一月",
        "02":"二月",
        "03":"三月",
        "04":"四月",
        "05":"五月",
        "06":"六月",
        "07":"七月",
        "08":"八月",
        "09":"九月",
        "10":"十月",
        "11":"十一月",
        "12":"十二月",
    }
    month = month_dic[month]
    day_dict = {
        "01": "1",
        "02": "2",
        "03": "3",
        "04": "4",
        "05": "5",
        "06": "6",
        "07": "7",
        "08": "8",
        "09": "9",
    }
    if day in day_dict:
        day = day_dict[day]
    time = (year,month,day)
    return departure_station,destination, time




def input_information_request_page(driver, departure_station,destination, time):

    main_url = "https://www.12306.cn/index/"
    main_page = driver.get(main_url)

    # 将用户输入的信息传入页面中的文本框处
    year,month,day = time

    input_departure_box = driver.find_element_by_id("fromStationText").click()
    departure_station_click = driver.find_element_by_xpath('//li[@title="%s"]'%departure_station).click()

    input_destination = driver.find_element(By.ID,value="toStationText").click()
    destination_click = driver.find_element_by_xpath('//li[@title="%s"]'%destination).click()

    input_departure_time = driver.find_element(By.ID,value="train_date").click()
    year_click = driver.find_element_by_xpath('//div[@class="year"]//input[@type="text"]').click()
    click = driver.find_element_by_xpath('//li[text()="%s"]' % year).click()
    month_click = driver.find_element_by_xpath('//div[@class="month"]//input[@type="text"]').click()
    click = driver.find_element_by_xpath('//li[text()="%s"]' % month).click()
    day_click = driver.find_element_by_xpath('//div[text()=%d]' % int(day)).click()

    # 点击查询按钮
    search_button = driver.find_element_by_id("search_one").click()

    # 将driver转到最新的页面上去
    driver.switch_to_window(driver.window_handles[-1])

    # 获取最新打开的页面源码
    page_source = driver.page_source
    return page_source


def attain_infos_from_page_source(page_source):
    html = etree.HTML(page_source)
    diction = {}
    checi_lst = html.xpath('//a[@title="点击查看停靠站信息"]/text()')
    print(checi_lst)
    for checi in checi_lst:
        info = html.xpath('//tr[contains(@id,"%s")]//text()' % checi)
        if info:
            info.remove(info[1])
            info.remove(info[1])
            info = list(info)[0:7]
        info2 = html.xpath('//td[contains(@hbid,"%s")]//text()' % checi)
        info2 = list(info2)
        info = info + info2
        info = list(info)
        info = list(map(lambda x:x.replace(' ',''),info))
        info = list(map(lambda x:x.ljust(6),info))
        # diction[checi] = info[1:]
        if len(info) == 18:
            diction[checi] = info

    columns_index = ["车次", "出发站", "到达站", "出发时间", "到达时间", "历时", "到达状态", "商务座", "一等座", "二等座", "高级软卧", "软卧", "动卧", "硬卧",
                     "软座", "硬座", "无座", "其他"]
    columns_index = map(lambda x: x.ljust(6), columns_index)
    columns_index = tuple(columns_index)
    diction["title"] = columns_index
    return diction



def print_form(diction):

    tab = PrettyTable()         # 创建格式化表格
    columns_index = diction["title"]
    tab.field_names = columns_index     # 把title行作为第一行
    del diction["title"]                # 把 diction 中 title 这行删除
    for value in diction.values():
        tab.add_row(value)
    return tab



if __name__ == '__main__':
    argv_lst = sys.argv
    driver = driver_construction()
    departure_station, destination, time = parse_argv(argv_lst)
    print(departure_station, destination, time)
    page_source = input_information_request_page(driver, departure_station, destination, time)
    diction = attain_infos_from_page_source(page_source)
    tab = print_form(diction)
    print(tab)
    with open('table.txt','w',encoding='utf-8') as f:
        f.write(str(tab))

结果

在这里插入图片描述

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

暖仔会飞

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值