from selenium import webdriver
from lxml import etree
import time
driver = webdriver.Chrome()
url = 'https://kyfw.12306.cn/otn/leftTicket/init'
driver.get(url)
def sendKey():#模拟搜索
driver.find_element_by_xpath('//*[@id="fromStationText"]').click()#点击出发站
time.sleep(1)
driver.find_element_by_xpath('//*[@id="ul_list1"]/li[1]').click()#第一个热门城市默认为北京
time.sleep(1)
driver.find_element_by_xpath('//*[@id="toStationText"]').click()#点击终点站
time.sleep(1)
driver.find_element_by_xpath('//*[@id="ul_list1"]/li[2]').click()#第二个热门城市默认为上海
time.sleep(1.1)
driver.find_element_by_xpath('//*[@id="train_date"]').click()#点击出发时间
time.sleep(1)
driver.find_element_by_xpath('/html/body/div[34]/div[1]/div[2]/div[23]/div').click()#选择23号
time.sleep(1.1)
driver.find_element_by_xpath('//*[@id="query_ticket"]').click()#点击查询
time.sleep(4) #等待加载
source = driver.page_source #获取网页源码
spider_page(source)#传值给spider
def spider_page(html):#爬虫函数
html = etree.HTML(html)#解析网页
for et in html.xpath('//*[@id="queryLeftTable"]/tr'):
train = et.xpath('./td/div/div/div/a/text()')#列车名称
if len(train) != 0:#网页中同时存在两个tr,判断存在列车名再进行其他信息获取
train =train
star = et.xpath('./td/div/div[2]/strong[1]/text()')[0]#出发站
end = et.xpath('./td/div/div[2]/strong[2]/text()')[0]#终点站
starTime = et.xpath('./td/div/div[3]/strong[1]/text()')#发车时间
starTime = starTime[0]+"(当天出发)"
endTime = et.xpath('./td/div/div[3]/strong[2]/text()')#到达时间
dayTime = et.xpath('./td/div/div[4]/span/text()')
arrive = endTime[0]+("("+dayTime[0]+")")
if len(str(et.xpath('./td[2]/text()'))) == 6:#头等座,6是字符串“候补”的长度
best = et.xpath('./td[2]/text()')
else:
best = et.xpath('./td[2]/div/text()')
best = "特等座--"+best[0]#字符串拼接
#一等座
if len(str(et.xpath('./td[3]/text()'))) == 5: #5是字符串“有”的长度
frist = et.xpath('./td[3]/text()')
elif len(str(et.xpath('./td[3]/text()'))) == 6:
frist = ['候补']
else:
frist = et.xpath('./td[3]/div/text()')
frist = "一等座--"+frist[0]
#二等座
second = et.xpath('./td[4]/text()')
second = "二等座--"+second[0]
print (train,"\t",star,"---",end,"\t",starTime,"---",arrive,"\t",best,"\t",frist,"\t",second)
if __name__ == '__main__':
sendKey()
输出结果:
只爬取部分信息做测试,其他信息还需要完善!!!