pyhton爬取武汉地铁信息
话不多说,直接上代码
from lxml import etree
import urllib3.request
import pandas as pd
urllib3.disable_warnings()
# 生成待解析的对象
def getTree(url):
pool_manager = urllib3.PoolManager()
response = pool_manager.request('GET', url)
r = response.data.decode()
return etree.HTML(r)
def page(url):
try:
tree = getTree(url)
metro_name = tree.xpath('//div[@class="ib-hd lm-hd"]/*/text()')
metro_stations = tree.xpath('//ul[@class="clear"]/li/a[@class="cl-station"]/text()')
# print(metro_name,metro_stations, sep='\n')
return metro_name, metro_stations
except Exception:
pass
def getEntrance(homeUrl):
tree = getTree(homeUrl)
div = tree.xpath('//div[@class="ib-box"]')[0]
title = div.xpath('//div[@class="ib-hd"]/text()')[0]
line_name = div.xpath('//ul/li/a/text()')
page_links = div.xpath('//ul/li/a/@href')
line_info = div.xpath('//ul/li/div//text()')
# print(line_info)
run_time = line_info[1::4]
update_time = line_info[3::4]
# print(title, line_name, page_links, run_time,update_time, sep='\n')
metro_counts = []
metro_stations = []
for page_link in page_links:
metro_count, metro_station = page("https://dt.8684.cn/" + page_link)
metro_counts.append(metro_count[1])
metro_stations.append(metro_station)
# 之后运用pandas的数据框进行处理
data = {'line_name': line_name, 'run_time': run_time, 'update_time': update_time, 'metro_count':
metro_counts, 'metro_stations': metro_stations}
# 每一项数据的合并
df = pd.DataFrame(data)
# print(df)
df.to_excel('data.xls')
print('finished!')
homeUrl = 'https://dt.8684.cn/wh_list_time'
if __name__ == '__main__':
getEntrance(homeUrl)