pyhton爬取武汉地铁信息

最新推荐文章于 2023-03-14 09:00:10 发布

shigen01

最新推荐文章于 2023-03-14 09:00:10 发布

阅读量333

点赞数

分类专栏：网络爬虫 python 文章标签： python xpath

本文链接：https://blog.csdn.net/weixin_55768452/article/details/115287090

版权

python 同时被 2 个专栏收录

15 篇文章 0 订阅

订阅专栏

网络爬虫

4 篇文章 0 订阅

订阅专栏

pyhton爬取武汉地铁信息

话不多说，直接上代码

from lxml import etree
import urllib3.request
import pandas as pd
urllib3.disable_warnings()


# 生成待解析的对象
def getTree(url):
    pool_manager = urllib3.PoolManager()
    response = pool_manager.request('GET', url)
    r = response.data.decode()
    return etree.HTML(r)


def page(url):
    try:
        tree = getTree(url)
        metro_name = tree.xpath('//div[@class="ib-hd lm-hd"]/*/text()')
        metro_stations = tree.xpath('//ul[@class="clear"]/li/a[@class="cl-station"]/text()')
        # print(metro_name,metro_stations, sep='\n')
        return metro_name, metro_stations
    except Exception:
        pass


def getEntrance(homeUrl):
    tree = getTree(homeUrl)
    div = tree.xpath('//div[@class="ib-box"]')[0]
    title = div.xpath('//div[@class="ib-hd"]/text()')[0]
    line_name = div.xpath('//ul/li/a/text()')
    page_links = div.xpath('//ul/li/a/@href')
    line_info = div.xpath('//ul/li/div//text()')
    # print(line_info)
    run_time = line_info[1::4]
    update_time = line_info[3::4]
    # print(title, line_name, page_links, run_time,update_time, sep='\n')
    metro_counts = []
    metro_stations = []
    for page_link in page_links:
        metro_count, metro_station = page("https://dt.8684.cn/" + page_link)
        metro_counts.append(metro_count[1])
        metro_stations.append(metro_station)
    # 之后运用pandas的数据框进行处理
    data = {'line_name': line_name, 'run_time': run_time, 'update_time': update_time, 'metro_count':
        metro_counts, 'metro_stations': metro_stations}

    # 每一项数据的合并
    df = pd.DataFrame(data)
    # print(df)
    df.to_excel('data.xls')
    print('finished!')



homeUrl = 'https://dt.8684.cn/wh_list_time'

if __name__ == '__main__':
    getEntrance(homeUrl)