python实现自动获取国家统计局三级行政区及代码

ASS-ASH

已于 2022-03-30 19:44:03 修改

阅读量383

点赞数 1

分类专栏：数据爬取文章标签： python 开发语言后端自然语言处理人工智能

于 2022-03-22 10:51:51 首次发布

本文链接：https://blog.csdn.net/qq_38563206/article/details/123654740

版权

数据爬取专栏收录该内容

7 篇文章 4 订阅

订阅专栏

import requests
import time
from lxml import etree


def analysis_html(address, pattern):
    global trs
    response = requests.get(address, headers=headers)
    response.encoding = 'utf-8'
    text = response.text
    html = etree.HTML(text)
    trs = html.xpath(pattern)


def make_url(url_before, pattern):
    return url_before + tr.xpath(pattern)[0]


def province():
    global tr
    analysis_html(f'{main_address}index.html', '//tr[@class="provincetr"]/td')
    for tr in trs[:-1]:
        province = tr.xpath('./a/text()')[0]
        province_url = make_url(main_address, './a/@href')
        print(province)
        city(province_url, province)
        time.sleep(1)


def city(province_url, province):
    analysis_html(province_url, '//tr[@class="citytr"]')
    for tr in trs:
        city = tr.xpath('./td[2]/a/text()')[0]
        #city_url = make_url(main_address, './td[1]/a/@href') #why?
        page = tr.xpath('./td[1]/a/@href')[0]
        city_url = main_address + page

        city_id = tr.xpath('./td[1]/a/text()')[0]

        country(city_url, city, city_id, province, province_url)
        time.sleep(1)


def country(city_url, city, city_id, province, province_url):
    analysis_html(city_url, '//tr[@class="countytr"]')
    for tr in trs:
        try:
            country = tr.xpath('./td[2]/a/text()')[0]
            country_id = tr.xpath('./td[1]/a/text()')[0]

            # page = tr.xpath('./td[1]/a/@href')[0]
            # country_url = province_url.replace('.html','/') + page

            f.write(f'{country},{country_id}\t,{city},{city_id}\t,{province}\n')
            time.sleep(1)
        except:
            pass


headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.204 Safari/537.36',
    'Cookie':'AD_RS_COOKIE=20080918; _trs_uv=kahvgie3_6_fc6v'
}
main_address = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/"

with open(r'省市区.csv', 'a',encoding='utf-8') as f:
    province()

最后存储为相应的csv文件，示例为2021年数据

ASS-ASH

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
python实现自动获取国家统计局三级行政区及代码

import requestsimport timefrom lxml import etreedef analysis_html(address, pattern): global trs response = requests.get(address, headers=headers) response.encoding = 'utf-8' text = response.text html = etree.HTML(text) trs = h.
复制链接

扫一扫

专栏目录