python获取国家统计局全国行政区划数据

最新推荐文章于 2024-03-15 11:44:24 发布
kkwkkwkkwkkw
最新推荐文章于 2024-03-15 11:44:24 发布
阅读量891
点赞数 10
文章标签： python 开发语言
本文链接：https://blog.csdn.net/qqyme/article/details/135450441
版权
以下代码是ChatGPT和我共创的
ChatGPT问答记录分享
 国家统计局2023全国行政区划
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse, unquote, urljoin


def fetch_data(url, encoding='utf-8'):
    response = requests.get(url)
    if response.status_code == 200:
        response.encoding = encoding
        return response.text
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None


def parse_html(html, url):
    soup = BeautifulSoup(html, 'html.parser')
    data = []

    # 根据实际网页结构调整以下代码
    provinces = soup.select('tr.provincetr a')
    for province in provinces:
        province_name = province.text.strip()
        province_code = province['href'].split('.')[0]
        if province_code == '37':
            # 输出省级数据
            print(f"Province: {province_name} ({province_code})")

            # 获取市级数据
            city_url_base = get_parent_folder_path(url)
            city_url = f"{city_url_base}/{province_code}.html"
            city_html = fetch_data(city_url)
            if city_html:
                city_data = parse_city_html(
                    city_html, province_code, city_url)
            data.extend(city_data)
    return data


def parse_city_html(html, province_code, url):
    soup = BeautifulSoup(html, 'html.parser')
    city_data = []

    # 根据实际网页结构调整以下代码
    cities = soup.select('tr.citytr')
    for city in cities:
        city_info = city.select('td')
        city_code = city_info[0].text.strip()
        city_name = city_info[1].text.strip()
        # 输出市级数据
        print(f"  City: {city_name} ({city_code})")
        # 将数据添加到列表
        city_data.append({
            'ProvinceCode': province_code,
            'ProvinceName': '',
            'CityCode': city_code,
            'CityName': city_name,
            # 添加其他需要的字段
        })
        # 获取县级数据
        if city_info[0].find('a') and city_info[0].find('a').has_attr('href'):
            index = city_info[0].find('a')['href']
            countie_url_base = get_parent_folder_path(url)
            countie_url = f"{countie_url_base}/{index}"
            countie_html = fetch_data(countie_url)
            if countie_html:
                countie_data = parse_countie_html(
                    countie_html, city_code, countie_url)
                city_data.extend(countie_data)

    return city_data


def parse_countie_html(html, city_code, url):
    soup = BeautifulSoup(html, 'html.parser')
    countie_data = []

    # 根据实际网页结构调整以下代码
    counties = soup.select('tr.countytr')
    for countie in counties:
        countie_info = countie.select('td')
        countie_code = countie_info[0].text.strip()
        countie_name = countie_info[1].text.strip()

        # 输出县级数据
        print(f"    countie: {countie_name} ({countie_code})")

        # 将数据添加到列表
        countie_data.append({
            'CityCode': city_code,
            'CityName': '',
            'CountieCode': countie_code,
            'CountieName': countie_name,
            # 添加其他需要的字段
        })
        # 获取镇级数据
        if countie_info[0].find('a') and countie_info[0].find('a').has_attr('href'):
            index = countie_info[0].find('a')['href']
            town_url_base = get_parent_folder_path(url)
            town_url = f"{town_url_base}/{index}"
            town_html = fetch_data(town_url)
            if town_html:
                town_data = parse_town_html(
                    town_html, city_code, town_url)
                countie_data.extend(town_data)
    return countie_data


def parse_town_html(html, countie_code, url):
    soup = BeautifulSoup(html, 'html.parser')
    town_data = []

    # 根据实际网页结构调整以下代码
    towns = soup.select('tr.towntr')
    for town in towns:
        town_info = town.select('td')
        town_code = town_info[0].text.strip()
        town_name = town_info[1].text.strip()

        # 输出镇级数据
        print(f"      town: {town_name} ({town_code})")

        # 将数据添加到列表
        town_data.append({
            'CountieCode': countie_code,
            'CountieName': '',
            'TownCode': town_code,
            'TowneName': town_name,
            # 添加其他需要的字段
        })
        # 获取村级数据
        if town_info[0].find('a') and town_info[0].find('a').has_attr('href'):
            index = town_info[0].find('a')['href']
            village_url_base = get_parent_folder_path(url)
            village_url = f"{village_url_base}/{index}"
            village_html = fetch_data(village_url)
            if village_html:
                village_data = parse_village_html(
                    village_html, town_code)
                town_data.extend(village_data)
    return town_data


def parse_village_html(html, town_code):
    soup = BeautifulSoup(html, 'html.parser')
    village_data = []

    # 根据实际网页结构调整以下代码
    villages = soup.select('tr.villagetr')
    for village in villages:
        village_info = village.select('td')
        village_code = village_info[0].text.strip()
        village_name = village_info[2].text.strip()

        # 输出镇级数据
        print(f"        village: {village_name} ({village_code})")

        # 将数据添加到列表
        village_data.append({
            'TownCode': town_code,
            'TowneName': '',
            'VillageCode': village_code,
            'VillageName': village_name,
            # 添加其他需要的字段
        })

    return village_data


def get_parent_folder_path(current_url):
    # 解析 URL
    parsed_url = urlparse(current_url)

    # 获取 URL 路径部分
    path = unquote(parsed_url.path)  # 对路径进行解码

    # 移除路径末尾的文件名，获得上级文件夹路径
    parent_folder_path = '/'.join(path.split('/')[:-1])

    # 使用当前页面的 URL 作为基准 URL，将相对路径转换为绝对路径
    absolute_parent_folder_path = urljoin(current_url, parent_folder_path)

    return absolute_parent_folder_path


def save_to_excel(data, filename='administrative_divisions.xlsx'):
    df = pd.DataFrame(data)
    df.to_excel(filename, index=False)
    print(f"Data saved to {filename}")


def main():
    url_base = 'https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/'
    url = f"{url_base}index.html"
    html = fetch_data(url)

    if html:
        data = parse_html(html, url)
        save_to_excel(data)


if __name__ == "__main__":
    main()