采集源来2023年国家统计局的数据,如下链接
2023年统计用区划代码和城乡划分代码
JSON文件下载地址
链接: https://pan.baidu.com/s/1s3kw-ZKSa5dORdgsRNpESA?pwd=37fn 提取码: 37fn
采集程序如下:
# 2024年7月12日采集的数据
import requests
from bs4 import BeautifulSoup
import json
base_url = "https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023"
provinces_url = f"{base_url}/index.html"
def fetch_soup(url):
response = requests.get(url)
response.encoding = 'utf-8' # 需要设置编码
return BeautifulSoup(response.text, 'html.parser')
def parse_province(province_tag):
province_name = province_tag.text.strip()
province_href = province_tag['href']
province_id = province_href.split('.')[0] + '0000'
return province_id, province_name, province_href
def parse_city(city_tag):
tds = city_tag.find_all('td')
if len(tds) < 2:
print(f"Unexpected city_tag structure: {city_tag}")
return None, None, None
city_id = tds[0].text.strip()[:4] + '00'
city_name = tds[1].text.strip()
a_tag = tds[0].find('a')
city_href = a_tag['href'] if a_tag and 'href' in a_tag.attrs else None
if not city_href:
print(f"No href found in city_tag: {city_tag}")
return city_id, city_name, city_href
def parse_area(area_tag):
tds = area_tag.find_all('td')
if len(tds) < 2:
print(f"Unexpected area_tag structure: {area_tag}")
return None, None
area_id = tds[0].text.strip()[:6]
area_name = tds[1].text.strip()
return area_id, area_name
def fetch_province_data():
provinces_data = []
soup = fetch_soup(provinces_url)
provinces = soup.find_all('a', href=True)
for province_tag in provinces:
province_id, province_name, province_href = parse_province(province_tag)
print(f"Parsing province: {province_name}")
province_data = {
"ssqid": province_id,
"ssqname": province_name,
"ssqename": "",
"city": fetch_city_data(province_href)
}
provinces_data.append(province_data)
return provinces_data
def fetch_city_data(province_href):
cities_data = []
province_url = f"{base_url}/{province_href}"
soup = fetch_soup(province_url)
city_rows = soup.find_all('tr', class_='citytr')
for city_tag in city_rows:
city_id, city_name, city_href = parse_city(city_tag)
if city_id and city_name and city_href: # 确保所有值都存在
print(f"Parsing city: {city_name}")
city_data = {
"ssqid": city_id,
"ssqname": city_name,
"ssqename": "",
"area": fetch_area_data(city_href)
}
cities_data.append(city_data)
elif city_id and city_name:
print(f"Skipping city without href: {city_name}")
city_data = {
"ssqid": city_id,
"ssqname": city_name,
"ssqename": "",
"area": [] # 空的area列表
}
cities_data.append(city_data)
return cities_data
def fetch_area_data(city_href):
areas_data = []
city_url = f"{base_url}/{city_href}"
soup = fetch_soup(city_url)
area_rows = soup.find_all('tr', class_='countytr')
for area_tag in area_rows:
area_id, area_name = parse_area(area_tag)
if area_id and area_name: # 确保所有值都存在
print(f"Parsing area: {area_name.encode('utf-8').decode('utf-8')}")
area_data = {
"ssqid": area_id,
"ssqname": area_name,
"ssqename": ""
}
areas_data.append(area_data)
return areas_data
provinces_data = fetch_province_data()
with open('china_provinces.json', 'w', encoding='utf-8') as f:
json.dump(provinces_data, f, ensure_ascii=False, indent=4)
print("Data fetched and saved to china_provinces.json")
采集结果如下: