import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse, unquote, urljoin
def fetch_data(url, encoding='utf-8'):
response = requests.get(url)
if response.status_code == 200:
response.encoding = encoding
return response.text
print(f"Failed to fetch data. Status code: {response.status_code}")
return None
def parse_html(html, url):
soup = BeautifulSoup(html, 'html.parser')
data = []
# 根据实际网页结构调整以下代码
provinces = soup.select('tr.provincetr a')
for province in provinces:
province_name = province.text.strip()
province_code = province['href'].split('.')[0]
if province_code == '37':
# 输出省级数据
print(f"Province: {province_name} ({province_code})")
# 获取市级数据
city_url_base = get_parent_folder_path(url)
city_url = f"{city_url_base}/{province_code}.html"
city_html = fetch_data(city_url)
if city_html:
city_data = parse_city_html(
city_html, province_code, city_url)
return data
def parse_city_html(html, province_code, url):
soup = BeautifulSoup(html, 'html.parser')
city_data = []
# 根据实际网页结构调整以下代码
cities = soup.select('tr.citytr')
for city in cities:
city_info = city.select('td')
city_code = city_info[0].text.strip()
city_name = city_info[1].text.strip()
# 输出市级数据
print(f" City: {city_name} ({city_code})")
# 将数据添加到列表
'ProvinceCode': province_code,
'ProvinceName': '',
'CityCode': city_code,
'CityName': city_name,
# 添加其他需要的字段
# 获取县级数据
if city_info[0].find('a') and city_info[0].find('a').has_attr('href'):
index = city_info[0].find('a')['href']
countie_url_base = get_parent_folder_path(url)
countie_url = f"{countie_url_base}/{index}"
countie_html = fetch_data(countie_url)
if countie_html:
countie_data = parse_countie_html(
countie_html, city_code, countie_url)
return city_data
def parse_countie_html(html, city_code, url):
soup = BeautifulSoup(html, 'html.parser')
countie_data = []
# 根据实际网页结构调整以下代码
counties = soup.select('tr.countytr')
for countie in counties:
countie_info = countie.select('td')
countie_code = countie_info[0].text.strip()
countie_name = countie_info[1].text.strip()
# 输出县级数据
print(f" countie: {countie_name} ({countie_code})")
# 将数据添加到列表
'CityCode': city_code,
'CityName': '',
'CountieCode': countie_code,
'CountieName': countie_name,
# 添加其他需要的字段
# 获取镇级数据
if countie_info[0].find('a') and countie_info[0].find('a').has_attr('href'):
index = countie_info[0].find('a')['href']
town_url_base = get_parent_folder_path(url)
town_url = f"{town_url_base}/{index}"
town_html = fetch_data(town_url)
if town_html:
town_data = parse_town_html(
town_html, city_code, town_url)
return countie_data
def parse_town_html(html, countie_code, url):
soup = BeautifulSoup(html, 'html.parser')
town_data = []
# 根据实际网页结构调整以下代码
towns = soup.select('tr.towntr')
for town in towns:
town_info = town.select('td')
town_code = town_info[0].text.strip()
town_name = town_info[1].text.strip()
# 输出镇级数据
print(f" town: {town_name} ({town_code})")
# 将数据添加到列表
'CountieCode': countie_code,
'CountieName': '',
'TownCode': town_code,
'TowneName': town_name,
# 添加其他需要的字段
# 获取村级数据
if town_info[0].find('a') and town_info[0].find('a').has_attr('href'):
index = town_info[0].find('a')['href']
village_url_base = get_parent_folder_path(url)
village_url = f"{village_url_base}/{index}"
village_html = fetch_data(village_url)
if village_html:
village_data = parse_village_html(
village_html, town_code)
return town_data
def parse_village_html(html, town_code):
soup = BeautifulSoup(html, 'html.parser')
village_data = []
# 根据实际网页结构调整以下代码
villages = soup.select('tr.villagetr')
for village in villages:
village_info = village.select('td')
village_code = village_info[0].text.strip()
village_name = village_info[2].text.strip()
# 输出镇级数据
print(f" village: {village_name} ({village_code})")
# 将数据添加到列表
'TownCode': town_code,
'TowneName': '',
'VillageCode': village_code,
'VillageName': village_name,
# 添加其他需要的字段
return village_data
def get_parent_folder_path(current_url):
# 解析 URL
parsed_url = urlparse(current_url)
# 获取 URL 路径部分
path = unquote(parsed_url.path) # 对路径进行解码
# 移除路径末尾的文件名,获得上级文件夹路径
parent_folder_path = '/'.join(path.split('/')[:-1])
# 使用当前页面的 URL 作为基准 URL,将相对路径转换为绝对路径
absolute_parent_folder_path = urljoin(current_url, parent_folder_path)
return absolute_parent_folder_path
def save_to_excel(data, filename='administrative_divisions.xlsx'):
df = pd.DataFrame(data)
df.to_excel(filename, index=False)
print(f"Data saved to {filename}")
def main():
url_base = 'https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/'
url = f"{url_base}index.html"
html = fetch_data(url)
if html:
data = parse_html(html, url)
if __name__ == "__main__":