import re
import ssl
import urllib.request
import time
import sys
import requests
from urllib.parse import urljoin
INDEX_URL = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/"
def get_province_code():
"""
获取省份名称
:return:
"""
# 获取省级初始页
province_response = urllib.request.urlopen(INDEX_URL + "index.html", context=ssl._create_unverified_context()).read().decode("utf-8")
# 获取省份名称+城市初始页数据
province_data = re.findall(r"<td><a href=\"(.*?)\">(.*?)<br /></a></td>", province_response)
# 获取城市名称 + 省份代码 + 城市代码
for url,name in province_data:
# 拼接省份代码
code = url.replace(".html", "") + "0" * 10
#print("insert into t_city_code(parent_adcode,name,short_name,adcode,level) VALUES('0','"+name+ "','"+name+ "','"+code+ "','1');");
get_city_code(url,code,name)
time.sleep(1)
def get_city_code(province_url,parent_code,parent_name):
"""
获取城市名称+代码
:return:
"""
# 获取城市初始页
city_response = urllib.request.urlopen(INDEX_URL + province_url, context=ssl._create_unverified_context()).read().decode("utf-8")
# 获取城市名称+城市code+地区初始页数据
city_data = re.findall(r"<tr class=\"citytr\"><td><a href=\"(.*?)\">(.*?)</a></td><td><a href=\"(.*?)\">("
r".*?)</a></td></tr>", city_response)
# 获取地区名称 + 地区代码
for url, code, _url, name in city_data:
if "市辖区" in name:
name=parent_name
elif "县" == name:
name=parent_name+"郊县"
if "直辖县级行政区划" in name:
code=parent_code
name=parent_name;
#else:
#print("insert into t_city_code(parent_adcode,name,short_name,adcode,level) VALUES('"+parent_code+"','"+name+ "','"+name+ "','"+code+ "','2');");
get_area_code(url,code,name)
time.sleep(1)
def get_area_code(city_url,parent_code,parent_name):
"""
获取区县名称+代码
:return:
"""
# 获取区县初始页
area_response = urllib.request.urlopen(INDEX_URL + city_url, context=ssl._create_unverified_context()).read().decode("utf-8")
# 获取区县名称+区县code+街道初始页数据
city_area_data = re.findall(r"<tr class=\"countytr\"><td>(.*?)</td><td>(.*?)</td></tr>", area_response)
if len(city_area_data) == 0:
street_data = re.findall(r"<tr class=\"towntr\"><td><a href=\"(.*?)\">(.*?)</a></td><td><a href=\"(.*?)\">("
r".*?)</a></td></tr>", area_response)
# 获取社区名称 + 社区代码
for url, code, _url, name in street_data:
print("insert into t_city_code(parent_adcode,name,short_name,adcode,level) VALUES('"+parent_code+"','"+name+ "','"+name+ "','"+code+ "','4');");
for key,value in city_area_data:
url=""
code=""
if "href" in key:
hrefKey = re.findall(r"<a href=\"(.*?)\">(.*?)</a>", key)
url = hrefKey[0][0];
code = hrefKey[0][1];
hrefVal = re.findall(r"<a href=\"(.*?)\">(.*?)</a>", value)
name = hrefVal[0][1];
else:
code = key;
name = value;
if "市辖区" == name:
name = parent_name+name
print("insert into t_city_code(parent_adcode,name,short_name,adcode,level) VALUES('"+parent_code+"','"+name+ "','"+name+ "','"+code+ "','3');");
#if url!="":
#get_street_code(url,code)
#time.sleep(2)
def get_street_code(area_url,parent_code):
"""
获取街道名称+代码
:return:
"""
# 获取街道初始页
street_response = urllib.request.urlopen(INDEX_URL + area_url[3:5] + "/" + area_url, context=ssl._create_unverified_context()).read().decode("utf-8")
# 获取街道名称+街道code+社区初始页数据
street_data = re.findall(r"<tr class=\"towntr\"><td><a href=\"(.*?)\">(.*?)</a></td><td><a href=\"(.*?)\">("
r".*?)</a></td></tr>", street_response)
# 获取社区名称 + 社区代码
for url, code, _url, name in street_data:
print("insert into t_city_code(parent_adcode,name,short_name,adcode,level) VALUES('"+parent_code+"','"+name+ "','"+name+ "','"+code+ "','4');");
#get_community_code(url)
def get_community_code(street_url):
"""
获取社区名称+代码
:return:
"""
# 获取社区初始页
community_response = urllib.request.urlopen(INDEX_URL + street_url[3:5] + "/" + street_url[5:7] + "/" + street_url, context=ssl._create_unverified_context()) \
.read().decode("utf-8")
community_data = re.findall(r"<tr class=\"villagetr\"><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>"
, community_response)
for code, _type, name in community_data:
print("社区:%s 代码:%s" % (name, code))
def get_redirected_url(response):
location = response.headers.get('Location')
if location:
return urljoin(response.url, location)
return None
def crawl_url(url):
try:
response = requests.get(url)
if response.status_code == 302:
redirected_url = get_redirected_url(response)
if redirected_url and redirected_url not in visited_urls: # 假设visited_urls是一个已访问URL的集合
crawl_url(redirected_url)
else:
print(response.text)
except requests.exceptions.RequestException as e:
print(f"请求出错: {e}")
# 测试爬虫
visited_urls = set() # 用于跟踪已访问的URLs
crawl_url('http://example.com')
def main():
"""
主启动函数
:return:
"""
# 获取省份名称
temp = sys.stdout
# f = open('C:\\city\\city.sql', 'w',encoding='utf-8')
# # 之后使用print函数,都将内容打印到 screenshot.log 文件中
# sys.stdout = f
get_province_code()
# f.close()
if __name__ == "__main__":
main()
python 爬虫实现从国家统计局官网获取行政区域
于 2024-02-02 13:45:47 首次发布