废话少说,直接上代码:
#!usr/bin/python
#coding:utf-8
import requests
import time
from bs4 import BeautifulSoup
# 国家统计局 2019年 行政区域划分网址
link = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/'
# 伪装 headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Accept-Encoding': 'gzip, deflate'
}
'''
获取省数据
'''
def getprovincetr(url = 'index.html'):
# 组合 链接
urls = link+url
# get 请求
r = requests.get(urls,headers=headers)
# 数据转码
r.encoding='gbk'
# 数据解析
soup = BeautifulSoup(r.text,"html.parser")
# 取省
provincetr = soup.find_all('tr',class_='provincetr');
provincetr_list = [];
# 数据a 标签的值存放到列表
for x in provincetr:
provincetr_list += x.find_all('a')
# 创建一个空字典
provincetr_dist = {}
for x in provincetr_list:
if x.text.isdigit():
# 如果是数字,跳过
pass
else:
# 如果是文字,存放到字典,并追加到 area.sql 文件中
ids = x['href'].replace('.html','')
pid = {'name':x.text,'id':ids,'pid':0,'child':{}}
# 追加到文件
with open('area.sql','a+',encoding='utf-8') as f:
f.write("INSERT INTO area (id,pid,name) VALUES ('"+pid['id']+"','"+str(pid['pid'])+"','"+pid['name']+"')\n")
# 在追加的过程中获取市级的数据
provincetr_dist[ids] = {'name':x.text,'id':ids,'pid':0,'child':getcitytr(pid)}
return provincetr_dist
'''
获取城市数据
'''
def getcitytr(pid = {'name': '广东省', 'id': '44', 'pid': '', 'child': {}}):
urls = link+pid['id']+'.html'
r = requests.get(urls,headers=headers)
r.encoding='gbk'
soup = BeautifulSoup(r.text,"html.parser")
# 取省
citytr = soup.find_all('tr',class_='citytr');
citytr_list = [];
for x in citytr:
citytr_list += x.find_all('a')
citytr_dist = {}
for x in citytr_list:
if x.text.isdigit():
pass
else:
ids = x['href'].replace('.html','')
temp = {'name':x.text,'id':ids.replace(pid['id']+'/',''),'pid':pid['id'],'child':{}}
# 追加到文件
with open('area.sql','a+',encoding='utf-8') as f:
f.write("INSERT INTO area (id,pid,name) VALUES ('"+temp['id']+"','"+str(temp['pid'])+"','"+temp['name']+"')\n")
citytr_dist[ids] = {'name':x.text,'id':ids.replace(pid['id']+'/',''),'pid':pid['id'],'child':getcountytr(temp)}
return citytr_dist
'''
获取区县数据
'''
def getcountytr(pid = {'name': '东莞市', 'id': '4420', 'pid': '44', 'child': {}}):
urls = link+pid['pid']+'/'+pid['id']+'.html'
r = requests.get(urls,headers=headers)
r.encoding='gbk'
soup = BeautifulSoup(r.text,"html.parser")
# 取省
countytr = soup.find_all('tr',class_='countytr');
if countytr:
countytr_list = [];
countytr_dist = {}
for x in countytr:
countytr_list += x.find_all('a')
for x in countytr_list:
if x.text.isdigit():
pass
else:
ids = x['href'].replace('.html','')
temp = {'name':x.text,'id':ids.replace(pid['id'].replace(pid['pid'],'')+'/',''),'pid':pid['id'],'child':{}}
# 追加到文件
with open('area.sql','a+',encoding='utf-8') as f:
f.write("INSERT INTO area (id,pid,name) VALUES ('"+temp['id']+"','"+str(temp['pid'])+"','"+temp['name']+"')\n")
countytr_dist[ids] = {'name':x.text,'id':ids.replace(pid['id'].replace(pid['pid'],'')+'/',''),'pid':pid['id'],'child':gettowntr(temp)}
return countytr_dist
else:
# 为了获取东莞之类没有区级的城市进行处理
temp = {'name':pid['name'],'id':pid['id'],'pid':pid['id'],'child':{}}
# 追加到文件
with open('area.sql','a+',encoding='utf-8') as f:
f.write("INSERT INTO area (id,pid,name) VALUES ('"+temp['id']+"','"+str(temp['pid'])+"','"+temp['name']+"')\n")
return {pid['id']:{'name':pid['name'],'id':pid['id']+'01','pid':pid['id'],'child':gettowntr(temp)}}
'''
获取 镇、街道 数据
'''
def gettowntr(pid = {'name': '临西县', 'id': '130535', 'pid': '1305', 'child': {}}):
urls = link+pid['pid'][0:2]+'/'+pid['pid'][2:4]+'/'+pid['id']+'.html'
if pid['id'] == pid['pid']:
urls = link+pid['pid'][0:2]+'/'+pid['id']+'.html'
r = requests.get(urls,headers=headers)
r.encoding='gbk'
soup = BeautifulSoup(r.text,"html.parser")
# 取省
towntr = soup.find_all('tr',class_='towntr');
towntr_list = [];
towntr_dist = {}
for x in towntr:
towntr_list += x.find_all('a')
for x in towntr_list:
if x.text.isdigit():
pass
else:
ids = x['href'].replace('.html','')
if pid['id'] == pid['pid']:
pid['id'] = pid['pid']+'01'
temp = {'name':x.text,'id':ids[3:],'pid':pid['id']}
towntr_dist[ids] = {'name':x.text,'id':ids[3:],'pid':pid['id']}
# 追加到文件
with open('area.sql','a+',encoding='utf-8') as f:
f.write("INSERT INTO area (id,pid,name) VALUES ('"+temp['id']+"','"+str(temp['pid'])+"','"+temp['name']+"')\n")
return towntr_dist
# getprovincetr()
print('run finish');
# print(r.content.decode("utf-8","ignore"))7
# provincetr=>省
# citytr=>市
# countytr=>区
# towntr=>街道
# villagetr=>村委会
获取的四级联动:https://download.csdn.net/download/Yel_Liang/12658808