Python 获取 区域 四级联动 爬虫源码 (国家统计局 2019)

废话少说,直接上代码:

#!usr/bin/python
#coding:utf-8
import requests
import time
from bs4 import BeautifulSoup

# 国家统计局 2019年 行政区域划分网址
link = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/'
# 伪装 headers
headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
	'Accept-Encoding': 'gzip, deflate'
}

'''
获取省数据
'''
def getprovincetr(url = 'index.html'):
	# 组合 链接
	urls = link+url
	# get 请求
	r = requests.get(urls,headers=headers)
	# 数据转码  
	r.encoding='gbk'
	# 数据解析
	soup = BeautifulSoup(r.text,"html.parser")
	# 取省
	provincetr = soup.find_all('tr',class_='provincetr');
	provincetr_list = [];
	# 数据a 标签的值存放到列表
	for x in provincetr:
		provincetr_list += x.find_all('a')
	# 创建一个空字典
	provincetr_dist = {}
	for x in provincetr_list:
		if x.text.isdigit():
			# 如果是数字,跳过
			pass
		else:
			# 如果是文字,存放到字典,并追加到 area.sql 文件中 
			ids = x['href'].replace('.html','')
			pid = {'name':x.text,'id':ids,'pid':0,'child':{}}
			# 追加到文件
			with open('area.sql','a+',encoding='utf-8') as f:
				f.write("INSERT INTO area (id,pid,name) VALUES ('"+pid['id']+"','"+str(pid['pid'])+"','"+pid['name']+"')\n")
			# 在追加的过程中获取市级的数据
			provincetr_dist[ids] = {'name':x.text,'id':ids,'pid':0,'child':getcitytr(pid)}
	return provincetr_dist

'''
获取城市数据
'''
def getcitytr(pid =  {'name': '广东省', 'id': '44', 'pid': '', 'child': {}}):
	urls = link+pid['id']+'.html'
	r = requests.get(urls,headers=headers)
	r.encoding='gbk'
	soup = BeautifulSoup(r.text,"html.parser")
	# 取省
	citytr = soup.find_all('tr',class_='citytr');

	citytr_list = [];
	for x in citytr:
		citytr_list += x.find_all('a')
	citytr_dist = {}

	for x in citytr_list:
		if x.text.isdigit():
			pass
		else:
			ids = x['href'].replace('.html','')
			temp = {'name':x.text,'id':ids.replace(pid['id']+'/',''),'pid':pid['id'],'child':{}}
			# 追加到文件
			with open('area.sql','a+',encoding='utf-8') as f:
				f.write("INSERT INTO area (id,pid,name) VALUES ('"+temp['id']+"','"+str(temp['pid'])+"','"+temp['name']+"')\n")
			citytr_dist[ids] = {'name':x.text,'id':ids.replace(pid['id']+'/',''),'pid':pid['id'],'child':getcountytr(temp)}
	return citytr_dist

'''
获取区县数据
'''
def getcountytr(pid =  {'name': '东莞市', 'id': '4420', 'pid': '44', 'child': {}}):
	urls = link+pid['pid']+'/'+pid['id']+'.html'
	r = requests.get(urls,headers=headers)
	r.encoding='gbk'
	soup = BeautifulSoup(r.text,"html.parser")
	# 取省
	countytr = soup.find_all('tr',class_='countytr');
	if countytr:
		countytr_list = [];
		countytr_dist = {}
		for x in countytr:
			countytr_list += x.find_all('a')
		for x in countytr_list:
			if x.text.isdigit():
				pass
			else:
				ids = x['href'].replace('.html','')
				temp = {'name':x.text,'id':ids.replace(pid['id'].replace(pid['pid'],'')+'/',''),'pid':pid['id'],'child':{}}
				# 追加到文件
				with open('area.sql','a+',encoding='utf-8') as f:
					f.write("INSERT INTO area (id,pid,name) VALUES ('"+temp['id']+"','"+str(temp['pid'])+"','"+temp['name']+"')\n")
				countytr_dist[ids] = {'name':x.text,'id':ids.replace(pid['id'].replace(pid['pid'],'')+'/',''),'pid':pid['id'],'child':gettowntr(temp)}
		return countytr_dist
	else:
		# 为了获取东莞之类没有区级的城市进行处理
		temp = {'name':pid['name'],'id':pid['id'],'pid':pid['id'],'child':{}}
		# 追加到文件
		with open('area.sql','a+',encoding='utf-8') as f:
			f.write("INSERT INTO area (id,pid,name) VALUES ('"+temp['id']+"','"+str(temp['pid'])+"','"+temp['name']+"')\n")
		return {pid['id']:{'name':pid['name'],'id':pid['id']+'01','pid':pid['id'],'child':gettowntr(temp)}}

'''
获取 镇、街道 数据
'''	
def gettowntr(pid =  {'name': '临西县', 'id': '130535', 'pid': '1305', 'child': {}}):

	urls = link+pid['pid'][0:2]+'/'+pid['pid'][2:4]+'/'+pid['id']+'.html'
	if pid['id'] == pid['pid']:
		urls = link+pid['pid'][0:2]+'/'+pid['id']+'.html'
	r = requests.get(urls,headers=headers)
	r.encoding='gbk'
	soup = BeautifulSoup(r.text,"html.parser")
	# 取省
	towntr = soup.find_all('tr',class_='towntr');
	towntr_list = [];
	towntr_dist = {}
	for x in towntr:
		towntr_list += x.find_all('a')

	for x in towntr_list:
		if x.text.isdigit():
			pass
		else:
			ids = x['href'].replace('.html','')
			if pid['id'] == pid['pid']:
				pid['id'] = pid['pid']+'01'
			temp = {'name':x.text,'id':ids[3:],'pid':pid['id']}
			towntr_dist[ids] = {'name':x.text,'id':ids[3:],'pid':pid['id']}
			# 追加到文件
			with open('area.sql','a+',encoding='utf-8') as f:
				f.write("INSERT INTO area (id,pid,name) VALUES ('"+temp['id']+"','"+str(temp['pid'])+"','"+temp['name']+"')\n")
	return towntr_dist

# getprovincetr()
print('run finish');



# print(r.content.decode("utf-8","ignore"))7

# provincetr=>省
# citytr=>市
# countytr=>区
# towntr=>街道
# villagetr=>村委会

获取的四级联动:https://download.csdn.net/download/Yel_Liang/12658808

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值