强烈不建议直接抓取, 如果因频繁请求导致服务异常, 可能要承担一定的责任, 所以要慎重. 推荐在某宝上直接购买对应数据表, 本文只做学习使用
行政区域地址:
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html
MySQL表结构
CREATE TABLE `region` (
`code` varchar(32) NOT NULL COMMENT '行政编码',
`name` varchar(128) NOT NULL COMMENT '名称',
`parent_code` varchar(32) NOT NULL COMMENT '父级行政编码',
PRIMARY KEY (`code`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci COMMENT='行政划分区域';
代码
运行过程中可能会出现卡顿, 所以要通过db进行查询判断, 避免重复处理, 中途失败可重试继续.
import re
import requests as req
from bs4 import BeautifulSoup
from region_reptile.region_db import RegionDB
city_Prefix = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'
region_db = RegionDB()
# 从首页获取省份列表
def province_list():
data = req.get(f'{city_Prefix}index.html')
data.encoding = 'utf-8'
html = data.text
soup = BeautifulSoup(html, 'html.parser').find('table', 'provincetable')
provinces = soup.select('tr.provincetr > td')
array = []
for province in provinces:
print(province.text)
c_array = [province.find('a').get('href').replace('.html', ''), province.find('a').text, '0']
array.append(c_array)
if region_db.query(f'select code from region where parent_code={c_array[0]}') is not None:
continue
city_list(c_array[0], province.find('a').get('href'))
region_db.save_all("insert ignore into region values(%s, %s, %s)", array)
# 根据省份获取市列表
def city_list(province_id, province_href):
data = req.get(f'{city_Prefix}{province_href}')
data.encoding = 'utf-8'
html = data.text
soup = BeautifulSoup(html, 'html.parser').find('table', 'citytable')
citys = soup.select('tr.citytr > td:nth-of-type(2)')
array = []
for city in citys:
print(city.text)
c_array = [city.find('a').get('href').split('/')[1].replace('.html', ''), city.find('a').text, province_id]
array.append(c_array)
if region_db.query(f'select code from region where parent_code={c_array[0]}') is not None:
continue
county_list(c_array[0], build_href(province_href, city.find('a').get('href')))
region_db.save_all("insert ignore into region values(%s, %s, %s)", array)
# 根据市获取区县列表
def county_list(city_id, city_href):
data = req.get(f'{city_Prefix}{city_href}')
data.encoding = 'utf-8'
html = data.text
soup = BeautifulSoup(html, 'html.parser').find('table', 'countytable')
countys = soup.select('tr.countytr > td:nth-of-type(2)')
array = []
for i, county in enumerate(countys):
print(county.text)
if county.find('a') is None:
continue
c_array = [county.find('a').get('href').split('/')[1].replace('.html', ''), county.find('a').text, city_id]
array.append(c_array)
if region_db.query(f'select code from region where parent_code={c_array[0]}') is not None:
continue
town_list(c_array[0], build_href(city_href, county.find('a').get('href')))
region_db.save_all("insert ignore into region values(%s, %s, %s)", array)
# 根据区县获取乡镇列表
def town_list(country_id, country_href):
data = req.get(f'{city_Prefix}{country_href}')
data.encoding = 'utf-8'
html = data.text
soup = BeautifulSoup(html, 'html.parser').find('table', 'towntable')
towns = soup.select('tr.towntr > td:nth-of-type(2)')
array = []
for town in towns:
print(town.text)
c_array = [town.find('a').get('href').split('/')[1].replace('.html', ''), town.find('a').text, country_id]
array.append(c_array)
if region_db.query(f'select code from region where parent_code={c_array[0]}') is not None:
continue
village_list(c_array[0], build_href(country_href, town.find('a').get('href')))
region_db.save_all("insert ignore into region values(%s, %s, %s)", array)
# 根据乡镇获取街道列表
def village_list(town_id, town_href):
data = req.get(f'{city_Prefix}{town_href}')
data.encoding = 'utf-8'
html = data.text
soup = BeautifulSoup(html, 'html.parser').find('table', 'villagetable')
villages = soup.select('tr.villagetr')
array = []
for village in villages:
print(village.text)
c_array = [village.select('td:nth-of-type(1)')[0].text, village.select('td:nth-of-type(3)')[0].text, town_id]
array.append(c_array)
region_db.save_all("insert ignore into region values(%s, %s, %s)", array)
def build_href(p_href, c_href):
return re.sub('[0-9]*.html', '', p_href) + c_href
# city_list('41', '41.html')
# county_list('4109', '41/4109.html')
# town_list('410923', '41/09/410923.html')
# village_list('410923206', '41/09/23/410923206.html')
# town_list('110101', '11/01/110102.html')
province_list()
运行结果
北京市
天津市
市辖区
和平区
...
造甲城镇
120117109201121造甲城村委会
120117109202220大王台村委会
120117109203220冯家台村委会
120117109204220付家台村委会
120117109205220东小王台村委会
120117109206220西小王台村委会
120117109207220田辛庄村委会
120117109208220赵温庄村委会