使用Python获取最新的行政区划(省市区)数据
一.说明
1.此数据从 国家统计局 获取
2.此脚本仅仅包含省市区,如需乡镇村,只需按照格式,自己做匹配开发就行了
3.py3
二.脚本
1.python脚本
import requests
from bs4 import BeautifulSoup
import pymysql
import time
"""
从国家统计局爬取省市区数据
"""
class Administrative(object):
def __init__(self):
self.db = pymysql.connect("localhost", "root", "123456", "test", charset="utf8mb4")
self.main()
self.db.close()
def main(self):
# 年份
year = 2019
base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/%s/' % year
sql = "insert into dict_administrative_area (area_code,area_name,parent_code,parent_id,area_level) values (%s,%s,%s,%s,%s)"
trs = self.get_response(base_url, 'provincetr')
for tr in trs: # 循环每一行
for td in tr: # 循环每个省
if td.a is None:
continue
href_url = td.a.get('href')
province_name = td.a.get_text()
province_code = str(href_url.split(".")[0]) + "0000000000";
province_url = base_url + href_url
# print(province_code)
print(province_name)
# print(province_url)
# 插入省份数据并获取主键
province_data = [province_code, province_name, '0', 0, 1]
province_id = self.connect_mysql(sql, province_data)
trs = self.get_response(province_url, None)
for tr in trs[1:]: # 循环每个市
city_code = tr.find_all('td')[0].string
city_name = tr.find_all('td')[1].string
# 插入城市数据并获取主键
city_data = [city_code, city_name, province_code, province_id, 2]
city_id = self.connect_mysql(sql, city_data)
city_url = base_url + tr.find_all('td')[1].a.get('href')
trs = self.get_response(city_url, None)
for tr in trs[1:]: # 循环每个区县
county_code = tr.find_all('td')[0].string
county_name = tr.find_all('td')[1].string
# 插入区县数据并获取主键
county_data = [county_code, county_name, city_code, city_id, 3]
county_id = self.connect_mysql(sql, county_data)
# time.sleep(1)
time.sleep(1)
time.sleep(1)
@staticmethod
def get_response(url, attr):
response = requests.get(url)
response.encoding = 'gb2312' # 编码转换
soup = BeautifulSoup(response.text, features="html.parser")
table = soup.find_all('tbody')[1].tbody.tbody.table
if attr:
trs = table.find_all('tr', attrs={'class': attr})
else:
trs = table.find_all('tr')
return trs
def connect_mysql(self, sql, data):
cursor = self.db.cursor()
try:
result = None
if data:
if isinstance(data[0], list):
cursor.executemany(sql, data)
result = self.db.insert_id()
print()
else:
cursor.execute(sql, data)
result = self.db.insert_id()
else:
cursor.execute(sql)
cursor.fetchall()
result = self.db.insert_id()
except Exception as e:
print(e)
self.db.rollback();
finally:
cursor.close()
self.db.commit(); # 提交操作
return result
if __name__ == '__main__':
Administrative()
2.数据库schema.sql
CREATE TABLE `dict_administrative_area` (
`id` bigint(20) NOT NULL AUTO_INCREMENT,
`area_code` varchar(100) DEFAULT NULL COMMENT '地区编码',
`area_name` varchar(100) DEFAULT NULL COMMENT '地区名称',
`area_level` smallint(6) DEFAULT NULL COMMENT '地区等级: 1 省 2 市 3 区 4 区县 5 乡镇 6 村/社区',
`parent_code` varchar(100) DEFAULT '' COMMENT '上级编码',
`parent_id` bigint(20) DEFAULT '0' COMMENT '上级ID',
`status` tinyint(1) DEFAULT '1' COMMENT '状态 0 禁用 1 启用',
`gmt_create` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`gmt_modify` timestamp NULL DEFAULT NULL COMMENT '修改时间',
`create_by` bigint(20) DEFAULT '0' COMMENT '创建人',
`modify_by` bigint(20) DEFAULT '0' COMMENT '修改人',
`active` tinyint(1) DEFAULT '1' COMMENT '是否有效 0 否 1 是',
PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='地区';