前言
获取最新的省市县乡村五级字典数据代码
免责申明:爬取数据造成任何问题,概不负责,本文只做技术分享和学习。
网络上看到的都是三级的,而且超时后,错误也没处理,我在这里加了超时重试三次。
def get_response(self, url, attr):
i = 0
while i < 3:
try:
response = requests.get(url, timeout=5)
response.encoding = 'utf-8' # 编码转换
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find_all('tbody')[1].tbody.tbody.table
if attr:
trs = table.find_all('tr', attrs={'class': attr})
else:
trs = table.find_all('tr')
return trs
except requests.exceptions.RequestException:
i += 1
print("请求超时")
一、运行环境
Python3.9
用到的Python+Interpreter
requests、BeautifulSoup、pymysql、lxml
二、python
# This is a sample Python script.
# Press ⌃R to execute it or replace it with your code.
# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
import requests
from bs4 import BeautifulSoup
import pymysql
import time
class Administrative(object):
def __init__(self):
self.db = pymysql.connect(host="127.0.0.1", port=3306,
database="jkgs",
charset="utf8mb4", user="root",
password="Smile2022")
self.main()
self.db.close()
def main(self):
base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'
sql = "insert into area_2021 (area_code,area_name,parent_code,level) values (%s,%s,%s,%s)"
trs = self.get_response(base_url, 'provincetr')
for tr in trs: # 循环每一行
for td in tr: # 循环每个省
province_code = td.a.get('href').replace('.html','')
province_name = td.a.get_text()
province_url = base_url + td.a.get('href')
# print(province_name)
# time.sleep(1)
data = [province_code,province_name,'0',1]
print(data)
datas = []
datas.append(data)
self.connect_mysql(sql, datas)
trs = self.get_response(province_url, None)
for tr in trs[1:]: # 循环每个市
city_code = tr.find_all('td')[0].string
city_name = tr.find_all('td')[1].string
city_url = base_url + tr.find_all('td')[1].a.get('href')
#print(city_name)
data = [city_code, city_name, province_code, 2]
print(data)
datas = []
datas.append(data)
self.connect_mysql(sql, datas)
trs = self.get_response(city_url, None)
for tr in trs[1:]: # 循环每个区
county_code = tr.find_all('td')[0].string
county_name = tr.find_all('td')[1].string
#print(county_name)
if tr.find_all('td')[1].a:
county_url = base_url +province_code+'/'+ tr.find_all('td')[1].a.get('href')
county_baseurl = county_url[:county_url.rfind('/')+1]
data = [county_code, county_name, city_code, 3]
print(data)
datas = []
datas.append(data)
self.connect_mysql(sql, datas)
# time.sleep(1)
trs = self.get_response(county_url, None)
if trs is None:
print(county_url)
else:
for tr in trs[1:]: # 循环每个乡
town_code = tr.find_all('td')[0].string
town_name = tr.find_all('td')[1].string
town_url = county_baseurl + tr.find_all('td')[1].a.get('href')
#print(town_name)
data = [town_code, town_name, county_code, 4]
print(data)
datas = []
datas.append(data)
self.connect_mysql(sql, datas)
# time.sleep(1)
trs = self.get_response(town_url, None)
if trs is None:
print(town_url)
else:
for tr in trs[1:]: # 循环每个乡
village_code = tr.find_all('td')[0].string
village_name = tr.find_all('td')[2].string
# print(village_name)
data = [village_code, village_name, town_code, 5]
print(data)
datas = []
datas.append(data)
self.connect_mysql(sql, datas)
# data = [province_name, city_code, city_name, county_code, county_name]
# print(data)
# datas.append(data)
else:
continue
def get_response(self, url, attr):
i = 0
while i < 3:
try:
response = requests.get(url, timeout=5)
response.encoding = 'utf-8' # 编码转换
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find_all('tbody')[1].tbody.tbody.table
if attr:
trs = table.find_all('tr', attrs={'class': attr})
else:
trs = table.find_all('tr')
return trs
except requests.exceptions.RequestException:
i += 1
print("请求超时")
def connect_mysql(self, sql, data):
cursor = self.db.cursor()
try:
result = None
if data:
if isinstance(data[0], list):
cursor.executemany(sql, data)
else:
cursor.execute(sql, data)
else:
cursor.execute(sql)
result = cursor.fetchall()
except Exception as e:
print(e)
self.db.rollback();
finally:
cursor.close()
self.db.commit(); # 提交操作
return result
if __name__ == '__main__':
Administrative()
三、数据表SQL
行政区划表
CREATE TABLE `jkgs`.`area_2021` (
`area_code` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '行政区划编码',
`area_name` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '行政区划名称',
`parent_code` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '父级行政区划编码',
`level` int NOT NULL COMMENT '级别',
`status` varchar(2) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL DEFAULT '1' COMMENT '状态值',
PRIMARY KEY (`area_code`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;