前言
获取最新的省市区县字典数据代码
免责申明:爬取数据造成任何问题,概不负责,本文只做技术分享和学习。
一、python
import requests
from bs4 import BeautifulSoup
import pymysql
import time
class Administrative(object):
def __init__(self):
self.db = pymysql.connect(host="127.0.0.1", port=3306,
database="county_db",
charset="utf8mb4", user="root",
password="admin")
self.main()
self.db.close()
def main(self):
base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/'
trs = self.get_response(base_url, 'provincetr')
for tr in trs: # 循环每一行
datas = []
for td in tr: # 循环每个省
province_name = td.a.get_text()
province_url = base_url + td.a.get('href')
print(province_name)
trs = self.get_response(province_url, None)
for tr in trs[1:]: # 循环每个市
city_code = tr.find_all('td')[0].string
city_name = tr.find_all('td')[1].string
city_url = base_url + tr.find_all('td')[1].a.get('href')
trs = self.get_response(city_url, None)
for tr in trs[1:]: # 循环每个区
county_code = tr.find_all('td')[0].string
county_name = tr.find_all('td')[1].string
data = [province_name, city_code, city_name, county_code, county_name]
print(data)
datas.append(data)
time.sleep(1)
sql = "insert into china (province_name,city_code,city_name,county_code,county_name) values (%s,%s,%s,%s,%s)"
self.connect_mysql(sql, datas)
def get_response(self, url, attr):
response = requests.get(url)
response.encoding = 'utf-8' # 编码转换
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find_all('tbody')[1].tbody.tbody.table
if attr:
trs = table.find_all('tr', attrs={'class': attr})
else:
trs = table.find_all('tr')
return trs
def connect_mysql(self, sql, data):
cursor = self.db.cursor()
try:
result = None
if data:
if isinstance(data[0], list):
cursor.executemany(sql, data)
else:
cursor.execute(sql, data)
else:
cursor.execute(sql)
result = cursor.fetchall()
except Exception as e:
print(e)
self.db.rollback();
finally:
cursor.close()
self.db.commit(); # 提交操作
return result
if __name__ == '__main__':
Administrative()
二、数据表SQL
CREATE TABLE `china` (
`province_name` varchar(200) NOT NULL,
`city_code` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
`city_name` varchar(200) NOT NULL,
`county_code` varchar(32) NOT NULL,
`county_name` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
PRIMARY KEY (`city_code`,`county_code`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
总结
亲测,可用,修改base_url,可以获取每年最新数据!