### 源代码参考
源代码: pedll/pcloud-cdn - Gitee.com
源文章:Wiki - Gitee.com
### 数据库
数据库: localhost:3306 账号: test 密码: test (本机单独测试库)
表格:见附件 sys_area.sql
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for sys_area
-- ----------------------------
DROP TABLE IF EXISTS `sys_area`;
CREATE TABLE `sys_area` (
`AREA_ID` varchar(50) COLLATE utf8mb4_bin NOT NULL COMMENT '主键ID',
`AREA_NAME` varchar(50) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区域名称',
`AREA_CODE` varchar(50) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区域编码',
`AREA_LVL` varchar(5) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区域级别',
`LANG_KEY` varchar(50) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区域国际化语言KEY',
`LOCALE_NAME` varchar(50) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '本地语言',
`TIME_ZONE` tinyint(4) DEFAULT NULL COMMENT '市区',
`AREA_STATE` varchar(5) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '国家码状态',
`SORT_ODR` smallint(6) DEFAULT NULL COMMENT '显示序列排名',
`PAR_AREA_ID` varchar(50) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '上级ID',
`CRT_UID` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '创建用ID, 默认 999, 系统创建',
`CRT_DATE` datetime DEFAULT NULL COMMENT '创建日期',
`UPD_UID` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '更新用户ID',
`UPD_DATE` datetime DEFAULT NULL COMMENT '更新日期',
`DEL_STATE` varchar(5) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '删除状态 ',
PRIMARY KEY (`AREA_ID`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
SET FOREIGN_KEY_CHECKS = 1;
### 下载规则
从省份 -> 城市 -> 县/区 -> 镇/街道办事处 -> 乡/社区
省份编码: 86 + 省份编码(2位)
城市编码:86 + 省份编码(2位) + 城市编码(2位)
县/区编码: 86 + 省份编码(2位) + 城市编码(2位)+ 区/县编码(2位)
镇/街道办编码: 86 + 省份编码(2位) + 城市编码(2位)+ 区/县编码(2位)+ 镇/街道办编码(3位)
乡/社区编码:86 + 省份编码(2位) + 城市编码(2位)+ 区/县编码(2位)+ 镇/街道办编码(3位) + 乡/社区编码(3位)
### 数据说明
合计耗时 10小时,67W条记录,100MB存储空间,具体见源代码附件zip文件
### 性能说明
此不考虑性能,不考虑架构等,只需要将数据下载并插入到数据库中
### 代码
# !/usr/bin/env python3
# -*- coding: UTF-8 -*-
# @Author : johnli
# @Time: 02/11/2021 21:33
# The estimate time is 10 hours, 669707 records in 100MB
import logging
import random
import time
import pymysql
import requests
from lxml import etree
statsBaseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 '
'User-Agent': 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
}
country_code = "86"
encoding_type = "gbk"
logging.basicConfig(level=logging.INFO)
do_cycle_parsing = True
do_cycle_sleep = False
cycle_idx = 0
def cycle_sleep():
if do_cycle_sleep:
# time.sleep(random.randint(0, 5))
time.sleep(random.randint(0, 2))
def get_province(country_url): # 省 11
if country_url.endswith("index.html"):
country_url = country_url[: len(country_url) - len("index.html")]
else:
if country_url.endswith("/"):
pass
else:
country_url += "/"
response = requests.get(country_url, headers=headers)
response.encoding = encoding_type
text = response.text
# print(text)
html = etree.HTML(text)
trs = html.xpath('//tr[@class="provincetr"]/td')
province_cycle = 1
for tr in trs:
sys_area = {"country_url": country_url}
try:
province_name = tr.xpath('./a/text()')[0]
province_page = tr.xpath('./a/@href')[0]
province_code = province_page[0: len(province_page) - len(".html")]
province_url = statsBaseUrl + province_page
sys_area["province_name"] = province_name
sys_area["province_code"] = province_code
sys_area["province_page"] = province_page
sys_area["province_url"] = province_url
sys_area["province_cycle"] = province_cycle
# print(sys_area)
if do_cycle_parsing:
get_city(sys_area)
time.sleep(10)
except IndexError:
# 32 位空,预留台湾省
# print("Unknown exception: {}".format(sys.exc_info()[0]))
pass
finally:
province_cycle += 1
def get_city(sys_area): # 市,1101
print("city:", sys_area)
province_url = sys_area["province_url"]
cycle_sleep()
response = requests.get(province_url, headers=headers)
response.encoding = encoding_type
text = response.text
# print(text)
html = etree.HTML(text)
trs = html.xpath('//tr[@class="citytr"]')
city_cycle = 1
for tr in trs:
try:
# city_code = city_page[len(province_code) + 1: len(city_page) - len(".html")]
city_code = tr.xpath('./td[1]/a/text()')[0]
city_name = tr.xpath('./td[2]/a/text()')[0]
city_page = tr.xpath('./td[1]/a/@href')[0] # 42/4201.html
city_url = statsBaseUrl
if city_url.endswith("/"):
city_url += city_page
else:
city_url += ("/" + city_page)
sys_area["city_name"] = city_name
sys_area["city_code"] = city_code
sys_area["city_page"] = city_page
sys_area["city_url"] = city_url
sys_area["city_cycle"] = city_cycle
# print(sys_area)
if do_cycle_parsing:
get_district(sys_area)
finally:
city_cycle += 1
def get_district(sys_area): # 区/县 110101
city_url = sys_area["city_url"]
province_code = sys_area["province_code"]
cycle_sleep()
response = requests.get(city_url, headers=headers)
response.encoding = encoding_type
text = response.text
# print(text)
html = etree.HTML(text)
trs = html.xpath('//tr[@class="countytr"]')
district_cycle = 1
for tr in trs:
try:
district_name = tr.xpath('./td[2]/a/text()')[0]
district_code = tr.xpath('./td[1]/a/text()')[0]
district_page = tr.xpath('./td[1]/a/@href')[0] # 11/1101.html
district_url = statsBaseUrl + province_code + "/" + district_page
sys_area["district_name"] = district_name
sys_area["district_code"] = district_code
sys_area["district_page"] = district_page
sys_area["district_url"] = district_url
sys_area["district_cycle"] = district_cycle
# print(sys_area)
if do_cycle_parsing:
get_town(sys_area)
except IndexError:
# 市下面可能没有区
district_name = tr.xpath('./td[2]/text()')[0]
district_code = tr.xpath('./td[1]/text()')[0]
sys_area["district_name"] = district_name
sys_area["district_code"] = district_code
sys_area["district_page"] = ""
sys_area["district_url"] = ""
sys_area["district_cycle"] = district_cycle
add_to_db(sys_area)
finally:
district_cycle += 1
def get_town(sys_area): # 街道/镇
country_url = sys_area["country_url"]
province_code = sys_area["province_code"]
city_code = sys_area["city_code"]
district_url = sys_area["district_url"]
cycle_sleep()
response = requests.get(district_url, headers=headers)
response.encoding = encoding_type
text = response.text
# print(text)
html = etree.HTML(text)
trs = html.xpath('//tr[@class="towntr"]')
town_cycle = 1
for tr in trs:
try:
town_code = tr.xpath('./td[1]/a/text()')[0]
town_name = tr.xpath('./td[2]/a/text()')[0]
town_page = tr.xpath('./td[1]/a/@href')[0]
# http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/42/08/22/420822100.html
town_url = country_url + province_code + "/" + city_code[2:4] + "/" + town_page
sys_area["town_name"] = town_name
sys_area["town_code"] = town_code
sys_area["town_page"] = town_page
sys_area["town_url"] = town_url
sys_area["town_url"] = town_url
sys_area["town_cycle"] = town_cycle
# print(sys_area)
if do_cycle_parsing:
get_village(sys_area)
finally:
town_cycle += 1
def get_village(sys_area):
town_url = sys_area["town_url"]
cycle_sleep()
response = requests.get(town_url, headers=headers)
response.encoding = encoding_type
text = response.text
# print(text)
html = etree.HTML(text)
trs = html.xpath('//tr[@class="villagetr"]')
village_cycle = 1
for tr in trs:
try:
# 统计用区划代码 / 城乡分类代码 / 名称
village_name = tr.xpath('./td[3]/text()')[0]
village_code = tr.xpath('./td[1]/text()')[0]
sys_area["village_name"] = village_name
sys_area["village_code"] = village_code
sys_area["village_cycle"] = village_cycle
add_to_db(sys_area)
finally:
village_cycle += 1
global cycle_idx
cycle_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("%s[%d] %s" % (cycle_time, cycle_idx, sys_area))
cycle_idx += 1
def add_to_db(sys_area):
db = pymysql.connect(host="localhost", user="test", password="test", database="test")
cursor = db.cursor()
# province_name province_code
# city_name city_code
# district_name district_code
# town_name town_code
# village_name village_code
crt_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
check_sql = "SELECT COUNT(1) AS TOTAL FROM SYS_AREA WHERE AREA_ID = '%s'"
ins_sql = "INSERT INTO SYS_AREA(AREA_ID,AREA_NAME,AREA_CODE,AREA_LVL, LANG_KEY, LOCALE_NAME,TIME_ZONE,AREA_STATE," \
"SORT_ODR,PAR_AREA_ID,CRT_UID,CRT_DATE,DEL_STATE) VALUES \
('%s','%s','%s','%s','%s','%s','%s','%s',%d,'%s', '%s', '%s', '%s')"
if "province_code" in sys_area:
cursor.execute(check_sql % (country_code + sys_area["province_code"]))
cnt = cursor.fetchone()
if cnt[0] < 1:
province_sys_area = (
country_code + sys_area["province_code"], # AREA_ID
sys_area["province_name"], # AREA_NAME
country_code + sys_area["province_code"], # AREA_CODE
"2", # AREA_LVL
"sys.country.cn." + country_code + sys_area["province_code"], # LANG_KEY
sys_area["province_name"], "8", "1", # LOCALE_NAME,TIME_ZONE,AREA_STATE
int(sys_area["province_cycle"]), # SORT_ODR
country_code, # PAR_AREA_ID
"1000", crt_date, "0" # CRT_UID,CRT_DATE,DEL_STATE
)
print(province_sys_area)
cursor.execute(ins_sql % province_sys_area)
if "city_code" in sys_area:
cursor.execute(check_sql % (country_code + sys_area["city_code"][0:4]))
cnt = cursor.fetchone()
if cnt[0] < 1:
city_sys_area = (
country_code + sys_area["city_code"][0:4], # AREA_ID 城市用4位 86 + 2位省份 + 2位城市
sys_area["city_name"], # AREA_NAME
country_code + sys_area["city_code"][0:4], # AREA_CODE
"3", # AREA_LVL
"sys.country.cn." + country_code + sys_area["city_code"][0:4], # LANG_KEY
sys_area["city_name"], "8", "1", # LOCALE_NAME,TIME_ZONE,AREA_STATE
int(sys_area["city_cycle"]), # SORT_ODR
country_code + sys_area["province_code"], # PAR_AREA_ID
"1000", crt_date, "0" # CRT_UID,CRT_DATE,DEL_STATE
)
cursor.execute(ins_sql % city_sys_area)
if "district_code" in sys_area:
cursor.execute(check_sql % (country_code + sys_area["district_code"][0:6]))
cnt = cursor.fetchone()
if cnt[0] < 1:
district_sys_area = (
country_code + sys_area["district_code"][0:6], # AREA_ID
sys_area["district_name"], # AREA_NAME
country_code + sys_area["district_code"][0:6], # AREA_CODE
"4", # AREA_LVL
"sys.country.cn." + country_code + sys_area["district_code"][0:6], # LANG_KEY
sys_area["district_name"], "8", "1", # LOCALE_NAME,TIME_ZONE,AREA_STATE
int(sys_area["district_cycle"]), # SORT_ODR
country_code + sys_area["city_code"][0:4], # PAR_AREA_ID
"1000", crt_date, "0" # CRT_UID,CRT_DATE,DEL_STATE
)
cursor.execute(ins_sql % district_sys_area)
if "town_code" in sys_area:
cursor.execute(check_sql % (country_code + sys_area["town_code"][0:9]))
cnt = cursor.fetchone()
if cnt[0] < 1:
town_sys_area = (
country_code + sys_area["town_code"][0:9], # AREA_ID
sys_area["town_name"], # AREA_NAME
country_code + sys_area["town_code"][0:9], # AREA_CODE
"5", # AREA_LVL
"sys.country.cn." + country_code + sys_area["town_code"][0:9], # LANG_KEY
sys_area["town_name"], "8", "1", # LOCALE_NAME,TIME_ZONE,AREA_STATE
int(sys_area["town_cycle"]), # SORT_ODR
country_code + sys_area["district_code"][0:6], # PAR_AREA_ID
"1000", crt_date, "0" # CRT_UID,CRT_DATE,DEL_STATE
)
cursor.execute(ins_sql % town_sys_area)
if "village_code" in sys_area:
cursor.execute(check_sql % (country_code + sys_area["village_code"]))
cnt = cursor.fetchone()
if cnt[0] < 1:
village_sys_area = (
country_code + sys_area["village_code"], # AREA_ID
sys_area["village_name"], # AREA_NAME
country_code + sys_area["village_code"], # AREA_CODE
"6", # AREA_LVL
"sys.country.cn." + country_code + sys_area["village_code"], # LANG_KEY
sys_area["village_name"], "8", "1", # LOCALE_NAME,TIME_ZONE,AREA_STATE
int(sys_area["village_cycle"]), # SORT_ODR
country_code + sys_area["town_code"][0:9], # PAR_AREA_ID
"1000", crt_date, "0" # CRT_UID,CRT_DATE,DEL_STATE
)
cursor.execute(ins_sql % village_sys_area)
db.commit()
cursor.close()
db.close()
#
#
# # check area id exists
# area_ids = [sys_area["province_code"], sys_area["city_code"], sys_area["district_code"], sys_area["town_code"],
# sys_area["village_code"]]
# area_in_ids = ','.join(("'%s'" % item for item in area_ids))
# check_sql = "SELECT AREA_ID FROM SYS_AREA WHERE AREA_ID In (%s)" % ','.join(['%s']) * len(area_ids)
# cursor.execute(check_sql, area_in_ids)
# area_id_results = cursor.fetchall()
# presence_values = [province_sys_area, city_sys_area, district_sys_area, town_sys_area, village_sys_area]
# dst_values = []
# presence_flags = [False, False, False, False, False]
# for row in area_id_results:
# area_id = row[0]
# has_area_id = False
# if area_id == country_code + sys_area["province_code"]:
# presence_flags[0] = True
# continue
# elif area_id == country_code + sys_area["city_code"]:
# presence_flags[1] = True
# continue
# elif area_id == country_code + sys_area["district_code"]:
# presence_flags[2] = True
# continue
# elif area_id == country_code + sys_area["town_code"]:
# presence_flags[3] = True
# continue
# elif area_id == country_code + sys_area["village_code"]:
# presence_flags[4] = True
# continue
# presence_idx = 0
# for flag in presence_flags:
# if not flag:
# dst_values.append(presence_values[presence_idx])
# presence_idx += 1
#
# # ins
# batch_ins_sql = "INSERT INTO SYS_AREA(AREA_ID,AREA_NAME,AREA_CODE,AREA_LVL, LANG_KEY, LOCALE_NAME,TIME_ZONE,AREA_STATE," \
# "SORT_ODR,PAR_AREA_ID,CRT_UID,CRT_DATE,DEL_STATE) VALUES \
# ('%s','%s','%s','%s','%s','%s','%s','%s',%d,'%s', '%s', '%s', '%s')"
# cursor.executemany(batch_ins_sql, dst_values)
if __name__ == '__main__':
# db = pymysql.connect(host="localhost", user="test", password="test", database="test")
# cursor = db.cursor()
# crt_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# trunc_sql = "TRUNCATE TABLE SYS_AREA"
# ins_sql = "INSERT INTO SYS_AREA(AREA_ID,AREA_NAME,AREA_CODE,AREA_LVL, LANG_KEY, LOCALE_NAME,TIME_ZONE,AREA_STATE," \
# "SORT_ODR,PAR_AREA_ID,CRT_UID,CRT_DATE,DEL_STATE) VALUES \
# ('%s','%s','%s','%s','%s','%s','%s','%s',%d,'%s','%s','%s','%s')" % \
# ("86", "中国", "86", "1", "sys.country.cn", "中国", "8", "1", 1, "0", "1000", crt_date, "0")
#
# try:
# cursor.execute(trunc_sql)
# cursor.execute(ins_sql)
# db.commit()
# except Exception as e:
# print(e)
# sys.exit(1)
# # print("Unknown exception: {}".format(sys.exc_info()[1]))
# finally:
# cursor.close()
# db.close()
get_province(statsBaseUrl)
### 特别说明
本次试验主要研究Python脚本爬虫能力,若有不合规行为,请联系本人删除,感谢!