国内行政区划-六级目录-Python爬虫 研究

### 源代码参考

源代码:  pedll/pcloud-cdn - Gitee.com

源文章:Wiki - Gitee.com


### 数据库

数据库: localhost:3306 账号: test 密码: test (本机单独测试库)
表格:见附件 sys_area.sql

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for sys_area
-- ----------------------------
DROP TABLE IF EXISTS `sys_area`;
CREATE TABLE `sys_area` (
  `AREA_ID` varchar(50) COLLATE utf8mb4_bin NOT NULL COMMENT '主键ID',
  `AREA_NAME` varchar(50) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区域名称',
  `AREA_CODE` varchar(50) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区域编码',
  `AREA_LVL` varchar(5) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区域级别',
  `LANG_KEY` varchar(50) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区域国际化语言KEY',
  `LOCALE_NAME` varchar(50) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '本地语言',
  `TIME_ZONE` tinyint(4) DEFAULT NULL COMMENT '市区',
  `AREA_STATE` varchar(5) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '国家码状态',
  `SORT_ODR` smallint(6) DEFAULT NULL COMMENT '显示序列排名',
  `PAR_AREA_ID` varchar(50) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '上级ID',
  `CRT_UID` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '创建用ID, 默认 999, 系统创建',
  `CRT_DATE` datetime DEFAULT NULL COMMENT '创建日期',
  `UPD_UID` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '更新用户ID',
  `UPD_DATE` datetime DEFAULT NULL COMMENT '更新日期',
  `DEL_STATE` varchar(5) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '删除状态 ',
  PRIMARY KEY (`AREA_ID`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;

SET FOREIGN_KEY_CHECKS = 1;

### 下载规则
从省份 -> 城市 -> 县/区 -> 镇/街道办事处 -> 乡/社区
省份编码: 86 + 省份编码(2位)
城市编码:86 + 省份编码(2位) + 城市编码(2位)
县/区编码: 86 + 省份编码(2位) + 城市编码(2位)+ 区/县编码(2位)
镇/街道办编码: 86 + 省份编码(2位) + 城市编码(2位)+ 区/县编码(2位)+ 镇/街道办编码(3位)
乡/社区编码:86 + 省份编码(2位) + 城市编码(2位)+ 区/县编码(2位)+ 镇/街道办编码(3位) + 乡/社区编码(3位)


### 数据说明
合计耗时 10小时,67W条记录,100MB存储空间,具体见源代码附件zip文件

### 性能说明
此不考虑性能,不考虑架构等,只需要将数据下载并插入到数据库中


### 代码

# !/usr/bin/env python3
# -*- coding: UTF-8 -*-
# @Author : johnli
# @Time: 02/11/2021 21:33
# The estimate time is 10 hours, 669707 records in 100MB

import logging
import random
import time

import pymysql
import requests
from lxml import etree

statsBaseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
headers = {
    # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 '
    'User-Agent': 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
}
country_code = "86"
encoding_type = "gbk"
logging.basicConfig(level=logging.INFO)
do_cycle_parsing = True
do_cycle_sleep = False
cycle_idx = 0


def cycle_sleep():
    if do_cycle_sleep:
        # time.sleep(random.randint(0, 5))
        time.sleep(random.randint(0, 2))


def get_province(country_url):  # 省 11
    if country_url.endswith("index.html"):
        country_url = country_url[: len(country_url) - len("index.html")]
    else:
        if country_url.endswith("/"):
            pass
        else:
            country_url += "/"

    response = requests.get(country_url, headers=headers)
    response.encoding = encoding_type
    text = response.text

    # print(text)
    html = etree.HTML(text)
    trs = html.xpath('//tr[@class="provincetr"]/td')
    province_cycle = 1
    for tr in trs:
        sys_area = {"country_url": country_url}
        try:
            province_name = tr.xpath('./a/text()')[0]
            province_page = tr.xpath('./a/@href')[0]
            province_code = province_page[0: len(province_page) - len(".html")]
            province_url = statsBaseUrl + province_page
            sys_area["province_name"] = province_name
            sys_area["province_code"] = province_code
            sys_area["province_page"] = province_page
            sys_area["province_url"] = province_url
            sys_area["province_cycle"] = province_cycle
            # print(sys_area)
            if do_cycle_parsing:
                get_city(sys_area)
            time.sleep(10)
        except IndexError:
            # 32 位空,预留台湾省
            # print("Unknown exception: {}".format(sys.exc_info()[0]))
            pass
        finally:
            province_cycle += 1


def get_city(sys_area):  # 市,1101
    print("city:", sys_area)
    province_url = sys_area["province_url"]
    cycle_sleep()
    response = requests.get(province_url, headers=headers)
    response.encoding = encoding_type
    text = response.text
    # print(text)
    html = etree.HTML(text)
    trs = html.xpath('//tr[@class="citytr"]')
    city_cycle = 1
    for tr in trs:
        try:
            # city_code = city_page[len(province_code) + 1: len(city_page) - len(".html")]
            city_code = tr.xpath('./td[1]/a/text()')[0]
            city_name = tr.xpath('./td[2]/a/text()')[0]
            city_page = tr.xpath('./td[1]/a/@href')[0]  # 42/4201.html
            city_url = statsBaseUrl
            if city_url.endswith("/"):
                city_url += city_page
            else:
                city_url += ("/" + city_page)
            sys_area["city_name"] = city_name
            sys_area["city_code"] = city_code
            sys_area["city_page"] = city_page
            sys_area["city_url"] = city_url
            sys_area["city_cycle"] = city_cycle
            # print(sys_area)
            if do_cycle_parsing:
                get_district(sys_area)
        finally:
            city_cycle += 1


def get_district(sys_area):  # 区/县 110101
    city_url = sys_area["city_url"]
    province_code = sys_area["province_code"]
    cycle_sleep()
    response = requests.get(city_url, headers=headers)
    response.encoding = encoding_type
    text = response.text
    # print(text)
    html = etree.HTML(text)
    trs = html.xpath('//tr[@class="countytr"]')
    district_cycle = 1
    for tr in trs:
        try:
            district_name = tr.xpath('./td[2]/a/text()')[0]
            district_code = tr.xpath('./td[1]/a/text()')[0]
            district_page = tr.xpath('./td[1]/a/@href')[0]  # 11/1101.html
            district_url = statsBaseUrl + province_code + "/" + district_page
            sys_area["district_name"] = district_name
            sys_area["district_code"] = district_code
            sys_area["district_page"] = district_page
            sys_area["district_url"] = district_url
            sys_area["district_cycle"] = district_cycle
            # print(sys_area)
            if do_cycle_parsing:
                get_town(sys_area)
        except IndexError:
            # 市下面可能没有区
            district_name = tr.xpath('./td[2]/text()')[0]
            district_code = tr.xpath('./td[1]/text()')[0]
            sys_area["district_name"] = district_name
            sys_area["district_code"] = district_code
            sys_area["district_page"] = ""
            sys_area["district_url"] = ""
            sys_area["district_cycle"] = district_cycle

            add_to_db(sys_area)
        finally:
            district_cycle += 1


def get_town(sys_area):  # 街道/镇
    country_url = sys_area["country_url"]
    province_code = sys_area["province_code"]
    city_code = sys_area["city_code"]
    district_url = sys_area["district_url"]
    cycle_sleep()
    response = requests.get(district_url, headers=headers)
    response.encoding = encoding_type
    text = response.text
    # print(text)
    html = etree.HTML(text)
    trs = html.xpath('//tr[@class="towntr"]')
    town_cycle = 1
    for tr in trs:
        try:
            town_code = tr.xpath('./td[1]/a/text()')[0]
            town_name = tr.xpath('./td[2]/a/text()')[0]
            town_page = tr.xpath('./td[1]/a/@href')[0]
            # http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/42/08/22/420822100.html
            town_url = country_url + province_code + "/" + city_code[2:4] + "/" + town_page

            sys_area["town_name"] = town_name
            sys_area["town_code"] = town_code
            sys_area["town_page"] = town_page
            sys_area["town_url"] = town_url
            sys_area["town_url"] = town_url
            sys_area["town_cycle"] = town_cycle
            # print(sys_area)
            if do_cycle_parsing:
                get_village(sys_area)
        finally:
            town_cycle += 1


def get_village(sys_area):
    town_url = sys_area["town_url"]
    cycle_sleep()
    response = requests.get(town_url, headers=headers)
    response.encoding = encoding_type
    text = response.text
    # print(text)
    html = etree.HTML(text)
    trs = html.xpath('//tr[@class="villagetr"]')
    village_cycle = 1
    for tr in trs:
        try:
            # 统计用区划代码 / 城乡分类代码 / 名称
            village_name = tr.xpath('./td[3]/text()')[0]
            village_code = tr.xpath('./td[1]/text()')[0]
            sys_area["village_name"] = village_name
            sys_area["village_code"] = village_code
            sys_area["village_cycle"] = village_cycle
            add_to_db(sys_area)
        finally:
            village_cycle += 1

            global cycle_idx
            cycle_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            print("%s[%d] %s" % (cycle_time, cycle_idx, sys_area))
            cycle_idx += 1


def add_to_db(sys_area):
    db = pymysql.connect(host="localhost", user="test", password="test", database="test")
    cursor = db.cursor()
    # province_name province_code
    # city_name city_code
    # district_name district_code
    # town_name town_code
    # village_name village_code

    crt_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    check_sql = "SELECT COUNT(1) AS TOTAL FROM SYS_AREA WHERE AREA_ID = '%s'"
    ins_sql = "INSERT INTO SYS_AREA(AREA_ID,AREA_NAME,AREA_CODE,AREA_LVL, LANG_KEY, LOCALE_NAME,TIME_ZONE,AREA_STATE," \
              "SORT_ODR,PAR_AREA_ID,CRT_UID,CRT_DATE,DEL_STATE) VALUES \
              ('%s','%s','%s','%s','%s','%s','%s','%s',%d,'%s', '%s', '%s', '%s')"

    if "province_code" in sys_area:
        cursor.execute(check_sql % (country_code + sys_area["province_code"]))
        cnt = cursor.fetchone()
        if cnt[0] < 1:
            province_sys_area = (
                country_code + sys_area["province_code"],  # AREA_ID
                sys_area["province_name"],  # AREA_NAME
                country_code + sys_area["province_code"],  # AREA_CODE
                "2",  # AREA_LVL
                "sys.country.cn." + country_code + sys_area["province_code"],  # LANG_KEY
                sys_area["province_name"], "8", "1",  # LOCALE_NAME,TIME_ZONE,AREA_STATE
                int(sys_area["province_cycle"]),  # SORT_ODR
                country_code,  # PAR_AREA_ID
                "1000", crt_date, "0"  # CRT_UID,CRT_DATE,DEL_STATE
            )
            print(province_sys_area)
            cursor.execute(ins_sql % province_sys_area)

    if "city_code" in sys_area:
        cursor.execute(check_sql % (country_code + sys_area["city_code"][0:4]))
        cnt = cursor.fetchone()
        if cnt[0] < 1:
            city_sys_area = (
                country_code + sys_area["city_code"][0:4],  # AREA_ID 城市用4位 86 + 2位省份 + 2位城市
                sys_area["city_name"],  # AREA_NAME
                country_code + sys_area["city_code"][0:4],  # AREA_CODE
                "3",  # AREA_LVL
                "sys.country.cn." + country_code + sys_area["city_code"][0:4],  # LANG_KEY
                sys_area["city_name"], "8", "1",  # LOCALE_NAME,TIME_ZONE,AREA_STATE
                int(sys_area["city_cycle"]),  # SORT_ODR
                country_code + sys_area["province_code"],  # PAR_AREA_ID
                "1000", crt_date, "0"  # CRT_UID,CRT_DATE,DEL_STATE
            )
            cursor.execute(ins_sql % city_sys_area)

    if "district_code" in sys_area:
        cursor.execute(check_sql % (country_code + sys_area["district_code"][0:6]))
        cnt = cursor.fetchone()
        if cnt[0] < 1:
            district_sys_area = (
                country_code + sys_area["district_code"][0:6],  # AREA_ID
                sys_area["district_name"],  # AREA_NAME
                country_code + sys_area["district_code"][0:6],  # AREA_CODE
                "4",  # AREA_LVL
                "sys.country.cn." + country_code + sys_area["district_code"][0:6],  # LANG_KEY
                sys_area["district_name"], "8", "1",  # LOCALE_NAME,TIME_ZONE,AREA_STATE
                int(sys_area["district_cycle"]),  # SORT_ODR
                country_code + sys_area["city_code"][0:4],  # PAR_AREA_ID
                "1000", crt_date, "0"  # CRT_UID,CRT_DATE,DEL_STATE
            )
            cursor.execute(ins_sql % district_sys_area)

    if "town_code" in sys_area:
        cursor.execute(check_sql % (country_code + sys_area["town_code"][0:9]))
        cnt = cursor.fetchone()
        if cnt[0] < 1:
            town_sys_area = (
                country_code + sys_area["town_code"][0:9],  # AREA_ID
                sys_area["town_name"],  # AREA_NAME
                country_code + sys_area["town_code"][0:9],  # AREA_CODE
                "5",  # AREA_LVL
                "sys.country.cn." + country_code + sys_area["town_code"][0:9],  # LANG_KEY
                sys_area["town_name"], "8", "1",  # LOCALE_NAME,TIME_ZONE,AREA_STATE
                int(sys_area["town_cycle"]),  # SORT_ODR
                country_code + sys_area["district_code"][0:6],  # PAR_AREA_ID
                "1000", crt_date, "0"  # CRT_UID,CRT_DATE,DEL_STATE
            )
            cursor.execute(ins_sql % town_sys_area)

    if "village_code" in sys_area:
        cursor.execute(check_sql % (country_code + sys_area["village_code"]))
        cnt = cursor.fetchone()
        if cnt[0] < 1:
            village_sys_area = (
                country_code + sys_area["village_code"],  # AREA_ID
                sys_area["village_name"],  # AREA_NAME
                country_code + sys_area["village_code"],  # AREA_CODE
                "6",  # AREA_LVL
                "sys.country.cn." + country_code + sys_area["village_code"],  # LANG_KEY
                sys_area["village_name"], "8", "1",  # LOCALE_NAME,TIME_ZONE,AREA_STATE
                int(sys_area["village_cycle"]),  # SORT_ODR
                country_code + sys_area["town_code"][0:9],  # PAR_AREA_ID
                "1000", crt_date, "0"  # CRT_UID,CRT_DATE,DEL_STATE
            )
            cursor.execute(ins_sql % village_sys_area)
    db.commit()

    cursor.close()
    db.close()
    #
    #
    # # check area id exists
    # area_ids = [sys_area["province_code"], sys_area["city_code"], sys_area["district_code"], sys_area["town_code"],
    #             sys_area["village_code"]]
    # area_in_ids = ','.join(("'%s'" % item for item in area_ids))
    # check_sql = "SELECT AREA_ID FROM SYS_AREA WHERE AREA_ID In (%s)" % ','.join(['%s']) * len(area_ids)
    # cursor.execute(check_sql, area_in_ids)
    # area_id_results = cursor.fetchall()
    # presence_values = [province_sys_area, city_sys_area, district_sys_area, town_sys_area, village_sys_area]
    # dst_values = []
    # presence_flags = [False, False, False, False, False]
    # for row in area_id_results:
    #     area_id = row[0]
    #     has_area_id = False
    #     if area_id == country_code + sys_area["province_code"]:
    #         presence_flags[0] = True
    #         continue
    #     elif area_id == country_code + sys_area["city_code"]:
    #         presence_flags[1] = True
    #         continue
    #     elif area_id == country_code + sys_area["district_code"]:
    #         presence_flags[2] = True
    #         continue
    #     elif area_id == country_code + sys_area["town_code"]:
    #         presence_flags[3] = True
    #         continue
    #     elif area_id == country_code + sys_area["village_code"]:
    #         presence_flags[4] = True
    #         continue
    # presence_idx = 0
    # for flag in presence_flags:
    #     if not flag:
    #         dst_values.append(presence_values[presence_idx])
    #     presence_idx += 1
    #
    # # ins
    # batch_ins_sql = "INSERT INTO SYS_AREA(AREA_ID,AREA_NAME,AREA_CODE,AREA_LVL, LANG_KEY, LOCALE_NAME,TIME_ZONE,AREA_STATE," \
    #                 "SORT_ODR,PAR_AREA_ID,CRT_UID,CRT_DATE,DEL_STATE) VALUES \
    #                 ('%s','%s','%s','%s','%s','%s','%s','%s',%d,'%s', '%s', '%s', '%s')"
    # cursor.executemany(batch_ins_sql, dst_values)


if __name__ == '__main__':
    # db = pymysql.connect(host="localhost", user="test", password="test", database="test")
    # cursor = db.cursor()

    # crt_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

    # trunc_sql = "TRUNCATE TABLE SYS_AREA"
    # ins_sql = "INSERT INTO SYS_AREA(AREA_ID,AREA_NAME,AREA_CODE,AREA_LVL, LANG_KEY, LOCALE_NAME,TIME_ZONE,AREA_STATE," \
    #           "SORT_ODR,PAR_AREA_ID,CRT_UID,CRT_DATE,DEL_STATE) VALUES \
    #           ('%s','%s','%s','%s','%s','%s','%s','%s',%d,'%s','%s','%s','%s')" % \
    #           ("86", "中国", "86", "1", "sys.country.cn", "中国", "8", "1", 1, "0", "1000", crt_date, "0")
    #
    # try:
    #     cursor.execute(trunc_sql)
    #     cursor.execute(ins_sql)
    #     db.commit()
    # except Exception as e:
    #     print(e)
    #     sys.exit(1)
    #     # print("Unknown exception: {}".format(sys.exc_info()[1]))
    # finally:
    #     cursor.close()
    #     db.close()

    get_province(statsBaseUrl)


### 特别说明
本次试验主要研究Python脚本爬虫能力,若有不合规行为,请联系本人删除,感谢!

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值