2023年行政区

jyvan

已于 2024-08-17 16:58:10 修改

阅读量75

点赞数 3

分类专栏： Python mysql 文章标签： python 开发语言

于 2024-08-17 16:17:27 首次发布

本文链接：https://blog.csdn.net/qq_25296245/article/details/141282439

版权

Python 同时被 2 个专栏收录

4 篇文章 0 订阅

订阅专栏

mysql

3 篇文章 0 订阅

订阅专栏

获取最新的行政区划分数据

使用python 获取

# !/usr/bin/env python3
# -*- coding: utf-8 -*-
from mysql_handler import MysqlHandler
from mysql_handler_sub import MysqlHandlerSub
from html_downloader import HtmlDownloader
from html_parser import HtmlParser
import traceback
import json 
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        self.mysql_handler = MysqlHandler()
        # self.mysql_handler = MysqlHandlerSub()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser() 
        # 爬取起点url
        self.root_url = 'http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html'
        # 用于后续url的拼接
        self.split_url = 'http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/'
        self.province_array = [11,12,13]
    def craw(self):
        self.get_Province_list()
    def get_Province_list(self):
        # 返回一个 response 对象
        downloading_url = self.root_url
        html_content = self.html_downloader.download(downloading_url)
        # 第一个参数：需要解析的html代码
        # 第二个参数：用于url拼接的url
        self.province_url_list = self.html_parser.province_parser(html_content, self.split_url)
        for province_name, province_url, province_code in self.province_url_list:
            print( province_name, province_url, province_code)
            area_code = str(province_code) + '000000000'
            province_id = self.mysql_handler.insert(1, province_name,  area_code,0,0)
            self.get_City_list(province_code,area_code,province_id)

    # 获取市
    def get_City_list(self,code, parent_code, parent_id):
        print('获取-市')
        print(code, parent_code, parent_id)
        # 返回一个 response 对象
        # 记录正在下载、解析的url，便于分析错误
        downloading_url = self.split_url + code + '.html'
        html_content = self.html_downloader.download(downloading_url)
        self.city_url_list = self.html_parser.city_parser(html_content, self.split_url)
        for city_name, city_url, city_code in self.city_url_list:
            city_id = self.mysql_handler.insert(2, city_name,city_code,parent_code,parent_id)
            if city_url is None:
                continue
                # 记录正在下载、解析的url，便于分析错误
            self.get_county_list(city_url,code, city_code, city_id)
    # 获取县级市
    def get_county_list(self,_url,code, parent_code, parent_id):
        print('获取 - 县级市')
        print(_url,code, parent_code, parent_id)
        # 记录正在下载、解析的url，便于分析错误
        downloading_url = self.split_url + _url
        html_content = self.html_downloader.download(downloading_url)
        self.county_url_list = self.html_parser.county_parser(html_content, self.split_url + code + "/")
        for county_name, county_url, county_code in self.county_url_list:
            city_id = self.mysql_handler.insert(3, county_name,county_code,parent_code,parent_id)
            if county_url is None:
                continue
            code1 = county_url.split('/')
            self.get_town_list(county_url,code,code1[0],county_code,city_id)
            # 记录正在下载、解析的url，便于分析错误

    def get_town_list(self,_url,code, code1,parent_code, parent_id):
        print('获取乡镇/街道')
        print(_url,code, code1, parent_id)
        downloading_url = self.split_url +code+ '/'  + _url
        html_content = self.html_downloader.download(downloading_url)
        self.town_url_list = self.html_parser.town_parser(html_content, self.split_url +code+ '/')

        for town_name, town_url, town_code in self.town_url_list:
            # 输出抓取到的乡镇街道的名称、链接（实际不需要）、编号代码
            town_id = self.mysql_handler.insert( 4,town_name, town_code,parent_code,parent_id)
            if town_url is None:
                continue
            self.get_village_list(town_url,code,code1,town_code,town_id)
    def get_village_list(self,_url,code, code1,parent_code, parent_id):
        print('获取村/居委会')
        print(_url,code, code1, parent_id)
        downloading_url =self.split_url +code+'/'+code1+'/'+_url
        html_content = self.html_downloader.download(downloading_url)
        self.village_url_list = self.html_parser.village_parser(html_content, self.split_url+"/")
        for village_name, village_url, village_code in self.village_url_list:
            # 输出抓取到的乡镇街道的名称、链接（实际不需要）、编号代码
            self.mysql_handler.insert(5, village_name, village_code,parent_code,parent_id)
if __name__ == '__main__':
    obj_spider = CodeSpider()
    obj_spider.craw()