2023年行政区

获取最新的行政区划分数据

使用python 获取
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
from mysql_handler import MysqlHandler
from mysql_handler_sub import MysqlHandlerSub
from html_downloader import HtmlDownloader
from html_parser import HtmlParser
import traceback
import json 
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        self.mysql_handler = MysqlHandler()
        # self.mysql_handler = MysqlHandlerSub()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser() 
        # 爬取起点url
        self.root_url = 'http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html'
        # 用于后续url的拼接
        self.split_url = 'http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/'
        self.province_array = [11,12,13]
    def craw(self):
        self.get_Province_list()
    def get_Province_list(self):
        # 返回一个 response 对象
        downloading_url = self.root_url
        html_content = self.html_downloader.download(downloading_url)
        # 第一个参数:需要解析的html代码
        # 第二个参数:用于url拼接的url
        self.province_url_list = self.html_parser.province_parser(html_content, self.split_url)
        for province_name, province_url, province_code in self.province_url_list:
            print( province_name, province_url, province_code)
            area_code = str(province_code) + '000000000'
            province_id = self.mysql_handler.insert(1, province_name,  area_code,0,0)
            self.get_City_list(province_code,area_code,province_id)

    # 获取市
    def get_City_list(self,code, parent_code, parent_id):
        print('获取-市')
        print(code, parent_code, parent_id)
        # 返回一个 response 对象
        # 记录正在下载、解析的url,便于分析错误
        downloading_url = self.split_url + code + '.html'
        html_content = self.html_downloader.download(downloading_url)
        self.city_url_list = self.html_parser.city_parser(html_content, self.split_url)
        for city_name, city_url, city_code in self.city_url_list:
            city_id = self.mysql_handler.insert(2, city_name,city_code,parent_code,parent_id)
            if city_url is None:
                continue
                # 记录正在下载、解析的url,便于分析错误
            self.get_county_list(city_url,code, city_code, city_id)
    # 获取县级市
    def get_county_list(self,_url,code, parent_code, parent_id):
        print('获取 - 县级市')
        print(_url,code, parent_code, parent_id)
        # 记录正在下载、解析的url,便于分析错误
        downloading_url = self.split_url + _url
        html_content = self.html_downloader.download(downloading_url)
        self.county_url_list = self.html_parser.county_parser(html_content, self.split_url + code + "/")
        for county_name, county_url, county_code in self.county_url_list:
            city_id = self.mysql_handler.insert(3, county_name,county_code,parent_code,parent_id)
            if county_url is None:
                continue
            code1 = county_url.split('/')
            self.get_town_list(county_url,code,code1[0],county_code,city_id)
            # 记录正在下载、解析的url,便于分析错误

    def get_town_list(self,_url,code, code1,parent_code, parent_id):
        print('获取乡镇/街道')
        print(_url,code, code1, parent_id)
        downloading_url = self.split_url +code+ '/'  + _url
        html_content = self.html_downloader.download(downloading_url)
        self.town_url_list = self.html_parser.town_parser(html_content, self.split_url +code+ '/')

        for town_name, town_url, town_code in self.town_url_list:
            # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码
            town_id = self.mysql_handler.insert( 4,town_name, town_code,parent_code,parent_id)
            if town_url is None:
                continue
            self.get_village_list(town_url,code,code1,town_code,town_id)
    def get_village_list(self,_url,code, code1,parent_code, parent_id):
        print('获取村/居委会')
        print(_url,code, code1, parent_id)
        downloading_url =self.split_url +code+'/'+code1+'/'+_url
        html_content = self.html_downloader.download(downloading_url)
        self.village_url_list = self.html_parser.village_parser(html_content, self.split_url+"/")
        for village_name, village_url, village_code in self.village_url_list:
            # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码
            self.mysql_handler.insert(5, village_name, village_code,parent_code,parent_id)
if __name__ == '__main__':
    obj_spider = CodeSpider()
    obj_spider.craw()
  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

jyvan

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值