直接爬取个省市区的行政代码编号

今天的工作内容,需要获取某省级所有的行政区域编码,由于数据量太多,又懒得逐条整理,索性花费一点时间,写了一个爬虫。

由于又懒得定位某省的,索性全国的编码都获取下来,至于剩下的查看某省份的信息,那就交给其他同事好了。

再懒,也得说一下代码的结构,这是乌龟的屁股(规定),O(∩_∩)O~

一、爬取的内容以csv文件存储,

二、爬取的层数:

  1层:爬取省份的信息

  2层:爬取市的信息

  3层:爬取区的信息

 

下面直接上代码:

# coding = utf -8
#auth = 'carl_DJ'

import requests
from lxml  import etree
import csv,time
import  pandas as pd
from queue import Queue
from threading import Thread

#获取网页数据

def getUrl(url,num_retries = 5):
    headers = {
        'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"}
    try:
        response = requests.get(url, headers=headers)
        response.encoding = 'GBK'
        data = response.text
        return data
    except Ellipsis as e:
        if num_retries > 0:
            time.sleep(5)
            print(url)
            print("request fail,retry!")
            return getUrl(url,num_retries-1) #递归调用

        else:
            print("retry fail!")
            print("errors:%s" %e + "" +url)


#获取省级代码函数
def getProvice(url):
    provice = []
    data = getUrl(url)
    selector = etree.HTML(data)
    proviceList = selector.xpath('//tr[@class="provincetr"]')
    for i  in proviceList:
        proviceName  = i.xpath('td/a/text()')
        proviceLink = i.xpath('td/a/@href')
        for j in range(len(proviceLink)):
            ##根据获取到的每个省的链接进行补全,得到真实的UR
            proviceURL = url[:-10] + proviceLink[j]
            provice.append({'name':proviceName[j],'link':proviceURL})
    return provice

#获取市级代码函数
def getCity(url_list):
    city_all = []
    for url in url_list:
        data = getUrl(url)
        selector = etree.HTML(data)
        cityList = selector.xpath('//tr[@class="citytr"]')

        #获取每个城市的代码,url
        city =[]
        for i  in cityList:
            cityCode = i.xpath('td[1]/a/text()')
            cityLink = i.xpath('td[1]/a/@href')
            cityName = i.xpath('td[2]/a/text()')

            for j in range(len(cityLink)):
                ##根据获取到的每市省的链接进行补全,得到真实的UR
                cityURL = url[:-7] +cityLink[j]
                city.append({'name':cityName[j],'code':cityCode[j],'link':cityURL})
        #所有省份的城市信息合并到一起
        city_all.extend(city)
    return city_all


#获取区级代码函数 --- 多线程
def getCounty(url_list):
    queue_county =Queue()  #列队
    thread_num = 10  #进程数
    county = []   #记录区级信息字典(全局)

    def produce_url(url_list):
        for url in url_list:
            queue_county.put(url)  #生成列队,等待提取

    def getData():
        while not queue_county.empty():  #可以遍历到所有,并能正常退出
            url =queue_county.get()
            data = getUrl(url=url)
            selector = etree.HTML(data)
            countyList = selector.xpath('//tr[@class="countytr"]')

            #爬取每个区域的代码,url
            for i in countyList:
                countryCode = i.xpath('td[1]/a/text()')
                countyLink = i.xpath('td[1]/a/@href')
                countyName = i.xpath('td[2]/a/text()')

                # 存储格式为字典
                for j in range(len(countyLink)):
                    countyURL = url[:-9] + countyLink[j]
                    county.append({'code': countryCode[j], 'link':countyURL, 'name': countyName[j]})


    def run (url_list):
        produce_url(url_list)

        ths = []
        for _ in range(thread_num):
            th = Thread(target=getData)
            th.start()
            ths.append(th)
        for  th in ths:
            th.join()

    run(url_list)
    return county


  #省级信息获取
# url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html"
# pro = getProvice(url)
pro = getProvice("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html")
df_province = pd.DataFrame(pro)
df_province.info()
  #写入csv
df_province.to_csv('province.csv',sep=',',header=True,index=False)

  #市级信息获取
city = getCity(df_province['link'])
df_city=pd.DataFrame(city)
df_city.info()
  #写入csv
df_city.to_csv('city.csv',sep=',',header=True,index=False)


  #区级信息获取
county = getCounty(df_city['link'])
df_county=pd.DataFrame(county)
df_county.info()
  #写入csv
df_county.to_csv('county.csv',sep=',',header=True,index=False)


  • 3
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
从国家统计局抓取的地图省市区代码和城划分代码(最新2020/06/03),共596071条数据。来源于国家统计局http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/。 数据结构: CREATE TABLE `area` ( `areaid` varchar(255) COLLATE utf8_unicode_ci NOT NULL, `area_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, `fatherid` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, `area_type` int(255) DEFAULT NULL COMMENT '区域代码:\r\n100 :城镇,110:城区,111 :主城区,112 :城乡结合区,120 :镇区,121 :镇中心区,122:镇乡结合区,123:特殊区域200 :乡村,210:乡中心区,220:村庄\r\n\r\n', `is_delete` int(255) DEFAULT '0', PRIMARY KEY (`areaid`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 部分数据: INSERT INTO `area` VALUES ('110000000000','北京市',NULL,NULL,0); INSERT INTO `area` VALUES ('110100000000','市辖区','110000000000',NULL,0); INSERT INTO `area` VALUES ('110101000000','东城区','110100000000',NULL,0); INSERT INTO `area` VALUES ('110101001000','东华门街道','110101000000',NULL,0); INSERT INTO `area` VALUES ('110101001001','多福巷社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001002','银闸社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001005','东厂社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001006','智德社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001007','南池子社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001008','黄图岗社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001009','灯市口社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001010','正义路社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001011','甘雨社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001013','台基厂社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001014','韶九社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001015','王府井社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101002000','景山街道','110101000000',NULL,0); INSERT INTO `area` VALUES ('110101002001','隆福寺社区居委会','110101002000',111,0); INSERT INTO `area` VALUES ('110101002002','吉祥社区居
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Carl_奕然

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值