国家统计局 2020年11月06日最新全国省、市、区/县、镇/街道、村/社区五级地区数据

数据源:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/ 根据国家统计局2020年11月06日最新数据

没开多线程,到乡镇/街道 四级地址大概耗时2分钟左右 ,到5级大概耗时1个多小时 

省/自治区/直辖市:31 条
市:342 条
区/县:2993 条
镇/街道:41613 条
村/社区:633980 条
总计:678959 条

​​​​​​​需要的可以下载数据:https://download.csdn.net/download/z_wen_quan/20603110

import requests
import time
from bs4 import BeautifulSoup

time_start=time.time()
agent = {
    'Accept-Language':'zh-CN,zh;q=0.9',
    'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1',
}
def rget_html(url):
    req = requests.get(url,headers=agent)
    req.encoding = 'GBK' #中文解码
    return req.text

choose_ls=[depth*2 if depth<=3 else 3*(depth-1) for depth in range(1,6)]  #根据深度大小取12位代码前**位 [2, 4, 6, 9, 12]

match_table_class=['provincetable','citytable','countytable','towntable','villagetable']
match_tr_class=['provincetr','citytr','countytr','towntr','villagetr']
init_url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/34.html'

url_list={init_url:('','')} 
depth=0
max_depth= 5 #可选,1-5分别表示省、市、区/县、乡镇/街道、村/居委会
position_list={}
while depth < max_depth:
    next_url_list={}
    for url in url_list:
        
        change_url ='/'.join(url.split('/')[:-1])+'/'
        code_index=url_list[url][0]+'-' if depth!=0 else url_list[url][0]
        name_index=url_list[url][1]+'-' if depth!=0 else url_list[url][1]

        while True:
            # 获取数据
            try:
                req = requests.get(url,headers=agent)
                req.encoding = 'GBK' #中文解码,不要用req.encoding=req.apparent_encoding,这样识别出来的req.encoding='gb2312',有好多复杂汉字解不出码
                text = req.text
                position = BeautifulSoup(text, 'lxml')
                special_sigh=0
                if match_table_class[depth] in text:
                    match_table = position.find(class_=match_table_class[depth])
                    if depth !=0:
                        tr_list = match_table.find_all('tr',class_=match_tr_class[depth])
                        break
                    else:
                        tr_list = match_table.find_all('td')
                        break
                else:
                    #  东莞、中山、儋州等缺县级单位,放到下个节点存储
                    search = 0
                    for level in range(depth,5):#东莞、中山、儋州缺县级单位,因此需要进行识别并放入下一节点存储
                        if match_table_class[level] in text:
                            special_sigh = 1
                            search = 1
                            next_url_list[url] = (code_index,name_index)
                            print('---------特殊区划转存下级---:%s'%url_list[url][1])
                            break
                    if search:
                        break
                    else:
                        print('服务器繁忙-----:%s'%url_list[url][1])
                        time.sleep(3)
            except requests.RequestException as e:
                print('服务器繁忙-----:%s'%url_list[url][1])
                time.sleep(3)

        total_count = 0
        if special_sigh != 1:
            if depth !=0:
                for tr in tr_list:
                    if tr.find_all('a'):
                        a_list = tr.find_all('a')
                        href = a_list[0].get('href')
                        code = a_list[0].text
                        if depth ==4:
                            name = a_list[2].text
                        else:
                            name = a_list[1].text
                        next_url_list[change_url+href] = (code_index+code[:choose_ls[depth]],name_index+name)
                        total_count+=1
                        # print('在路径-%s-处'%(name_index+name))
                    else :
                        # 有些市辖区数据 直接存储  河北省-唐山市-市辖区
                        td_list = tr.find_all('td')
                        code = td_list[0].text
                        if depth ==4:
                            name = td_list[2].text
                        else:
                            name = td_list[1].text
                        if depth ==2:
                            print('---------市辖区直接存储-%s-'%(name_index+name))
                        position_list[code_index+code[:choose_ls[depth]]] = name_index+name
                        total_count+=1
            else:
                for each in tr_list:
                    if each.find('a'):
                        href = each.find('a').get('href')
                        name = each.find('a').text
                        code = href.split('.html')[0]
                        next_url_list[change_url+href] = (code_index+code[:choose_ls[depth]],name_index+name)
                        total_count+=1
            print('已爬取-%d-个,在路径-%s-处'%(total_count+1,url_list[url][1]))
    
    depth+=1
    url_list = next_url_list

position_list.update(url_list)

def decompose(each):
    if type(position_list[each])==tuple:
        codelist=position_list[each][0].split('-')
        namelist=position_list[each][1].split('-')
    else:
        codelist=each.split('-')
        namelist=position_list[each].split('-')
    if len(codelist)<depth:
        for i in range(len(codelist),depth):
            codelist.append('')
            namelist.append('')
    ziplist=list(zip(codelist,namelist))
    return [i for j in ziplist for i in j]

sort_name=['province','city','county','town','village']
real_column=[(sort_name[each]+'_code',sort_name[each]) for each in range(depth)]
column=[i for each in real_column for i in each]

book=open('./position/position.csv','w',encoding='utf-8')
book.write(','.join(column)+'\n')
for position in position_list:
    flatten=decompose(position)
    book.write(','.join(flatten)+'\n')
book.close()

print('共爬取-%d-个'%(len(position_list)))
print('用时%d分%d秒'%divmod((time.time())-time_start,60))

  • 5
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值