爬取国家统计局2020年五级联动行政区划（精确）

最新推荐文章于 2024-08-02 17:07:04 发布

置顶 Kappuccinno

最新推荐文章于 2024-08-02 17:07:04 发布

阅读量2.3k

点赞数 10

分类专栏：国家统计局专栏文章标签： python 爬虫

本文链接：https://blog.csdn.net/python_solver/article/details/115468954

版权

国家统计局专栏专栏收录该内容

3 篇文章 11 订阅

订阅专栏

无其他新鲜数据的情况下，这篇应该是国家统计局专栏的最后一篇
思路和之前爬国家统计局运用的根节点叶节点思路基本相同，先放代码，具体的说明想好再解释~（代码中有部分注释）
爬到最小的村级大概用时一个半小时(因为没用代理ip或者多进程,最后要访问40000+乡级网页比较耗时)，总共630000+村级数据，但是网上的数据量分布在640000-740000之间，先不说和我数据不符，竟然体量也不尽相同，我也不知道是怎么回事qaq
（最新：看到这位博主的博客https://blog.csdn.net/xuemu2008/article/details/110262257，他的数据条目和这篇代码实现能完成的数目完全一致，所以应该是完全正确的）

import requests
import re
import xlsxwriter
import time
time_start=time.time()
agent={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
choose_ls=[depth*2 if depth<=3 else 3*(depth-1) for depth in range(1,6)]#根据深度大小取12位代码前**位
match_level=['provincetr','citytr','countytr','towntr','villagetr']
initurl='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
total_dict={}
depth=0
each_root={initurl:('','')}
max_depth=5#可选，1-5分别表示省级、地级、县级、乡级、村级,进而爬取固定深度范围内所有的叶节点以及该深度下的根节点
while depth<max_depth:
    total_count=0
    next_root={}
    for url in each_root:
        code_join=each_root[url][0]+'-' if depth!=0 else each_root[url][0]
        zone_join=each_root[url][1]+'-' if depth!=0 else each_root[url][1]
        change_root='/'.join(url.split('/')[:-1])+'/'
        while True:
            try:
                req=requests.get(url,headers=agent)
                req.encoding='GBK'#中文解码,不要用req.encoding=req.apparent_encoding,这样识别出来的req.encoding='gb2312',有好多复杂汉字解不出码
                text=req.text
                text=text.replace('\n','\\n')#正则表达式会跳过换行符（无法识别下一行），因此将换行符替换
                special_sigh=False
                if match_level[depth] in text:
                    match_text=re.findall(r"class='%s'>(.*?)</table"%match_level[depth],text)[0]
                    break
                else:
                    search=False
                    for level in range(depth,5):#东莞、中山、儋州缺县级单位，因此需要进行识别并放入下一节点存储
                        if match_level[level] in text:
                            match_text=re.findall(r"class='%s'>(.*?)</table"%match_level[level],text)[0]
                            search=True
                            special_sigh=True
                            print('特殊区划:%s'%each_root[url][1])
                            break
                    if search:
                        break
                    else:
                        print('服务器繁忙')
                        time.sleep(2)
            except:
                print('服务器繁忙')
                time.sleep(2)
        if special_sigh:
            next_root[url]=(code_join,zone_join)
        else:
            if depth!=0:
                has_tree=re.findall(r"href='(.*?)'>(\d+?)<.*?html'>(.*?)</a></td></tr>",match_text)
            else:
                base_tree=re.findall(r"href='(.*?)'>(.*?)<br/",match_text)
                has_tree=[(each[0],each[0].split('.html')[0],each[1]) for each in base_tree]
            base_no=re.findall(r"td>(\d+?)</td><td>(.*?)</td></tr>",match_text)
            no_tree=[(each[0],re.findall(r'<td>(.+)',each[1])[0] if 'td' in each[1] else each[1]) for each in base_no]
            for each in has_tree:
                each_dir=change_root+each[0]
                next_root[each_dir]=(code_join+each[1][:choose_ls[depth]],zone_join+each[2])
                if depth==3:
                    if (total_count+1)%100==0:
                        print('已爬取%d个,在路径%s处'%(total_count+1,zone_join+each[2]))
                else:
                    print('在路径%s处'%(zone_join+each[2]))
            if no_tree:
                for each in no_tree:
                    total_dict[code_join+each[0][:choose_ls[depth]]]=zone_join+each[1]
                    if depth==4:
                        if (total_count+1)%800==0:
                            print('已爬取%d个,在路径%s处'%(total_count+1,zone_join+each[1]))
                    else:
                        print('已获取路径%s'%(zone_join+each[1]))
        total_count+=1
    depth+=1
    each_root=next_root
def decompose(each):
    if type(total_dict[each])==tuple:
        codelist=total_dict[each][0].split('-')
        namelist=total_dict[each][1].split('-')
    else:
        codelist=each.split('-')
        namelist=total_dict[each].split('-')
    if len(codelist)<depth:
        for i in range(len(codelist),depth):
            codelist.append('')
            namelist.append('')
    ziplist=list(zip(codelist,namelist))
    return [i for j in ziplist for i in j]
sort_name=['省级','地级','县级','乡级','村级']
real_column=[(sort_name[each]+'代码',sort_name[each]+'名称') for each in range(depth)]
flat_col=[i for each in real_column for i in each]
total_dict.update(each_root)
if depth<=3:#县级及以上数据量不大（约三千行），可以用excel存储
    wk=xlsxwriter.Workbook('五级联动.xlsx')
    sh=wk.add_worksheet('sheet1')
    for each in range(2*depth):
        sh.write(0,each,flat_col[each])
    totalrow=1
    for each in total_dict:
        flatlist=decompose(each)
        for i in range(2*depth):
            sh.write(totalrow,i,flatlist[i])
        totalrow+=1
    wk.close()
else:#县级往下数据较多，excel没有优势，因此写入csv存储
    book=open('五级联动.csv','w',encoding='utf-8')
    book.write(','.join(flat_col)+'\n')
    for each in total_dict:
        flatten=decompose(each)
        book.write(','.join(flatten)+'\n')
    book.close()
time_end=time.time()
rest_second=time_end-time_start
print('用时%d分%d秒'%divmod(rest_second,60))