无其他新鲜数据的情况下,这篇应该是国家统计局专栏的最后一篇
思路和之前爬国家统计局运用的根节点叶节点思路基本相同,先放代码,具体的说明想好再解释~(代码中有部分注释)
爬到最小的村级大概用时一个半小时(因为没用代理ip或者多进程,最后要访问40000+乡级网页比较耗时),总共630000+村级数据,但是网上的数据量分布在640000-740000之间,先不说和我数据不符,竟然体量也不尽相同,我也不知道是怎么回事qaq
(最新:看到这位博主的博客https://blog.csdn.net/xuemu2008/article/details/110262257,他的数据条目和这篇代码实现能完成的数目完全一致,所以应该是完全正确的)
import requests
import re
import xlsxwriter
import time
time_start=time.time()
agent={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
choose_ls=[depth*2 if depth<=3 else 3*(depth-1) for depth in range(1,6)]#根据深度大小取12位代码前**位
match_level=['provincetr','citytr','countytr','towntr','villagetr']
initurl='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
total_dict={}
depth=0
each_root={initurl:('','')}
max_depth=5#可选,1-5分别表示省级、地级、县级、乡级、村级,进而爬取固定深度范围内所有的叶节点以及该深度下的根节点
while depth<max_depth:
total_count=0
next_root={}
for url in each_root:
code_join=each_root[url][0]+'-' if depth!=0 else each_root[url][0]
zone_join=each_root[url][1]+'-' if depth!=0 else each_root[url][1]
change_root='/'.join(url.split('/')[:-1])+'/'
while True:
try:
req=requests.get(url,headers=agent)
req.encoding='GBK'#中文解码,不要用req.encoding=req.apparent_encoding,这样识别出来的req.encoding='gb2312',有好多复杂汉字解不出码
text=req.text
text=text.replace('\n','\\n')#正则表达式会跳过换行符(无法识别下一行),因此将换行符替换
special_sigh=False
if match_level[depth] in text:
match_text=re.findall(r"class='%s'>(.*?)</table"%match_level[depth],text)[0]
break
else:
search=False
for level in range(depth,5):#东莞、中山、儋州缺县级单位,因此需要进行识别并放入下一节点存储
if match_level[level] in text:
match_text=re.findall(r"class='%s'>(.*?)</table"%match_level[level],text)[0]
search=True
special_sigh=True
print('特殊区划:%s'%each_root[url][1])
break
if search:
break
else:
print('服务器繁忙')
time.sleep(2)
except:
print('服务器繁忙')
time.sleep(2)
if special_sigh:
next_root[url]=(code_join,zone_join)
else:
if depth!=0:
has_tree=re.findall(r"href='(.*?)'>(\d+?)<.*?html'>(.*?)</a></td></tr>",match_text)
else:
base_tree=re.findall(r"href='(.*?)'>(.*?)<br/",match_text)
has_tree=[(each[0],each[0].split('.html')[0],each[1]) for each in base_tree]
base_no=re.findall(r"td>(\d+?)</td><td>(.*?)</td></tr>",match_text)
no_tree=[(each[0],re.findall(r'<td>(.+)',each[1])[0] if 'td' in each[1] else each[1]) for each in base_no]
for each in has_tree:
each_dir=change_root+each[0]
next_root[each_dir]=(code_join+each[1][:choose_ls[depth]],zone_join+each[2])
if depth==3:
if (total_count+1)%100==0:
print('已爬取%d个,在路径%s处'%(total_count+1,zone_join+each[2]))
else:
print('在路径%s处'%(zone_join+each[2]))
if no_tree:
for each in no_tree:
total_dict[code_join+each[0][:choose_ls[depth]]]=zone_join+each[1]
if depth==4:
if (total_count+1)%800==0:
print('已爬取%d个,在路径%s处'%(total_count+1,zone_join+each[1]))
else:
print('已获取路径%s'%(zone_join+each[1]))
total_count+=1
depth+=1
each_root=next_root
def decompose(each):
if type(total_dict[each])==tuple:
codelist=total_dict[each][0].split('-')
namelist=total_dict[each][1].split('-')
else:
codelist=each.split('-')
namelist=total_dict[each].split('-')
if len(codelist)<depth:
for i in range(len(codelist),depth):
codelist.append('')
namelist.append('')
ziplist=list(zip(codelist,namelist))
return [i for j in ziplist for i in j]
sort_name=['省级','地级','县级','乡级','村级']
real_column=[(sort_name[each]+'代码',sort_name[each]+'名称') for each in range(depth)]
flat_col=[i for each in real_column for i in each]
total_dict.update(each_root)
if depth<=3:#县级及以上数据量不大(约三千行),可以用excel存储
wk=xlsxwriter.Workbook('五级联动.xlsx')
sh=wk.add_worksheet('sheet1')
for each in range(2*depth):
sh.write(0,each,flat_col[each])
totalrow=1
for each in total_dict:
flatlist=decompose(each)
for i in range(2*depth):
sh.write(totalrow,i,flatlist[i])
totalrow+=1
wk.close()
else:#县级往下数据较多,excel没有优势,因此写入csv存储
book=open('五级联动.csv','w',encoding='utf-8')
book.write(','.join(flat_col)+'\n')
for each in total_dict:
flatten=decompose(each)
book.write(','.join(flatten)+'\n')
book.close()
time_end=time.time()
rest_second=time_end-time_start
print('用时%d分%d秒'%divmod(rest_second,60))
村级经pandas sort_values排序后,如图所示: