Python 爬虫 中国行政区划信息爬取 (初学者)
背景
业务部门需要更新最新的全国区划信息数据,建立基础数据库,权威数据当然是国家统计局的官方数据, http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/
这里要做的,就是将其爬取下来。
环境准备
我们使用python工具爬取数据,并将其保存为Excel:
- python环境 ,略过;
- 相关依赖
requests、BeautifulSoup、pandas、threading、os
;
requests 用于web请求,并获取页面数据;
BeautifulSoup 提取页面数据;
pandas 数据分析,此处仅仅用来方便数据导出;
threading 多线程爬取;
代码片段
1、定义地址信息对象
封装解析后的数据,areainfo
class areainfo():
def __init__(self):
self.areacode='' #行政区划编码
self.areaname='' #行政区划名称
self.parentcode='' #父级区划编码
self.leve='' #地址级别
self.href='' #连接地址
def as_dict(self):
return {'areacode': self.areacode, 'areaname': self.areaname, 'parentcode': self.parentcode,'leve': self.leve,'href': self.href}
2、地址解析对象
将整个地址解析方法封装为一个类,包含 web请求、web解析等方法
2.1 获取web信息
def getUrl(self,url):
try:
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
resp = requests.get(url, headers=headers)
resp.encoding = 'gbk'
text = resp.text
soup = BeautifulSoup(text, "html.parser")
return soup
#记录异常请求
except Exception as e:
print(e)
with open('err.log', "a") as file: # ”w"代表着每次运行都覆盖内容
file.write(url + "\n")
return None
该处将异常的请求存到err.log文件中,以便于后期读取异常连接,补充丢失数据。
2.2 web信息解析
#classname 页面便签 ,parnetcode 父级区划编码,leve 当前区划等级
def initAreainfo(self,url,classname,parnetcode,leve):
print( "页面便签 %s -- 地址等级 %s --- url %s \n" % (classname,leve,url))
soup = self.getUrl(url)
if soup is None:
return None
classes = soup.find_all(name='tr', attrs={"class": classname}) # 按照字典的形式给attrs参数赋值
list = []
for classesoup in classes:
group = classesoup.find_all('a')
entity = areainfo()
entity.leve = leve
entity.parentcode = parnetcode
if len(group) > 0:
entity.href = group[0]['href']
entity.areacode = group[0].string
entity.areaname = group[1].string
else:
tds = classesoup.find_all('td')
entity.href = ''
if len(tds)==2 :
entity.areacode = tds[0].string
entity.areaname = tds[1].string
if len(tds)==3:
entity.areacode = tds[0].string
entity.areaname = tds[2].string
entity.parentcode = parnetcode
list.append(entity)
return list
网页中,每一层级区划信息的便签不同,可使用浏览器F12进入调试模式识别。BeautifulSoup 通过对标签class提取,获取需要的区划信息数据。
eg
2.3 区划信息提取
各等级区划信息提取,分别调用2.2的方法进行解析。每个方法返回地址list
'''
获取一级省份
'''
def getPronvice(self):
soup = self.getUrl(self.base)
if soup is None :
return None
provincesoups = soup.find_all(name='tr', attrs={"class": "provincetr"}) # 按照字典的形式给attrs参数赋值
provinceList=[]
for provincesoup in provincesoups:
for k in provincesoup.find_all('a'):
province = areainfo()
province.href=k['href']
province.areaname= k.get_text()
province.areacode= k['href'].replace(".html","0000")
province.parentcode="0"
province.leve = "1"
print(province.__dict__)
provinceList.append(province)
return provinceList
'''
获取二级城市
'''
def getCity(self,parent):
url=self.base + parent.href
list =self.initAreainfo(url,"citytr",parent.areacode,"2")
return list
'''
获取三级城市
'''
def getCounty(self,parent):
url = self.base + parent.href
list = self.initAreainfo(url,"countytr",parent.areacode,"3")
return list
'''
获取四级地址
'''
def getTown(self,parent):
url = parent.href
if url=='' :
return None
url = self.base + parent.areacode[0:2]+'/'+parent.href
list = self.initAreainfo(url,"towntr",parent.areacode,"4")
return list
'''
获取五级地址
'''
def getVillagetr(self,parent):
url = parent.href
if url=='' :
return None
url = self.base + parent.areacode[0:2]+'/'+parent.areacode[2:4]+'/'+parent.href
list = self.initAreainfo(url,"villagetr",parent.areacode,"5")
return list
2.4 省份数据封装
获取一个省下边所有地址数据
'''
通过省份获取该省份下所有地址信息
'''
def finAllPronvinceCity(self,pro,dir):
listall=[]
listall.append(pro)
citylist = self.getCity(pro)
for city in citylist :
listall.append(city)
#print(city.__dict__)
conlist = self.getCounty(city)
if conlist is not None :
for county in conlist:
#print(county.__dict__)
listall.append(county)
townlist = self.getTown(county)
if townlist is not None:
for town in townlist:
#print(town.__dict__)
listall.append(town)
villagelist = self.getVillagetr(town)
if villagelist is not None:
listall.extend(villagelist)
df = pd.DataFrame([x.as_dict() for x in listall])
#print(df)
isExists = os.path.exists(dir)
if not isExists:
os.makedirs(dir)
filepath = os.path.join(dir,pro.areaname+'.xlsx');
writer = pd.ExcelWriter(filepath)
df.to_excel(writer, float_format='%.5f')
writer.save()
2.5 线程封装
'''
异步调用
'''
def ruanthread(self):
provinces = self.getPronvice()
for province in provinces:
threading.Thread(target= self.finAllPronvinceCity, args=(province,'F://areainfo')).start()
2.6 万能的MAIN
if __name__ == '__main__':
china_city=china_city()
china_city.ruanthread()
2.7 补充-err.log 数据处理
构建新的方法,仅仅解析区划信息。该方法不太完善,仅参考
def getCityOnly(self,url,str,leve):
list = self.initAreainfo(url,str,"",leve)
return list
输出数据
def errFileRe(self):
listother=[]
with open('err.log', "r") as file:
line = file.readline()
while line:
# isspace()方法判断当该行是空行时,跳过该行
if line.isspace():
line = file.readline()
else:
"""
不是空行时,对每一行进行的操作
"""
line = line.replace("\n", '')
list = self.getCityOnly(line, "villagetr", "5")
listother.extend(list)
line = file.readline()
return listother
跑起来
运行日志
导出数据列表
数据格式
err.log日志:
完整代码
附上完整代码
import requests
from bs4 import BeautifulSoup
import pandas as pd
import threading
import os
class areainfo():
def __init__(self):
self.areacode='' #行政区划编码
self.areaname='' #行政区划名称
self.parentcode='' #父级区划编码
self.leve='' #地址级别
self.href='' #连接地址
def as_dict(self):
return {'areacode': self.areacode, 'areaname': self.areaname, 'parentcode': self.parentcode,'leve': self.leve,'href': self.href}
class china_city():
def __init__(self):
self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
'''
获取web信息
'''
def getUrl(self,url):
try:
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
resp = requests.get(url, headers=headers)
resp.encoding = 'gbk'
text = resp.text
soup = BeautifulSoup(text, "html.parser")
return soup
#记录异常请求
except Exception as e:
print(e)
with open('err.log', "a") as file: # ”w"代表着每次运行都覆盖内容
file.write(url + "\n")
return None
'''
获取一级省份
'''
def getPronvice(self):
soup = self.getUrl(self.base)
if soup is None :
return None
provincesoups = soup.find_all(name='tr', attrs={"class": "provincetr"}) # 按照字典的形式给attrs参数赋值
provinceList=[]
for provincesoup in provincesoups:
for k in provincesoup.find_all('a'):
province = areainfo()
province.href=k['href']
province.areaname= k.get_text()
province.areacode= k['href'].replace(".html","0000")
province.parentcode="0"
province.leve = "1"
print(province.__dict__)
provinceList.append(province)
return provinceList
'''
获取二级城市
'''
def getCity(self,parent):
url=self.base + parent.href
list =self.initAreainfo(url,"citytr",parent.areacode,"2")
return list
'''
获取三级城市
'''
def getCounty(self,parent):
url = self.base + parent.href
list = self.initAreainfo(url,"countytr",parent.areacode,"3")
return list
'''
获取四级地址
'''
def getTown(self,parent):
url = parent.href
if url=='' :
return None
url = self.base + parent.areacode[0:2]+'/'+parent.href
list = self.initAreainfo(url,"towntr",parent.areacode,"4")
return list
'''
获取五级地址
'''
def getVillagetr(self,parent):
url = parent.href
if url=='' :
return None
url = self.base + parent.areacode[0:2]+'/'+parent.areacode[2:4]+'/'+parent.href
list = self.initAreainfo(url,"villagetr",parent.areacode,"5")
return list
'''
soup解析
'''
def initAreainfo(self,url,classname,parnetcode,leve):
print( "页面便签 %s -- 地址等级 %s --- url %s \n" % (classname,leve,url))
soup = self.getUrl(url)
if soup is None:
return None
classes = soup.find_all(name='tr', attrs={"class": classname}) # 按照字典的形式给attrs参数赋值
list = []
for classesoup in classes:
group = classesoup.find_all('a')
entity = areainfo()
entity.leve = leve
entity.parentcode = parnetcode
if len(group) > 0:
entity.href = group[0]['href']
entity.areacode = group[0].string
entity.areaname = group[1].string
else:
tds = classesoup.find_all('td')
entity.href = ''
if len(tds)==2 :
entity.areacode = tds[0].string
entity.areaname = tds[1].string
if len(tds)==3:
entity.areacode = tds[0].string
entity.areaname = tds[2].string
entity.parentcode = parnetcode
list.append(entity)
return list
'''
通过省份获取该省份下所有地址信息
'''
def finAllPronvinceCity(self,pro,dir):
listall=[]
listall.append(pro)
citylist = self.getCity(pro)
for city in citylist :
listall.append(city)
#print(city.__dict__)
conlist = self.getCounty(city)
if conlist is not None :
for county in conlist:
#print(county.__dict__)
listall.append(county)
townlist = self.getTown(county)
if townlist is not None:
for town in townlist:
#print(town.__dict__)
listall.append(town)
villagelist = self.getVillagetr(town)
if villagelist is not None:
listall.extend(villagelist)
df = pd.DataFrame([x.as_dict() for x in listall])
#print(df)
isExists = os.path.exists(dir)
if not isExists:
os.makedirs(dir)
filepath = os.path.join(dir,pro.areaname+'.xlsx');
writer = pd.ExcelWriter(filepath)
df.to_excel(writer, float_format='%.5f')
writer.save()
'''
异步调用
'''
def ruanthread(self):
provinces = self.getPronvice()
for province in provinces:
threading.Thread(target= self.finAllPronvinceCity, args=(province,'F://areainfo')).start()
if __name__ == '__main__':
china_city=china_city()
china_city.ruanthread()
第一个爬虫程序,感谢交流,评论。