Python 爬虫 中国行政区划信息爬取 (初学者)

1 篇文章 0 订阅
1 篇文章 0 订阅

背景

业务部门需要更新最新的全国区划信息数据,建立基础数据库,权威数据当然是国家统计局的官方数据, http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/
在这里插入图片描述
这里要做的,就是将其爬取下来。

环境准备

我们使用python工具爬取数据,并将其保存为Excel:

  1. python环境 ,略过;
  2. 相关依赖requests、BeautifulSoup、pandas、threading、os
    requests 用于web请求,并获取页面数据;
    BeautifulSoup 提取页面数据;
    pandas 数据分析,此处仅仅用来方便数据导出;
    threading 多线程爬取;

代码片段

1、定义地址信息对象

封装解析后的数据,areainfo

class areainfo():
  def __init__(self):
      self.areacode=''   #行政区划编码
      self.areaname=''   #行政区划名称
      self.parentcode='' #父级区划编码
      self.leve=''       #地址级别
      self.href=''       #连接地址

  def as_dict(self):
      return {'areacode': self.areacode, 'areaname': self.areaname, 'parentcode': self.parentcode,'leve': self.leve,'href': self.href}

2、地址解析对象

将整个地址解析方法封装为一个类,包含 web请求、web解析等方法
在这里插入图片描述

2.1 获取web信息

    def getUrl(self,url):
       try:
           headers = {
               'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
           resp = requests.get(url, headers=headers)
           resp.encoding = 'gbk'
           text = resp.text
           soup = BeautifulSoup(text, "html.parser")
           return  soup
       #记录异常请求
       except  Exception  as e:
           print(e)
           with open('err.log', "a") as file:  # ”w"代表着每次运行都覆盖内容
               file.write(url  + "\n")

           return  None

该处将异常的请求存到err.log文件中,以便于后期读取异常连接,补充丢失数据。

2.2 web信息解析

      #classname 页面便签 ,parnetcode 父级区划编码,leve 当前区划等级
     def initAreainfo(self,url,classname,parnetcode,leve):
     print( "页面便签 %s -- 地址等级 %s --- url  %s  \n" % (classname,leve,url))
     soup = self.getUrl(url)
     if soup is None:
         return  None

     classes = soup.find_all(name='tr', attrs={"class": classname})  # 按照字典的形式给attrs参数赋值
     list = []
     for classesoup in classes:
         group = classesoup.find_all('a')
         entity = areainfo()
         entity.leve = leve
         entity.parentcode = parnetcode
         if len(group) > 0:
             entity.href = group[0]['href']
             entity.areacode = group[0].string
             entity.areaname = group[1].string
         else:
             tds = classesoup.find_all('td')
             entity.href = ''
             if len(tds)==2 :
                 entity.areacode = tds[0].string
                 entity.areaname = tds[1].string
             if len(tds)==3:
                 entity.areacode = tds[0].string
                 entity.areaname = tds[2].string
                 entity.parentcode = parnetcode
         list.append(entity)
     return list

网页中,每一层级区划信息的便签不同,可使用浏览器F12进入调试模式识别。BeautifulSoup 通过对标签class提取,获取需要的区划信息数据。
eg通过class= 'citytr' 提供城市信息数据

2.3 区划信息提取

各等级区划信息提取,分别调用2.2的方法进行解析。每个方法返回地址list

    '''
    获取一级省份
 '''
 def getPronvice(self):
     soup = self.getUrl(self.base)
     if soup is None :
         return None

     provincesoups = soup.find_all(name='tr', attrs={"class": "provincetr"})  # 按照字典的形式给attrs参数赋值
     provinceList=[]
     for provincesoup in provincesoups:
         for k in provincesoup.find_all('a'):
             province = areainfo()
             province.href=k['href']
             province.areaname= k.get_text()
             province.areacode= k['href'].replace(".html","0000")
             province.parentcode="0"
             province.leve = "1"
             print(province.__dict__)
             provinceList.append(province)
     return provinceList
 '''
     获取二级城市
 '''
 def getCity(self,parent):
     url=self.base + parent.href
     list =self.initAreainfo(url,"citytr",parent.areacode,"2")
     return list
 '''
    获取三级城市
 '''
 def getCounty(self,parent):
     url = self.base + parent.href
     list  = self.initAreainfo(url,"countytr",parent.areacode,"3")
     return  list
 '''
    获取四级地址
 '''
 def getTown(self,parent):
     url = parent.href
     if url=='' :
         return None
     url = self.base + parent.areacode[0:2]+'/'+parent.href
     list = self.initAreainfo(url,"towntr",parent.areacode,"4")
     return  list
 '''
   获取五级地址
 '''
 def getVillagetr(self,parent):
     url = parent.href
     if url=='' :
         return None
     url = self.base + parent.areacode[0:2]+'/'+parent.areacode[2:4]+'/'+parent.href
     list = self.initAreainfo(url,"villagetr",parent.areacode,"5")
     return  list

2.4 省份数据封装

获取一个省下边所有地址数据

    '''
     通过省份获取该省份下所有地址信息
   '''
   def finAllPronvinceCity(self,pro,dir):
       listall=[]
       listall.append(pro)
       citylist =  self.getCity(pro)
       for city in citylist :
           listall.append(city)
           #print(city.__dict__)
           conlist =  self.getCounty(city)
           if conlist is not None :
               for county in conlist:
                   #print(county.__dict__)
                   listall.append(county)
                   townlist = self.getTown(county)
                   if townlist is not None:
                       for town in townlist:
                           #print(town.__dict__)
                           listall.append(town)
                           villagelist = self.getVillagetr(town)
                           if villagelist is not None:
                               listall.extend(villagelist)
       df = pd.DataFrame([x.as_dict() for x in listall])
       #print(df)
       isExists = os.path.exists(dir)
       if not isExists:
           os.makedirs(dir)
       filepath = os.path.join(dir,pro.areaname+'.xlsx');
       writer = pd.ExcelWriter(filepath)
       df.to_excel(writer, float_format='%.5f')
       writer.save()

2.5 线程封装

   '''
       异步调用
    '''
    def ruanthread(self):
        provinces = self.getPronvice()
        for province in provinces:
            threading.Thread(target= self.finAllPronvinceCity, args=(province,'F://areainfo')).start()

2.6 万能的MAIN

  if __name__ == '__main__':
   china_city=china_city()
   china_city.ruanthread()

2.7 补充-err.log 数据处理

构建新的方法,仅仅解析区划信息。该方法不太完善,仅参考

  def getCityOnly(self,url,str,leve):
       list = self.initAreainfo(url,str,"",leve)
       return  list

输出数据

def errFileRe(self):
      listother=[]
      with open('err.log', "r") as file:
          line = file.readline()
          while line:
              # isspace()方法判断当该行是空行时,跳过该行
              if line.isspace():
                  line = file.readline()
              else:
                  """
                   不是空行时,对每一行进行的操作
                  """
                  line = line.replace("\n", '')
                  list = self.getCityOnly(line, "villagetr", "5")
                  listother.extend(list)
                  line = file.readline()

      return listother

跑起来

运行日志
在这里插入图片描述
导出数据列表
在这里插入图片描述
数据格式
在这里插入图片描述
err.log日志:
在这里插入图片描述

完整代码

附上完整代码

import  requests
from bs4 import BeautifulSoup
import pandas as pd
import threading
import os

class areainfo():
    def __init__(self):
        self.areacode=''   #行政区划编码
        self.areaname=''   #行政区划名称
        self.parentcode='' #父级区划编码
        self.leve=''       #地址级别
        self.href=''       #连接地址

    def as_dict(self):
        return {'areacode': self.areacode, 'areaname': self.areaname, 'parentcode': self.parentcode,'leve': self.leve,'href': self.href}
class china_city():
    def __init__(self):
        self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
    '''
      获取web信息
    '''
    def getUrl(self,url):
        try:
            headers = {
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
            resp = requests.get(url, headers=headers)
            resp.encoding = 'gbk'
            text = resp.text
            soup = BeautifulSoup(text, "html.parser")
            return  soup
        #记录异常请求
        except  Exception  as e:
            print(e)
            with open('err.log', "a") as file:  # ”w"代表着每次运行都覆盖内容
                file.write(url  + "\n")

            return  None

    '''
       获取一级省份
    '''
    def getPronvice(self):
        soup = self.getUrl(self.base)
        if soup is None :
            return None

        provincesoups = soup.find_all(name='tr', attrs={"class": "provincetr"})  # 按照字典的形式给attrs参数赋值
        provinceList=[]
        for provincesoup in provincesoups:
            for k in provincesoup.find_all('a'):
                province = areainfo()
                province.href=k['href']
                province.areaname= k.get_text()
                province.areacode= k['href'].replace(".html","0000")
                province.parentcode="0"
                province.leve = "1"
                print(province.__dict__)
                provinceList.append(province)
        return provinceList
    '''
        获取二级城市
    '''
    def getCity(self,parent):
        url=self.base + parent.href
        list =self.initAreainfo(url,"citytr",parent.areacode,"2")
        return list
    '''
       获取三级城市
    '''
    def getCounty(self,parent):
        url = self.base + parent.href
        list  = self.initAreainfo(url,"countytr",parent.areacode,"3")
        return  list
    '''
       获取四级地址
    '''
    def getTown(self,parent):
        url = parent.href
        if url=='' :
            return None
        url = self.base + parent.areacode[0:2]+'/'+parent.href
        list = self.initAreainfo(url,"towntr",parent.areacode,"4")
        return  list
    '''
      获取五级地址
    '''
    def getVillagetr(self,parent):
        url = parent.href
        if url=='' :
            return None
        url = self.base + parent.areacode[0:2]+'/'+parent.areacode[2:4]+'/'+parent.href
        list = self.initAreainfo(url,"villagetr",parent.areacode,"5")
        return  list

    '''
       soup解析
    '''
    def initAreainfo(self,url,classname,parnetcode,leve):
        print( "页面便签 %s -- 地址等级 %s --- url  %s  \n" % (classname,leve,url))
        soup = self.getUrl(url)
        if soup is None:
            return  None

        classes = soup.find_all(name='tr', attrs={"class": classname})  # 按照字典的形式给attrs参数赋值
        list = []
        for classesoup in classes:
            group = classesoup.find_all('a')
            entity = areainfo()
            entity.leve = leve
            entity.parentcode = parnetcode
            if len(group) > 0:
                entity.href = group[0]['href']
                entity.areacode = group[0].string
                entity.areaname = group[1].string
            else:
                tds = classesoup.find_all('td')
                entity.href = ''
                if len(tds)==2 :
                    entity.areacode = tds[0].string
                    entity.areaname = tds[1].string
                if len(tds)==3:
                    entity.areacode = tds[0].string
                    entity.areaname = tds[2].string
                    entity.parentcode = parnetcode
            list.append(entity)
        return list

    '''
      通过省份获取该省份下所有地址信息
    '''
    def finAllPronvinceCity(self,pro,dir):
        listall=[]
        listall.append(pro)
        citylist =  self.getCity(pro)
        for city in citylist :
            listall.append(city)
            #print(city.__dict__)
            conlist =  self.getCounty(city)
            if conlist is not None :
                for county in conlist:
                    #print(county.__dict__)
                    listall.append(county)
                    townlist = self.getTown(county)
                    if townlist is not None:
                        for town in townlist:
                            #print(town.__dict__)
                            listall.append(town)
                            villagelist = self.getVillagetr(town)
                            if villagelist is not None:
                                listall.extend(villagelist)
        df = pd.DataFrame([x.as_dict() for x in listall])
        #print(df)
        isExists = os.path.exists(dir)
        if not isExists:
            os.makedirs(dir)
        filepath = os.path.join(dir,pro.areaname+'.xlsx');
        writer = pd.ExcelWriter(filepath)
        df.to_excel(writer, float_format='%.5f')
        writer.save()


    '''
       异步调用
    '''
    def ruanthread(self):
        provinces = self.getPronvice()
        for province in provinces:
            threading.Thread(target= self.finAllPronvinceCity, args=(province,'F://areainfo')).start()



if __name__ == '__main__':

    china_city=china_city()
    china_city.ruanthread()

第一个爬虫程序,感谢交流,评论。

  • 14
    点赞
  • 53
    收藏
    觉得还不错? 一键收藏
  • 7
    评论
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值