python爬虫爬取行政区划代码

最新推荐文章于 2024-06-17 15:07:45 发布

Pompey_hpy

最新推荐文章于 2024-06-17 15:07:45 发布

阅读量1.6k

点赞数 5

分类专栏： python 文章标签： python 爬虫大数据行政区划

本文链接：https://blog.csdn.net/qq_34651764/article/details/111559974

版权

python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import requests
import xlsxwriter
import os
from bs4 import BeautifulSoup
import time
import logging
logging.basicConfig(level=logging.DEBUG,  # 控制台打印的日志级别
                    filename='new.log',
                    filemode='a',  # 模式，有w和a，w就是写模式，每次都会重新写日志，覆盖之前的日志 #a是追加模式，默认如果不写的话，就是追加模式
                    format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'  # 日志格式
                    )

# 地址
urlPrefix = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
dataArr = []
fields = ['行政区划代码', '行政区划名称', '所属行政区划代码', '所属行政区划名称', '年份', '级别']


# 抓取网页数据
def getHTML(url, encoding):
    i = 1
    while i < 11:
        print('第%s次请求:' % i, url)
        try:
            r = requests.get(url)
            r.encoding = encoding
            if r.status_code == 200:
                return BeautifulSoup(r.text, 'lxml')
            else:
                i += 1
        except requests.exceptions.RequestException:
            i += 1
    time.sleep(1 * i)
    return -1  # 当所有请求都失败，返回  -1  ，此时有极大的可能是网络问题或IP被封。


# 导出文件
def write_data_to_excel(fields, fileName, data):
    if not os.path.exists('./xlsx'):
        os.makedirs('./xlsx')

    file_name = "./xlsx/"+fileName+".xlsx"
    workbook = xlsxwriter.Workbook(file_name)
    worksheet = workbook.add_worksheet('sheet1')
    worksheet.write_row('A1', fields)

    i = 2
    for d in data:
        row_data = d
        row = 'A' + str(i)
        worksheet.write_row(row, row_data)
        i += 1
    workbook.close()


# 遍历省级数据
def getProvincetr():
    prov = getHTML(urlPrefix + 'index.html', 'GBK')
    if prov != -1:
        # 遍历省级
        for item in prov.select('tr.provincetr a'):
            # 因频发爬取会被限制，此处我改成一个省一个省爬取
            if item.text == '新疆维吾尔自治区':
                print("**********************["+item.text+"]抓取开始**********************")
                provincetrCode = item.get('href').split('.')[0]  # 行政区划代码
                provincetrName = item.text  # 行政区划名称
                urlSuffix = item.get('href')  # 地址后缀
                provArr = [provincetrCode, provincetrName, "", "", "2020", "1"]
                print(provArr)
                # 将数据追加到数组中
                dataArr.append(provArr)
                # 查询下级数据
                getCity(urlSuffix, provincetrCode, provincetrName)
                # 将每个省份导成excel
                write_data_to_excel(fields, provincetrName, dataArr)
                print("**********************[" + item.text + "]抓取结束**********************")
    else:
        print('省级数据请求失败')
        logging.error('省级数据请求失败:'+urlPrefix + 'index.html')


# 遍历市级数据   urlSuffix:地址后缀；provincetrCode：省级代码；provincetrName：省级名称
def getCity(urlSuffix, provincetrCode, provincetrName):
    city = getHTML(urlPrefix + urlSuffix, 'GBK')
    if city != -1:
        # 遍历市级
        for item in city.select('tr.citytr'):
            if len(item.select('a')):
                cityCode = item.select('td')[0].text  # 行政区划代码
                cityName = item.select('td')[1].text  # 行政区划名称
                citySuffix = item.select('td')[1].a.get('href')  # 地址后缀
                cityArr = [cityCode, cityName, provincetrCode, provincetrName, "2020", "2"]
                print(cityArr)
                # 将数据追加到数组中
                dataArr.append(cityArr)
                # 查询下级数据
                getCounty(citySuffix, cityCode, cityName)
            else:
                cityCode = item.select('td')[0].text  # 行政区划代码
                cityName = item.select('td')[1].text  # 行政区划名称
                cityArr = [cityCode, cityName, provincetrCode, provincetrName, "2020", "2"]
                print(cityArr)
                # 将数据追加到数组中
                dataArr.append(cityArr)
    else:
        print('市级数据请求失败')
        logging.error('市级数据请求失败:' + urlPrefix + urlSuffix)


# 遍历区县数据  citySuffix:地址后缀；cityCode:市级代码；cityName:市级名称
def getCounty(citySuffix, cityCode, cityName):
    county = getHTML(urlPrefix + citySuffix, 'GBK')
    if county != -1:
        # 遍历市级
        for item in county.select('tr.countytr'):
            if len(item.select('a')):
                countyCode = item.select('td')[0].text  # 行政区划代码
                countyName = item.select('td')[1].text  # 行政区划名称
                countySuffix = item.select('td')[1].a.get('href')  # 地址后缀
                countyArr = [countyCode, countyName, cityCode, cityName, "2020", "3"]
                print(countyArr)
                # 将数据追加到数组中
                dataArr.append(countyArr)
                # 查询下级数据
                getTown(countySuffix, countyCode, countyName,)
            else:
                countyCode = item.select('td')[0].text  # 行政区划代码
                countyName = item.select('td')[1].text  # 行政区划名称
                countyArr = [countyCode, countyName, cityCode, cityName, "2020", "3"]
                print(countyArr)
                # 将数据追加到数组中
                dataArr.append(countyArr)
    else:
        print('区县级数据请求失败')
        logging.error('区县级数据请求失败:' + urlPrefix + citySuffix)


# 遍历乡镇数据  countySuffix:地址后缀；countyCode:区县级代码；countyName:区县级名称
def getTown(countySuffix, countyCode, countyName):
    town = getHTML(urlPrefix + countyCode[0:2] + '/' + countySuffix, 'GBK')
    if town != -1:
        # 遍历市级
        for item in town.select('tr.towntr'):
            if len(item.select('a')):
                townCode = item.select('td')[0].text  # 行政区划代码
                townName = item.select('td')[1].text  # 行政区划名称
                townSuffix = item.select('td')[1].a.get('href')  # 地址后缀
                townArr = [townCode, townName, countyCode, countyName, "2020", "4"]
                #print(townArr)
                # 将数据追加到数组中
                dataArr.append(townArr)
                # 查询下级数据
                getVillage(townSuffix, townCode, townName)
            else:
                townCode = item.select('td')[0].text  # 行政区划代码
                townName = item.select('td')[1].text  # 行政区划名称
                townArr = [townCode, townName, countyCode, countyName, "2020", "4"]
                #print(townArr)
                # 将数据追加到数组中
                dataArr.append(townArr)
    else:
        print('乡镇级数据请求失败')
        logging.error('乡镇级数据请求失败:' + urlPrefix + countyCode[0:2] + '/' + countySuffix)


# 遍历街道数据  townSuffix:地址后缀；townCode:乡镇级代码；townName:乡镇级名称
def getVillage(townSuffix, townCode, townName):
    village = getHTML(urlPrefix + townCode[0:2] + '/' + townCode[2:4] + '/' + townSuffix, 'GBK')
    if village != -1:
        # 遍历市级
        for item in village.select('tr.villagetr'):
            if len(item.select('a')):
                pass
            else:
                villageCode = item.select('td')[0].text  # 行政区划代码
                villageName = item.select('td')[2].text  # 行政区划名称
                villageArr = [villageCode, villageName, townCode, townName, "2020", "5"]
                #print(villageArr)
                # 将数据追加到数组中
                dataArr.append(villageArr)
    else:
        print('街道级数据请求失败')
        logging.error('街道级数据请求失败:' + urlPrefix + townCode[0:2] + '/' + townCode[2:4] + '/' + townSuffix)


if __name__ == "__main__":
    getProvincetr()

Pompey_hpy

关注

5
点赞
踩
9

收藏

觉得还不错? 一键收藏
3
评论
python爬虫爬取行政区划代码

import requestsimport xlsxwriterimport osfrom bs4 import BeautifulSoupimport timeimport logginglogging.basicConfig(level=logging.DEBUG, # 控制台打印的日志级别 filename='new.log', filemode='a', # 模式，有w和a，w就是写模式，每次都会重新写.
复制链接

扫一扫