import requests
import xlsxwriter
import os
from bs4 import BeautifulSoup
import time
import logging
logging.basicConfig(level=logging.DEBUG, # 控制台打印的日志级别
filename='new.log',
filemode='a', # 模式,有w和a,w就是写模式,每次都会重新写日志,覆盖之前的日志 #a是追加模式,默认如果不写的话,就是追加模式
format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' # 日志格式
)
# 地址
urlPrefix = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
dataArr = []
fields = ['行政区划代码', '行政区划名称', '所属行政区划代码', '所属行政区划名称', '年份', '级别']
# 抓取网页数据
def getHTML(url, encoding):
i = 1
while i < 11:
print('第%s次请求:' % i, url)
try:
r = requests.get(url)
r.encoding = encoding
if r.status_code == 200:
return BeautifulSoup(r.text, 'lxml')
else:
i += 1
except requests.exceptions.RequestException:
i += 1
time.sleep(1 * i)
return -1 # 当所有请求都失败,返回 -1 ,此时有极大的可能是网络问题或IP被封。
# 导出文件
def write_data_to_excel(fields, fileName, data):
if not os.path.exists('./xlsx'):
os.makedirs('./xlsx')
file_name = "./xlsx/"+fileName+".xlsx"
workbook = xlsxwriter.Workbook(file_name)
worksheet = workbook.add_worksheet('sheet1')
worksheet.write_row('A1', fields)
i = 2
for d in data:
row_data = d
row = 'A' + str(i)
worksheet.write_row(row, row_data)
i += 1
workbook.close()
# 遍历省级数据
def getProvincetr():
prov = getHTML(urlPrefix + 'index.html', 'GBK')
if prov != -1:
# 遍历省级
for item in prov.select('tr.provincetr a'):
# 因频发爬取会被限制,此处我改成一个省一个省爬取
if item.text == '新疆维吾尔自治区':
print("**********************["+item.text+"]抓取开始**********************")
provincetrCode = item.get('href').split('.')[0] # 行政区划代码
provincetrName = item.text # 行政区划名称
urlSuffix = item.get('href') # 地址后缀
provArr = [provincetrCode, provincetrName, "", "", "2020", "1"]
print(provArr)
# 将数据追加到数组中
dataArr.append(provArr)
# 查询下级数据
getCity(urlSuffix, provincetrCode, provincetrName)
# 将每个省份导成excel
write_data_to_excel(fields, provincetrName, dataArr)
print("**********************[" + item.text + "]抓取结束**********************")
else:
print('省级数据请求失败')
logging.error('省级数据请求失败:'+urlPrefix + 'index.html')
# 遍历市级数据 urlSuffix:地址后缀;provincetrCode:省级代码;provincetrName:省级名称
def getCity(urlSuffix, provincetrCode, provincetrName):
city = getHTML(urlPrefix + urlSuffix, 'GBK')
if city != -1:
# 遍历市级
for item in city.select('tr.citytr'):
if len(item.select('a')):
cityCode = item.select('td')[0].text # 行政区划代码
cityName = item.select('td')[1].text # 行政区划名称
citySuffix = item.select('td')[1].a.get('href') # 地址后缀
cityArr = [cityCode, cityName, provincetrCode, provincetrName, "2020", "2"]
print(cityArr)
# 将数据追加到数组中
dataArr.append(cityArr)
# 查询下级数据
getCounty(citySuffix, cityCode, cityName)
else:
cityCode = item.select('td')[0].text # 行政区划代码
cityName = item.select('td')[1].text # 行政区划名称
cityArr = [cityCode, cityName, provincetrCode, provincetrName, "2020", "2"]
print(cityArr)
# 将数据追加到数组中
dataArr.append(cityArr)
else:
print('市级数据请求失败')
logging.error('市级数据请求失败:' + urlPrefix + urlSuffix)
# 遍历区县数据 citySuffix:地址后缀;cityCode:市级代码;cityName:市级名称
def getCounty(citySuffix, cityCode, cityName):
county = getHTML(urlPrefix + citySuffix, 'GBK')
if county != -1:
# 遍历市级
for item in county.select('tr.countytr'):
if len(item.select('a')):
countyCode = item.select('td')[0].text # 行政区划代码
countyName = item.select('td')[1].text # 行政区划名称
countySuffix = item.select('td')[1].a.get('href') # 地址后缀
countyArr = [countyCode, countyName, cityCode, cityName, "2020", "3"]
print(countyArr)
# 将数据追加到数组中
dataArr.append(countyArr)
# 查询下级数据
getTown(countySuffix, countyCode, countyName,)
else:
countyCode = item.select('td')[0].text # 行政区划代码
countyName = item.select('td')[1].text # 行政区划名称
countyArr = [countyCode, countyName, cityCode, cityName, "2020", "3"]
print(countyArr)
# 将数据追加到数组中
dataArr.append(countyArr)
else:
print('区县级数据请求失败')
logging.error('区县级数据请求失败:' + urlPrefix + citySuffix)
# 遍历乡镇数据 countySuffix:地址后缀;countyCode:区县级代码;countyName:区县级名称
def getTown(countySuffix, countyCode, countyName):
town = getHTML(urlPrefix + countyCode[0:2] + '/' + countySuffix, 'GBK')
if town != -1:
# 遍历市级
for item in town.select('tr.towntr'):
if len(item.select('a')):
townCode = item.select('td')[0].text # 行政区划代码
townName = item.select('td')[1].text # 行政区划名称
townSuffix = item.select('td')[1].a.get('href') # 地址后缀
townArr = [townCode, townName, countyCode, countyName, "2020", "4"]
#print(townArr)
# 将数据追加到数组中
dataArr.append(townArr)
# 查询下级数据
getVillage(townSuffix, townCode, townName)
else:
townCode = item.select('td')[0].text # 行政区划代码
townName = item.select('td')[1].text # 行政区划名称
townArr = [townCode, townName, countyCode, countyName, "2020", "4"]
#print(townArr)
# 将数据追加到数组中
dataArr.append(townArr)
else:
print('乡镇级数据请求失败')
logging.error('乡镇级数据请求失败:' + urlPrefix + countyCode[0:2] + '/' + countySuffix)
# 遍历街道数据 townSuffix:地址后缀;townCode:乡镇级代码;townName:乡镇级名称
def getVillage(townSuffix, townCode, townName):
village = getHTML(urlPrefix + townCode[0:2] + '/' + townCode[2:4] + '/' + townSuffix, 'GBK')
if village != -1:
# 遍历市级
for item in village.select('tr.villagetr'):
if len(item.select('a')):
pass
else:
villageCode = item.select('td')[0].text # 行政区划代码
villageName = item.select('td')[2].text # 行政区划名称
villageArr = [villageCode, villageName, townCode, townName, "2020", "5"]
#print(villageArr)
# 将数据追加到数组中
dataArr.append(villageArr)
else:
print('街道级数据请求失败')
logging.error('街道级数据请求失败:' + urlPrefix + townCode[0:2] + '/' + townCode[2:4] + '/' + townSuffix)
if __name__ == "__main__":
getProvincetr()