from lxml import etree
import urllib
import urllib.request
import xlwt
import pandas as pd
from pyecharts import Geo
import matplotlib.pyplot as plt
import matplotlib as mpl
def getpage(url):
req = urllib.request.Request(url)
req.add_header('User-Agent', "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36") #添加自己的用户代理
data = urllib.request.urlopen(req).read().decode("gbk")
return data
def getdata(data):
AQI = []
start = 0
html = etree.HTML(data)
infor = html.xpath('//li[@id!="tr-fixed"]//text()') #利用xpath解析路径
while True:
if start < len(infor):
AQI.append(infor[start:start+5])
start = start + 5
else:
break
return AQI
#写入excel表格
def writeExcel(AQI):
f = xlwt.Workbook()
sheet1 = f.add_sheet('The AQI', cell_overwrite_ok=True)
rowTitle = ['order', 'city', 'province', 'AQI', 'quality']
for i in range(len(rowTitle)):
sheet1.write(0, i, rowTitle[i])
for j in range(len(AQI)):
for k in range(len(AQI[j])):
sheet1.write(j+1, k, AQI[j][k])
f.save("E:\\python\\aqi.xls")
if __name__ == "__main__":
url = "http://tianqi.2345.com/air-rank.htm"
data = getpage(url)
AQI = getdata(data)
writeExcel(AQI)
city = []; value = []
fbook = pd.DataFrame(pd.read_excel("E:\\python\\aqi.xls", 0))
for each in fbook['city']:
city.append(str(each))
for each in fbook['AQI']:
value.append(each)
for order, quality in zip(fbook['order'], fbook['quality']):
if quality == "中度污染":
index = order-1
break
geo = Geo("全国空气质量指数", "Data from AQI", title_color="#fff", width=1000, height=600, \
background_color='#404a59')
geo.add("空气质量指数", city, value, visual_range=[1, 60], maptype='china', type='effectScatter', \
visual_text_color="#fff", effect_scale=5, symbol_size=15, is_visualmap=True, is_random=True, is_roam=False)
geo.render(path="全国空气质量指数.html")
fig = plt.figure()
font = {"size": 15}
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
ax = fig.add_subplot(111)
ax.spines['bottom'].set_linewidth(2) #设置底部坐标轴的粗细
ax.spines['left'].set_linewidth(2) #设置左边坐标轴的粗细
ax.spines['right'].set_linewidth(2) #设置右边坐标轴的粗细
ax.spines['top'].set_linewidth(2) #设置上部坐标轴的粗细
rects = ax.bar(range(len(city)-index), value[index:], width=0.08, tick_label=city[index:], color='m')
plt.tick_params(labelsize=15)
for rect in rects:
height = rect.get_height();
ax.text(rect.get_width()/2.0+rect.get_x()-0.04,height,height)
plt.xlabel('城市', font)
plt.ylabel('空气质量指数', font)
plt.title("中度污染、重度污染、严重污染城市", font)
mpl.rcParams["font.sans-serif"] = ["KaiTi"]
mpl.rcParams["axes.unicode_minus"] = False
plt.show()
爬取到的大数据如下所示: