天气数据的爬虫获取与建模分析

import requests # 导入request库
from bs4 import BeautifulSoup 
import pandas as pd 
url = "https://lishi.tianqi.com/beijing/202305.html"

# 设置请求头，防止被反爬
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.36'}
# 使用Requests发送网络请求
html = requests.get(url,headers=headers)
html.encoding = 'utf-8' #设置编码方式
print(html.text)

# 转化成BeautifulSoup对象
soup = BeautifulSoup(html.text, 'lxml')
# 选取每日天气的块
weather_list = soup.selct('ul[class="thrui"]')
print(weather_list)

# 创建空DataFrame
df_empty = pd.DataFrame(columns=['date','maxTem','minTem','weather','wind'])
for weather in weather_list: # weather_list列表只含一个元素
    ul_list = weather.select('li') 
    for li in ul_list: 
        div_list = li.select('div')
        shuchu=[]
        for div1 in div_list: 
            df1 = div1.string # 获取内容 
            shuchu.append(df1) # 将每个字符添加到列表后面，组成一个列表
        # print(shuchu)
        df_empty.loc[len(df_empty)] = shuchu # 增加行
 
        if len(shuchu)>0: #当有数据时增加行，不设置的化，因为最后一个shuchu为空，会报错
             df_empty.loc[len(df_empty)] = shuchu #增加行 
print(df_empty)
df_empty.to_excel('01_weather202011.xlsx') # 保存为excel

3. 抓取历史天气数据

from bs4 import BeautifulSoup 
import requests 
import pandas as pd 
import time
def get_url(city): # 定义获取链接函数
    url_list = []
    for year in range(2011, 2023):
        for month in range(1, 13):
            y = year * 100 + month
            url1 = "http://lishi.tianqi.com/" + city + "/" + str(y) + ".html"
            url_list.append(url1)
    return url_list
 
def get_weather_month(url): # 定义获取每月天气函数
 # 设置请求头，防止被反爬
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.36'} # 设置编码方式
    df_empty = pd.DataFrame(columns=['date','maxTem','minTem','weather','wind']) # 创建空D
    html = requests.get(url,headers=headers) # 使用Requests发送网络请求。
    html.encoding = 'utf-8' # 设置编码方式
    soup = BeautifulSoup(html.text, 'lxml') # 转化成BeautifulSoup对象
    weather_list = soup.select('ul[class="thrui"]') # 选取每月天气的块
    for weather in weather_list: 
        ul_list = weather.select('li') # 查找标签为li的
        for li in ul_list:
            div_list = li.select('div')
            shuchu=[]
            for div1 in div_list:
                df1 = div1.string # 获取内容 
                shuchu.append(df1) # 将每个字符添加到列表后面，组成一个列表
            #if len(shuchu)>0: #当有数据时增加行，不设置的化，因为最后一个shuchu为空，会报错
            df_empty.loc[len(df_empty)] = shuchu #增加行
    return df_empty
if __name__ == '__main__': # 主函数
    city = "beijing"
    all_url = get_url(city)
    all_weather = pd.DataFrame(columns=['date','maxTem','minTem', 'weather','wind']) # 创
    for url in all_url: 
        every_month_weather = get_weather_month(url)
        # 将爬取到的每月数据合并
        all_weather = pd.concat([all_weather,every_month_weather],ignore_index=True)
        print(all_weather)
        time.sleep(3) # 每个月数据爬取后停顿3秒，避免频繁发送请求被服务器屏蔽。
    all_weather.to_excel('demoData/02_all_weather.xlsx') # 保存为excel

4. 天气数据可视化

4.1 查看数据基本信息

import pandas as pd # 导入pandas库
df = pd.read_excel('demoData/02_all_weather.xlsx',sheet_name='Sheet1',index_col=0) # 读取历
print(df.head(5))
print(df.shape)
print(df.dtypes)
print(df.info()) # 快速浏览数据基本信息
print(df.isnull().sum()) # 观察缺失值数
print(df.describe(include='all'))

4.2 数据格式变换

import pandas as pd #导入pandas库
import numpy as np
df = pd.read_excel('demoData/02_all_weather.xlsx',sheet_name='Sheet1',index_col=0)#读取历史
# 一、日期变换，分列
df_date = df['date'].str.split(expand=True) # 未指定分列方式，按空格进行拆分
# print(df_date.head())
df['date'] = df_date.get(0)
# print(df.head())
# print(df.info())#快速浏览数据基本信息


# 二、气温数值化
# 最高气温列字符中提取数字，替换的方法
df['maxTem'] = df['maxTem'].str.replace('℃','').astype(np.float64)
# 最低气温列字符串中提取数字，正则表达式的方法
df['minTem'] = df['minTem'].str.extract('(\d+)').astype(np.float64)
print(df.head(5))

# 三、风向处理
df_wind = df['wind'].str.split(expand=True)
df_wind_split = df_wind.rename(columns={0:'wind_direction',1:'wind_speed'}) # 改变列名称
print(df_wind_split.head())


# 四、连接多个数据帧
df2 = pd.concat([df,df_wind_split],axis=1) # 连接数据帧
df_new = df2.drop('wind',axis=1)
# print(df_new.head())
# print(df_new.info()) # 快速浏览数据基本信息
print(df_new.describe()) # 描述数据
df_new.to_excel('demoData/03_newDate.xlsx') # 保存为excel

4.3 气温走势的折线图

import pandas as pd
import matplotlib.pyplot as plt
df_new = pd.read_excel('demoData/03_newDate.xlsx',sheet_name='Sheet1',index_col=0)
df_new.index = df_new['date'] # 修改索引值为日期
df_new['minTem'].plot(figsize=(20, 10)) # 作图
plt.xlabel('date',fontsize=20) # 设置X轴名称
plt.ylabel('temperature',fontsize=20) # 设置Y轴名称
plt.title('Min Temperature(2011-2020)',fontsize=20) # 设置图片标题
plt.tick_params(labelsize=15)
plt.show()

4.4 历年气温对比图

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
df_new = pd.read_excel(r'demoData/03_newDate.xlsx',sheet_name='Sheet1',index_col=0) # 读取
df_year = df_new['date'].str.split('-',expand=True)
df_year_month = df_year.rename(columns={0:'year',1:'month-day'}) # 改变列名称
df2 = pd.concat([df_new,df_year_month],axis=1) # 连接数据帧
df3 = pd.pivot_table(df2,values='maxTem',index='month-day',columns='year')
print(df3.info())
df3.to_excel(r'demoData/04_yearData.xlsx') #保存为excel
# 设置中文字体为仿宋
myfont=FontProperties(fname = "C:/Windows/Fonts/STFANGSO.TTF",size=25)
df3.plot(figsize=(20, 10),fontsize=15) # 作图
plt.xlabel('日期（月-日）',fontproperties=myfont)
plt.ylabel('气温（℃）',fontproperties=myfont)
plt.title('最高气温对比图(2011-2022)',fontproperties=myfont, fontsize=20) #设置图片标题
plt.legend(fontsize=15,markerscale=15)
plt.tick_params(labelsize=20)
plt.show()

4.5 天气情况的柱状图

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df_new = pd.read_excel(r'demoData/03_newDate.xlsx',sheet_name='Sheet1',index_col=0) # 读取
df_new.index = df_new['date'] # 修改索引值为日期
df_2020 = df_new["2020-01-01":"2020-12-31"] # 截取2020年全部数据
# print(df_2020.info())
df_weather_2020 = df_2020["weather"].value_counts() # 统计2020年天气情况数
# print(df_weather_2020)
sns.set(font_scale=2,font='SimHei') #设置字体大小、字体（这里是黑体）
plt.figure(figsize=(20, 10))
df_weather_2020.plot.bar(color='blue')#Series绘制2020年天气情况柱状图，柱体颜色设置为蓝色
plt.ylabel('天数',fontsize=20) #设置X轴名称
plt.xticks(rotation=90, fontsize=14) #改变标签显示角度为90、字体大小，防止重叠。
plt.show()

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df_new = pd.read_excel(r'demoData/03_newDate.xlsx',sheet_name='Sheet1',index_col=0) # 读取
df_new.index = df_new['date'] # 修改索引值为日期
df_2020 = df_new["2020-01-01":"2020-12-31"] # 截取2020年全部数据
# print(df_2020.info())
df_weather_2020 = df_2020["weather"].value_counts() # 统计2020年天气情况数
# print(df_weather_2020)
# 实现X轴坐标竖排显示（非旋转90度）
label_list = df_weather_2020.index.tolist()
label = []
for i in range(len(label_list)):
    label_vertical = []
    for j in range(len(label_list[i])):
        label_vertical.append(label_list[i][j]+"\n") # 每个中文加入换行符，实现竖排显示
    label.append(''.join(label_vertical)) # 列表转换为字符串
# print(label)
sns.set(font_scale=3.5,font='SimHei',style='white') # 设置字体大小、字体（这里是黑体）、图片背景
plt.figure(figsize=(40, 20))
plt.bar(label,df_weather_2020.values,color='black')
plt.ylabel('天\n数',rotation=360) # 设置Y轴名称,并让标签文字上下显示
plt.show()

4.6 风向占比的饼图

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df_new = pd.read_excel('demoData/03_newDate.xlsx',sheet_name='Sheet1',index_col=0) # 读取历
sizes = df_new["wind_direction"].value_counts() # 风向统计
labels = sizes.index
sns.set(font_scale=3,font='SimHei') # 设置字体大小、字体（这里是黑体）
plt.style.use('grayscale') # 设置为灰度图
plt.figure(figsize=(25, 16))
plt.pie(sizes,labels=labels,autopct='%1.1f%%',textprops=dict(color="w"))
# 其中labels是标注，autopct='%1.1f%%'是显示数字，textprops用于设置字体颜色
plt.legend(bbox_to_anchor=(0.8,1.1),fontsize=20,ncol=2)
# fontsize=12 控制图例字体大小
# bbox_to_anchor=(0.8,1.1) # 控制图例位置
# ncol=2 图列分两列
plt.axis('equal')
plt.show()

4.7 使用windrose库绘制风玫瑰图

作为气象专业常见的图表之一，风玫瑰图主要用来统计一段时间内风向、风速发生的频率，其花瓣越长表示该风向的频率越高，单个花瓣上的不同颜色表示风速的分布情况。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from windrose import plot_windrose
df_all = pd.read_excel(r'demoData/03_newDate.xlsx',sheet_name='Sheet1',index_col=0)#读取历史
df_all.index = df_all['date'] # 修改索引值为日期
# 截取2017——2019年数据，2017年之前对风向定义较为混乱
# 这里只对2017——2019年数据进行处理
data = df_all["2018-01-01":"2020-12-31"]
df2 = data[['wind_speed','wind_direction']]
# 风向、风力数据化
df2.replace({'无持续风向':0,'东北风':45,'东南风':135,'东南偏东风':135,'西南风':225,'西北风':315,
 '西北偏北风':360,'北风':360,'东风':90,'南风':180,'西风':270,
 '微风':0,'1级':1,'2级':2,'小于3级':2.5,'3级':3,'3～4级':3.5,'4级':4,
 '5级':5,'6级':6,'7级':7},inplace = True) # 将汉字替换成可计算的数字
# df3 = df2[['speed','direction']]#交换两列，符合windrose'3级':3,的数据规范
df3 = df2.astype(float)#转换成浮点数
# 改变列名称,符合windrose库命名规则
df = df3.rename(columns={'wind_direction':'direction','wind_speed':'speed'})
print(df.info())
#作出2018—2020年风玫瑰图
df_2018 = df["2018-01-01":"2018-12-31"] # 截取2018年数据
df_2019 = df["2019-01-01":"2019-12-31"] # 截取2019年数据
df_2020 = df["2020-01-01":"2020-12-31"] # 截取2020年数据
#绘制子图
fg = plt.figure(figsize=(30, 10))
ax1 = fg.add_subplot(1,3,1, projection='windrose')
ax1.bar(df_2018.direction, df_2018.speed, normed=True, opening=0.8, edgecolor='white')
ax1.set_legend(bbox_to_anchor=(-0.4,0),fontsize=20)
ax1.set_legend(loc="lower left")
ax1.set_title('2018', pad = 20)
ax2 = fg.add_subplot(1,3,2, projection='windrose')
ax2.bar(df_2019.direction, df_2019.speed, normed=True, opening=0.8, edgecolor='white')
ax2.set_title('2019', pad = 20)
ax3 = fg.add_subplot(1,3,3, projection='windrose')
ax3.bar(df_2020.direction, df_2020.speed, normed=True, opening=0.8, edgecolor='white')
ax3.set_title('2020', pad = 20)
plt.show()

plot_windrose(df_2018, kind='bar')

plot_windrose(df_2018, kind='pdf', bins=np.arange(0.1,6,1))

5. 机器学习在天气预报中的应用

天气预报的制作和发布是非常复杂的过程，现行的天气预报的发布一般包括数据收集、数据分析、预报会商、产

品发布等环节，其涉及大气运动方程计算、经验总结等，这涉及另一个领域，本案例不做过多介绍。

本部分将尝试使用机器学习中的回归算法，对北京的次日最高气温进行预测，并评估模型的性能。

5.1 线性回归基本概念

可以将一元线性回归理解为，给定自变量 x ，其和因变量 y 之间的关系建模。可以将一元线性回归方程定义为 : y =

ax + b

一元线性回归可以理解为，通过已知的样本点找到最佳拟合直线的过程。最佳地拟合已知数据可以采用最小二乘

等方法。

多元线性回归与一元线性回归类似，只是多元线性回归需要添加预测变量的数量及其相应的系数。

5.2 使用一元线性回归预测气温

本部分主要依据前面处理后的数据集进行建模分析，包括：

1) 生成次日最高气温数据，即预测气温的实际值（因变量 y ）

2) 观察当日最高气温（ x ）与次日最高气温（ y ）之间的关系

3) 利用 sklearn 库实现线性回归

1) 生成次日最高气温

import pandas as pd
# 一、补齐日期缺失值
df_new = pd.read_excel('demoData/03_newDate.xlsx',sheet_name='Sheet1',index_col=0) # 读取历
df_new.index = pd.to_datetime(df_new.date,format="%Y-%m-%d") # 将字符型日期格式转换为日期格式,
new_time = pd.date_range('2011-01-01','2022-12-31') # 生成2011年1月1日至2022年12月31日之间的日
df_date_new = df_new.reindex(new_time) # 使用完整的日期重置原数据的索引，为了补全日期的缺失
df_date_new.to_excel('demoData/05_missDate.xlsx')

# 二、生成次日最高气温（预测值）
df = df_date_new.reset_index() # 重置索引,为合并两个数帧准备
df1 = df_date_new.loc["2011-01-02":"2022-12-31","maxTem"] # 截取第二日最高气温数据，作为预测实
df2 = df1.reset_index() # 重置索引,为合并两个数帧准备
df['maxPre'] = df2['maxTem'] # 将第二日最高气温（预测气温）整合到原数据框
df.to_excel('demoData/06_preDate.xlsx')

2) 观察当日最高气温与次日最高气温之间的关系

import pandas as pd
import matplotlib.pyplot as plt
# 一、数据处理
df_pre = pd.read_excel('demoData/06_preDate.xlsx',sheet_name='Sheet1',index_col=0) # 读取历
df_pre.index = df_pre['index'] # 修改索引值为日期
df0=df_pre[['maxTem','maxPre']] # 截取后续用到的“最高气温、预测气温”两列。
df = df0.dropna() # 删除缺失值
# 二、作图
fig = plt.figure(figsize=(12,30))
j = 1
year = range(2011,2023) # 生成2011到2022年之间的连续整数
for i in year:
    plt.subplot(4,3,j) # 绘制一个4行3列的图
    plt.plot(df.loc[str(i)].maxTem, df.loc[str(i)].maxPre, 'o') # 分别绘制每一年的数据 
    plt.xlabel('max Temperature')
    plt.title(str(i))
    j = j+1
plt.subplots_adjust(wspace=0.3,hspace=0.4) # 设置子图间间距
plt.savefig('观察当日最高气温与次日最高气温之间的关系.png',bbox_inches = 'tight')
plt.show()

3) 一元线性回归程序的实现

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf

# 一、数据处理
df_pre = pd.read_excel('demoData/06_preDate.xlsx', sheet_name='Sheet1', index_col=0)
df_pre.index = df_pre['index']
df = df_pre[['maxTem', 'maxPre']]

# 二、生成训练数据集，测试数据集
df_train = df["2011-01-01":"2020-12-31"].dropna()
df_test = df["2021-01-01":"2022-12-31"].dropna()
X_train = df_train[['maxTem']]
Y_train = df_train[['maxPre']]
X_test = df_test[['maxTem']]
Y_test = df_test[['maxPre']]

# 三、模型训练
slr = LinearRegression()
slr.fit(X_train, Y_train)
# 打印出拟合的函数
print("The linear model is: y = {:.5} + {:.5}x".format(slr.intercept_[0], slr.coef_[0][0]))

# 四、做出散点图以及拟合直线
sns.set(font_scale=2, font='SimHei')
fig = plt.figure(figsize=(14, 14))
plt.scatter(X_train, Y_train, c='blue')
plt.plot(X_train, slr.predict(X_train), c='red', linewidth=3)
plt.xlabel("当日最高气温")
plt.ylabel("第二日最高气温")
plt.savefig('散点图以及拟合直线.png',bbox_inches = 'tight')
plt.show()

# 五、训练集数据上评估线性回归模型
formula = "maxPre ~ maxTem"
lm_train = smf.ols(formula, df_train).fit()
print(lm_train.summary())  # 输出评估结果
lm_test = smf.ols(formula, df_test).fit()
print(lm_test.summary())  # 输出评估结果

# 六、将测试数据带入模型，进行预测，并可视化
pre_X_test = slr.predict(X_test)
Y_test['pre'] = pre_X_test
Y_test.plot(subplots=True, figsize=(20, 15))
plt.legend(loc='best')
plt.xlabel('日期')
plt.ylabel('气温')
plt.savefig('将测试数据带入模型，进行预测，并可视化.png',bbox_inches = 'tight')
plt.show()

5.3 使用多元线性回归预测气温

1) 分析各个维度之间的关系，进而确定自变量因子

2) 利用选定的自变量进行多元线性回归建模

1) 分析天气数据各个维度之间的关系

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# 一、数据处理
df_pre = pd.read_excel('demoData/06_preDate.xlsx', sheet_name='Sheet1', index_col=0)
df_pre.index = df_pre['index']

df1 = df_pre["2017-01-01":"2022-12-31"]

# 将汉字替换成可计算的数字，风向按照玫瑰图的作图规则，进行数据化。
wind_direction_mapping = {
    '无持续风向': 0, '微风': 0, '东北风': 45, '东南偏东风': 135,
    '西南风': 225, '西北风': 315, '西北偏北风': 315, '北风': 0, '东风': 90, '西风': 270
}
df1['wind_direction'] = df1['wind_direction'].map(wind_direction_mapping)

# 将汉字替换成可计算的数字，风速等级
wind_speed_mapping = {
    '微风': 0.5, '0级': 0, '1级': 1, '2级': 2, '小于3级': 2.5, '3～4级': 3.5, '4级': 4, '4～5级': 4.5, '5级': 5, '6级': 6
}
df1['wind_speed'] = df1['wind_speed'].map(wind_speed_mapping)

df2 = df1.dropna()

df3 = df2[['maxTem', 'minTem', 'wind_direction', 'wind_speed', 'maxPre']].astype(float)

# 二、数据散点图
df3.rename(columns={
    "maxTem": "最高气温（℃）",
    "minTem": "最低气温(℃)",
    "wind_direction": "风向",
    "wind_speed": "风速等级",
    "maxPre": "预测气温(℃)"
}, inplace=True)

sns.set(font_scale=2, font='SimHei', style='white')
sns.pairplot(df3, plot_kws=dict(s=20, color="black"), diag_kws=dict(color="black"))

# 三、作出相关性热力图
datacor = np.corrcoef(df3.values.T)
datacor = pd.DataFrame(data=datacor, columns=df3.columns, index=df3.columns)

plt.figure(figsize=(15, 15))
ax = sns.heatmap(datacor, square=True, annot=True, fmt=".3f", linewidths=.5, cmap="YlGnBu", cbar=True)
plt.show()

2) 多元线性回归建模

import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import numpy as np
import matplotlib.pyplot as plt
# 一、数据处理
df_pre = pd.read_excel('demoData/06_preDate.xlsx',sheet_name='Sheet1',index_col=0) # 读取历
df_pre.index = df_pre['index'] # 修改索引值为日期
# 二、生成训练数据集，测试数据集
# 2011—2020年生成训练模型数据，并删除含缺失值的行
df_train = df_pre["2011-01-01":"2020-12-31"].dropna()
#2021—2022年作为测试数据，并删除含缺失值的行
df_test = df_pre["2021-01-01":"2022-12-31"].dropna()
X_train = df_train[['maxTem','minTem']] # 训练集的X值
Y_train = df_train[['maxPre']] # 训练集的Y值
X_test = df_test[['maxTem','minTem']] # 测试集的X值
Y_test = df_test[['maxPre']] # 测试集的Y值
# 三、模型训练
slr = LinearRegression()
slr.fit(X_train,Y_train) # 训练模型
# 打印出多元线性回归方程
print("The linear model is: Y = {:.4} + {:.4}*maxTem + {:.4}*minTem".format(
 slr.intercept_[0],slr.coef_[0][0], slr.coef_[0][1]))
# 四、回归方程平面3维拟合
fig1 = plt.figure(figsize=(10,10))
ax = fig1.gca(projection='3d')
x1=x2=np.arange(0,40) # 生成0-40的整数
x1, x2 = np.meshgrid(x1, x2) # 生成网格点坐标矩阵
y=slr.intercept_[0] + slr.coef_[0][0]*x1 + slr.coef_[0][1]*x2 # 拟合方程y = w0 + w1*x1 + w
ax.plot_surface(x1,x2,y)
plt.show()
# 五、训练集评估线性回归模型
formula="maxPre ~ maxTem + minTem"
# lm_train = smf.ols(formula,df_train).fit() # 训练集上评估
# print(lm_train.summary()) # 输出评估结果
lm_test = smf.ols(formula,df_test).fit() # 测试集上评估
print(lm_test.summary()) # 输出评估结果
# 六、测试数据预测，做出实际气温、预测气温三维散点图
pre_X_test =slr.predict(X_test) # 计算测试集上预测气温
fig2 = plt.figure(figsize=(10,10))
bx = fig2.gca(projection='3d')
bx.scatter(X_test['maxTem'], X_test['minTem'],Y_test,s=30,c='blue', marker='o') # 实际气温
bx.scatter(X_test['maxTem'], X_test['minTem'],pre_X_test,s=100,c='r', marker='+') # 预测气
bx.set_xlabel('maxTem') # 设置x轴名称
bx.set_ylabel('minTem')
bx.set_zlabel('actual/predict')
plt.show()
# 七、作图：含有拟合平面
x1=x2=np.arange(0,40) # 生成0-40的整数
x1, x2 = np.meshgrid(x1, x2) # 生成网格点坐标矩阵
y=slr.intercept_[0] + slr.coef_[0][0]*x1 + slr.coef_[0][1]*x2 # 拟合方程y = w0 + w1*x1 + w
fig3=plt.figure(figsize=(10,10))
cx = fig3.gca(projection='3d')
cx.plot_surface(x1,x2,y)
cx.scatter(X_test['maxTem'], X_test['minTem'],Y_test,s=30,c='blue', marker='o') # 做出第二
cx.scatter(X_test['maxTem'], X_test['minTem'],pre_X_test,s=100,c='r', marker='+') # 做出第二
plt.show()