1概述
本文分析城市天气数据,包含气温、温度、气压、纬度等关系,数据来源链接:https://pan.baidu.com/s/12NUbEwhFgfcY4yb6xVOfvg
提取码:fa4u
2 humidity湿度分析
2.1 基础分析(最大值、最小值、间隔)
import pandas as pd
# 1 导入数据
data = pd.read_csv("humidity.csv")
# 2 打印数据描述
print(data.describe())
print(data.shape)
# 3 打印数据前5行
print(data.head(5))
# 4 打印出Vancouver城市的前5行
print(data.Vancouver.head(5))
# 5 计算出每个城市缺失值的数量,并且找到缺失值最多的城市;
null_sum = data.iloc[:,1:].isnull().sum()
print(null_sum)
print(null_sum.index[null_sum.argmax()])
#1 data.iloc[:,1:]
#2 .isnull() 缺失值返回True
#3 .sum() 求出
#4 null_sum.index[] 返回行名
#5 null_sum.argmax() 返回最大值所在的下标
# 6 计算出每个城市的均值,并且找出均值最大的城市
city_mean = data.iloc[:,1:].mean()
print(city_mean)
print(city_mean.index[city_mean.argmax()])
# 7 计算出每个城市的间隔(最大值-最小值),并且找出间隔最大的城市
city_interval = data.iloc[:,1:].max() - data.iloc[:,1:].min()
print(city_interval)
print(city_interval.index[city_interval.argmax()])
2.2 使用时间序列进行分析asfreq、resample、rolling
import pandas as pd
import matplotlib.pyplot as plt
# 1 导入数据
data = pd.read_csv("humidity.csv",
index_col="datetime",
parse_dates=["datetime"]
)
print(data.head(5))
# 2 时间戳,年、月、日 时、分、秒
timestamp = pd.Timestamp(2022, 12, 6, 12, 50, 2)
# 打印自己生日,打印高考的三天
# 打印2010-2-18 12:2:59
print(timestamp)
print(type(timestamp))
# 3 时间段
period = pd.Period('2017-01', freq="d")
print(period.start_time)
print(period.end_time)
# 4 时间范围:
dr1 = pd.date_range(start="2018-1-1",
end="2018-1-7",
freq="15min")
# 2010, 2022
# 1d,5d,1m, 1y, 5min, 15min 30min
print(dr1)
print(len(dr1))
# 5 处理数据
vancouver = data.Vancouver
print(vancouver.head(20))
vm = vancouver.asfreq('1m') # m月末,ms月初
# print(vm.head(50))
# 6 shift()
vm_5 = vm.shift(1)
print(vm_5.head(20))
# 数据绘图
vm.plot(legend=True)
vm_5.plot(legend=True)
plt.legend(["vm", "vm_5"])
plt.show()
# 数据保存
new_data = pd.DataFrame(vm)
new_data["vm_1"] = vm.shift(1)
new_data["vm_3"] = vm.shift(3)
new_data.to_csv("new_vancouver.csv")
# resample
vd = vancouver.asfreq("1d")
vd_mean = vd.resample('3d').mean()
print(vd.head(10))
print(vd_mean)
# rolling
vd = vancouver.asfreq("1d")
vd_mean = vd.rolling('3d').mean()
print(vd.head(10))
print(vd_mean)
2.3 取多个城市进行绘制
# 绘制图像
# 5.1 取一个时间段内的统计量(均值)来描述数据:1d,1m
Las_Vegas = data.loc[:,"Las Vegas"]
print(Las_Vegas.head(5)) # 查看前5行
Las_Vegas_mean = Las_Vegas.resample("1d").mean()
print(Las_Vegas_mean.head(5))
Las_Vegas_mean.plot(legend = True)
Vancouver = data.loc[:,"Vancouver"]
print(Vancouver.head(5)) # 查看前5行
Vancouver_mean = Vancouver.resample("15d").mean()
print(Vancouver_mean.head(5))
Vancouver_mean.plot(legend = True)
plt.legend(["Las Vegas","Vancouver"])
plt.show()
# 绘制前5个城市的湿度
plt.figure()
for x in data.columns[:5]:
x_mean = data.loc[:, x].resample("30d").mean()
x_mean.plot(legend=True)
plt.legend(data.columns[:5])
plt.show()
2.4 绘制频率分布直方图
# 5.3 频率分布直方图
Las_Vegas = data.loc[:, "Las Vegas"]
Las_Vegas.hist(bins = 200)
Vancouver = data.loc[:, "Vancouver"]
Vancouver.hist(bins = 200)
plt.legend(["Las Vegas","Vancouver"])
plt.show()
# 5.4 核密度函数
Las_Vegas = data.loc[:, "Las Vegas"]
Las_Vegas.plot(kind = 'kde')
Vancouver = data.loc[:, "Vancouver"]
Vancouver.plot(kind = 'kde')
plt.legend(["Las Vegas","Vancouver"])
plt.show()
3 风向分析-绘制雷达图
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# 使用雷达图分析风向
data = pd.read_csv("weather/wind_direction.csv",
index_col="datetime",
parse_dates=["datetime"])
Vancouver = data.Vancouver.asfreq("1d")
vancouver_index_counts = Vancouver.value_counts()
vancouver_index_counts.sort_index(inplace=True)
labels = np.array(vancouver_index_counts.index)
data = np.array(vancouver_index_counts.values)
angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False)
data = np.concatenate((data, [data[0]]))
angles = np.concatenate((angles, [angles[0]]))
# 导出fig图片设置,雷达图这种格式基本固定
fig = plt.figure()
# polar=true,就是ax.set_thetagrids可以显示
ax = fig.add_subplot(111, polar=True)
# 数据线的宽度2,ro-代表red-;bo-代表blue-;yo-代表yellow-;o-代表深蓝deepblue-
ax.plot(angles, data, 'ro-', linewidth=2)
# ax.set_thetagrids(angles * 180/np.pi, labels, fontproperties="SimHei") #bug,本机报错
#ax.set_thetagrids(angles * 180 / np.pi, labels)
# fig标题设置,中文字体设置
# 中文字体设置第3步,在显示中文的地方,增加u和fontproperties=my_font
ax.set_title(u"温度变化雷达图", va='bottom')
ax.grid(True) # 显示雷达图的一圈一圈的线,8个圈线
# 显示图片
plt.show()
4 气温-气压相关性分析(反比例)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# 1 导入数据
data_tem = pd.read_csv("weather/temperature.csv",
index_col="datetime",
parse_dates=["datetime"])
data_pre = pd.read_csv("weather/pressure.csv",
index_col="datetime",
parse_dates=["datetime"])
# 2 取城市Vancouver
v_temp = data_tem.Vancouver
v_press = data_pre.Vancouver
v_temp_press = pd.concat([v_temp, v_press], axis=1) # 数据拼接
v_temp_press.columns = ["temp", "press"] # 修改列名
v_temp_press.fillna(method="ffill", inplace=True)
v_temp_press.dropna(inplace=True)
# 计算Vancouver相关系数
v1 = np.array(v_temp_press.temp)
v2 = np.array(v_temp_press.press)
corr = np.corrcoef(v1, v2)[0][1]
print(corr)
temp_press_list = []
# 计算每个城市的相关系数
for x in data_pre.columns:
v_temp = data_tem.loc[:, x]
v_press = data_pre.loc[:, x]
v_temp_press = pd.concat([v_temp, v_press], axis=1) # 数据拼接
v_temp_press.columns = ["temp", "press"] # 修改列名
v_temp_press.fillna(method="ffill", inplace=True)
v_temp_press.dropna(inplace=True)
# 计算相关系数
v1 = np.array(v_temp_press.temp)
v2 = np.array(v_temp_press.press)
corr = np.corrcoef(v1, v2)[0][1]
temp_press_list.append(round(corr, 4))
print(len([x for x in temp_press_list if x < 0]) / len(temp_press_list))
Vancouver气温-气压相关系数:-0.11716415412170467
所有城市中相关系数小于0的城市所占比例:0.9722222222222222
5 纬度-气温相关性分析(反比)
5.1 绘制位置图像,并分析迈阿密-温哥华两个城市
import pandas as pd
import matplotlib.pyplot as plt
# 1 导入数据
data = pd.read_csv("weather/city_attributes.csv")
x = data.Longitude
y = data.Latitude
city_name = data.City
plt.plot(x, y, 'ro')
for x1, y1, s1 in zip(x, y, city_name):
plt.text(x1, y1, s1)
plt.grid()
plt.show()
# 分析迈阿密Miami和温哥华Vancouver的温度变化,绘制均值图像
data = pd.read_csv("weather/temperature.csv",
index_col="datetime",
parse_dates=["datetime"])
Miami = data.loc[:,"Miami"]
Miami_mean = Miami.resample("1d").mean()
Miami_mean.plot(legend = True)
Vancouver = data.loc[:,"Vancouver"]
Vancouver_mean = Vancouver.resample("15d").mean()
Vancouver_mean.plot(legend = True)
plt.legend(["Miami","Vancouver"])
plt.show()
5.2 分析所有城市
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# 1 导入数据
data_temp = pd.read_csv("weather/temperature.csv",
index_col="datetime",
parse_dates=["datetime"])
data_city = pd.read_csv("weather/city_attributes.csv",
index_col="City",
)
# 2 计算每个城市的气温均值
city_Latitude = data_city.Latitude
city_temp = np.mean(data_temp)
city_Lat_temp = pd.concat([city_Latitude, city_temp], axis=1)
city_Lat_temp.columns = ["Latitude", "temp"] # 修改列名
# 3 计算相关系数
v1 = np.array(city_Lat_temp.Latitude)
v2 = np.array(city_Lat_temp.temp)
plt.plot(v1,v2,'ro')
plt.show()
corr = np.corrcoef(v1, v2)[0][1]
print(corr)
纬度-气温相关系数:-0.9014470840303698