在本部分 将展示线性回归预测的具体实现
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import getdata
from scipy.stats import pearsonr
import warnings
import math
from scipy.stats import pearsonr
from scipy.stats import pearsonr
def data_processing(a,b):
plt.style.use('seaborn')
data2 = getdata(a,b);
print(data2.head(3))
#cel(excel_name, sheet_name=2, na_values=na_values)
# data3.head(3)
# # 数据清洗
# ## 缺失行补齐算法(不补齐会造成很严重的后果)
temp_df = data2.copy()
lack_list = list(set(time_list)-set(data2['last_update']))
lack_df = pd.DataFrame(lack_list,columns=['last_update'])
temp_df = pd.concat([temp_df,lack_df])
temp_df.sort_values("last_update",inplace=True)
temp_df.index = list(range(len(temp_df)))
temp_df.shape
data2 = temp_df
data2.describe().to_csv('describe1.csv')
data2.describe()
data2.info()
data2.isnull().sum().to_csv('info.csv')
data2.isnull().sum()
# ### 线性插值
data2.iloc[:,2:] = data2.iloc[:,2:].interpolate()
data2.isnull().sum()
# ## 异常值
# ### 消除负值,不低于边界点0
data2 = data2.applymap(lambda v: max(0,v) if isinstance(v, float) else v)
data2.describe().to_csv('describe2.csv')
data2.describe()
# # 特征展示
# ## 分布图
warnings.filterwarnings("ignore")
def fkdeplot(vec, name):
sns.kdeplot(vec)
plt.title(name, fontsize=12)
plt.show()
# for i in range(10):
# fkdeplot(data2.iloc[:,i+2], 'Data')
fkdeplot(data2.iloc[:,4].values, 'SO2')
fkdeplot(data2.iloc[:,5].values, 'NO2')
fkdeplot(data2.iloc[:,3].values, 'PM10')
fkdeplot(data2.iloc[:,2].values, 'Pm2.5')
fkdeplot(data2.iloc[:,7].values, 'O3')
fkdeplot(data2.iloc[:,6].values, 'CO')
fkdeplot(data2.iloc[:,10].values, 'Temperature')
fkdeplot(data2.iloc[:,13].values, 'Humidity')
fkdeplot(data2.iloc[:,12].values, 'Pressure')
fkdeplot(data2.iloc[:,15].values, 'Wind Speed')
# ## AQI分布图
def compute_AQI(a):
print(a)
#a=np.array([n_SO2,n_NO2,n_PM10,n_PM25,n_O3,n_CO])
T=np.array([[0,50,150,475,800,1600,2100,2620],[0,40,80,180,280,565,750,940],[0,50,150,250,350,420,500,600],[0,35,75,115,150,250,350,500],[0,100,160,215,265,800,0,0],[0,2,4,14,24,36,48,60]])
I=np.array([0,50,100,150,200,300,400,500])
BP_LO = []
BP_Hi=[]
IAQI_LO=[]
IAQI_Hi=[]
IAQI=np.zeros((1,6))
result=[]
for j in range(6):
for i in range(1,8):
if a[j] <= T[j, i]:
print(a[i])
BP_LO.append(T[j, i-1])
BP_Hi.append(T[j, i])
IAQI_LO.append(I[i-1])
IAQI_Hi.append(I[i])
break
else:
continue
IAQI[0, j] = (IAQI_Hi[j] - IAQI_LO[j]) / (BP_Hi[j] - BP_LO[j]) * (a[j] - BP_LO[j]) + IAQI_LO[j]
result.append(math.ceil(IAQI[0, j]))
AQI=np.max(result)
c=np.argmax(result)
name=["SO2", "NO2", "PM10", "PM25", "O3", "CO"]
# if AQI<=50:
# print("AQI为%d" %(AQI))
# print("当天无首要污染物")
# else:
# print("AQI为%d" % (AQI))
# print('首要污染物为%s' % (name[c]))
return AQI
for inst in data2.iloc[:,2:8].values:
print('!!!')
print(inst)
AQI = [compute_AQI(inst) for inst in data2.iloc[:,2:8].values]
data2['AQI'] = AQI
fkdeplot(data2.iloc[:, 1].values, 'AQI')
def fplot(vec, name):
plt.plot(vec)
plt.title(name)
plt.show()
show_num = 24
fplot(data2.iloc[:show_num,4], 'SO2')
fplot(data2.iloc[:show_num,5], 'NO2')
fplot(data2.iloc[:show_num,3], 'PM10')
fplot(data2.iloc[:show_num,2], 'Pm2.5')
fplot(data2.iloc[:show_num,7], 'O3')
fplot(data2.iloc[:show_num,6], 'CO')
fplot(data2.iloc[:show_num,10], 'Temperature')
fplot(data2.iloc[:show_num,13], 'Humidity')
fplot(data2.iloc[:show_num,12], 'Pressure')
fplot(data2.iloc[:show_num,15], 'Wind Speed')
# ## 展示AQI的前6个时刻
fplot(data2.iloc[:show_num,-1], 'AQI')
# # 相关性分析(AQI与气象条件)
# pearson相关性
corr = data2.loc[:, ['temperature', 'humidity', 'pressure', 'wind_speed', 'aqi']].corr()
corr.columns = ['temperature', 'pressure', 'humidity', 'wind_speed', 'aqi']
corr.index = corr.columns
sns.heatmap(corr, annot=True)
AQI = data2.loc[:, 'aqi']
# # 相关性分析(AQI与高级气象条件)
# ## k小时变压
warm_up = 20
high_level_data = []
# In[310]:
k = 2
def stat_k(pressure_k):
# temp = pressure_k[-1]-pressure_k[0]
# temp = np.var(pressure_k)
temp = np.mean(pressure_k)
# np.std(pressure_k)/np.mean(pressure_k)
return temp
pressure_k = []
temp_data = data2.iloc[:, 12].values
print(temp_data)
print('total number', len(temp_data))
for i in range(len(temp_data) - k + 1):
pressure_k.append(temp_data[i:i + k])
pressure_k = np.array(pressure_k)
print('data.shape', pressure_k.shape)
print('total number', len(pressure_k))
pressure_k = [stat_k(inst) for inst in pressure_k]
fkdeplot(pressure_k, 'pressure-%d' % k)
fplot(pressure_k[:show_num], 'pressure-%d' % k)
high_level_data.append(pressure_k[warm_up - k:])
# In[311]:
from scipy.stats import pearsonr
def pressure_k_peasonr(k):
pressure_k = []
temp_data = data2.iloc[:, 12].values
# print('total number', len(temp_data))
for i in range(len(temp_data) - k + 1):
pressure_k.append(temp_data[i:i + k])
pressure_k = np.array(pressure_k)
# print('data.shape', pressure_k.shape)
# print('total number', len(pressure_k))
pressure_k = [stat_k(inst) for inst in pressure_k]
# print(len(AQI[k-1:]), len(pressure_k))
temp_pearsonr = pearsonr(AQI[k - 1:], pressure_k)[0]
# print(temp_pearsonr)
return temp_pearsonr
k_list = []
pearsonr_list = []
for k in range(10):
temp_k = k + 5
temp_pearsonr = pressure_k_peasonr(temp_k)
k_list.append(temp_k)
pearsonr_list.append(temp_pearsonr)
plt.plot(k_list, pearsonr_list)
plt.xlabel('K')
plt.ylabel('pearsonr correlation')
plt.title('pressure-k')
plt.show()
idx = np.argmax(np.abs(pearsonr_list))
print('最大(绝对)相关性', pearsonr_list[idx])
print('最大相关性对应k', k_list[idx])
# ## k小时变温
k = 2
def stat_k(pressure_k):
temp = pressure_k[-1] - pressure_k[0]
# temp = np.var(pressure_k)
# temp = np.mean(pressure_k)
# np.std(pressure_k)/np.mean(pressure_k)
return temp
pressure_k = []
temp_data = data2.iloc[:, 10].values
print('total number', len(temp_data))
for i in range(len(temp_data) - k + 1):
pressure_k.append(temp_data[i:i + k])
pressure_k = np.array(pressure_k)
print('data.shape', pressure_k.shape)
print('total number', len(pressure_k))
pressure_k = [stat_k(inst) for inst in pressure_k]
fplot(pressure_k[:show_num], 'temperatyre-%d' % k)
fkdeplot(pressure_k, 'temperatyre-%d' % k)
high_level_data.append(pressure_k[warm_up - k:])
from scipy.stats import pearsonr
def pressure_k_peasonr(k):
pressure_k = []
temp_data = data2.iloc[:, 10].values
# print('total number', len(temp_data))
for i in range(len(temp_data) - k + 1):
pressure_k.append(temp_data[i:i + k])
pressure_k = np.array(pressure_k)
# print('data.shape', pressure_k.shape)
# print('total number', len(pressure_k))
pressure_k = [stat_k(inst) for inst in pressure_k]
# print(len(AQI[k-1:]), len(pressure_k))
temp_pearsonr = pearsonr(AQI[k - 1:], pressure_k)[0]
# print(temp_pearsonr)
return temp_pearsonr
k_list = []
pearsonr_list = []
for k in range(10):
temp_k = k + 2
temp_pearsonr = pressure_k_peasonr(temp_k)
k_list.append(temp_k)
pearsonr_list.append(temp_pearsonr)
plt.plot(k_list, pearsonr_list)
plt.xlabel('K')
plt.ylabel('pearsonr correlation')
plt.title('temperature-k')
plt.show()
idx = np.argmax(np.abs(pearsonr_list))
print('最大(绝对)相关性', pearsonr_list[idx])
print('最大相关性对应k', k_list[idx])
# ## k小时平均风速
# In[314]:
k = 3
def stat_k(pressure_k):
temp = np.mean(pressure_k)
return temp
pressure_k = []
temp_data = data2.iloc[:, 15].values
print('total number', len(temp_data))
for i in range(len(temp_data) - k + 1):
pressure_k.append(temp_data[i:i + k])
pressure_k = np.array(pressure_k)
print('data.shape', pressure_k.shape)
print('total number', len(pressure_k))
pressure_k = [stat_k(inst) for inst in pressure_k]
fkdeplot(pressure_k, 'pressure-%d' % k)
fplot(pressure_k[:show_num], 'pressure-%d' % k)
high_level_data.append(pressure_k[warm_up - k:])
# In[315]:
def pressure_k_peasonr(k):
pressure_k = []
temp_data = data2.iloc[:, 15].values
# print('total number', len(temp_data))
for i in range(len(temp_data) - k + 1):
pressure_k.append(temp_data[i:i + k])
pressure_k = np.array(pressure_k)
# print('data.shape', pressure_k.shape)
# print('total number', len(pressure_k))
pressure_k = [stat_k(inst) for inst in pressure_k]
# print(len(AQI[k-1:]), len(pressure_k))
temp_pearsonr = pearsonr(AQI[k - 1:], pressure_k)[0]
# print(temp_pearsonr)
return temp_pearsonr
k_list = []
pearsonr_list = []
for k in range(10):
temp_k = k + 2
temp_pearsonr = pressure_k_peasonr(temp_k)
k_list.append(temp_k)
pearsonr_list.append(temp_pearsonr)
plt.plot(k_list, pearsonr_list)
plt.xlabel('K')
plt.ylabel('pearsonr correlation')
plt.title('wind-speed-k')
plt.show()
idx = np.argmax(np.abs(pearsonr_list))
print('最大(绝对)相关性', pearsonr_list[idx])
print('最大相关性对应k', k_list[idx])
# ## k小时平均湿度
# In[316]:
k = 2
def stat_k(pressure_k):
temp = np.mean(pressure_k)
return temp
pressure_k = []
temp_data = data2.iloc[:, 13].values
print(1212121211)
print(temp_data)
print('total number', len(temp_data))
for i in range(len(temp_data) - k + 1):
pressure_k.append(temp_data[i:i + k])
pressure_k = np.array(pressure_k)
print('data.shape', pressure_k.shape)
print('total number', len(pressure_k))
pressure_k = [stat_k(inst) for inst in pressure_k]
fkdeplot(pressure_k, 'pressure-%d' % k)
fplot(pressure_k[:show_num], 'pressure-%d' % k)
high_level_data.append(pressure_k[warm_up - k:])
def pressure_k_peasonr(k):
pressure_k = []
temp_data = data2.iloc[:, 13].values
# print('total number', len(temp_data))
for i in range(len(temp_data) - k + 1):
pressure_k.append(temp_data[i:i + k])
pressure_k = np.array(pressure_k)
# print('data.shape', pressure_k.shape)
# print('total number', len(pressure_k))
pressure_k = [stat_k(inst) for inst in pressure_k]
# print(len(AQI[k-1:]), len(pressure_k))
temp_pearsonr = pearsonr(AQI[k - 1:], pressure_k)[0]
# print(temp_pearsonr)
return temp_pearsonr
k_list = []
pearsonr_list = []
for k in range(10):
temp_k = k + 2
temp_pearsonr = pressure_k_peasonr(temp_k)
k_list.append(temp_k)
pearsonr_list.append(temp_pearsonr)
plt.plot(k_list, pearsonr_list)
plt.xlabel('K')
plt.ylabel('pearsonr correlation')
plt.title('humidity-k')
plt.show()
idx = np.argmax(np.abs(pearsonr_list))
print('最大(绝对)相关性', pearsonr_list[idx])
print('最大相关性对应k', k_list[idx])
"""
# ## k小时风向一致性
# In[318]:
from itertools import combinations
k = 182
def angle_minute_drift(alpha, beta):
if max(alpha, beta)-min(alpha, beta)<180:
result = max(alpha, beta)-min(alpha, beta)
else:
result = min(alpha, beta)-max(alpha, beta)+360
return result
def drift_direction(pressure_k):
combine_list = list(combinations(pressure_k, 2))
return np.array([angle_minute_drift(i, j) for i,j in combine_list])
def stat_k(pressure_k):
#pressure_k = drift_direction(pressure_k)
temp = np.var(pressure_k)
return temp
pressure_k = []
temp_data = data2.iloc[:,12].values
print('total number', len(temp_data))
for i in range(len(temp_data)-k+1):
pressure_k.append(temp_data[i:i+k])
pressure_k = np.array(pressure_k)
print('data.shape', pressure_k.shape)
print('total number', len(pressure_k))
pressure_k = [stat_k(inst) for inst in pressure_k]
fkdeplot(pressure_k, 'pressure-%d'%k)
fplot(pressure_k[:show_num], 'pressure-%d'%k)
high_level_data.append(pressure_k[warm_up-k:])
# In[319]:
from scipy.stats import pearsonr
def pressure_k_peasonr(k):
pressure_k = []
temp_data = data2.iloc[:,12].values
#print('total number', len(temp_data))
for i in range(len(temp_data)-k+1):
pressure_k.append(temp_data[i:i+k])
pressure_k = np.array(pressure_k)
#print('data.shape', pressure_k.shape)
#print('total number', len(pressure_k))
pressure_k = [stat_k(inst) for inst in pressure_k]
#print(len(AQI[k-1:]), len(pressure_k))
temp_pearsonr = pearsonr(AQI[k-1:],pressure_k)[0]
#print(temp_pearsonr)
return temp_pearsonr
k_list = []
pearsonr_list = []
for k in range(24):
temp_k = k*10+2
temp_pearsonr = pressure_k_peasonr(temp_k)
k_list.append(temp_k)
pearsonr_list.append(temp_pearsonr)
plt.plot(k_list, pearsonr_list)
plt.xlabel('K')
plt.ylabel('pearsonr correlation')
plt.title('wind-direction-k')
plt.show()
idx = np.argmax(np.abs(pearsonr_list))
print('最大(绝对)相关性',pearsonr_list[idx])
print('最大相关性对应k',k_list[idx])
"""
# # SWI静稳天气指数
# ## 数据准备,得按照刚刚的结果抽取高级气象条件
high_level_data.append(AQI[warm_up - 1:])
high_level_data = np.array(high_level_data)
high_level_data.shape
high_level_data = pd.DataFrame(high_level_data).T
high_level_data.columns = ['temperature', 'pressure', 'humidity', 'wind_speed', 'aqi']
high_level_data
# # 高级气象条件与AQI相关性
corr = high_level_data.corr()
corr.to_csv('corr2.csv')
corr
corr.columns = ['temperature', 'pressure', 'humidity', 'wind_speed', 'aqi']
corr.index = corr.columns
sns.heatmap(corr, annot=True)
plt.scatter(x=high_level_data['pressure'], y=high_level_data['aqi'])
plt.ylabel('AQI')
plt.xlabel('Pressure')
plt.show()
plt.scatter(x=high_level_data['wind_speed'], y=high_level_data['aqi'])
plt.ylabel('AQI')
plt.xlabel('Wind Speed')
plt.show()
# ## 归一化
from sklearn.preprocessing import MinMaxScaler
y = high_level_data.iloc[:, -1]
x = high_level_data.iloc[:, :-1]
#print('1111')
#print(y)
#print(x)
# minmaxscaler
mm_x = MinMaxScaler()
mm_y = MinMaxScaler()
x = mm_x.fit_transform(x)
y = mm_y.fit_transform(np.reshape(y.values, (-1, 1)))
return x,y
# ## 划分训练集测试集
# In[327]:
cut = 20
x_train, y_train, x_test, y_test = x[:-cut], y[:-cut], x[-cut:], y[-cut:]
print('!!!')
print(x_train)
print('@@@@@')
print(y_train)
print('x_train.shape', x_train.shape)
print('x_test.shape', x_test.shape)
print('y_train.shape', y_train.shape)
print('y_test.shape', y_test.shape)
# ## 线性回归
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(x_train, y_train)
clf.coef_
clf.intercept_
plt.figure(figsize=(6,4))
plt.bar(['Pressure','Temperature','Wind Speed','Humidity','Wind Direction'],clf.coef_[0,:])
plt.title('coef')
plt.show()
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from math import sqrt
def mape(y_true, y_pred):
n = len(y_true)
mape = sum(np.abs((y_true - y_pred) / y_true)) / n * 100
return mape
def easy_result(y_train, y_train_predict, train_index, model_index, show_num):
# 画图进行展示
y_train_predict = np.reshape(y_train_predict, (-1, 1))
y_train_predict = mm_y.inverse_transform(y_train_predict)
y_train_predict = y_train_predict[:, 0]
y_train = np.reshape(y_train, (-1, 1))
y_train = mm_y.inverse_transform(y_train)
y_train = y_train[:, 0]
plt.figure(figsize=(10, 2))
plt.plot(y_train[-show_num:])
plt.plot(y_train_predict[-show_num:])
plt.legend(('real', 'predict'), fontsize='15')
plt.title("%s Data" % train_index, fontsize='20') # 添加标题
plt.show()
print('\n')
plot_begin, plot_end = min(min(y_train), min(y_train_predict)), max(max(y_train), max(y_train_predict))
plot_x = np.linspace(plot_begin, plot_end, 10)
plt.figure(figsize=(5, 5))
plt.plot(plot_x, plot_x)
plt.plot(y_train, y_train_predict, 'o')
plt.title("%s Data" % train_index, fontsize='20') # 添加标题
plt.show()
# 输出结果
print('%s上的MAE/RMSE/MAPE/R^2' % train_index)
print(mean_absolute_error(y_train, y_train_predict))
print(np.sqrt(mean_squared_error(y_train, y_train_predict)))
print(mape(y_train, y_train_predict))
print(r2_score(y_train, y_train_predict))
y_train_predict = clf.predict(x_train) # 预测结果
easy_result(y_train, y_train_predict, 'Train', 'Linear Model', 100) # 输出评价指标
y_test_predict = clf.predict(x_test) # 预测结果
easy_result(y_test, y_test_predict, 'Test', 'Linear Model', 100) # 输出评价指标
# ## 岭回归 为了避免多重共线
from sklearn.linear_model import Ridge
clf = Ridge(alpha=0.1)
clf.fit(x_train, y_train)
plt.bar(['Pressure', 'Temperature', 'Wind Speed', 'Humidity', 'Wind Direction'], clf.coef_[0, :])
plt.title('coef')
plt.show()
clf.coef_
clf.intercept_
y_train_predict = clf.predict(x_train) # 预测结果
easy_result(y_train, y_train_predict, 'Train', 'Linear Model', 100) # 输出评价指标
y_test_predict = clf.predict(x_test) # 预测结果
easy_result(y_test, y_test_predict, 'Test', 'Linear Model', 100) # 输出评价指标
# ## SWI静稳天气指数
yy = clf.predict(x)
yy = np.reshape(yy, (-1, 1))
# y= mm_y.inverse_transform(y)
# minmaxscaler
mm_yy = MinMaxScaler()
yy = mm_yy.fit_transform(np.reshape(yy, (-1, 1)))
yy = yy[:, 0]
fkdeplot(yy, 'SWI')
# # 对SWI指数进行聚类得到天气类别
# ### GMM
from sklearn.mixture import GaussianMixture as GMM
c = 3
temp = yy.reshape(-1, 1)
gmm = GMM(n_components=c).fit(temp)
labels = gmm.predict(temp)
plt.scatter(temp[:, 0], temp[:, 0], c=labels, s=50, cmap='viridis')
result = pd.DataFrame(np.array([temp[:, 0], labels]).T)
result
# ## 分界点
result.groupby(1).describe().to_csv('gmm_bound.csv')
result.groupby(1).describe()
df0, df1, df2 = result.groupby(1)
# fkdeplot(df0[-1][0], '1')
# fkdeplot(df1[-1][0], '2')
# fkdeplot(df2[-1][0], '-1')
sns.kdeplot(df0[-1][0])
sns.kdeplot(df1[-1][0])
sns.kdeplot(df2[-1][0])
plt.title('distribution', fontsize=12)
plt.show()
# ### KMeans
from sklearn.cluster import KMeans
c = 3
temp = np.array([yy, yy]).T
kmeans_model = KMeans(n_clusters=c, random_state=1).fit(temp)
labels = kmeans_model.labels_
plt.scatter(temp[:, 0], temp[:, 1], c=labels, s=50, cmap='viridis')
# ## Case Study
show_num = 100
plt.figure(figsize=(10, 2))
plt.plot(yy[-show_num:], label='SWI')
plt.plot(y[-show_num:, 0], label='AQI')
plt.axhline(0.42, linestyle="dotted", linewidth=1, color='r')
plt.axhline(0.60, linestyle="dotted", linewidth=1, color='r')
plt.legend()
plt.show()
show_num = 0
plt.figure(figsize=(10, 2))
plt.plot(yy[-show_num:], label='SWI')
plt.axhline(0.42, linestyle="dotted", linewidth=1, color='r')
plt.axhline(0.60, linestyle="dotted", linewidth=1, color='r')
plt.legend()
plt.show()
show_num = 1000
plt.figure(figsize=(10, 2))
plt.plot(yy[-show_num:], label='SWI')
plt.axhline(0.42, linestyle="dotted", linewidth=1, color='r')
plt.axhline(0.60, linestyle="dotted", linewidth=1, color='r')
plt.legend()
plt.show()
# # 各个因子分类别统计
result_stat = high_level_data.iloc[:, :-1]
result_stat['classes'] = result[1]
result_stat
result_stat.groupby('classes').describe().to_csv('factor_stat.csv')
result_stat.groupby('classes').describe()
temp1, temp2, temp3 = result_stat.groupby('classes')
sns.kdeplot(temp1[-1]['Wind Speed'])
sns.kdeplot(temp2[-1]['Wind Speed'])
sns.kdeplot(temp3[-1]['Wind Speed'])
plt.title('Wind Speed Distribution', fontsize=12)
plt.show()
sns.kdeplot(temp1[-1]['Humidity'])
sns.kdeplot(temp2[-1]['Humidity'])
sns.kdeplot(temp3[-1]['Humidity'])
plt.title('Humidity Distribution', fontsize=12)
plt.show()
sns.kdeplot(temp1[-1]['Temperature'])
sns.kdeplot(temp2[-1]['Temperature'])
sns.kdeplot(temp3[-1]['Temperature'])
plt.title('Temperature Distribution', fontsize=12)
plt.show()
sns.kdeplot(temp1[-1]['Pressure'])
sns.kdeplot(temp2[-1]['Pressure'])
sns.kdeplot(temp3[-1]['Pressure'])
plt.title('Pressure Distribution', fontsize=12)
plt.show()
"""