import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from tensorflow.keras import Model
from tensorflow.keras.layers import Input,LSTM,Dropout,Dense,Activation,Conv1D,AveragePooling1D,Bidirectional,Add
from tensorflow.keras.optimizers import RMSprop,Adam
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
from sklearn.preprocessing import LabelEncoder
# # 转换流水线
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
pd.set_option('display.max_rows', None)#显示全部行
pd.set_option('display.max_columns', None)#显示全部列
# 加载数据
WEATHER_PATH = './datasets'
def load_weather_data(weather_path):
csv_path = os.path.join(weather_path,"weather.csv")
return pd.read_csv(csv_path)
#查看数据
weather = load_weather_data(WEATHER_PATH)
print(weather.columns)
print(weather.info())
print(weather.describe())
# 数据初步清理
# 处理时间属性
def Clean_weather(weather):
# print(weather["cloud"].describe())
date = pd.to_datetime(weather['date'].apply(lambda x: str(x))) #取值替换将其转换为字符串
weather['year'] =date.dt.year #提取年
weather['month'] = date.dt.month #提取月
weather['week'] = date.dt.weekofyear #提取周
weather['quarter'] = date.dt.to_period('Q').astype('str')[:-2].apply(lambda x:x[-1]).astype('int') #将日期以季度为单位
weather['day'] = date.dt.dayofyear #提取天
# 将无效数据替换成NAN
weather[weather==999999] = np.NaN
weather[weather==999990] = np.NaN
# 去掉wind_direction和phenomenon属性
#weather = weather.drop(["wind_direction","phenomenon"],axis=1)
return weather
weather = Clean_weather(weather)
weather.to_csv('./datasets/clean_weather.csv',index=False)
# 数据可视化
plt.rcParams['font.family'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
# 绘制某个属性关于季度和地理位置的变化情况
def plot_of_quarter_city(X,attribe):
# 将属性按照季度和城市分组
df_group = X[attribe].groupby([X['quarter'],X['city']]).mean()
# 对多级索引中的city进行重命名
#df_group = df_group.rename({'重庆市':'chongqing','北京市':'beijing','上海市':'shanghai'},axis='index')
# 将series 按照city展开成一个表格
pop_df = df_group.unstack()
return pop_df
print(weather.describe())
# 每个城市在每个季度的可见度
df_vb_group1 = plot_of_quarter_city(weather,'visibility')
# 对多级索引重新命名
df_vb_group1.plot(kind='bar')
plt.title("可见度变化情况")
plt.show()
# 绘制降雨量
df_rain20_group = plot_of_quarter_city(weather,'rain20')
#df_rain20_group.plot(kind='bar')
plt.title("20时降雨量")
df_rain08_group = plot_of_quarter_city(weather,'rain08')
df_rain08_group.plot(kind='bar')
plt.title("08时降雨量")
plt.show()
# 温度变化情况
df_tempature_group = plot_of_quarter_city(weather,'temperature')
df_tempature_group.plot(kind='bar')
plt.title("温度变化情况")
plt.show()
# 湿度变化情况
df_humidity_group = plot_of_quarter_city(weather,'humidity')
df_humidity_group.plot(kind='bar')
plt.title("湿度变化情况")
plt.show()
# 气压变化情况
df_pressure_group = plot_of_quarter_city(weather,'pressure')
df_pressure_group.plot(kind='bar')
plt.title("气压变化情况")
plt.show()
# 风速变化情况
df_windspeed_group = plot_of_quarter_city(weather,'wind_speed')
df_windspeed_group.plot(kind='bar')
plt.title("风速变化情况")
plt.show()
# 云量变化情况
df_cloud_group = plot_of_quarter_city(weather,'cloud')
df_cloud_group.plot(kind='bar')
plt.title("云量变化情况")
plt.show()
# 各个属性相互之间的相关性
from pandas.plotting import scatter_matrix
attributes = ['wind_speed','temperature', 'humidity', 'rain20', 'cloud', 'visibility']
scatter_matrix(weather[attributes],figsize=(12,8))
plt.show()
#生成数据集
## 标签具有单个属性
# 选择窗口 单个属性
def windows_select(data_perpared,feature_size,squence_length,temper_dim):
features = []
labels = []
for i in range(data_perpared.shape[0]-squence_length):
x = np.array(data_perpared[i:i+squence_length,:]).flatten()
y = data_perpared[i+squence_length,temper_dim] # 第temper_dim维是温度
features.append(x)
labels.append(y)
features = np.array(features)
labels = np.array(labels)
return features,labels
def shuffle_data(features,labels): #将序列中的所有数据随机排序
# shuffle data
shuffle_indicies = np.random.permutation(features.shape[0])
features = features[shuffle_indicies,:]
labels = labels[shuffle_indicies]
return features,labels
# 数据预处理
def get_data(data_perpared,is_joint=False,squence_length=20,feature_size=7,temper_dim=2,train_ratio=0.8,validate_ratio=0.1):
if not is_joint:
data_perpared = data_perpared[:,:feature_size] # 取前面7维的数据
features,labels =windows_select(data_perpared,feature_size,squence_length,temper_dim)
features,labels = shuffle_data(features,labels)
else:
data_perpared[0] = data_perpared[0][:,:feature_size]
features,labels = windows_select(data_perpared[0],feature_size,squence_length,temper_dim)
for data in data_perpared[1:]:
data = data[:,:feature_size] # 取前面7维的数据
# 对一个地级市内的数据进行分组
f,l =windows_select(data,feature_size,squence_length,temper_dim)
features = np.vstack((features,f)) # 垂直拼接
labels = np.hstack((labels,l)) # 水平拼接
features,labels = shuffle_data(features,labels)
print("features.shape",features.shape)
print("labels.shape",labels.shape)
# train samples
train_row = round(features.shape[0]*train_ratio)
validate_num = round(features.shape[0]*validate_ratio)
test_num = features.shape[0]-train_row-validate_num
x_train = np.reshape(features[:train_row,:],(train_row,squence_length,feature_size))
y_train = np.reshape(labels[:train_row],(train_row,-1))
# validation samples
x_val = np.reshape(features[train_row:train_row+validate_num,:],(validate_num,squence_length,feature_size))
y_val = np.reshape(labels[train_row:train_row+validate_num],(validate_num,-1))
# test samples
x_test = np.reshape(features[train_row+validate_num:,:],(test_num,squence_length,feature_size))
y_test = np.reshape(labels[train_row+validate_num:],(test_num,-1))
print("train_samples:",x_train.shape,y_train.shape)
print("validate_samples:",x_val.shape,y_val.shape)
print("test_samples:",x_test.shape,y_test.shape)
return (x_train,y_train,x_val,y_val,x_test,y_test)
### 标签具有多个属性
# 选择窗口 多个属性
def windows_select_multi(data_perpared,feature_size,squence_length,temper_dim):
features = []
labels = []
for i in range(data_perpared.shape[0]-squence_length):
x = np.array(data_perpared[i:i+squence_length,:]).flatten()
y = data_perpared[i+squence_length,:temper_dim] # 前temper_dim维
features.append(x)
labels.append(y)
features = np.array(features)
labels = np.array(labels)
return features,labels
def shuffle_data_multi(features,labels):
# shuffle data
shuffle_indicies = np.random.permutation(features.shape[0])
features = features[shuffle_indicies,:]
labels = labels[shuffle_indicies,:]
return features,labels
# 数据预处理
def get_data_multi(data_perpared,is_joint=False,squence_length=20,feature_size=26,temper_dim=11,train_ratio=0.8,validate_ratio=0.1):
# data_perpared 是二维数据
if not is_joint:
data_perpared = data_perpared[:,:feature_size] # 取前面7维的数据
features,labels =windows_select_multi(data_perpared,feature_size,squence_length,temper_dim)
features,labels = shuffle_data_multi(features,labels)
# data_perpared 是三维数据,(county_num,feature_size,temper_dim)
else:
data_perpared[0] = data_perpared[0][:,:feature_size]
features,labels = windows_select_multi(data_perpared[0],feature_size,squence_length,temper_dim)
for data in data_perpared[1:]:
data = data[:,:feature_size] # 取前面7维的数据
# 对一个地级市内的数据进行分组
f,l =windows_select_multi(data,feature_size,squence_length,temper_dim)
features = np.vstack((features,f)) # 垂直拼接
labels = np.vstack((labels,l)) # 垂直拼接
features,labels = shuffle_data_multi(features,labels)
print("features.shape",features.shape)
print("labels.shape",labels.shape)
# train samples
train_row = round(features.shape[0]*train_ratio)
validate_num = round(features.shape[0]*validate_ratio)
test_num = features.shape[0]-train_row-validate_num
x_train = np.reshape(features[:train_row,:],(train_row,squence_length,feature_size))
y_train = np.reshape(labels[:train_row,:],(train_row,temper_dim))
# validation samples
x_val = np.reshape(features[train_row:train_row+validate_num,:],(validate_num,squence_length,feature_size))
y_val = np.reshape(labels[train_row:train_row+validate_num,:],(validate_num,temper_dim))
# test samples
x_test = np.reshape(features[train_row+validate_num:,:],(test_num,squence_length,feature_size))
y_test = np.reshape(labels[train_row+validate_num:,:],(test_num,temper_dim))
print("train_samples:",x_train.shape,y_train.shape)
print("validate_samples:",x_val.shape,y_val.shape)
print("test_samples:",x_test.shape,y_test.shape)
return (x_train,y_train,x_val,y_val,x_test,y_test)
# 模型设计
# 多对一
def BLSTM_model(input_shape):
x_in = Input(input_shape,name='input')
x = Conv1D(64,2,padding='same',name='conv1')(x_in)
x = AveragePooling1D(2,name='apool1')(x)
x1 = Bidirectional(LSTM(80,go_backwards=False),name='forward_lstm')(x)
x1 = Dropout(0.3,name='drop1')(x1)
x2 = Bidirectional(LSTM(80,go_backwards=True),name='backward_lstm')(x)
x2 = Dropout(0.3,name='drop2')(x2)
x = Add(name='add')([x1,x2])
x = Dense(100,activation='relu',name='dense1')(x)
x = Dropout(0.3,name='drop3')(x)
x = Dense(50,activation='relu',name='dense2')(x) # 70
x = Dropout(0.2,name='drop4')(x)
x = Dense(10,activation='relu',name='dense3')(x) # 20
x = Dropout(0.1,name='drop5')(x)
x = Dense(1,activation='sigmoid',name='dense4')(x) #linear
return Model(x_in,x,name='BLSTM')
# 多对多
def BLSTM_model_multi(input_shape,classes):
x_in = Input(input_shape,name='input')
x = Conv1D(64,2,padding='same',name='conv1')(x_in)
x = AveragePooling1D(2,name='apool1')(x)
x1 = Bidirectional(LSTM(80,go_backwards=False),name='forward_lstm')(x)
x1 = Dropout(0.3,name='drop1')(x1)
x2 = Bidirectional(LSTM(80,go_backwards=True),name='backward_lstm')(x)
x2 = Dropout(0.3,name='drop2')(x2)
x = Add(name='add')([x1,x2])
x = Dense(100,activation='relu',name='dense1')(x)
x = Dropout(0.3,name='drop3')(x)
x = Dense(50,activation='relu',name='dense2')(x) # 70
x = Dropout(0.2,name='drop4')(x)
x = Dense(10,activation='relu',name='dense3')(x) # 20
x = Dropout(0.1,name='drop5')(x)
x = Dense(classes,activation='sigmoid',name='dense4')(x) #linear
return Model(x_in,x,name='BLSTM')
#回归模型的训练和测试
# 评价指标
def evaluate_metrics(y_test, y_pred):
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print("mse: ", mse)
print("mae: ", mae)
print('rmse: ', rmse)
print("r2_score:", r2)
model_path = './checkpoint/weights-temp_acc=0.97.h5'
# 测试模型
def test(model, x_test, y_test):
model.load_weights(model_path)
# predict
y_pred = model.predict(x_test)
# 评价指标
evaluate_metrics(y_test, y_pred)
# plot
y_pred = y_pred.flatten()
y_test = y_test.flatten()
return y_pred, y_test
# 训练和预测
def train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,is_train=True):
input_shape = (x_train.shape[1],x_train.shape[2])
# train model
model = BLSTM_model(input_shape)
model.compile(loss="mse",optimizer=Adam())
print(model.summary())
if is_train:
cb_ckpt = ModelCheckpoint('./weights.{epoch:02d}-{val_loss:.2f}.h5',monitor='val_loss',
save_best_only=True,mode='min',period=20)
history = model.fit(x_train,y_train,batch_size=32,epochs=100,
validation_data=(x_val,y_val),shuffle=True,
callbacks=[cb_ckpt,
EarlyStopping(monitor='val_loss',patience=10)])
plt.plot(history.history['loss'],label='train')
plt.plot(history.history['val_loss'],label='test')
plt.legend()
plt.title("损失值对比图")
plt.show()
else:
y_pred,y_test = test(model,x_test,y_test)
return y_pred,y_test
# 在本数据集上测试回归模型 (预测温度)
num_pipeline = Pipeline([
('imputer',SimpleImputer(strategy="median")),# 处理缺失值
('std_scaler',MinMaxScaler()),# 最大最小归一化
])
# # 对weather 进行数据清理
num_weather = weather.drop(["date",'city','county','station','year','phenomenon'],axis=1)
num_attribs = list(num_weather.columns)
# 处理数值和文本属性的流水线
full_pipeline = ColumnTransformer([
("num",num_pipeline,num_attribs),
("city",OneHotEncoder(),['city']),
])
data_perpared = full_pipeline.fit_transform(weather)
weather_clean = pd.DataFrame(data_perpared,columns=num_attribs+['shanghai','beijing', 'chongqin'])
print(weather_clean.columns)
weather_clean[['date','county']] = weather[['date','county']]
def generate_features(weather_clean):
# 按照时间和地区进行排序
sortweather = weather_clean.sort_values(by=['county','date'],ascending=(False,True))
data_perpared_countys = []
# 按照地级市进行分组
for name,group in sortweather.groupby('county'):
group = group.drop(['date','county'],axis=1)
group_array = np.array(group)
data_perpared_countys.append(group_array)
return data_perpared_countys
data_perpared_countys = generate_features(weather_clean)
print(len(data_perpared_countys))
# 训练模型
# data_perpared_countys:处理后气象数据(数组),squence_length:窗口大小,temper_dim:温度的维度, feature_size: 特征数
# load data
x_train,y_train,x_val,y_val,x_test,y_test = get_data(data_perpared_countys,True,30,26,2)
#train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,is_train=True)
# 测试模型
model_path = './checkpoint/weights-temp_acc=0.97.h5'
print(data_perpared_countys[0].shape)
y_pred,y_test = train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,False)
# 绘制温度变化图形
# # 保留温度的最大最小值
temper_max = np.max(weather['temperature'])
temper_min = np.min(weather['temperature'])
# 还原温度值
y_pred_orignal = np.array(y_pred)*(temper_max-temper_min)+temper_min
y_test_orignal = np.array(y_test)*(temper_max-temper_min)+temper_min
evaluate_metrics(y_pred_orignal,y_test_orignal)
fig = plt.figure(figsize=(12,8))
plt.plot(y_test,label='y_test',alpha = 0.3)
plt.plot(y_pred,label='y_pred',alpha = 0.3)
plt.title("温度值对比")
plt.legend()
plt.show()
fig1 = plt.figure(figsize=(12,8))
plt.plot(y_pred_orignal,label='y_pred',alpha = 0.3)
plt.plot(y_test_orignal,label='y_test',alpha = 0.3)
plt.title("原始温度值对比")
plt.legend()
plt.show()
# 预测可见度
corr_matrix = weather_clean.corr()
print(corr_matrix['visibility'].sort_values(ascending=False)) #各种天气情况可见度排名
## 训练模型
# load data
x_train,y_train,x_val,y_val,x_test,y_test = get_data(data_perpared_countys,True,30,26,7)
#train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,is_train=True)
## 测试模型
model_path = './checkpoint/weights-visibility_acc=0.78.h5'
y_pred,y_test = train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,False)
# 绘制可见度变化图形
# # 保留可见度的最大最小值
visbi_max = np.max(weather['visibility'])
visbi_min = np.min(weather['visibility'])
# 还原温度值
y_pred_orignal = np.array(y_pred)*(visbi_max-visbi_min)+visbi_min
y_test_orignal = np.array(y_test)*(visbi_max-visbi_min)+visbi_min
evaluate_metrics(y_pred_orignal,y_test_orignal)
fig = plt.figure(figsize=(12,8))
plt.plot(y_test,label='y_test',alpha = 0.3)
plt.plot(y_pred,label='y_pred',alpha = 0.3)
plt.title("可见度对比")
plt.legend()
plt.show()
fig1 = plt.figure(figsize=(12,8))
plt.plot(y_pred_orignal,label='y_pred',alpha = 0.3)
plt.plot(y_test_orignal,label='y_test',alpha = 0.3)
plt.title("原始可见度对比")
plt.legend()
plt.show()
# 预测多个数值属性
print(weather_clean.columns)
# 训练和预测
# data_perpared:处理后气象数据(数组),squence_length:窗口大小,temper_dim:温度的维度, feature_size: 特征数
def train_and_predict_multi(x_train, y_train, x_val, y_val, x_test, y_test, is_train=True):
input_shape = (x_train.shape[1], x_train.shape[2])
# train model
model = BLSTM_model_multi(input_shape, y_train.shape[1])
model.compile(loss="mse", optimizer=Adam())
print(model.summary())
if is_train:
cb_ckpt = ModelCheckpoint('./weights.{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss',
save_best_only=True, mode='min', period=20)
history = model.fit(x_train, y_train, batch_size=32, epochs=100,
validation_data=(x_val, y_val), shuffle=True,
callbacks=[cb_ckpt,
EarlyStopping(monitor='val_loss', patience=10)])
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.title("损失值对比图")
plt.legend()
plt.show()
else:
y_pred, y_test = test(model, x_test, y_test)
return y_pred, y_test
# load data
x_train,y_train,x_val,y_val,x_test,y_test = get_data_multi(data_perpared_countys,True,30,26,7)
# 训练模型
# train_and_predict_multi(x_train,y_train,x_val,y_val,x_test,y_test,True)
# 测试模型
model_path = './checkpoint/weights-multi_acc=0.54.h5'
print(data_perpared_countys[0].shape)
y_pred,y_test = train_and_predict_multi(x_train,y_train,x_val,y_val,x_test,y_test,False)
#多分类模型的训练和测试
# 调整列的顺序
weather_clean_class = weather_clean[['sunny', 'cloudy', 'rain', 'fog', 'haze', 'dust',
'thunder', 'lightning','snow', 'hail', 'wind','pressure', 'wind_speed', 'temperature', 'humidity', 'rain20', 'rain08',
'cloud', 'visibility','month', 'week','quarter', 'day', 'shanghai', 'beijing', 'chongqin', 'date', 'county']]
# 生成特征列表
data_perpared_countys = generate_features(weather_clean_class)
classes = ['sunny', 'cloudy', 'rain', 'fog', 'haze', 'dust','thunder', 'lightning','snow', 'hail', 'wind']
# 测试模型
model_path = ''
def test_classify(model,x_test,y_test,classes):
model.load_weights(model_path)
y_pred = model.predict(x_test)
rocauc = metrics.roc_auc_score(y_test,y_pred)
prauc = metrics.average_precision_score(y_test,y_pred,average='macro')
print(f'ROC-AUC score={rocauc:.6f}')
print(f'Prauc score={prauc:.6f}')
y_prod = (y_pred > 0.5).astype(np.float32)
acc = metrics.accuracy_score(y_test,y_prod)
f1 = metrics.f1_score(y_test,y_prod,average='samples')
print(f'acc score={acc:.6f}')
print(f'f1 score={f1:.6f}')
# 计算每个类的准确率
for i,cls in enumerate(classes):
cls_rocauc = metrics.roc_auc_score(y_test[:,i],y_pred[:,i])
cls_prauc = metrics.average_precision_score(y_test[:,i],y_pred[:,i])
cls_acc = metrics.accuracy_score(y_test[:,i],y_prod[:,i])
cls_f1 = metrics.f1_score(y_test[:,i],y_prod[:,i])
print(f"[{i:2} {cls:10}] rocauc={cls_rocauc:.4f} prauc={cls_prauc:.4f} acc={cls_acc:4f} f1={cls_f1:.4f}")
return y_pred,y_test
# 训练和预测
# data_perpared:处理后气象数据(数组),squence_length:窗口大小,temper_dim:气象的维度, feature_size: 特征数
def train_and_predict_classify(x_train,y_train,x_val,y_val,x_test,y_test,classes,is_train=True):
# input_shape = (squence_length,feature_size)
input_shape = (x_train.shape[1],x_train.shape[2])
# train model
model = BLSTM_model_multi(input_shape,y_train.shape[1])
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=['accuracy'])
print(model.summary())
if is_train:
cb_ckpt = ModelCheckpoint('./weights.{epoch:02d}-{val_loss:.2f}.h5',monitor='val_loss',
save_best_only=True,mode='min',period=20)
history = model.fit(x_train,y_train,batch_size=32,epochs=100,
validation_data=(x_val,y_val),shuffle=True,
callbacks=[cb_ckpt,
EarlyStopping(monitor='val_loss',patience=10)])
plt.plot(history.history['loss'],label='train')
plt.plot(history.history['val_loss'],label='test')
plt.title("损失值对比图")
plt.legend()
plt.show()
else:
y_pred,y_test = test_classify(model,x_test,y_test,classes)
return y_pred,y_test
# load data
x_train,y_train,x_val,y_val,x_test,y_test = get_data_multi(data_perpared_countys,True,30,26,11)
# 训练模型
# train_and_predict_classify(x_train,y_train,x_val,y_val,x_test,y_test,True)
model_path = './checkpoint/weights_classify_auc=0.85.h5'
y_pred,y_test = train_and_predict_classify(x_train,y_train,x_val,y_val,x_test,y_test,classes,False)
基于LSTM的气候变化预测源码
最新推荐文章于 2024-10-05 08:57:27 发布