使用UCI空气质量数据集为例,原始数据下载地址
使用前一小时数据预测后1小时(Timestep=1)
参考:https://blog.csdn.net/tMb8Z9Vdm66wH68VX1/article/details/78463811
特征列表:
Column Name | Meaning | Column Name | Meaning |
---|---|---|---|
NO | 行号 | TEMP | 温度 |
year | 年份 | PRES | 气压 |
month | 月份 | cbwd | 组合风向 |
day | 日 | Iws | 累计风速 |
hour | 时 | Is | 累积降雪时间 |
pm2.5 | PM2.5 浓度(label) | Ir | 累积降雨时间 |
DEWP | 露点 |
数据加载和预处理
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf #tf版本为1.13
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
# 加载原始数据
data = pd.read_csv('./PRSA_data.csv')
data.head()
# 整合时间
date = pd.PeriodIndex(year=data["year"], month=data["month"], day=data["day"], hour=data['hour'], freq="H")
data['date'] = date
drop_cols = ['No','year','month','day','hour']
for col in drop_cols:
data.drop(col,axis=1,inplace=True)
data.set_index('date',inplace=True)
data.columns=['pollution','dew','temp','press','wnd_dir','wnd_spd','snow','rain']
data.head()
# 处理空值
data['pollution'].fillna(0,inplace=True)
data = data[24:]
data.head()
data.to_csv('pollution.csv',index=None)
数据可视化
plt_cols = ['pollution','dew','temp','press','wnd_spd','snow','rain']
values = data[plt_cols].values
plt.figure(figsize=(12,10))
for i in range(len(plt_cols)):
plt.subplot(len(plt_cols),1,i+1)
plt.plot(values[:,i])
plt.title(plt_cols[i],y=0.5,loc='right')
plt.show()
Demo1
# 生成LSTM需要的数据格式
new_data = data.copy()
new_data['y'] = new_data['pollution'].shift(-1) #
new_data = new_data[:-1]
# 对风向进行数值编码
lbl = LabelEncoder()
data['wnd_dir'] = lbl.fit_transform(data['wnd_dir'])
# 归一化 (按照原博将y也一起归一化了)
for col in new_data.columns]:
mms = MinMaxScaler()
new_data[col] = mms.fit_transform(new_data[col].values.reshape(-1,1))
new_data.head()
# 训练测试集分割
y = new_data.pop('y')
X = new_data
# 使用前1年数据训练,剩余数据为据测试,也可以使用train_test_split随机分割
train_size = 365*24*1
X_train = X[:train_size]
X_test = X[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]
X_train = X_train.values.reshape(X_train.shape[0],1,X_train.shape[1])
X_test = X_test.values.reshape(X_test.shape[0],1,X_test.shape[1])
print("X_train:",X_train.shape,"y_train:",y_train.shape) # X_train: (8760, 1, 8) y_train: (8760,)
print("X_test:",X_test.shape,"y_test:",y_test.shape) # X_test: (35039, 1, 8) y_test: (35039,)
构建训练模型
# 使用和参考博客相同的参数
from tensorflow import keras
model = keras.models.Sequential()
model.add(keras.layers.LSTM(50,input_shape=(X_train.shape[1],X_train.shape[2])))
model.add(keras.layers.Dense(1))
model.compile(loss='mae',optimizer='adam')
history = model.fit(X_train,y_train,
epochs = 50,
batch_size = 72,
validation_data = (X_test,y_test))
# 省去训练log
# 误差可视化
def plot_learning_curves(history, label, epochs, min_value, max_value,title):
data = {}
data['train'] = history.history[label]
data['test'] = history.history['val_'+label]
pd.DataFrame(data).plot(figsize=(8, 5))
plt.grid(True)
plt.axis([0, epochs, min_value, max_value])
plt.title(title)
plt.show()
plot_learning_curves(history, 'loss', 30, 0, 0.06,'MAE')
# 预测评价
# 做归一化的时候没有特意保存scaler,就用label的scaler了,应该没有问题
pred_train = model.predict(X_train)
inv_train = mms.inverse_transform(pred_train)
train_y_true = mms.inverse_transform(y_train.values.reshape(-1,1))
rmse = np.sqrt(mean_squared_error(train_y_true,inv_train))
print("train rmse:",rmse) # train rmse:30.686416544450598
pred_test = model.predict(X_test)
inv_pred = mms.inverse_transform(pred_test)
test_y_true = mms.inverse_transform(y_test.values.reshape(-1,1))
rmse = np.sqrt(mean_squared_error(test_y_true,inv_pred))
print("test rmse:",rmse) # test rmse: 26.48025156818094
测试集误差居然比训练误差还要更小一些,不知道原博所说的RMSE3.836是怎么得到的