介绍:
时序算法模型的交叉验证:
需求:我们在进行机器学习算法,为了保能够得到可靠稳定的模型,通常需要用到交叉验证法来对模型进行验证。常见的交叉验证形式有Holdout验证,K-fold验证,留一验证等。
与其他机器学习算法不同,时序算法由于数据具有前后关系,因此使用KFold方法无法满足要求,本文将介绍一种利用sklearn库中的TimeSeriesSplit方法,以下是以LSTM模型预测股票数据为例子,采用交叉验证法的具体代码实现。
交叉验证法:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout,Bidirectional,Activation
from keras.optimizers import Adam
stock = pd.DataFrame(pd.read_csv('stock2.csv'))
# 完整的数据集,
# 交叉验证我们需要先根据股票数据根据time_stamp,得到全部的数据集;
# 这里的做法和你上面的基本类似
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(stock)
X, Y = [], []
print(scaled_data)
# 训练集
#print(scaled_data.shape)
#print(scaled_data[1, 3])
for i in range(time_stamp, len(stock)):
X.append(scaled_data[i - time_stamp:i])
Y.append(scaled_data[i,3])
X, Y = np.array(X), np.array(Y)
# 每一个数据集需要有一个单独的模型去训练,因此定义一个产生lstm模型的函数
def build_model(epochs = 50,batch_size = 16):
# 超参数
# LSTM 参数: return_sequences=True LSTM输出为一个序列。默认为False,输出一个值。
# input_dim:输入单个样本特征值的维度
# input_length:输入的时间点长度
model = Sequential()
model.add(Bidirectional(LSTM(units=100, return_sequences=True, input_dim=x_train.shape[-1],input_length=x_train.shape[1])))
model.add(Dropout(0.5))
model.add(LSTM(units=80))
model.add(Dropout(0.5))
model.add(Dense(50))
#model.add(Activation('tanh'))
model.add(Dense(1))
# model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(0.01))
model.compile(loss='mean_squared_error', optimizer=Adam(lr = 0.1))
return model
# 导入sklearn库中用于时序模型K折扣交叉验证的TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
tscv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
rmse_list=[] # 勇于保存每次的测试误差
# 遍历,每一次取出对应的训练和测试集的index
for kfold, (train_index, train_index) in enumerate(tscv.split(X)):
print('train_index', train_index, 'test_index', test_index)
# 根据索引得到对应的训练集和测试集
train_X, train_y = X[train_index], Y[train_index]
test_X, test_y = X[test_index], Y[test_index]
# 建立模型并训练
model=build_model(epochs = 5,batch_size = 128)
history = model.fit(train_X, train_y, epochs=epochs, batch_size=batch_size,validation_split=0.1,verbose=2)
# 计算测试集误差
test_pred = model.predict(test_X)
scaler.fit_transform(pd.DataFrame(test['Close'].values))
test_pred = scaler.inverse_transform(test_pred)
test_y = scaler.inverse_transform([test_y])
rmse = np.sqrt(np.mean(np.power((test_y - test_pred), 2)))
rmse_list.append(rmse)
print('rmse of %d fold=%.4f'%(kfold,rmse))
# 总的误差为每次误差的均值
print('average rmse:',np.mean(rmse_list))
以下是sklearn官方给的一个例子:
>>> import numpy as np
>>> from sklearn.model_selection import TimeSeriesSplit
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
>>> y = np.array([1, 2, 3, 4, 5, 6])
>>> tscv = TimeSeriesSplit()
>>> print(tscv)
TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
>>> for train_index, test_index in tscv.split(X):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [0] TEST: [1]
TRAIN: [0 1] TEST: [2]
TRAIN: [0 1 2] TEST: [3]
TRAIN: [0 1 2 3] TEST: [4]
TRAIN: [0 1 2 3 4] TEST: [5]
>>> # Fix test_size to 2 with 12 samples
>>> X = np.random.randn(12, 2)
>>> y = np.random.randint(0, 2, 12)
>>> tscv = TimeSeriesSplit(n_splits=3, test_size=2)
>>> for train_index, test_index in tscv.split(X):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [0 1 2 3 4 5] TEST: [6 7]
TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9]
TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11]
>>> # Add in a 2 period gap
>>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2)
>>> for train_index, test_index in tscv.split(X):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [0 1 2 3] TEST: [6 7]
TRAIN: [0 1 2 3 4 5] TEST: [8 9]
TRAIN: [0 1 2 3 4 5 6 7] TEST: [10 11]