该数据集包含14个不同的特征,例如气温,大气压力和湿度。
获取数据
from __future__ import absolute_import, division, print_function, unicode_literals
try:
# %tensorflow_version only exists in Colab.
%tensorflow_version 2.x
except Exception:
pass
import tensorflow as tf
tf.enable_eager_execution()
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import ssl
ssl._create_default_https_context = ssl._create_unverified_context # 实测本地jupyter必须加
mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False
zip_path = tf.keras.utils.get_file(
origin='https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip',
fname='jena_climate_2009_2016.csv.zip',
extract=True)
csv_path, _ = os.path.splitext(zip_path)
df = pd.read_csv(csv_path)
df.head()
Date Time | p (mbar) | T (degC) | Tpot (K) | Tdew (degC) | rh (%) | VPmax (mbar) | VPact (mbar) | VPdef (mbar) | sh (g/kg) | H2OC (mmol/mol) | rho (g/m**3) | wv (m/s) | max. wv (m/s) | wd (deg) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 01.01.2009 00:10:00 | 996.52 | -8.02 | 265.40 | -8.90 | 93.3 | 3.33 | 3.11 | 0.22 | 1.94 | 3.12 | 1307.75 | 1.03 | 1.75 | 152.3 |
1 | 01.01.2009 00:20:00 | 996.57 | -8.41 | 265.01 | -9.28 | 93.4 | 3.23 | 3.02 | 0.21 | 1.89 | 3.03 | 1309.80 | 0.72 | 1.50 | 136.1 |
2 | 01.01.2009 00:30:00 | 996.53 | -8.51 | 264.91 | -9.31 | 93.9 | 3.21 | 3.01 | 0.20 | 1.88 | 3.02 | 1310.24 | 0.19 | 0.63 | 171.6 |
3 | 01.01.2009 00:40:00 | 996.51 | -8.31 | 265.12 | -9.07 | 94.2 | 3.26 | 3.07 | 0.19 | 1.92 | 3.08 | 1309.19 | 0.34 | 0.50 | 198.0 |
4 | 01.01.2009 00:50:00 | 996.51 | -8.27 | 265.15 | -9.04 | 94.1 | 3.27 | 3.08 | 0.19 | 1.92 | 3.09 | 1309.00 | 0.32 | 0.63 | 2 |
如上所示,每10分钟记录一次观察值。这意味着在一个小时内,您将有6个观测值。同样,一天将包含144(6x24)次观察。
def univariate_data(dataset, start_index, end_index, history_size, target_size):
data = []
labels = []
start_index = start_index + history_size
if end_index is None:
end_index = len(dataset) - target_size
for i in range(start_index, end_index):
indices = range(i-history_size, i)
# Reshape data from (history_size,) to (history_size, 1)
data.append(np.reshape(dataset[indices], (history_size, 1)))
labels.append(dataset[i+target_size])
return np.array(data), np.array(labels)
该函数返回上述时间窗以供模型训练。参数history_size
是过去信息窗口的大小。target_size
模型在未来需要学会预测的时间。该target_size
是需要被预测的标签。
TRAIN_SPLIT = 300000
据的前300,000行将是训练数据集,其余的将是验证数据集。
tf.set_random_seed(13)
设置种子以确保可重复性。
就是说每次取随机的结果都是一样的。
预测多元时间序列
features_considered = ['p (mbar)', 'T (degC)', 'rho (g/m**3)']
原始数据集包含十四个特征。
预测的特征为气温,大气压和空气密度3个。
features = df[features_considered] # 取这三个特征数据
features.index = df['Date Time'] # 按照时间序列
features.head()
数据展示
p (mbar) | T (degC) | rho (g/m**3) | |
---|---|---|---|
Date Time | |||
01.01.2009 00:10:00 | 996.52 | -8.02 | 1307.75 |
01.01.2009 00:20:00 | 996.57 | -8.41 | 1309.80 |
01.01.2009 00:30:00 | 996.53 | -8.51 | 1310.24 |
01.01.2009 00:40:00 | 996.51 | -8.31 | 1309.19 |
01.01.2009 00:50:00 | 996.51 | -8.27 | 1309.00 |
features.plot(subplots=True)
三个特征的时序图
dataset = features.values
[[ 996.52 -8.02 1307.75]
[ 996.57 -8.41 1309.8 ]
[ 996.53 -8.51 1310.24]
...
[ 999.82 -3.16 1288.39]
[ 999.81 -4.23 1293.56]
[ 999.82 -4.82 1296.38]]
data_mean = dataset.mean(axis=0) # 平均数
data_std = dataset.std(axis=0) # 标准差
dataset = (dataset-data_mean)/data_std
[[ 0.87422976 -2.0740129 2.29360559]
[ 0.88021172 -2.12031274 2.34488743]
[ 0.87542615 -2.1321845 2.35589427]
...
[ 1.26903882 -1.49704566 1.80930485]
[ 1.26784243 -1.62407343 1.93863516]
[ 1.26903882 -1.69411678 2.00917896]] # 归一化
训练数据的均值和标准差对数据集进行归一化
def multivariate_data(dataset, target, start_index, end_index, history_size,
target_size, step, single_step=False):
data = []
labels = []
start_index = start_index + history_size
if end_index is None:
end_index = len(dataset) - target_size
for i in range(start_index, end_index):
indices = range(i-history_size, i, step)
data.append(dataset[indices])
if single_step:
labels.append(target[i+target_size])
else:
labels.append(target[i:i+target_size])
return np.array(data), np.array(labels)
单步模型
在一步设置中,模型将根据提供的一些历史记录来学习预测未来的单个点。
下面的函数执行与下面相同的加窗任务,但是,这里它根据给定的步长对过去的观察进行采样。
past_history = 720 # 取5天的数据作为历史数据,每小时采样6次,5*24*6 = 720
future_target = 72 # 预测未来12小时,12*6 = 72
STEP = 6
x_train_single, y_train_single = multivariate_data(dataset, dataset[:, 1], 0,
TRAIN_SPLIT, past_history,
future_target, STEP,
single_step=True)
x_val_single, y_val_single = multivariate_data(dataset, dataset[:, 1],
TRAIN_SPLIT, None, past_history,
future_target, STEP,
single_step=True)
print ('Single window of past history : {}'.format(x_train_single[0].shape))
Single window of past history : (120, 3)
每隔一个小时采样,过去5天的数据作为训练数据
5*24 = 120
我们看下第一个数据点的大小
train_data_single = tf.data.Dataset.from_tensor_slices((x_train_single, y_train_single))
train_data_single = train_data_single.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()
val_data_single = tf.data.Dataset.from_tensor_slices((x_val_single, y_val_single))
val_data_single = val_data_single.batch(BATCH_SIZE).repeat()
single_step_model = tf.keras.models.Sequential()
single_step_model.add(tf.keras.layers.LSTM(32,
input_shape=x_train_single.shape[-2:]))
single_step_model.add(tf.keras.layers.Dense(1))
single_step_model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='mae')
让我们看看样本预测。
for x, y in val_data_single.take(1):
print(single_step_model.predict(x).shape)
(256,1)
single_step_history = single_step_model.fit(train_data_single, epochs=EPOCHS,
steps_per_epoch=EVALUATION_INTERVAL,
validation_data=val_data_single,
validation_steps=50)
Epoch 1/10
200/200 [==============================] - 34s 169ms/step - loss: 0.3272 - val_loss: 0.2723
Epoch 2/10
200/200 [==============================] - 30s 151ms/step - loss: 0.2697 - val_loss: 0.2479
Epoch 3/10
200/200 [==============================] - 33s 163ms/step - loss: 0.2692 - val_loss: 0.2521
Epoch 4/10
200/200 [==============================] - 27s 136ms/step - loss: 0.2647 - val_loss: 0.2502
Epoch 5/10
200/200 [==============================] - 30s 148ms/step - loss: 0.2348 - val_loss: 0.2448
Epoch 6/10
200/200 [==============================] - 27s 133ms/step - loss: 0.2486 - val_loss: 0.2679
Epoch 7/10
200/200 [==============================] - 25s 125ms/step - loss: 0.2506 - val_loss: 0.2659
Epoch 8/10
200/200 [==============================] - 25s 127ms/step - loss: 0.2465 - val_loss: 0.2534
Epoch 9/10
200/200 [==============================] - 25s 127ms/step - loss: 0.2526 - val_loss: 0.2587
Epoch 10/10
200/200 [==============================] - 25s 123ms/step - loss: 0.2477 - val_loss: 0.2535
def plot_train_history(history, title):
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(loss))
plt.figure()
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title(title)
plt.legend()
plt.show()
plot_train_history(single_step_history,
'Single Step Training and validation loss')
预测未来的一步
现在已经对模型进行了训练,让我们进行一些样本预测。该模型具有每小时过去5天采样的三个要素的历史记录(120个数据点),因为目标是预测温度,所以该图仅显示过去的温度。预测是在未来一天进行的(因此,历史记录和预测之间存在差距)。
for x, y in val_data_single.take(3):
plot = show_plot([x[0][:, 1].numpy(), y[0].numpy(),
single_step_model.predict(x)[0]], 12,
'Single Step Prediction')
plot.show()