Deep Factors代码实现
Deep Factors,包括前面介绍的DeepAR,MQR ( ( (C ) ) )NN都适合对批量时间序列进行建模预测,为了验证这个优势,本次随机生成了2条时间序列数据,当然序列数量可以更多,但尽量保证一批建模的序列有一定的相关性,比如rate型的数据就不合适和count型的数据一起建模。
数据处理
目录结构:
在MQRNN代码实现那一章节,输入模型的数据是转化成npy的格式,但在数据量很大的时候,读tfrecord形式的数据效率会更高,本次就将数据转化成tfrecord的形式分批入到模型。
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from multiprocessing import Pool
from multiprocessing import cpu_count
class Tfrecord_process():
def __init__(self):
self.train_lag_step=400
self.test_lag_step=30
self.time_step=400
self.prediction_len=30
self.time_series_dim = 6
self.feature_num = 4
def generator_fn(self,tuple_list):
# for ts_name, df in tuple_list:
df = tuple_list[1]
ts_name = tuple_list[0].replace(".csv","")
data_len=df.shape[0]
base_train_len = int(data_len * 0.7)
base_test_len = int(data_len * 0.3)
base_df_train = df[:base_train_len]
train_np=self.generator_npy_data(base_df_train,self.train_lag_step)
self.generator_tfrecord(train_np,ts_name,self.tf_train_dir)
base_df_test= df[(base_train_len-self.time_step):(base_train_len+base_test_len)]
test_np=self.generator_npy_data(base_df_test,self.test_lag_step)
self.generator_tfrecord(test_np,ts_name,self.tf_test_dir)
def generator_npy_data(self, df,lag_step):
data_len = self.time_step + self.prediction_len
step_list = range(data_len, df.shape[0], lag_step)
if len(step_list) == (df.shape[0] - data_len) / lag_step + 1:
data_np = np.zeros((len(step_list), data_len, self.time_series_dim))
for index, i in enumerate(step_list):
data_np[index, :, :] = np.array(df[i - data_len:i]).reshape((data_len, self.time_series_dim))
else:
data_np = np.zeros((len(step_list) + 1, data_len, self.time_series_dim))
for index, i in enumerate(step_list):
data_np[index, :, :] = np.array(df[i - data_len:i]).reshape((data_len, self.time_series_dim))
data_np[-1, :, :] = np.array(df[-data_len:]).reshape((data_len, self.time_series_dim))
return data_np
def generator_tfrecord(self, data_np,ts_name,tf_dir):
writer = tf.python_io.TFRecordWriter(os.path.join(tf_dir, ts_name))
for k in range(data_np.shape[0]):
features = {
}
features['time_step'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[self.time_step]))
features['prediction_len'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[self.prediction_len]))
history_feature = data_np[k, :self.time_step, 0:-2].reshape(-1, self.feature_num).astype(np.int64)
features['feature_shape'] = tf.train.Feature(
int64_list=tf.train.Int64List(value=history_feature.shape))
features['feature'] = tf.train.Feature(
int64_list=tf.train.Int64List(value=history_feature.reshape(-1)))
future_feature = data_np[k, self.time_step:self.time_step + self.prediction_len, 0:-2].reshape(-1,self.feature_num).astype(np.int64)
features['future_feature_shape'] = tf.train.Feature(
int64_list=tf.train.Int64List(value=future_feature.shape))
features['future_feature'] = tf.train.Feature(
int64_list=tf.train.Int64List(value=future_feature.reshape(-1)))
embedding_id = data_np[k, :self.time_step, -2].reshape(-1).astype(np.int64)
features['embedding_id'] = tf.train.Feature(int64_list=tf.train.Int64List(value=embedding_id))
y = data_np[k, :self.time_step + self.prediction_len, -1].reshape(-1