基于开源时间序列分析库
https://github.com/thuml/Time-Series-Libraryhttps://github.com/thuml/Time-Series-Library今天主要学习其中的数据集制作部分,核心代码块进行逐行解释理解。
0、源码
其中主要包括(此处只选择了自定义构造数据集)
1、数据类的定义
import os
import numpy as np
import pandas as pd
import glob
import re
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from utils.timefeatures import time_features
from data_provider.m4 import M4Dataset, M4Meta
from data_provider.uea import subsample, interpolate_missing, Normalizer
from sktime.datasets import load_from_tsfile_to_dataframe
import warnings
warnings.filterwarnings('ignore')
class Dataset_Custom(Dataset):
def __init__(self, root_path, flag='train', size=None,
features='S', data_path='ETTh1.csv',
target='OT', scale=True, timeenc=0, freq='h', seasonal_patterns=None):
# size [seq_len, label_len, pred_len]
# info
if size == None:
self.seq_len = 24 * 4 * 4
self.label_len = 24 * 4
self.pred_len = 24 * 4
else:
self.seq_len = size[0]
self.label_len = size[1]
self.pred_len = size[2]
# init
assert flag in ['train', 'test', 'val']
type_map = {'train': 0, 'val': 1, 'test': 2}
self.set_type = type_map[flag]
self.features = features
self.target = target
self.scale = scale
self.timeenc = timeenc
self.freq = freq
self.root_path = root_path
self.data_path = data_path
self.__read_data__()
def __read_data__(self):
self.scaler = StandardScaler()
df_raw = pd.read_csv(os.path.join(self.root_path,
self.data_path))
'''
df_raw.columns: ['date', ...(other features), target feature]
'''
cols = list(df_raw.columns)
cols.remove(self.target)
cols.remove('date')
df_raw = df_raw[['date'] + cols + [self.target]]
num_train = int(len(df_raw) * 0.7)
num_test = int(len(df_raw) * 0.2)
num_vali = len(df_raw) - num_train - num_test
border1s = [0, num_train - self.seq_len, len(df_raw) - num_test - self.seq_len]
border2s = [num_train, num_train + num_vali, len(df_raw)]
border1 = border1s[self.set_type]
border2 = border2s[self.set_type]
if self.features == 'M' or self.features == 'MS':
cols_data = df_raw.columns[1:]
df_data = df_raw[cols_data]
elif self.features == 'S':
df_data = df_raw[[self.target]]
if self.scale:
train_data = df_data[border1s[0]:border2s[0]]
self.scaler.fit(train_data.values)
data = self.scaler.transform(df_data.values)
else:
data = df_data.values
df_stamp = df_raw[['date']][border1:border2]
df_stamp['date'] = pd.to_datetime(df_stamp.date)
if self.timeenc == 0:
df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1)
df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
data_stamp = df_stamp.drop(['date'], 1).values
elif self.timeenc == 1:
data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
data_stamp = data_stamp.transpose(1, 0)
self.data_x = data[border1:border2]
self.data_y = data[border1:border2]
self.data_stamp = data_stamp
def __getitem__(self, index):
s_begin = index
s_end = s_begin + self.seq_len
r_begin = s_end - self.label_len
r_end = r_begin + self.label_len + self.pred_len
seq_x = self.data_x[s_begin:s_end]
seq_y = self.data_y[r_begin:r_end]
seq_x_mark = self.data_stamp[s_begin:s_end]
seq_y_mark = self.data_stamp[r_begin:r_end]
return seq_x, seq_y, seq_x_mark, seq_y_mark
def __len__(self):
return len(self.data_x) - self.seq_len - self.pred_len + 1
def inverse_transform(self, data):
return self.scaler.inverse_transform(data)
2、基于数据类制作数据集
from data_provider.data_loader import Dataset_ETT_hour, Dataset_ETT_minute, Dataset_Custom, Dataset_M4, PSMSegLoader, \
MSLSegLoader, SMAPSegLoader, SMDSegLoader, SWATSegLoader, UEAloader
from data_provider.uea import collate_fn
from torch.utils.data import DataLoader
data_dict = {
'ETTh1': Dataset_ETT_hour,
'ETTh2': Dataset_ETT_hour,
'ETTm1': Dataset_ETT_minute,
'ETTm2': Dataset_ETT_minute,
'custom': Dataset_Custom,
'm4': Dataset_M4,
'PSM': PSMSegLoader,
'MSL': MSLSegLoader,
'SMAP': SMAPSegLoader,
'SMD': SMDSegLoader,
'SWAT': SWATSegLoader,
'UEA': UEAloader
}
def data_provider(args, flag):
Data = data_dict[args.data]
timeenc = 0 if args.embed != 'timeF' else 1
if flag == 'test':
shuffle_flag = False
drop_last = True
if args.task_name == 'anomaly_detection' or args.task_name == 'classification':
batch_size = args.batch_size
else:
batch_size = 1 # bsz=1 for evaluation
freq = args.freq
else:
shuffle_flag = True
drop_last = True
batch_size = args.batch_size # bsz for train and valid
freq = args.freq
if args.task_name == 'anomaly_detection':
drop_last = False
data_set = Data(
root_path=args.root_path,
win_size=args.seq_len,
flag=flag,
)
print(flag, len(data_set))
data_loader = DataLoader(
data_set,
batch_size=batch_size,
shuffle=shuffle_flag,
num_workers=args.num_workers,
drop_last=drop_last)
return data_set, data_loader
elif args.task_name == 'classification':
drop_last = False
data_set = Data(
root_path=args.root_path,
flag=flag,
)
data_loader = DataLoader(
data_set,
batch_size=batch_size,
shuffle=shuffle_flag,
num_workers=args.num_workers,
drop_last=drop_last,
collate_fn=lambda x: collate_fn(x, max_len=args.seq_len)
)
return data_set, data_loader
else:
if args.data == 'm4':
drop_last = False
data_set = Data(
root_path=args.root_path,
data_path=args.data_path,
flag=flag,
size=[args.seq_len, args.label_len, args.pred_len],
features=args.features,
target=args.target,
timeenc=timeenc,
freq=freq,
seasonal_patterns=args.seasonal_patterns
)
print(flag, len(data_set))
data_loader = DataLoader(
data_set,
batch_size=batch_size,
shuffle=shuffle_flag,
num_workers=args.num_workers,
drop_last=drop_last)
return data_set, data_loader
1、首先分析数据类:Dataset_Custom:
1.1 初始化参数解释
class Dataset_Custom(Dataset):
def __init__(self, root_path, flag='train', size=None,
features='S', data_path='ETTh1.csv',
target='OT', scale=True, timeenc=0, freq='h', seasonal_patterns=None):
# size [seq_len, label_len, pred_len]
# info
if size == None:
self.seq_len = 24 * 4 * 4
self.label_len = 24 * 4
self.pred_len = 24 * 4
else:
self.seq_len = size[0]
self.label_len = size[1]
self.pred_len = size[2]
# init
assert flag in ['train', 'test', 'val']
type_map = {'train': 0, 'val': 1, 'test': 2}
self.set_type = type_map[flag]
self.features = features
self.target = target
self.scale = scale
self.timeenc = timeenc
self.freq = freq
self.root_path = root_path
self.data_path = data_path
self.__read_data__()
首先在类的初始化函数中定义构造数据会用到的参数。并将传入的参数定义赋值给本地的类中的全局变量。将传入的参数保存到类本身的self变量空间。
同时在最后一行调用了类的内部函数,完成数据集的制作
1.2 数据处理函数
def __read_data__(self):
self.scaler = StandardScaler() # 实例化归一化对象
df_raw = pd.read_csv(os.path.join(self.root_path,
self.data_path)) # 读取数据
'''
df_raw.columns: ['date', ...(other features), target feature]
'''
cols = list(df_raw.columns) # 获得所有数据列的名称
cols.remove(self.target) # 移除目标列名称
cols.remove('date') # 移除时间列名称
df_raw = df_raw[['date'] + cols + [self.target]] # 规范化数据重组
num_train = int(len(df_raw) * 0.7) # 计算训练集的个数 7成
num_test = int(len(df_raw) * 0.2) # 计算测试集的个数 2成
num_vali = len(df_raw) - num_train - num_test # 相减得到验证集个数
border1s = [0, num_train - self.seq_len, len(df_raw) - num_test - self.seq_len] # 计算不同数据集类型的起点
border2s = [num_train, num_train + num_vali, len(df_raw)] # 计算不同数据集类型的中终点
border1 = border1s[self.set_type] # 根据本次制作的数据集类型得到起点
border2 = border2s[self.set_type] # 根据本次制作的数据集类型得到终点
# 例子:如果是训练集,则self.set_type=0,所以数据集的范围为0到num_train
# 根据代码目标(M:多变量预测,MS:多变量预测单变量,S:单变量预测)
if self.features == 'M' or self.features == 'MS':
cols_data = df_raw.columns[1:]
df_data = df_raw[cols_data]
# 如果是M或者是MS,那么所有的特征都是要用到的,因此数据集为除去第一列(date)后的所有数据
elif self.features == 'S':
df_data = df_raw[[self.target]]
# 如果是单变量S,则只需要目标序列数据,也即self.target的数据
# 现在所有的数据都存在于df_data变量中
# 归一化与否,若归一化,则使用训练数据集来fit并最后transform整个数据集
if self.scale:
train_data = df_data[border1s[0]:border2s[0]]
self.scaler.fit(train_data.values)
data = self.scaler.transform(df_data.values)
else:
data = df_data.values
# 现在所有数据保存在变量data中
# 下列代码为时间position embeding,也即将时间(date)数据量化,能表达时间之间的相互关系,此处不做过多结束。可简单理解为将一个不可计算的date时间转换为了4个可计算数据(data_stamp)
df_stamp = df_raw[['date']][border1:border2]
df_stamp['date'] = pd.to_datetime(df_stamp.date)
if self.timeenc == 0:
df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1)
df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
data_stamp = df_stamp.drop(['date'], 1).values
elif self.timeenc == 1:
data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
data_stamp = data_stamp.transpose(1, 0)
# 获得数据集
self.data_x = data[border1:border2] # 得到数据的X
self.data_y = data[border1:border2] # 得到数据的y
self.data_stamp = data_stamp # 得到数据集的Mask
# 此除的self.data_x/y 并不是最后模型的输入和预期输出。他只是做一个存储,最后获取模型输入输出是调用__getitem__方法
1.3 索引函数定义
def __getitem__(self, index):
s_begin = index # 第index组数据,其模型输入开始位置等于index
s_end = s_begin + self.seq_len # 其模型输入结束位置等于index+序列长度
r_begin = s_end - self.label_len # 其模型输入开始位置等于s_end - label_len
# PS: label_len的意义举例,若现有模型输入为5预测为1,设输入为[1 2 3 4 5],无label_len的情况模型预测输出为[6]。通常,我们在建立模型预测输出时,不光只是包括其本身,还将输入的部分数据加入其中,若设置label_len为2,则对应的预测输出应该是label值[4 5]和期望输出[6],也即[4 5 6],
r_end = r_begin + self.label_len + self.pred_len # 其模型输出结束位置s_end + 预测长度
seq_x = self.data_x[s_begin:s_end] # 根据边界进行索引切片得到模型的输入
seq_y = self.data_y[r_begin:r_end] # 根据边界进行索引切片得到模型的对照输出
seq_x_mark = self.data_stamp[s_begin:s_end] # 根据边界进行索引切片得到模型的输入的时间时间掩码
seq_y_mark = self.data_stamp[r_begin:r_end] # 根据边界进行索引切片得到模型的预测时段的时间掩码
return seq_x, seq_y, seq_x_mark, seq_y_mark # 返回值
__getitem__(index)方法定义的为类的内部函数,若该类实例化的对象为dataset,其显示调用方法为dataset[index],
1.4 长度函数定义和反变换
def __len__(self):
return len(self.data_x) - self.seq_len - self.pred_len + 1
# 定义返回数据的数量。若实例化对象为dataset, 显示调用: len(dataset)
def inverse_transform(self, data):
# 反归一化定义
return self.scaler.inverse_transform(data)
2、数据提供函数
由于该lib包含了大量的应用场景,因此其函数定义较为庞杂,本文只简单介绍通常情况下的数据集制作函数。精简如下:
from data_provider.data_loader import Dataset_ETT_hour, Dataset_ETT_minute, Dataset_Custom
from torch.utils.data import DataLoader
data_dict = {
'ETTh1': Dataset_ETT_hour,
'ETTh2': Dataset_ETT_hour,
'ETTm1': Dataset_ETT_minute,
'ETTm2': Dataset_ETT_minute,
'custom': Dataset_Custom,
}
def data_provider(args, flag):
Data = data_dict[args.data] # 首先获得类(此刻还没实例化)
timeenc = 0 if args.embed != 'timeF' else 1 # 定义时间position embeding方法
train_only = args.train_only # 是否仅训练
if flag == 'test': # 测试情况
shuffle_flag = False # 不打乱顺序
drop_last = False # 删除最后一个数据
batch_size = args.batch_size # batchsize设置
freq = args.freq # 时间position embeding方法中的频率参数
else:
shuffle_flag = True
drop_last = True
batch_size = args.batch_size
freq = args.freq
# 实例化类获得对象
data_set = Data(
root_path=args.root_path, # 路径
data_path=args.data_path, # 文件路径
flag=flag, # 训练 or 测试 or 验证
size=[args.seq_len, args.label_len, args.pred_len], # 尺寸
features=args.features, # M/MS/S 多变量or单变量
target=args.target, # 目标序列
timeenc=timeenc, # 时间embeding方法
freq=freq, # 频率
train_only=train_only # 是否仅训练
)
print(flag, len(data_set)) # 输出各类数据集大小
# 调用torch中自带的数据加载方法
data_loader = DataLoader(
data_set, # 实例化对象
batch_size=batch_size, # batchsize设置
shuffle=shuffle_flag, # 是否打乱顺序设置
num_workers=args.num_workers, # 进程并行数设置
drop_last=drop_last) # 设置是否抛弃最后一个
# 最后返回的包括data_set和data_loader,data_set时数据类的实例化,data_loader时基于data_set创立的数据集,添加了batch策略以及打乱顺序等
return data_set, data_loader
实际应用:
# 得到数据
data_set, data_loader = data_provider(self.args, flag)
# 基于数据集的迭代训练(节选)
for epoch in range(self.args.train_epochs):
iter_count = 0
train_loss = []
self.model.train()
epoch_time = time.time()
for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader):
pass