1. 数据预处理及数据读取
1.1 数据格式
文件格式:CSV
特点:
第一列:date(标准时间,如 2002-01-01 00:00:00)
最后一列:OT(用于预测单变量)
其他列:数据的其他特征
如下表:(需要将第一列和最后一列分别命名为date和OT)
可用以下脚本用来转换时间格式:
import pandas as pd
from datetime import datetime
if __name__ == '__main__':
df = pd.read_csv("a.csv")
df['data'] = pd.to_datetime(df['data'],format = '%Y/%m/%d %H:%M')
df['data'] = df['data'].dt.strftime('%Y-%m-%d %H:%M:%S')
print(df)
df = df.to_csv('b.csv', index=False)
print(df)
1.2 数据读取
入口:exp/exp_main.py:class Exp_Main: _get_data(self, flag) # flag 分别为 'train'、'val'和 'test'。
data_set, data_loader = data_provider(self.args, flag) # data_set没用,我觉得应该修改为:只返回 data_loader 即可。
def data_provider(args, flag):
Data = data_dict[args.data] # 这里我们使用 'custom': Dataset_Custom,
timeenc = 0 if args.embed != 'timeF' else 1
if flag == 'test': # test 时读取的数据
shuffle_flag = False # test 时候不打乱
drop_last = True
batch_size = args.batch_size
freq = args.freq
elif flag == 'pred':
shuffle_flag = False
drop_last = False
batch_size = 1
freq = args.freq
Data = Dataset_Pred
else: # 训练和验证时候
shuffle_flag = True
drop_last = True
batch_size = args.batch_size
freq = args.freq
data_set = Data( # 实例化一个自定义的Dataset的对象
root_path=args.root_path,
data_path=args.data_path,
flag=flag,
size=[args.seq_len, args.label_len, args.pred_len],
features=args.features,
target=args.target,
timeenc=timeenc,
freq=freq
)
print(flag, len(data_set))
data_loader = DataLoader( # torch.utils.data 中的DataLoader
data_set,
batch_size=batch_size,
shuffle=shuffle_flag,
num_workers=args.num_workers,
drop_last=drop_last)
return data_set, data_loader # 这里d ata_set 没必要返回
自定义的 Dataset_Custom类, 继承父类:Dataset,重载两个私有成员函数__len__、__getitem__。
class Dataset_Custom(Dataset):
def __init__(self, root_path, flag='train', size=None,
features='S', data_path='ETTh1.csv',
target='OT', scale=True, timeenc=0, freq='h'):
# size [seq_len, label_len, pred_len]
# info
if size == None:
self.seq_len = 24 * 4 * 4
self.label_len = 24 * 4
self.pred_len = 24 * 4
else:
self.seq_len = size[0]
self.label_len = size[1]
self.pred_len = size[2]
# init
assert flag in ['train', 'test', 'val']
type_map = {'train': 0, 'val': 1, 'test': 2}
self.set_type = type_map[flag]
self.features = features
self.target = target
self.scale = scale
self.timeenc = timeenc
self.freq = freq
self.root_path = root_path
self.data_path = data_path
self.__read_data__()
def __read_data__(self):
self.scaler = StandardScaler() # 使经过处理的数据符合标准正态分布,即均值为0,标准差为1;减均值,然后除以标准差
df_raw = pd.read_csv(os.path.join(self.root_path,
self.data_path))
'''
df_raw.columns: ['date', ...(other features), target feature]
'''
cols = list(df_raw.columns) # 列
cols.remove(self.target) # 去除 'OT'
cols.remove('date') # 去除 'date'
df_raw = df_raw[['date'] + cols + [self.target]] # 这不就是原来的 df_raw 吗?多此一举?
num_train = int(len(df_raw) * 0.7) # 假设一共100行, 70
num_test = int(len(df_raw) * 0.2) # 20
num_vali = len(df_raw) - num_train - num_test # 10
border1s = [0, num_train - self.seq_len, len(df_raw) - num_test - self.seq_len] # 假设 seq_len=5, [0, 65, 75]
border2s = [num_train, num_train + num_vali, len(df_raw)] # [70, 80, 100]
border1 = border1s[self.set_type] # train:0 # val:65 # test:75
border2 = border2s[self.set_type] # train:70 # val:80 # test:100 这里验证集会有泄漏,可能导致训练loss和验证loss很低,但是 test loss 却很高
if self.features == 'M' or self.features == 'MS': # 多变量
cols_data = df_raw.columns[1:]
df_data = df_raw[cols_data] # 去掉第一列(时间列)的数据
elif self.features == 'S': # 单变量
df_data = df_raw[[self.target]]
if self.scale: # 进行标准化
train_data = df_data[border1s[0]:border2s[0]]
self.scaler.fit(train_data.values) # 参考:https://blog.csdn.net/weixin_42279212/article/details/121342576
data = self.scaler.transform(df_data.values)
else:
data = df_data.values
df_stamp = df_raw[['date']][border1:border2] # 当前某个模式(train/val/test)下的第一列时间数据
df_stamp['date'] = pd.to_datetime(df_stamp.date)
if self.timeenc == 0: # 如果 args.embed != 'timeF',就会把时间编码为 month,day,weekday,hour 四个数
df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1) # +1 = 星期几
df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
data_stamp = df_stamp.drop(['date'], 1).values
elif self.timeenc == 1: # args.embed == 'timeF'
data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq) # 根据传入的 freq 对时间戳进行解析
data_stamp = data_stamp.transpose(1, 0)
self.data_x = data[border1:border2] # 感觉后面的 [border1:border2] 没用
self.data_y = data[border1:border2] # 感觉后面的 [border1:border2] 没用
self.data_stamp = data_stamp
def __getitem__(self, index):
s_begin = index
s_end = s_begin + self.seq_len
r_begin = s_end - self.label_len
r_end = r_begin + self.label_len + self.pred_len
seq_x = self.data_x[s_begin:s_end]
seq_y = self.data_y[r_begin:r_end]
seq_x_mark = self.data_stamp[s_begin:s_end]
seq_y_mark = self.data_stamp[r_begin:r_end]
return seq_x, seq_y, seq_x_mark, seq_y_mark
def __len__(self):
return len(self.data_x) - self.seq_len - self.pred_len + 1 #? 这样写估计是最后一个能去到完整seq_len+ pred_len序列
def inverse_transform(self, data): # 数据恢复到原来尺度,好像没有用到
return self.scaler.inverse_transform(data)
def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
"""
Returns a list of time features that will be appropriate for the given frequency string.
Parameters
----------
freq_str
Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
"""
features_by_offsets = {
offsets.YearEnd: [],
offsets.QuarterEnd: [MonthOfYear],
offsets.MonthEnd: [MonthOfYear],
offsets.Week: [DayOfMonth, WeekOfYear],
offsets.Day: [DayOfWeek, DayOfMonth, DayOfYear],
offsets.BusinessDay: [DayOfWeek, DayOfMonth, DayOfYear],
offsets.Hour: [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear], # h, 4个值
offsets.Minute: [ # t 返回5个值
MinuteOfHour,
HourOfDay,
DayOfWeek,
DayOfMonth,
DayOfYear,
],
offsets.Second: [
SecondOfMinute,
MinuteOfHour,
HourOfDay,
DayOfWeek,
DayOfMonth,
DayOfYear,
],
} # 没有定义 to_offset('15min') # <15 * Minutes>,因此不能使用
offset = to_offset(freq_str)
for offset_type, feature_classes in features_by_offsets.items():
if isinstance(offset, offset_type):
return [cls() for cls in feature_classes]
supported_freq_msg = f"""
Unsupported frequency {freq_str}
The following frequencies are supported:
Y - yearly
alias: A
M - monthly
W - weekly
D - daily
B - business days
H - hourly
T - minutely
alias: min
S - secondly
""" # 只支持这些
raise RuntimeError(supported_freq_msg)
因此,Autoformer 把时间戳根据传入的freq来解析,比如freq='h',那一个时间就解析为4个值。
for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader):
那么输入的数据shape为:
# freq = 'h'
batch_x: batch_size, seq_len, d_in
batch_x_mark: batch_size, seq_len, 4
batch_y: batch_size, label_len+pred_len, d_in
batch_y_mark: batch_size, label_len+pred_len, 4
为什么要编码时间戳作为输入?
因为真实世界场景中,时间戳通常可以获得,并且信息丰富。但是Transformer 模型中很少使用, Informer 模型首先将时间戳编码为位置编码,然后使用一个嵌入层进行编码。后面的 Aotoformer 模型和 FEDformer 模型采用了相同的方案。