Transformer是一种基于自注意力机制(self-attention mechanism)的神经网络结构,用于自然语言处理(NLP)任务,如机器翻译、文本摘要、问答系统等。transformer主要对输入特征进行编码、解码等,在这里原理并不过多的介绍,我们通过实例来完成transformer在时间序列上的预测应用。
我们使用的是某一零件的耗用量,预测未来半年的该零件的耗用情况,时间维度是月,数据的示意图如下图所示
如上图所示,蓝色部分为训练数据,红色为测试部分。首先第一步要创建transformer模型
#模型创建
from transformers import TimeSeriesTransformerConfig, TimeSeriesTransformerForPrediction
config = TimeSeriesTransformerConfig(
prediction_length=prediction_length,
context_length=prediction_length * 2,
lags_sequence=lags_sequence,
num_time_features=len(time_features) + 1,
num_static_categorical_features=1,
# num_static_real_features = 1,
cardinality=[len(train_dataset)],
embedding_dimension=[2],
encoder_layers=4,
decoder_layers=4,
d_model=22,
)
model = TimeSeriesTransformerForPrediction(config)`
然后构建时间特征模块
from gluonts.time_feature import (
time_features_from_frequency_str,
TimeFeature,
get_lags_for_frequency,
)
from gluonts.dataset.field_names import FieldName
from gluonts.transform import (
AddAgeFeature,
AddObservedValuesIndicator,
AddTimeFeatures,
AsNumpyArray,
Chain,
ExpectedNumInstanceSampler,
InstanceSplitter,
RemoveFields,
SelectFields,
SetField,
TestSplitSampler,
Transformation,
ValidationSplitSampler,
VstackFeatures,
RenameFields,
)
紧接着搭建训练样本、测试样本以及验证样本的数据加载器
from gluonts.itertools import Cyclic, IterableSlice, PseudoShuffled
from gluonts.torch.util import IterableDataset
from torch.utils.data import DataLoader
from typing import Iterable
def create_train_dataloader(
config: PretrainedConfig,
freq,
data,
batch_size: int,
num_batches_per_epoch: int,
shuffle_buffer_length: Optional[int] = None,
**kwargs,
) -> Iterable:
PREDICTION_INPUT_NAMES = [
"past_time_features",
"past_values",
"past_observed_mask",
"future_time_features",
]
if config.num_static_categorical_features > 0:
PREDICTION_INPUT_NAMES.append("static_categorical_features")
if config.num_static_real_features > 0:
PREDICTION_INPUT_NAMES.append("static_real_features")
TRAINING_INPUT_NAMES = PREDICTION_INPUT_NAMES + [
"future_values",
"future_observed_mask",
]
transformation = create_transformation(freq, config)
transformed_data = transformation.apply(data, is_train=True)
instance_splitter = create_instance_splitter(config, "train") + SelectFields(
TRAINING_INPUT_NAMES
)
training_instances = instance_splitter.apply(
Cyclic(transformed_data)
if shuffle_buffer_length is None
else PseudoShuffled(
Cyclic(transformed_data),
shuffle_buffer_length=shuffle_buffer_length,
)
)
return IterableSlice(
iter(
DataLoader(
IterableDataset(training_instances),
batch_size=batch_size,
**kwargs,
)
),
num_batches_per_epoch,
)
#测试样本
def create_test_dataloader(
config: PretrainedConfig,
freq,
data,
batch_size: int,
**kwargs,
):
PREDICTION_INPUT_NAMES = [
"past_time_features",
"past_values",
"past_observed_mask",
"future_time_features",
]
if config.num_static_categorical_features > 0:
PREDICTION_INPUT_NAMES.append("static_categorical_features")
if config.num_static_real_features > 0:
PREDICTION_INPUT_NAMES.append("static_real_features")
transformation = create_transformation(freq, config)
transformed_data = transformation.apply(data, is_train=False)
instance_sampler = create_instance_splitter(config, "test") + SelectFields(
PREDICTION_INPUT_NAMES
)
testing_instances = instance_sampler.apply(transformed_data, is_train=False)
return DataLoader(
IterableDataset(testing_instances), batch_size=batch_size, **kwargs
)
前向传播
outputs = model(
past_values=batch["past_values"],
past_time_features=batch["past_time_features"],
past_observed_mask=batch["past_observed_mask"],
static_categorical_features=batch["static_categorical_features"]
if config.num_static_categorical_features > 0
else None,
static_real_features=batch["static_real_features"]
if config.num_static_real_features > 0
else None,
future_values=batch["future_values"],
future_time_features=batch["future_time_features"],
future_observed_mask=batch["future_observed_mask"],
output_hidden_states=True,
)
紧接着就要对模型进行训练
from accelerate import Accelerator
from torch.optim import AdamW
accelerator = Accelerator()
device = accelerator.device
model.to(device)
optimizer = AdamW(model.parameters(), lr=6e-4, betas=(0.9, 0.95), weight_decay=1e-1)
model, optimizer, train_dataloader = accelerator.prepare(
model,
optimizer,
train_dataloader,
)
model.train()
for epoch in range(40):
for idx, batch in enumerate(train_dataloader):
optimizer.zero_grad()
outputs = model( static_categorical_features=batch["static_categorical_features"].to(device)
if config.num_static_categorical_features > 0
else None,
static_real_features=batch["static_real_features"].to(device)
if config.num_static_real_features > 0
else None,
past_time_features=batch["past_time_features"].to(device),
past_values=batch["past_values"].to(device),
future_time_features=batch["future_time_features"].to(device),
future_values=batch["future_values"].to(device),
past_observed_mask=batch["past_observed_mask"].to(device),
future_observed_mask=batch["future_observed_mask"].to(device),
)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
if idx % 100 == 0:
print(loss.item())
transformer预测的结果并不是单一的值,而是一个集合,预测出在该时间序列的可能值,然后再根据具体的业务需求完成最后的值,下图展示了结果中的某一样本的预测值。经过分析后可以看出transformer训练后还是比较能够模拟出样本的变化规律。