本示例的目的,是希望把sku训练好的embedding值嵌入到transformer算法中,从而提高transformer在销量预测算法中的准确性。
一、训练数据格式说明
1、embedding训练的数据格式示例:
133657,本田#第八代雅阁,1816,4
字段1表示:sku_id
字段2表示:车型 # 款式
字段3表示:车型 # 款式对应的序号id
字段4表示:sku_id对应的类目信息
2、销量预测训练的数据格式示例:
0053#031188,0_0_0_0_0_0_1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_1_0_0_0_0_0_0_0_0_0_0_0_1_0_0_0_0_0_1_0_0_0_0_0_0_0_0_1_0_0_0_0_0_0_0_0_0_0_0_1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0
字段1表示:门店code # sku_id
字段2表示:历史N周每周的销量值
二、Embedding Model阶段
1、输入阶段
(1)<sku_id, car_id>的组合关系,一个sku可以对应多个car_id
(2)构建sku_car_matrix,其中car_id对应位置标记为1,其余位置标记为0
(3)sku_id到sku_car_matrix中dim=0维的序号值的映射,方便后续sku_id根据序号值直接取到sku的embedding值,并持久化该映射关系
(4)将sku_car_matrix装入DataSet中,其中__getitem()__为<train_data[index], train_data[index]>的组合,因为我们通过sku_car_matrix -> embedding -> sku_car_matrix从而获取embedding值。
(5)将DataSet的值装入dataloader中,并设置shuffle为False,默认情况下DataLoader是会将数据打乱的。
2、模型训练阶段
(1)encoder - decoder网络架构
(2)loss值
decoder阶段输出的值与label值计算MseLoss值
(3)每一批次迭代时会将中间值embedding保存下来,汇总所有迭代的embedding值便得到每一次epoch的embedding值。根据train_loss值,取最少值时的embedding值为最佳embedding值,并持久化该值
(4)测试embedding效果的方法,将几个sku查看两两间的embedding的欧氏距离,理论上相近的sku欧氏距离值更小。
(5)代码实现如下(embedding_model_train.py文件):
import os
import numpy as np
import pandas as pd
from torch.utils.data import Dataset,DataLoader
import torch
import torch.nn as nn
import logging
from tqdm import trange
import transformer_utils
logger = logging.getLogger('Transformer.Embedding')
class EmbeddingTrainDataset(Dataset):
def __init__(self, matrix_data):
self.train_data = matrix_data
self.train_len = len(matrix_data)
def __len__(self):
return self.train_len
def __getitem__(self, index):
return self.train_data[index], self.train_data[index]
class AutoEncoder(nn.Module):
def __init__(self, input_dim, embedding_dim):
super(AutoEncoder, self).__init__()
self.encoder = nn.Sequential(
nn.Linear(input_dim, input_dim // 2),
nn.Tanh(),
nn.Linear(input_dim // 2, input_dim // 4),
nn.Tanh(),
nn.Linear(input_dim // 4, embedding_dim),
)
self.decoder = nn.Sequential(
nn.Linear(embedding_dim, input_dim // 4),
nn.Tanh(),
nn.Linear(input_dim // 4, input_dim // 2),
nn.Tanh(),
nn.Linear(input_dim // 2, input_dim),
)
def forward(self, x):
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return encoded, decoded
if __name__ == '__main__':
embedding_dim = 100
epochs = 10000
lr = 0.001
gamma = 0.95
batch_size = 1000
transformer_utils.set_logger(os.path.join(os.getcwd(), 'train.log'))
data_frame = pd.read_csv(os.path.join(os.getcwd(), 'data', 'abs_sku_to_Car_classfication_onehot_detail.csv'), header=None,
names=['sku_code', 'car_model', 'car_id', 'cat_id'], dtype={0: str, 1: str, 2: int, 3: int})
sku_code_set = set(data_frame['sku_code'].drop_duplicates())
sku2idx_dict = {}
for i, sku_code in enumerate(sku_code_set):
sku2idx_dict[sku_code] = i
car_id_num = max(data_frame['car_id'])
sku_code_num = len(sku_code_set)
sku_code_car_matrix = np.zeros((sku_code_num, car_id_num), dtype='float32')
np.save(os.path.join(os.getcwd(), 'data', 'sku2idx_dict'), sku2idx_dict)
for i in trange(len(data_frame)):
sku_code = data_frame.loc[i, 'sku_code']
car_id = data_frame.loc[i, 'car_id']
sku_code_idx = sku2idx_dict[sku_code]
sku_code_car_matrix[sku_code_idx, car_id - 1] = 1
train_set = EmbeddingTrainDataset(sku_code_car_matrix)
train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=0, shuffle=False)
device = "cuda" if torch.cuda.is_available() else "cpu"
autoencoder_model = AutoEncoder(car_id_num, embedding_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(autoencoder_model.parameters(), lr=lr)
train_loss_summary = np.zeros(epochs)
best_evaluate_loss = 100.0
for epoch in trange(epochs):
train_total_loss = 0
sku_encoder_embedding = np.zeros((sku_code_num, embedding_dim), dtype='float32')
train_loader_len = len(train_loader)
for i, (x_input, x_label) in enumerate(train_loader):
x_input = x_input.to(device)
x_label = x_label.to(device)
encoded, decoded = autoencoder_model(x_input)
loss = criterion(decoded, x_label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_total_loss += loss.item()
sku_encoder_embedding[(i * batch_size) : (i * batch_size + x_input.shape[0])] = encoded.detach().to('cpu').numpy()
train_avg_loss = train_total_loss / train_loader_len
logger.info(f'epoch: {epoch + 1}, train_loss: {train_avg_loss}')
is_best = False
if train_avg_loss < best_evaluate_loss:
is_best = True
best_evaluate_loss = train_avg_loss
np.save(os.path.join(os.getcwd(), 'data', 'sku2embedding'), sku_encoder_embedding)
logger.info(f'best embedding at: {epoch + 1}')
if epoch >= 10: # 太前面的去掉,免得影响后面曲线的可观测性
train_loss_summary[epoch] = train_avg_loss
if epoch % 10 == 1:
transformer_utils.plot_all_epoch(train_loss_summary, train_loss_summary, epoch, 'embedding_train_loss_summary.png')
print('finish!')
三、Embedding嵌入Transformer预测
1、transformer预测数据预处理代码(transformer_preprocess_data.py文件):
import os
import numpy as np
import pandas as pd
from tqdm import trange
# 数据格式转换成标准格式
def normalize_data_format(data):
data_sale_list_series = data['sale_info'].apply(lambda row: list(map(float, row.split("_"))))
data_frame = pd.DataFrame(item for item in data_sale_list_series)
data_frame = pd.concat((data['warehouse_sku'], data_frame), axis=1)
data_frame = data_frame.transpose()
return data_frame
# 平滑过大的值
def smooth_big_value(data_frame):
columns_len = len(data_frame.columns)
print(">>>>smooth_big_value")
for i in trange(columns_len):
values = data_frame.iloc[1:,i]
value_mean = np.mean(values[values > 0])
value_std = np.std(values[values > 0], ddof=1)
value_std = value_std if value_std > 0 else 0
values_new = np.round(np.where(values > value_mean + 3 * value_std, value_mean + 3 * value_std, values).astype(float))
values_new = np.array(values_new, dtype=np.int).astype(str)
data_frame.iloc[1:, i] = values_new
return data_frame
# 获取列名和id之间的映射关系
def gen_col2series(columns):
columns = columns.values[0,:]
id2series_dict = {}
series2id_dict = {}
j = 0
for i, column in enumerate(columns):
id2series_dict[i] = column
if series2id_dict.get(column) is None:
series2id_dict[column] = j
j += 1
return id2series_dict, series2id_dict
# 每列的最大值
def gen_series2maxValue(data_frame):
series_max_value = np.max(data_frame[1:], axis=0)
series2maxValue = series_max_value.to_dict()
return series2maxValue
# 处理数据
def prep_data(data, series2maxValue):
num_series = data.shape[1]
time_len = data.shape[0]
windows_per_series = np.full((num_series), (time_len - backcast_len))
total_windows = np.sum(windows_per_series)
x_input = np.zeros((total_windows, backcast_len, 1 + 2), dtype='float32') # sale_info + series_info + max_value
label = np.zeros((total_windows, backcast_len), dtype='float32')
print(">>>>prep_data")
count = 0
zero_count = 0
for series_idx in trange(num_series):
for i in range(windows_per_series[series_idx]):
x_input_data = data[i : i + backcast_len, series_idx]
x_input_series = series_idx
label_data = data[i + 1 : i + backcast_len + 1, series_idx]
if np.max(x_input_data) > 0:
x_input[count, :, 0] = x_input_data
x_input[count, :, 1] = x_input_series
x_input[count, :, 2] = series2maxValue.get(series_idx)
label[count] = label_data
x_input[count, :, 0] = x_input[count, :, 0] / series2maxValue.get(series_idx)
label[count] = label[count] / series2maxValue.get(series_idx)
count += 1
elif np.max(label_data) == 0 and zero_count < 2000 and np.random.choice([0,1], p=[0.6, 0.4]) > 0:
x_input[count, :, 0] = x_input_data
x_input[count, :, 1] = x_input_series
x_input[count, :, 2] = 0
label[count] = label_data
zero_count += 1
count += 1
x_input = x_input[:count]
label = label[:count]
return x_input, label
# 切分测试集、验证集
def split_train_test_data(x_input, label, train_ratio=0.8):
x_len = x_input.shape[0]
shuffle_idx = np.random.permutation(x_len)
train_x_len = int(x_len * train_ratio)
train_shuffle_idx = shuffle_idx[:train_x_len]
test_shuffle_idx = shuffle_idx[train_x_len:]
train_x_input = x_input[train_shuffle_idx]
train_label = label[train_shuffle_idx]
test_x_input = x_input[test_shuffle_idx]
test_label = label[test_shuffle_idx]
return train_x_input, train_label, test_x_input, test_label
if __name__ == '__main__':
backcast_len = 12
train_val_num = 110
data_frame = pd.read_csv(os.path.join(os.getcwd(), 'data', 'ads_hub_sale_num_detail_simple.csv'), header=None, names=['warehouse_sku', 'sale_info'])
data_frame = normalize_data_format(data_frame)
data_frame = data_frame[:train_val_num]
data_frame = smooth_big_value(data_frame)
id2series, series2id = gen_col2series(data_frame)
series2maxValue = gen_series2maxValue(data_frame)
x_input, label = prep_data(data_frame.values[1:].astype('float'), series2maxValue)
train_x_input, train_label, test_x_input, test_label = split_train_test_data(x_input, label)
np.save(os.path.join(os.getcwd(), 'data', 'train_data'), train_x_input)
np.save(os.path.join(os.getcwd(), 'data', 'train_label'), train_label)
np.save(os.path.join(os.getcwd(), 'data', 'test_data'), test_x_input)
np.save(os.path.join(os.getcwd(), 'data', 'test_label'), test_label)
np.save(os.path.join(os.getcwd(), 'data', 'series_max_value'), series2maxValue)
np.save(os.path.join(os.getcwd(), 'data', 'series2id'), series2id)
np.save(os.path.join(os.getcwd(), 'data', 'id2series'), id2series)
print('finish!')
2、dataloader加载数据的代码实现(transformer_dataloader.py文件):
import logging
import os
import numpy as np
from torch.utils.data import Dataset
logger = logging.getLogger('Transformer.Data')
class TrainDataset(Dataset):
def __init__(self, data_path):
self.data = np.load(os.path.join(data_path, 'data', 'train_data.npy'))
self.label = np.load(os.path.join(data_path, 'data', 'train_label.npy'))
self.id2series_dict = np.load(os.path.join(data_path, 'data', 'id2series.npy')).item()
self.sku2idx_dict = np.load(os.path.join(data_path, 'data', 'sku2idx_dict.npy')).item()
self.sku2embedding = np.load(os.path.join(data_path, 'data', 'sku2embedding.npy'))
self.sku_embedding_avg = self.sku2embedding.mean(axis=0)
self.train_len = self.data.shape[0]
logger.info(f'train_len:{self.train_len}')
logger.info('building datasets from train_data.npy')
def __len__(self):
return self.train_len
def __getitem__(self, index):
series_idx = int(self.data[index,0,-2])
series = self.id2series_dict.get(series_idx)
sku_code = series.split('#')[1]
sku_idx = self.sku2idx_dict.get(sku_code)
if sku_idx is None:
sku_embedding = self.sku_embedding_avg
else:
sku_embedding = self.sku2embedding[sku_idx]
return (self.data[index,:,:-2], series_idx, sku_embedding, self.label[index])
class TestDataset(Dataset):
def __init__(self, data_path):
self.data = np.load(os.path.join(data_path, 'data', 'test_data.npy'))
self.label = np.load(os.path.join(data_path, 'data', 'test_label.npy'))
self.id2series_dict = np.load(os.path.join(data_path, 'data', 'id2series.npy')).item()
self.sku2idx_dict = np.load(os.path.join(data_path, 'data', 'sku2idx_dict.npy')).item()
self.sku2embedding = np.load(os.path.join(data_path, 'data', 'sku2embedding.npy'))
self.sku_embedding_avg = self.sku2embedding.mean(axis=0)
self.test_len = self.data.shape[0]
logger.info(f'test_len:{self.test_len}')
logger.info('building datasets from test_data.npy')
def __len__(self):
return self.test_len
def __getitem__(self, index):
series_idx = int(self.data[index, 0, -2])
series = self.id2series_dict.get(series_idx)
sku_code = series.split('#')[1]
sku_idx = self.sku2idx_dict.get(sku_code)
if sku_idx is None:
sku_embedding = self.sku_embedding_avg
else:
sku_embedding = self.sku2embedding[sku_idx]
return (self.data[index,:,:-2], series_idx, sku_embedding, self.data[index,0,-1], self.label[index])
(1)id2series表示sku历史销量到sku_id的映射
(2)sku2idx表示sku_id到embedding序号的映射
(3)sku2embedding表示embedding序号到embedding值的映射
(4)在dataset的__getitem__()函数中,先找到sku_id信息,并继续找到embedding值
3、使用embedding值的流程
代码实现如下(transformer_train.py文件):
import os
import numpy as np
import torch
import torch.nn as nn
import math
import time
import transformer_utils
from transformer_dataloader import TrainDataset,TestDataset
from torch.utils.data import DataLoader
import logging
logger = logging.getLogger('Transformer.Train')
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x, embedding):
return x + self.pe[:x.size(0), :] + embedding
class TransAm(nn.Module):
def __init__(self, feature_size=100, num_layers=1, dropout=0.1):
super(TransAm, self).__init__()
self.model_type = 'Transformer'
self.src_mask = None
self.pos_encoder = PositionalEncoding(feature_size)
self.encoder_layer = nn.TransformerEncoderLayer(d_model=feature_size, nhead=10, dropout=dropout)
self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
self.decoder = nn.Linear(feature_size, 1)
self.init_weights()
def init_weights(self):
initrange = 0.1
self.decoder.bias.data.zero_()
self.decoder.weight.data.uniform_(-initrange, initrange)
def _generate_square_subsequent_mask(self, sz):
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0,1)
mask = mask.float().masked_fill(mask==0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def forward(self, src, pre_embedding):
if self.src_mask is None or self.src_mask.shape[0] == len(src):
device = src.device
mask = self._generate_square_subsequent_mask(len(src)).to(device)
self.src_mask = mask
src = self.pos_encoder(src, pre_embedding)
output = self.transformer_encoder(src, self.src_mask)
output = self.decoder(output)
return output
def evaluate(model, test_loader):
test_total_loss = 0
test_total_with_max_loss = 0
model.eval()
test_loader_len = len(test_loader)
for i, (test_batch, idx, embedding_batch, max_value, labels) in enumerate(test_loader):
test_batch = test_batch.permute(1, 0, 2).to(device)
labels = labels.permute(1, 0).to(device)
embedding_batch = torch.unsqueeze(embedding_batch, dim=0).to(device)
test_output = transformer_model(test_batch, embedding_batch)
test_output = torch.squeeze(test_output)
test_output[test_output < 0] = 0
test_labels = labels[-1]
test_output = test_output[-1]
test_loss = criterion(test_output, test_labels)
test_total_loss += test_loss.item()
max_value = max_value.to(device)
test_with_max_labels = test_labels * max_value
test_with_max_output = test_output * max_value
test_with_max_loss = criterion(test_with_max_output, test_with_max_labels)
test_total_with_max_loss += test_with_max_loss
test_avg_loss = test_total_loss / test_loader_len
test_with_max_avg_loss = test_total_with_max_loss / test_loader_len
return test_avg_loss, test_with_max_avg_loss
if __name__ == '__main__':
transformer_utils.set_logger(os.path.join(os.getcwd(), 'train.log'))
json_path = os.path.join(os.getcwd(), 'params.json')
params = transformer_utils.Params(json_path)
lr = params.lr
epochs = params.epochs
feature_size = params.feature_size
gamma = params.gamma
device = torch.device(params.mode)
input_window = params.input_window
feature_size = params.feature_size
train_set = TrainDataset(os.getcwd())
test_set = TestDataset(os.getcwd())
train_loader = DataLoader(train_set, batch_size=params.train_batch_size, num_workers=0)
test_loader = DataLoader(test_set, batch_size=params.test_batch_size, num_workers=0)
transformer_model = TransAm(feature_size=feature_size).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(transformer_model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=gamma)
train_loss_summary = np.zeros(epochs)
test_loss_summary = np.zeros(epochs)
best_evaluate_loss = 100.0
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
train_loader_len = len(train_loader)
train_total_loss = 0
transformer_model.train()
for i, (train_batch, idx, embedding_batch, label_batch) in enumerate(train_loader):
optimizer.zero_grad()
train_batch = train_batch.permute(1, 0, 2).to(device)
label_batch = label_batch.permute(1, 0).to(device)
embedding_batch = torch.unsqueeze(embedding_batch, dim=0).to(device)
output = transformer_model(train_batch, embedding_batch)
output = torch.squeeze(output)
loss = criterion(output, label_batch)
loss.backward()
optimizer.step()
train_total_loss += loss.item()
train_avg_loss = train_total_loss / train_loader_len
test_avg_loss, test_with_max_avg_loss = evaluate(transformer_model, test_loader)
logger.info(f'epoch: {epoch}, train_loss: {train_avg_loss}, test_loss: {test_avg_loss}, test_max_loss: {test_with_max_avg_loss}')
is_best = False
if test_avg_loss < best_evaluate_loss:
is_best = True
best_evaluate_loss = test_avg_loss
transformer_utils.save_checkpoint({'epoch': epoch,
'state_dict': transformer_model.state_dict(),
'optim_dict': optimizer.state_dict()},
is_best,
epoch=epoch)
train_loss_summary[epoch] = train_avg_loss
test_loss_summary[epoch] = test_avg_loss
if epoch % 20 == 1:
transformer_utils.plot_all_epoch(test_loss_summary, test_loss_summary, epoch, 'train_test_loss_summary.png')
print('finish!')
(1)读取train_loader的embedding_batch,它的shape为[1200, 100],1200为batch_size,100为embedding_size
(2)torch.unsqueeze(embedding_batch, dim=0)操作,使得embedding_batch的shape为[1, 1200, 100],分别对应sequence_length, batch_size, embedding_size,也对应transformer的输入的shape需求
(3)positionalEncoding中,最后输出为:x + self.pe[:x.size(0), :] + embedding
其中,x的shape为[12, 1200, 1],因为历史销量为12周,embedding_size为1,即只有销量值。
self.pe[:x.size(0), :]的shape为[12, 1, 100],因为对应一个batch中的各个值,所以pe是通用的,因此通过torch的广播机制,两者相加后的shape变为[12, 1200, 100]。
embedding的shape为[1, 1200, 100],因为sku的embedding是固有属性,不随时间(seq_length)发生变化,所以和上一步值通过广播机制后相加,最终维度还是[12, 1200, 100]。
4、工具类文件的代码实现(transformer_utils.py):
import logging
import os
import torch
import json
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
logger = logging.getLogger('Transformer.Utils')
class Params:
'''
class that loads hyperparameters from a json file
Example:
params = Params(json_path)
print(params.learning_rate)
'''
def __init__(self, json_path):
with open(json_path) as f:
params = json.load(f)
self.__dict__.update(params)
def set_logger(log_path):
'''Set the logger to log info in terminal and file `log_path`.
In general, it is useful to have a logger so that every output to the terminal is saved
in a permanent file. Here we save it to `model_dir/train.log`.
Example:
logging.info('Starting training...')
Args:
log_path: (string) where to log
'''
_logger = logging.getLogger('Transformer')
_logger.setLevel(logging.INFO)
fmt = logging.Formatter('[%(asctime)s] %(name)s: %(message)s', '%H:%M:%S')
class TqdmHandler(logging.StreamHandler):
def __init__(self, formatter):
logging.StreamHandler.__init__(self)
self.setFormatter(formatter)
def emit(self, record):
msg = self.format(record)
tqdm.write(msg)
file_handler = logging.FileHandler(log_path)
file_handler.setFormatter(fmt)
_logger.addHandler(file_handler)
_logger.addHandler(TqdmHandler(fmt))
def save_checkpoint(state, is_best, epoch, save_checkpoint=False, ins_name=-1):
'''Saves model and training parameters at checkpoint + 'last.pth.tar'. If is_best==True, also saves
checkpoint + 'best.pth.tar'
Args:
state: (dict) contains model's state_dict, may contain other keys such as epoch, optimizer state_dict
is_best: (bool) True if it is the best model seen till now
checkpoint: (string) folder where parameters are to be saved
ins_name: (int) instance index
'''
if save_checkpoint:
if ins_name == -1:
filepath = os.path.join('transformer-training-checkpoint', f'epoch_{epoch}.pth.tar')
else:
filepath = os.path.join('transformer-training-checkpoint', f'epoch_{epoch}_ins_{ins_name}.pth.tar')
if not os.path.exists('transformer-training-checkpoint'):
logger.info(f'Checkpoint Directory does not exist! Making directory transformer-training-checkpoint')
os.mkdir('transformer-training-checkpoint')
torch.save(state, filepath)
logger.info(f'Checkpoint saved to {filepath}')
if is_best:
torch.save(state, os.path.join(os.getcwd(), 'base_model', 'best.pth.tar'))
logger.info('Best checkpoint saved to best.pth.tar')
def plot_all_epoch(train_loss_summary, test_loss_summary, num_samples, png_name):
x = np.arange(start=1, stop=num_samples + 1)
f = plt.figure()
plt.plot(x, train_loss_summary[:num_samples], label='train_loss', linestyle='--')
plt.plot(x, test_loss_summary[:num_samples], label='test_loss', linestyle='-')
f.savefig(os.path.join('base_model', png_name))
plt.close()
5、配置文件(params.json)代码:
{
"train_batch_size": 1200,
"test_batch_size":100,
"lr": 0.005,
"epochs": 1000,
"feature_size": 100,
"gamma": 0.95,
"input_window": 12,
"mode": "cuda"
}