和前面的模型结构上都是大差不差的,但是FM归根结底还是个二阶特征交叉的模型,NFM在在Embedding层后添加 特征交叉池化层 用于对 Embedding向量两两计算元素积操作,并对交叉特征向量求和,得到池化层的输出向量。(可能描述不清楚,可以参考下面论文给出的计算过程)再把该向量输入上层的多层全连接神经网络,进行进一步的交叉。对比前面的DeepFM模型,讲二阶特征交叉部分由并行结构改为了串联的结构,DNN在二阶交叉特征进行进一步交叉,而DeepFM是二阶交叉特征 + 一阶特征DNN,还是在改变特征交叉的操作。说明特征交叉在推荐算法中确实很重要,让模型具备更强的非线性表达能力。
xivi 是 Embedding输出的向量
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
class DNN(nn.Module):
def __init__(self, hidden_units):
super().__init__()
self.layers = nn.ModuleList([
nn.Linear(in_features, out_features, bias=True) for in_features, out_features in zip(hidden_units[:-1], hidden_units[1:])
])
def forward(self, x):
for layer in self.layers:
x = F.relu(layer(x))
return x
class NFM(nn.Module):
def __init__(self, features_info, hidden_units, embedding_dim):
super(NFM, self).__init__()
# 解析特征信息
self.dense_features, self.sparse_features, self.sparse_features_nunique = features_info
# 解析拿到所有 数值型 和 稀疏型特征信息
self.__dense_features_nums = len(self.dense_features)
self.__sparse_features_nums = len(self.sparse_features)
# embedding
self.embeddings = nn.ModuleDict({
"embed_" + key : nn.Embedding(num_embeds, embedding_dim)
for key, num_embeds in self.sparse_features_nunique.items()
})
stack_dim = self.__dense_features_nums + embedding_dim
hidden_units.insert(0, stack_dim)
self.dnn = DNN(hidden_units)
self.dnn_last_linear = nn.Linear(hidden_units[-1], 1, bias=False)
def forward(self, x):
# 从输入x中单独拿出 sparse_input 和 dense_input
dense_inputs, sparse_inputs = x[:, :self.__dense_features_nums], x[:, self.__dense_features_nums:]
sparse_inputs = sparse_inputs.long()
embedding_feas = [self.embeddings["embed_" + key](sparse_inputs[:, idx]) for idx, key in enumerate(self.sparse_features)]
embedding_feas = torch.stack(embedding_feas)
embedding_feas = embedding_feas.permute((1, 0, 2))
# 特征交叉池化层计算,参考公式
embedding_feas = 1/2 * (
torch.pow(torch.sum(embedding_feas, dim=1),2) - torch.sum(torch.pow(embedding_feas, 2), dim=1)
)
input_feas = torch.cat([embedding_feas, dense_inputs], dim=-1)
output = F.sigmoid(self.dnn_last_linear(self.dnn(input_feas)))
return output
def getCriteo(data_path='./criteo/train.csv'):
df_data = pd.read_csv(data_path, sep=',')
df_data.drop(['Id'], axis=1, inplace=True)
dense_features = ['I'+str(i+1) for i in range(13)]
sparse_features = ['C'+str(i+1) for i in range(26)]
# 填充缺失值
df_data[sparse_features] = df_data[sparse_features].fillna('-1')
df_data[dense_features] = df_data[dense_features].fillna(0)
# 类别型特征进行 LabelEncoder 编码
for feature in sparse_features:
df_data[feature] = LabelEncoder().fit_transform(df_data[feature])
# 数值型特征进行 特征归一化
df_data[dense_features] = MinMaxScaler().fit_transform(df_data[dense_features])
label = df_data.pop('Label')
sparse_features_nunique = {}
for fea in sparse_features:
sparse_features_nunique[fea] = df_data[fea].nunique()
features_info = [dense_features, sparse_features, sparse_features_nunique]
return df_data, label, features_info
class TrainTask:
def __init__(self, model, lr=0.001, use_cuda=False):
self.__device = torch.device("cuda" if torch.cuda.is_available() and use_cuda else "cpu")
self.__model = model.to(self.__device)
self.__loss_fn = nn.BCELoss().to(self.__device)
self.__optimizer = torch.optim.Adam(model.parameters(), lr=lr)
self.train_loss = []
self.eval_loss = []
self.train_metric = []
self.eval_metric = []
def __train_one_batch(self, feas, labels):
""" 训练一个batch
"""
self.__optimizer.zero_grad()
# 1. 正向
outputs = self.__model(feas)
# 2. loss求解
loss = self.__loss_fn(outputs.squeeze(), labels)
# 3. 梯度回传
loss.backward()
self.__optimizer.step()
return loss.item(), outputs
def __train_one_epoch(self, train_dataloader, epoch_id):
""" 训练一个epoch
"""
self.__model.train()
loss_sum = 0
batch_id = 0
for batch_id, (feas, labels) in enumerate(train_dataloader):
feas, labels = Variable(feas).to(self.__device), Variable(labels).to(self.__device)
loss, outputs = self.__train_one_batch(feas, labels)
loss_sum += loss
self.train_loss.append(loss_sum / (batch_id + 1))
print("Training Epoch: %d, mean loss: %.5f" % (epoch_id, loss_sum / (batch_id + 1)))
def train(self, train_dataset, eval_dataset, epochs, batch_size):
# 构造DataLoader
train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
eval_data_loader = DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=True)
for epoch in range(epochs):
print('-' * 20 + ' Epoch {} starts '.format(epoch) + '-' * 20)
# 训练一个轮次
self.__train_one_epoch(train_data_loader, epoch_id=epoch)
# 验证一遍
self.__eval(eval_data_loader, epoch_id=epoch)
def __eval(self, eval_dataloader, epoch_id):
""" 验证集上推理一遍
"""
batch_id = 0
loss_sum = 0
self.__model.eval()
for batch_id, (feas, labels) in enumerate(eval_dataloader):
with torch.no_grad():
feas, labels = Variable(feas).to(self.__device), Variable(labels).to(self.__device)
# 1. 正向
outputs = self.__model(feas)
# 2. loss求解
loss = self.__loss_fn(outputs.view(-1), labels)
loss_sum += loss.item()
self.eval_loss.append(loss_sum / (batch_id + 1))
print("Evaluate Epoch: %d, mean loss: %.5f" % (epoch_id, loss_sum / (batch_id + 1)))
def __plot_metric(self, train_metrics, val_metrics, metric_name):
""" 指标可视化
"""
epochs = range(1, len(train_metrics) + 1)
plt.plot(epochs, train_metrics, 'bo--')
plt.plot(epochs, val_metrics, 'ro-')
plt.title('Training and validation '+ metric_name)
plt.xlabel("Epochs")
plt.ylabel(metric_name)
plt.legend(["train_"+metric_name, 'val_'+metric_name])
plt.show()
def plot_loss_curve(self):
self.__plot_metric(self.train_loss, self.eval_loss, "Loss")
if __name__ == "__main__":
df_data, label, features_info = getCriteo()
# 划分、构建数据集、数据通道
x_train, x_val, y_train, y_val = train_test_split(df_data, label, test_size=0.2, random_state=2022)
train_dataset = TensorDataset(torch.tensor(x_train.values).float(), torch.tensor(y_train.values).float())
val_dataset = TensorDataset(torch.tensor(x_val.values).float(), torch.tensor(y_val.values).float())
# 构建模型
model = NFM(features_info, hidden_units=[64, 32], embedding_dim=8)
task = TrainTask(model, use_cuda=False)
task.train(train_dataset, val_dataset, 20, 16)
task.plot_loss_curve()