【1】基础模型
1、AFM
(1)本文在FM的基础上提出了添加注意力层,FM考虑了特征的一阶和二阶交互,其中FM公式如下:
或者可以使用这种方式描述公式:
(2)基于上述,我们取FM中二阶特征交互部分,公式如下:
(3)AFM总结架构和公式如下图:
(4)代码如下:
def feature_interaction(feature_emb):
p, q = zip(*list(combinations(range(num_fields), 2)))
self.field_p = nn.Parameter(torch.LongTensor(p), requires_grad=False)
self.field_q = nn.Parameter(torch.LongTensor(q), requires_grad=False)
emb1 = torch.index_select(feature_emb, 1, self.field_p)
emb2 = torch.index_select(feature_emb, 1, self.field_q)
return emb1 * emb2 # [b,(f*f-f)/2,emb]
# 注意力层
self.attention = nn.Sequential(nn.Linear(embedding_dim, attention_dim),
nn.ReLU(),
nn.Linear(attention_dim, 1, bias=False),
nn.Softmax(dim=1))
elementwise_product = self.product_layer(feature_emb) # # [b,(f*f-f)/2,emb]
attention_weight = self.attention(elementwise_product) # [b,(f*f-f)/2,1]
attention_sum = torch.sum(attention_weight * elementwise_product, dim=1) # [b,emb]
afm_out = self.weight_line(attention_sum) # 经过线性层,[b,1]
y_pred = self.lr_layer(x) + afm_out
2、DCN
(1) 模型架构:
(2)本文主要创新地方是是用来cross network网络,也即是上图中左侧部分,主要公示如下:
(3) 主要代码如下:
class CrossInteractionLayer(nn.Module):
def __init__(self, input_dim, hidden_dim=None, cross_type="weight_cross"):
super(CrossInteractionLayer, self).__init__()
if cross_type == "weight_cross":
self.weight = nn.Linear(input_dim, 1, bias=False)
elif cross_type == "attention_cross":
if hidden_dim is None:
hidden_dim = 4 * input_dim
self.weight = nn.Sequential(nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1, bias=False),
nn.Softmax(dim=1))
self.bias = nn.Parameter(torch.zeros(input_dim))
def forward(self, X_0, X_i): # [b,f*emb]
interaction_out = self.weight(X_i) * X_0 + self.bias # [b,f*emb]
return interaction_out
# DCN
class CrossNet(nn.Module):
"""
cross_type两个类型[weight_cross, attention_cross]
"""
def __init__(self, input_dim, num_layers, hidden_dim=None,cross_type="weight_cross"):
super(CrossNet, self).__init__()
self.num_layers = num_layers
self.cross_net = nn.ModuleList(CrossInteractionLayer(input_dim, hidden_dim,
cross_type) for _ in range(self.num_layers))
def forward(self, X_0): # [b,f*emb]
X_i = X_0
for i in range(self.num_layers):
X_i = X_i + self.cross_net[i](X_0, X_i) # [b,f*emb]
return X_i # [b,f*emb]
# main
flat_feature_emb = feature_emb.flatten(start_dim=1) # [b,f*emb]
cross_out = self.crossnet(flat_feature_emb) # cross部分,[b,f*emb]
dnn_out = self.dnn(flat_feature_emb) # dnn部分,[b,dnn_emb]
final_out = torch.cat([cross_out, dnn_out], dim=-1) # [b, f*emb+dnn_emb]
3、DeepFM
(1)论文主要是FM+Deep的组合,代码如下:
class LR_Layer(nn.Module):
def __init__(self, feature_map, output_activation=None, use_bias=True):
super(LR_Layer, self).__init__()
self.bias = nn.Parameter(torch.zeros(1), requires_grad=True) if use_bias else
None
self.output_activation = output_activation
# A trick for quick one-hot encoding in LR
self.embedding_layer = EmbeddingLayer(feature_map, 1, use_pretrain=False)
def forward(self, X): # [b,f]
embed_weights = self.embedding_layer(X) # [b,f,1]
output = embed_weights.sum(dim=1) # [b,1]
if self.bias is not None:
output += self.bias
if self.output_activation is not None:
output = self.output_activation(output)
return output
class FM_Layer(nn.Module):
def __init__(self, feature_map, output_activation=None, use_bias=True):
super(FM_Layer, self).__init__()
self.inner_product_layer = InnerProductLayer(feature_map.num_fields,
output="product_sum_pooling")
self.lr_layer = LR_Layer(feature_map, output_activation=None, use_bias=use_bias)
self.output_activation = output_activation
def forward(self, X, feature_emb):
lr_out = self.lr_layer(X) # 线性部分 [b,1]
dot_sum = self.inner_product_layer(feature_emb) # 特征交叉部分 [b,1]
output = dot_sum + lr_out # [b,1]
if self.output_activation is not None:
output = self.output_activation(output)
return output
# main
y_pred = self.fm_layer(X, feature_emb)
y_pred += self.dnn(feature_emb.flatten(start_dim=1))
4、PNN(IPNN)
(1)IPNN内积方式
(2)代码如下:
def inner_product_layer(feature_emb):
self.interaction_units = int(num_fields * (num_fields - 1) / 2) # 特征无重复交叉的个数
self.upper_triange_mask = nn.Parameter(
torch.triu(torch.ones(num_fields, num_fields), 1).type(torch.bool), # 除去对角线的上三角为1
requires_grad=False)
inner_product_matrix = torch.bmm(feature_emb, feature_emb.transpose(1, 2))
# 根据self.upper_triange_mask(为true的)条件从inner_product_matrix中取值,得到 [b*(f*f-f)/2]个数
flat_upper_triange = torch.masked_select(inner_product_matrix, self.upper_triange_mask)
# [b*(f*f-f)/2]转为[b,(f*f- f)/2]
return flat_upper_triange.view(-1, self.interaction_units)
# main
inner_product_vec = self.inner_product_layer(feature_emb)
dense_input = torch.cat([feature_emb.flatten(start_dim=1), inner_product_vec], dim=1)
y_pred = self.dnn(dense_input)
5、NFM
(1)与FM的区别,FM是embedding两两交叉,直接向量内积计算得到一个数值。NFM是两两交叉element-wise product,得到相同维度的embedding,后续再接DNN。
6、总结
【1】matmul product矩阵乘法,例如m行n列矩阵A和n行k列矩阵B,得到m行k列矩阵C。记作C=AB,C为矩阵A、B对应两两元素乘积之和。 tf.matmul函数,将矩阵a乘以矩阵b,生成ab。就是向量乘法,即线性代数中的矩阵之间相乘的运算。通常此函数可以使用@
运算符代替。
b = tf.constant([7, 8, 9, 10, 11, 12], shape=[3, 2])
a = tf.constant([1, 2, 3, 4, 5, 6], shape=[2, 3])
c = tf.matmul(a, b)
【2】Hadamard product哈达玛积,A和B都是m行n列(其中矩阵a的列要么为1要么和矩阵b同列,矩阵a和矩阵b行数必须相等),A=[x1,x2,x3],B=[y1,y2,y3],得到[x1*y1,x2*y2,x3*y3],相同位置的元素相乘。tf.multiply函数,两个矩阵中对应元素各自相乘,即逐元素操做,逐元素操做是指把x中的每个元素与y中的每个元素逐个地进行运算。通常次函数可以使用*
运算符来代替。
【3】内积(根据IPNN理解下内积操作),向量的内积,对两个向量执行点乘运算,就是对这两个向量对应位一一相乘之后求和的操作,A=[x1,x2,x3],B=[y1,y2,y3],得到[x1*y1+x2*y2+x3*y3],能够看出两个向量的内积结果为一个标量。内积可以使用如下公式来表达,
【4】总结
【2】门控机制
门控机制在推荐系统被广泛应用,如多任务模型MMOE、PLE等,其主要作用相当于一个调节阀,控制信息流入或流出的程度,在CTR排序模型中门控机制也被使用,主要作用为:
(1)学习特征重要度:特征交叉组合在排序模型中非常重要,选择有用的特征进行交叉组合,可以提升特征交叉效率,因此可以借助门槛机制,动态学习各特征重要度,弱化不重要特征,强化重要特征作用,进而可以提高特征交叉效率和模型效果;
(2)感知上下文信息:NLP领域同一个词在不同语境下意思不同(如“苹果”,喜欢苹果手机、喜欢苹果香蕉),通过感知上下文信息,可以获取更精准的表达,因此可以借助门控机制,把所有特征作为输入,动态生成包含上下文信息的mask,通过Hadamard积把上下文信息融入特征中,进而提升特征表达和交叉能力。
(3)学习不同特征分布:在输入相同的情况下,通过门控网络,控制信息的流出,为不同并行子网络,提供不同的输入,从而不同子网络能够学习不同的特征分布。
1、FiBiNET
(1)SENET层,学习得到每个特征fields权重,然后特征权重与embedding相乘,参与后续交叉计算,起到一个门控作用,会弱化不重要特征,强化重要特征,进而提升模型效果。
2、GateNet
(1) 在embedding层、DNN隐层引入Gate机制,选择重要的特征信息参与特征交叉组合。对于某个特征field。embedding层Gate,embedding向量作为输入,通过一层FC,得到该特征fields权重,计算方式分为两种,Vector-Wise:embeding向量通过一层FC,输出节点为1,得到特征field权重值,权重值与原始embedding向量进行相乘。Bit-Wise:embeding向量通过一层FC,输出节点为个数与embedding维度相同,得到特征field权重向量,权重向量与原始embedding向量进Hadamard积。
(2)隐层Gate,与上述Bit-Wise类似,隐层特征作为输入,通过一层FC,得到与输入特征维度相同的权重向量,权重向量与原始特征向量进行Hadamard积。
3、EDCN
(1)包括两个核心模块:Bridge模块:显式特征交叉与隐式特征交叉分别进行后,进行融合,通过协同工作来更好地捕获子网络各层间的交互信号。Regulation模块:不同特征适合不同的交叉函数,通过field-wise门控网络,把融合后的特征拆分开,使后续显式和隐式交叉结构输入特征分布不同。
4、MaskNet
(1)Instance-Guided Mask:由两层FC构成,先进行上下文特征聚合,然后维度映射,使mask维度与输入特征相同。聚合层比投影层宽,因为投影层的大小需要等于特征嵌入层。instance-guided mask可以被看作为一种特殊的bit-wise attention或者是gate结构。Vmask可以直接看作为每一个bit的权重,可以加强重要的特征,减弱噪声对模型的影响。
(2)MaskBlock:当embedding层作为输入时,根据所有输入特征得到Instance-Guided Mask,然后embedding层向量经过LayerNorm后,再与mask进行Hadamard积,然后再经过FC+LayerNorm+ReLU,进行特征交叉。
第一种方式:Serial MaskNet
class SerialMaskNet(nn.Module):
def __init__(self, input_dim, output_dim=None, output_activation=None, hidden_units=[],
hidden_activations="ReLU", reduction_ratio=1, dropout_rates=0, layer_norm=True):
super(SerialMaskNet, self).__init__()
if not isinstance(dropout_rates, list):
dropout_rates = [dropout_rates] * len(hidden_units)
if not isinstance(hidden_activations, list):
hidden_activations = [hidden_activations] * len(hidden_units)
self.hidden_units = [input_dim] + hidden_units
self.mask_blocks = nn.ModuleList()
for idx in range(len(self.hidden_units) - 1):
self.mask_blocks.append(MaskBlock(input_dim,
self.hidden_units[idx],
self.hidden_units[idx + 1],
hidden_activations[idx],
reduction_ratio,
dropout_rates[idx],
layer_norm))
fc_layers = []
if output_dim is not None:
fc_layers.append(nn.Linear(self.hidden_units[-1], output_dim))
if output_activation is not None:
fc_layers.append(get_activation(output_activation))
self.fc = None
if len(fc_layers) > 0:
self.fc = nn.Sequential(*fc_layers)
def forward(self, V_emb, V_hidden):
v_out = V_hidden
for idx in range(len(self.hidden_units) - 1):
v_out = self.mask_blocks[idx](V_emb, v_out) # [b,block_dim]
if self.fc is not None:
v_out = self.fc(v_out) # [b,1]
return v_out
class MaskBlock(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, hidden_activation="ReLU", reduction_ratio=1,
dropout_rate=0, layer_norm=True):
super(MaskBlock, self).__init__()
self.mask_layer = nn.Sequential(nn.Linear(input_dim, int(hidden_dim * reduction_ratio)),
nn.ReLU(),
nn.Linear(int(hidden_dim * reduction_ratio), hidden_dim))
hidden_layers = [nn.Linear(hidden_dim, output_dim, bias=False)]
if layer_norm:
hidden_layers.append(nn.LayerNorm(output_dim)) # 归一化
hidden_layers.append(get_activation(hidden_activation))
if dropout_rate > 0:
hidden_layers.append(nn.Dropout(p=dropout_rate))
self.hidden_layer = nn.Sequential(*hidden_layers)
def forward(self, V_emb, V_hidden):
V_mask = self.mask_layer(V_emb) # [b,f*emb]
v_out = self.hidden_layer(V_mask * V_hidden) # Instance-Guided Mask,[b,block_dim]
return v_out
第二种方式:Parallel MaskNet
class ParallelMaskNet(nn.Module):
def __init__(self, input_dim, output_dim=None, output_activation=None, num_blocks=1, block_dim=64,
hidden_units=[], hidden_activations="ReLU", reduction_ratio=1, dropout_rates=0,
layer_norm=True):
super(ParallelMaskNet, self).__init__()
self.num_blocks = num_blocks
self.mask_blocks = nn.ModuleList([MaskBlock(input_dim,
input_dim,
block_dim,
hidden_activations,
reduction_ratio,
dropout_rates,
layer_norm) for _ in range(num_blocks)])
self.dnn = MLP_Layer(input_dim=block_dim * num_blocks,
output_dim=output_dim,
hidden_units=hidden_units,
hidden_activations=hidden_activations,
output_activation=output_activation,
dropout_rates=dropout_rates)
def forward(self, V_emb, V_hidden): # [b,f*emb]
block_out = []
for i in range(self.num_blocks):
block_out.append(self.mask_blocks[i](V_emb, V_hidden)) # num_blocks个[b,block_dim]
concat_out = torch.cat(block_out, dim=-1) # [b,num_blocks*block_dim]
v_out = self.dnn(concat_out) # feed-forward layer [b,1]
return v_out
class MaskBlock(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, hidden_activation="ReLU", reduction_ratio=1,
dropout_rate=0, layer_norm=True):
super(MaskBlock, self).__init__()
self.mask_layer = nn.Sequential(nn.Linear(input_dim, int(hidden_dim * reduction_ratio)),
nn.ReLU(),
nn.Linear(int(hidden_dim * reduction_ratio), hidden_dim))
hidden_layers = [nn.Linear(hidden_dim, output_dim, bias=False)]
if layer_norm:
hidden_layers.append(nn.LayerNorm(output_dim)) # 归一化
hidden_layers.append(get_activation(hidden_activation))
if dropout_rate > 0:
hidden_layers.append(nn.Dropout(p=dropout_rate))
self.hidden_layer = nn.Sequential(*hidden_layers)
def forward(self, V_emb, V_hidden):
V_mask = self.mask_layer(V_emb) # [b,f*emb]
v_out = self.hidden_layer(V_mask * V_hidden) # Instance-Guided Mask,[b,block_dim]
return v_out