AFM
重点:在NFM中,特征向量进行两两交叉之后,直接进行sum pooling,将二阶交叉向量进行等权求和处理。但是直觉上来说,不同的交叉特征应该有着不同的重要性。不重要的交叉特征应该降低其权重,而重要性高的交叉特征应该提高其权重。Attention概念与该思想不谋而合,AFM作者顺势将其引入到模型之中,为每个交叉特征引入重要性权重,最终在对特征向量进行sum pooling时,利用重要性权重对二阶交叉特征进行加权求和。
模型
计算公式:
![[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-YhwSkold-1605879494373)(./imgs/fram.png)]](https://i-blog.csdnimg.cn/blog_migrate/4e026d4be6ca9e287457b6e161668f73.png#pic_center)
框架:
![[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-V6kv3OJg-1605879494376)(./imgs/AFM.png)]](https://i-blog.csdnimg.cn/blog_migrate/5754df28b4b7e3624c0e7b6b00a2d417.png#pic_center)
class BaseModel:
pass
class AFM(BaseModel):
"""Instantiates the Attentional Factorization Machine architecture.
:param linear_feature_columns: An iterable containing all the features used by linear part of the model.
:param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
:param use_attention: bool,whether use attention or not,if set to ``False``.it is the same as **standard Factorization Machine**
:param attention_factor: positive integer,units in attention net
:param l2_reg_linear: float. L2 regularizer strength applied to linear part
:param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
:param l2_reg_att: float. L2 regularizer strength applied to attention net
:param afm_dropout: float in [0,1), Fraction of the attention net output units to dropout.
:param init_std: float,to use as the initialize std of embedding vector
:param seed: integer ,to use as random seed.
:param task: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss
:param device: str, ``"cpu"`` or ``"cuda:0"``
:return: A PyTorch model instance.
"""
def __init__(self, linear_feature_columns, dnn_feature_columns, use_attention=True, attention_factor=8,
l2_reg_linear=1e-5, l2_reg_embedding=1e-5, l2_reg_att=1e-5, afm_dropout=0, init_std=0.0001, seed=1024,
task='binary', device='cpu'):
super(AFM, self).__init__(linear_feature_columns, dnn_feature_columns, l2_reg_linear=l2_reg_linear,
l2_reg_embedding=l2_reg_embedding, init_std=init_std, seed=seed, task=task,
device=device)
self.use_attention = use_attention
if use_attention:
self.fm = AFMLayer(self.embedding_size, attention_factor, l2_reg_att, afm_dropout,
seed, device)
# 防止过拟合
self.add_regularization_weight(self.fm.attention_W, l2_reg_att)
else:
self.fm = FM()
self.to(device)
def forward(self, X):
sparse_embedding_list, _ = self.input_from_feature_columns(X, self.dnn_feature_columns,
self.embedding_dict, support_dense=True)
# 一阶项
logit = self.linear_model(X)
# 二阶注意力交叉项
if len(sparse_embedding_list) > 0:
if self.use_attention:
logit += self.fm(sparse_embedding_list)
else:
logit += self.fm(torch.cat(sparse_embedding_list, dim=1))
# 预测
y_pred = self.out(logit)
return y_pred
AFMLayer
![[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JXJ1pdBU-1605879494378)(./imgs/att.png)]](https://i-blog.csdnimg.cn/blog_migrate/4f23971467fd084f81c3b05a34abb640.png#pic_center)
class AFMLayer(nn.Module):
"""Attentonal Factorization Machine models pairwise (order-2) feature
interactions without linear term and bias.
Input shape
- A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
Output shape
- 2D tensor with shape: ``(batch_size, 1)``.
Arguments
- **in_features** : Positive integer, dimensionality of input features.
- **attention_factor** : Positive integer, dimensionality of the
attention network output space.
- **l2_reg_w** : float between 0 and 1. L2 regularizer strength
applied to attention network.
- **dropout_rate** : float between in [0,1). Fraction of the attention net output units to dropout.
- **seed** : A Python integer to use as random seed.
References
- [Attentional Factorization Machines : Learning the Weight of Feature
Interactions via Attention Networks](https://arxiv.org/pdf/1708.04617.pdf)
"""
def __init__(self, in_features, attention_factor=4, l2_reg_w=0, dropout_rate=0, seed=1024, device='cpu'):
super(AFMLayer, self).__init__()
self.attention_factor = attention_factor
self.l2_reg_w = l2_reg_w
self.dropout_rate = dropout_rate
self.seed = seed
embedding_size = in_features
self.attention_W = nn.Parameter(torch.Tensor(
embedding_size, self.attention_factor))
self.attention_b = nn.Parameter(torch.Tensor(self.attention_factor))
self.projection_h = nn.Parameter(
torch.Tensor(self.attention_factor, 1))
self.projection_p = nn.Parameter(torch.Tensor(embedding_size, 1))
for tensor in [self.attention_W, self.projection_h, self.projection_p]:
nn.init.xavier_normal_(tensor, )
for tensor in [self.attention_b]:
nn.init.zeros_(tensor, )
self.dropout = nn.Dropout(dropout_rate)
self.to(device)
def forward(self, inputs):
embeds_vec_list = inputs
row = []
col = []
# 组合所有可能的二阶特征交叉
for r, c in itertools.combinations(embeds_vec_list, 2):
row.append(r)
col.append(c)
# 获得交叉项的左半部分 x_i
# (B, 交差项数目, E)
p = torch.cat(row, dim=1)
# 获得交叉项的有半部分 x_j
q = torch.cat(col, dim=1)
# 计算交叉项目 x_i * x_j
inner_product = p * q
# 输入交叉项目
# (B, 交叉项目数, E)
bi_interaction = inner_product
# 单层全连接
# (B, 交叉项目数, attention_factor)
attention_temp = F.relu(torch.tensordot(
bi_interaction, self.attention_W, dims=([-1], [0])) + self.attention_b)
# 计算每一个交叉项目的注意力权重
# (B, 交差项目数, 1)
self.normalized_att_score = F.softmax(torch.tensordot(
attention_temp, self.projection_h, dims=([-1], [0])), dim=1)
# 所有二阶特征加权求和(B, E)
attention_output = torch.sum(
self.normalized_att_score * bi_interaction, dim=1)
# 防止过拟合
attention_output = self.dropout(attention_output) # training
# (B, 1)
afm_out = torch.tennsordot(
attention_output, self.projection_p, dims=([-1], [0]))
return afm_out
966

被折叠的 条评论
为什么被折叠?



