# CTR预估算法之FM, FFM, DeepFM及实践

## Factorization Machines(FM)

FM的paper地址如下：https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf
FM主要目标是：解决数据稀疏的情况下，特征怎样组合的问题

1. 可以在非常稀疏的数据中进行合理的参数估计
2. FM模型的时间复杂度是线性的
3. FM是一个通用模型，它可以用于任何特征为实值的情况

#### 算法原理

$y={w}_{0}+\sum _{i=1}^{n}{w}_{i}{x}_{i}$$y = w_0 + \sum_{i=1}^nw_ix_i$

$y={w}_{0}+\sum _{i=1}^{n}{w}_{i}{x}_{i}+\sum _{i=1}^{n}\sum _{j=i+1}^{n}{w}_{ij}{x}_{i}{x}_{j}$$y = w_0 + \sum_{i=1}^nw_ix_i + \sum_{i=1}^{n}\sum_{j=i+1}^nw_{ij}x_ix_j$

$y={w}_{0}+\sum _{i=1}^{n}{w}_{i}{x}_{i}+\sum _{i=1}^{n}\sum _{j=i+1}^{n}<{V}_{i},{V}_{j}>{x}_{i}{x}_{j}$$y = w_0 + \sum_{i=1}^nw_ix_i + \sum_{i=1}^{n}\sum_{j=i+1}^n x_ix_j$

$\sum _{i=1}^{n}\sum _{j=i+1}^{n}<{V}_{i},{V}_{j}>{x}_{i}{x}_{j}$$\sum_{i=1}^{n}\sum_{j=i+1}^n x_ix_j$

$=\frac{1}{2}\sum _{i=1}^{n}\sum _{j=1}^{n}<{V}_{i},{V}_{j}>{x}_{i}{x}_{j}-\frac{1}{2}\sum _{i=1}^{n}<{V}_{i},{V}_{i}>{x}_{i}{x}_{i}$$=\frac{1}{2}\sum_{i=1}^{n}\sum_{j=1}^n x_ix_j -\frac{1}{2}\sum_{i=1}^nx_ix_i$

$=\frac{1}{2}\left(\sum _{i=1}^{n}\sum _{j=1}^{n}\sum _{f=1}^{k}{v}_{if}{v}_{jf}{x}_{i}{x}_{j}-\sum _{i=1}^{n}\sum _{f=1}^{k}{v}_{if}{v}_{if}{x}_{i}{x}_{i}\right)$$=\frac{1}{2}(\sum_{i=1}^n\sum_{j=1}^{n}\sum_{f=1}^kv_{if}v_{jf}x_ix_j - \sum_{i=1}^{n}\sum_{f=1}^kv_{if}v_{if}x_ix_i)$

$=\frac{1}{2}\sum _{f=1}^{k}\left(\left(\sum _{i=1}^{n}{v}_{if}{x}_{i}\right)\left(\sum _{j=1}^{n}{v}_{jf}{x}_{j}\right)-\sum _{i=1}^{n}{v}_{if}^{2}{x}_{i}^{2}\right)$$=\frac{1}{2}\sum_{f=1}^{k}((\sum_{i=1}^nv_{if}x_i)(\sum_{j=1}^nv_{jf}x_j) - \sum_{i=1}^nv_{if}^2x_i^2)$

$=\frac{1}{2}\sum _{f=1}^{k}\left(\left(\sum _{i=1}^{n}{v}_{if}{x}_{i}{\right)}^{2}-\sum _{i=1}^{n}{v}_{if}^{2}{x}_{i}^{2}\right)\right)$$=\frac{1}{2}\sum_{f=1}^{k}((\sum_{i=1}^nv_{if}x_i)^2 - \sum_{i=1}^nv_{if}^2x_i^2))$

#### 代码实现

class FM(object):
"""
Factorization Machine with FTRL optimization
"""
def __init__(self, config):
"""
:param config: configuration of hyperparameters
type of dict
"""
# number of latent factors
self.k = config['k']
self.lr = config['lr']
self.batch_size = config['batch_size']
self.reg_l1 = config['reg_l1']
self.reg_l2 = config['reg_l2']
# num of features
self.p = feature_length

self.X = tf.sparse_placeholder('float32', [None, self.p])
self.y = tf.placeholder('int64', [None,])
self.keep_prob = tf.placeholder('float32')

def inference(self):
"""
forward propagation
:return: labels for each sample
"""
with tf.variable_scope('linear_layer'):
b = tf.get_variable('bias', shape=[2],
initializer=tf.zeros_initializer())
w1 = tf.get_variable('w1', shape=[self.p, 2],
initializer=tf.truncated_normal_initializer(mean=0,stddev=1e-2))
# shape of [None, 2]
self.linear_terms = tf.add(tf.sparse_tensor_dense_matmul  (self.X, w1), b)

with tf.variable_scope('interaction_layer'):
v = tf.get_variable('v', shape=[self.p, self.k],
initializer=tf.truncated_normal_initializer(mean=0, stddev=0.01))
# shape of [None, 1]
self.interaction_terms = tf.multiply(0.5,
tf.reduce_mean(
tf.subtract(
tf.pow(tf.sparse_tensor_dense_matmul(self.X, v), 2),
tf.sparse_tensor_dense_matmul(tf.pow(self.X, 2), tf.pow(v, 2))),
1, keep_dims=True))
# shape of [None, 2]
self.y_out_prob = tf.nn.softmax(self.y_out)

cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.y_out)
mean_loss = tf.reduce_mean(cross_entropy)
self.loss = mean_loss
tf.summary.scalar('loss', self.loss)

# accuracy
self.correct_prediction = tf.equal(tf.cast(tf.argmax(model.y_out,1), tf.int64), model.y)
self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))
tf.summary.scalar('accuracy', self.accuracy)

def train(self):
# Applies exponential decay to learning rate
self.global_step = tf.Variable(0, trainable=False)
# define optimizer
optimizer = tf.train.FtrlOptimizer(self.lr, l1_regularization_strength=self.reg_l1,
l2_regularization_strength=self.reg_l2)
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
self.train_op = optimizer.minimize(self.loss, global_step=self.global_step)

def build_graph(self):
"""build graph for model"""
self.inference()
self.train()


## Field-aware Factorization Machines(FFM)

FFM的论文地址：https://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf
FFM（Field-aware Factorization Machine）最初的概念来自Yu-Chin Juan（阮毓钦，毕业于中国台湾大学，现在美国Criteo工作）与其比赛队员，提出了FM的升级版模型。通过引入field的概念3，FFM把相同性质的特征归于同一个field。

#### 算法原理

$y={w}_{0}+\sum _{i=1}^{n}{w}_{i}{x}_{i}+\sum _{i=1}^{n}\sum _{j=i+1}^{n}<{V}_{i,{f}_{j}},{V}_{j,{f}_{i}}>{x}_{i}{x}_{j}$$y = w_0 + \sum_{i=1}^nw_ix_i+\sum_{i=1}^n\sum_{j=i+1}^nx_ix_j$

${K}_{FFM}<<{K}_{FM}$$K_{FFM}<

Yes ESPN Nike Male

${\varphi }_{FM}（V,x）=<{V}_{ESPN},{V}_{Nike}>+<{V}_{ESPN},{V}_{Male}>+<{V}_{Nike},{V}_{Male}>$$\phi_{FM}（V,x） = ++$,

${\varphi }_{FFM}（V,x）=<{V}_{ESPN,A},{V}_{Nike,P}>+<{V}_{ESPN,G},{V}_{Male,P}>+<{V}_{Nike,G},{V}_{Male,A}>$$\phi_{FFM}（V,x） = ++$

$<{V}_{ESPN,A},{V}_{Nike,P}>$$$中，因为$Nike$$Nike$在字段A中，所以$ESPN$$ESPN$的这个特征必须考虑到字段$A$$A$,以区分其他字段。$<{V}_{ESPN,G},{V}_{Male,P}>$$$中因为其交叉的特征$Male$$Male$属于字段$G$$G$,所以使用了${V}_{ESPN,G}$$V_{ESPN,G}$这个latent vector。这样，每个特征都有$f$$f$个latent vector

#### 代码实现

class FFM(object):
"""
Field-aware Factorization Machine
"""
def __init__(self, config):
"""
:param config: configuration of hyperparameters
type of dict
"""
# number of latent factors
self.k = config['k']
# num of fields
self.f = config['f']
# num of features
self.p = feature_length
self.lr = config['lr']
self.batch_size = config['batch_size']
self.reg_l1 = config['reg_l1']
self.reg_l2 = config['reg_l2']
self.feature2field = config['feature2field']

self.X = tf.placeholder('float32', [self.batch_size, self.p])
self.y = tf.placeholder('int64', [None,])
self.keep_prob = tf.placeholder('float32')

def inference(self):
"""
forward propagation
:return: labels for each sample
"""
with tf.variable_scope('linear_layer'):
b = tf.get_variable('bias', shape=[2],
initializer=tf.zeros_initializer())
w1 = tf.get_variable('w1', shape=[self.p, 2],
initializer=tf.truncated_normal_initializer(mean=0,stddev=1e-2))
# shape of [None, 2]

with tf.variable_scope('field_aware_interaction_layer'):
v = tf.get_variable('v', shape=[self.p, self.f, self.k], dtype='float32',
initializer=tf.truncated_normal_initializer(mean=0, stddev=0.01))
# shape of [None, 1]
self.field_aware_interaction_terms = tf.constant(0, dtype='float32')
# build dict to find f, key of feature,value of field
for i in range(self.p):
for j in range(i+1,self.p):
self.field_aware_interaction_terms += tf.multiply(
tf.reduce_sum(tf.multiply(v[i,self.feature2field[i]], v[j,self.feature2field[j]])),
tf.multiply(self.X[:,i], self.X[:,j])
)
# shape of [None, 2]
self.y_out_prob = tf.nn.softmax(self.y_out)

cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.y_out)
mean_loss = tf.reduce_mean(cross_entropy)
self.loss = mean_loss
tf.summary.scalar('loss', self.loss)

# accuracy
self.correct_prediction = tf.equal(tf.cast(tf.argmax(model.y_out,1), tf.int64), model.y)
self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))
tf.summary.scalar('accuracy', self.accuracy)

def train(self):
# Applies exponential decay to learning rate
self.global_step = tf.Variable(0, trainable=False)
# define optimizer
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
self.train_op = optimizer.minimize(self.loss, global_step=self.global_step)

def build_graph(self):
"""build graph for model"""
self.inference()
self.train()


### Deep FM

DeepFM模型结合了广度和深度模型的有点，联合训练FM模型和DNN模型，来同时学习低阶特征组合和高阶特征组合。此外，DeepFM模型的Deep component和FM component从Embedding层共享数据输入，这样做的好处是Embedding层的隐式向量在(残差反向传播)训练时可以同时接受到Deep component和FM component的信息，从而使Embedding层的信息表达更加准确而最终提升推荐效果。DeepFM相对于现有的广度模型、深度模型以及Wide&Deep; DeepFM模型的优势在于：

• DeepFM模型同时对低阶特征组合和高阶特征组合建模，从而能够学习到各阶特征之间的组合关系
• DeepFM模型是一个端到端的模型，不需要任何的人工特征工程

#### 算法原理

DeepFM包含两部分，左边的FM部分和右边的DNN部分。这两部分共享相同的输入。对于给定的特征$i$$i$, ${w}_{i}$$w_i$用于表示一阶特征的重要性，特征$i$$i$的隐向量(latent vector)${V}_{i}$$V_i$用户表示和其他特征的相互影响。在FM部分，${V}_{i}$$V_i$用于表征二阶特征，同时在神经网络部分用于构建高阶特征。对于当前模型，所有的参数共同参与训练。DeepFM的预测结果可以写为

$y=sigmoid\left({y}_{FM}+{y}_{DNN}\right)$$y = sigmoid(y_{FM}+y_{DNN})$

$y\in \left(0,1\right)$$y\in(0,1)$是预测的CTR，${y}_{FM}$$y_{FM}$是FM部分得到的结果，${y}_{DNN}是DNN部分的结果$$y_{DNN}是DNN部分的结果$

${y}_{FM}={w}_{0}+\sum _{i=1}^{n}{w}_{i}{x}_{i}+\sum _{i=1}^{n}\sum _{j=i+1}^{n}<{V}_{i},{V}_{j}>{x}_{i}{x}_{j}$$y_{FM} = w_0 + \sum_{i=1}^nw_ix_i + \sum_{i=1}^{n}\sum_{j=i+1}^n x_ix_j$

embedding layer的结构如下图所示，

embedding layer有两个有趣的特性：

• 输入数据的每个字段的特征经过embedding之后，都为$k$$k$维(lantent vector的维度),所以embedding后的特征维度是 $字段数×k$$字段数×k$
• 在FM里得到的隐变量$V$$V$现在作为了嵌入层网络的权重,FM模型作为整个模型的一部分与其他深度学习模型一起参与整体的学习, 实现端到端的训练。

${a}^{\left(0\right)}=\left[{e}_{1},{e}_{2},...,{e}_{m}\right]$$a^{(0)}=[e_1,e_2,...,e_m]$

${e}_{i}$$e_i$是第$i$$i$个字段的embedding，$m$$m$是字段的个数。${a}^{\left(0\right)}$$a^{(0)}$是输入神经网络的向量，然后通过如下方式前向传播：

${a}^{l+1}=\sigma \left({W}^{\left(l\right)}{a}^{\left(l\right)}+{b}^{\left(l\right)}\right)$$a^{l+1} = \sigma(W^{(l)}a^{(l)} + b^{(l)})$

• 从原始数据中同时学习到了低维与高维特征
• 不再需要特征工程。而Wide&Deep Model需要

#### 代码实现

class DeepFM(object):
"""
Deep FM with FTRL optimization
"""
def __init__(self, config):
"""
:param config: configuration of hyperparameters
type of dict
"""
# number of latent factors
self.k = config['k']
self.lr = config['lr']
self.batch_size = config['batch_size']
self.reg_l1 = config['reg_l1']
self.reg_l2 = config['reg_l2']
# num of features
self.p = feature_length
# num of fields
self.field_cnt = field_cnt

self.X = tf.placeholder('float32', [None, self.p])
self.y = tf.placeholder('int64', [None,])
# index of none-zero features
self.feature_inds = tf.placeholder('int64', [None,field_cnt])
self.keep_prob = tf.placeholder('float32')

def inference(self):
"""
forward propagation
:return: labels for each sample
"""
v = tf.Variable(tf.truncated_normal(shape=[self.p, self.k], mean=0, stddev=0.01),dtype='float32')

# Factorization Machine
with tf.variable_scope('FM'):
b = tf.get_variable('bias', shape=[2],
initializer=tf.zeros_initializer())
w1 = tf.get_variable('w1', shape=[self.p, 2],
initializer=tf.truncated_normal_initializer(mean=0,stddev=1e-2))
# shape of [None, 2]

# shape of [None, 1]
self.interaction_terms = tf.multiply(0.5,
tf.reduce_mean(
tf.subtract(
tf.pow(tf.matmul(self.X, v), 2),
tf.matmul(tf.pow(self.X, 2), tf.pow(v, 2))),
1, keep_dims=True))
# shape of [None, 2]

# three-hidden-layer neural network, network shape of (200-200-200)
with tf.variable_scope('DNN',reuse=False):
# embedding layer
y_embedding_input = tf.reshape(tf.gather(v, self.feature_inds), [-1, self.field_cnt*self.k])
# first hidden layer
w1 = tf.get_variable('w1_dnn', shape=[self.field_cnt*self.k, 200],
initializer=tf.truncated_normal_initializer(mean=0,stddev=1e-2))
b1 = tf.get_variable('b1_dnn', shape=[200],
initializer=tf.constant_initializer(0.001))
y_hidden_l1 = tf.nn.relu(tf.matmul(y_embedding_input, w1) + b1)
# second hidden layer
w2 = tf.get_variable('w2', shape=[200, 200],
initializer=tf.truncated_normal_initializer(mean=0,stddev=1e-2))
b2 = tf.get_variable('b2', shape=[200],
initializer=tf.constant_initializer(0.001))
y_hidden_l2 = tf.nn.relu(tf.matmul(y_hidden_l1, w2) + b2)
# third hidden layer
w3 = tf.get_variable('w1', shape=[200, 200],
initializer=tf.truncated_normal_initializer(mean=0,stddev=1e-2))
b3 = tf.get_variable('b1', shape=[200],
initializer=tf.constant_initializer(0.001))
y_hidden_l3 = tf.nn.relu(tf.matmul(y_hidden_l2, w3) + b3)
# output layer
w_out = tf.get_variable('w_out', shape=[200, 2],
initializer=tf.truncated_normal_initializer(mean=0,stddev=1e-2))
b_out = tf.get_variable('b_out', shape=[2],
initializer=tf.constant_initializer(0.001))
self.y_dnn = tf.nn.relu(tf.matmul(y_hidden_l3, w_out) + b_out)
# add FM output and DNN output
self.y_out_prob = tf.nn.softmax(self.y_out)

cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.y_out)
mean_loss = tf.reduce_mean(cross_entropy)
self.loss = mean_loss
tf.summary.scalar('loss', self.loss)

# accuracy
self.correct_prediction = tf.equal(tf.cast(tf.argmax(model.y_out,1), tf.int64), model.y)
self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))
# add summary to accuracy# CTR预估算法之FM, FFM, DeepFM及实践# CTR预估算法之FM, FFM, DeepFM及实践c

tf.summary.scalar('accuracy', self.accuracy)

def train(self):
# Applies exponential decay to learning rate
self.global_step = tf.Variable(0, trainable=False)
# define optimizer
optimizer = tf.train.FtrlOptimizer(self.lr, l1_regularization_strength=self.reg_l1,
l2_regularization_strength=self.reg_l2)
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
self.train_op = optimizer.minimize(self.loss, global_step=self.global_step)

def build_graph(self):
"""build graph for model"""
self.inference()