在处理MF矩阵分解时使用过FunkSVD,最后在分解诶P,Q矩阵的时候果然还是用到了一般套路,根据预测y和实际y的差别梯度下降来寻找。所以能否直接从这个思路,把它变成多个特征的回归模型是否可行?
y
=
w
0
+
∑
i
=
1
n
w
i
x
i
,
n
表
示
特
征
数
量
y=w_{0}+\sum\limits_{i=1}^nw_{i}x_{i},n表示特征数量
y=w0+i=1∑nwixi,n表示特征数量但是普通的线性模型,并没有考虑到特征与特征之间的相互关系(如图中的每一行)。所以加上一项:
y
=
w
0
+
∑
i
=
1
n
w
i
x
i
+
∑
i
=
1
n
−
1
∑
j
=
i
+
1
n
w
i
j
x
i
x
j
y=w_{0}+\sum\limits_{i=1}^nw_{i}x_{i}+\sum_{i=1}^{n-1}\sum_{j=i+1}^{n}w_{ij}x_{i}x_{j}
y=w0+i=1∑nwixi+i=1∑n−1j=i+1∑nwijxixj但是在数据矩阵很稀疏的情况下,即xi,xj非0的情况非常少,ωij实际上是无法仅仅通过训练得出。于是需要引入一个辅助向量
V
i
=
(
v
i
1
,
v
i
2
,
⋯
,
v
i
k
)
T
V_{i}=(v_{i1},v_{i2},⋯,v_{ik})^{T}
Vi=(vi1,vi2,⋯,vik)T,其中k为超参,可以将y改写成:
y
=
w
0
+
∑
i
=
1
n
w
i
x
i
+
∑
i
=
1
n
−
1
∑
j
=
i
+
1
n
(
v
i
T
v
j
)
x
i
x
j
y=w_{0}+\sum\limits_{i=1}^nw_{i}x_{i}+\sum_{i=1}^{n-1}\sum_{j=i+1}^{n}(v_{i}^Tv_j)x_{i}x_{j}
y=w0+i=1∑nwixi+i=1∑n−1j=i+1∑n(viTvj)xixj
即引入V:
此时的交互矩阵,
也就是说我们相对对W进行了一种矩阵分解,那么在高稀疏上的表达上得到V相对来说是容易的。同样我们接着要求导,先化简一下后面的式子:
然后再求导和随机梯度下降SGD就行了。下面使用经典的MovieLens100k数据集,也就是由明尼苏达大学和研究人员收集整理的1000209匿名评级约3900部电影的评分。数据包括四列,用户id,电影id,评分和时间戳。
user item rating timestamp 0 1 1 5 874965758 1 1 2 3 876893171 2 1 3 4 878542960 3 1 4 3 876893119 4 1 5 3 889751712 5 1 6 5 887431973 6 1 7 4 875071561 7 1 8 1 875072484 8 1 9 5 878543541 9 1 10 3 875693118
FM的代码为:
from itertools import count
from collections import defaultdict
from scipy.sparse import csr
import numpy as np
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
import tensorflow as tf
from tqdm import tqdm
#from tqdm import tqdm_notebook as tqdm
######数据处理
#将原始文件输入转换成我们需要的稀疏矩阵(稀疏矩阵编码格式)
def vectorize_dic(dic,ix=None,p=None,n=0,g=0):
if ix==None:
ix = dict()
nz = n * g
col_ix = np.empty(nz,dtype = int)#每行起始的偏移量
i = 0
for k,lis in dic.items():#遍历文档
for t in range(len(lis)):#遍历每个词
ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),0) + 1
col_ix[i+t*g] = ix[str(lis[t]) + str(k)]
i += 1
row_ix = np.repeat(np.arange(0,n),g)#每个数对应的列号
data = np.ones(nz)
if p == None:
p = len(ix)
ixx = np.where(col_ix < p)#输出满足条件的值
return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p)),ix
#batch函数
def batcher(X_, y_=None, batch_size=-1):
n_samples = X_.shape[0]
if batch_size == -1:
batch_size = n_samples
if batch_size < 1:
raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))
for i in range(0, n_samples, batch_size):
upper_bound = min(i + batch_size, n_samples)
ret_x = X_[i:upper_bound]
ret_y = None
if y_ is not None:
ret_y = y_[i:i + batch_size]
yield (ret_x, ret_y)
#读入数据
cols = ['user','item','rating','timestamp']
train = pd.read_csv('data/ua.base',delimiter='\t',names = cols)
test = pd.read_csv('data/ua.test',delimiter='\t',names = cols)
print(train,test)
x_train,ix = vectorize_dic({'users':train['user'].values,
'items':train['item'].values},n=len(train.index),g=2)
x_test,ix = vectorize_dic({'users':test['user'].values,
'items':test['item'].values},ix,x_train.shape[1],n=len(test.index),g=2)
#变换后的形式
print(x_train)
y_train = train['rating'].values
y_test = test['rating'].values
#得到变换后的矩阵形式
x_train = x_train.todense()
x_test = x_test.todense()
print(x_train)
print(x_train.shape)
print (x_test.shape)
#######Tensorflow搭建
#定义损失函数
n,p = x_train.shape
k = 10#设置超参k
x = tf.placeholder('float',[None,p])
y = tf.placeholder('float',[None,1])
w0 = tf.Variable(tf.zeros([1]))
w = tf.Variable(tf.zeros([p]))
v = tf.Variable(tf.random_normal([k,p],mean=0,stddev=0.01))
#y_hat = tf.Variable(tf.zeros([n,1]))
linear_terms = tf.add(w0,tf.reduce_sum(tf.multiply(w,x),1,keep_dims=True)) #按行求和
#得到化简后的函数
pair_interactions = 0.5 * tf.reduce_sum(tf.subtract( tf.pow( tf.matmul(x,tf.transpose(v)),2), tf.matmul(tf.pow(x,2),tf.transpose(tf.pow(v,2)))),axis = 1 , keep_dims=True)
#完整的预测函数y
y_hat = tf.add(linear_terms,pair_interactions)
#正则化项
lambda_w = tf.constant(0.001,name='lambda_w')
lambda_v = tf.constant(0.001,name='lambda_v')
l2_norm = tf.reduce_sum(
tf.add(tf.multiply(lambda_w,tf.pow(w,2)),tf.multiply(lambda_v,tf.pow(v,2))))
#error和loss
error = tf.reduce_mean(tf.square(y-y_hat))
loss = tf.add(error,l2_norm)
train_op = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)#梯度下降
#模型训练
epochs = 1
batch_size = 5000
# Launch the graph
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in tqdm(range(epochs), unit='epoch'):#输出进行过程
perm = np.random.permutation(x_train.shape[0])#打乱顺序
# iterate over batches
for bX, bY in batcher(x_train[perm], y_train[perm], batch_size):
_,t = sess.run([train_op,loss], feed_dict={x: bX.reshape(-1, p), y: bY.reshape(-1, 1)})
print(t)
errors = []
for bX, bY in batcher(x_test, y_test):
errors.append(sess.run(error, feed_dict={x: bX.reshape(-1, p), y: bY.reshape(-1, 1)}))
print(errors)
RMSE = np.sqrt(np.array(errors).mean())
print (RMSE)
FM模型同时考虑了单个特征和组合特征的建模,并且将组合特征的权值系数用向量的内积来表示(V),大大的降低了参数空间,适用于稀疏特征的建模,且效果要由于LR模型的单特征建模。
NFM
FM很好的解决了高纬度高稀疏输入特征组合的问题。通过隐向量内积来建模权重,针对在训练集中没有出现过的特征组合或者出现次数很好的特征组合,也能有效的学习。缺点就是它毕竟还是属于线性模型,它的表达能力受限,只能对二阶组合特征进行建模。
所以NFM的想法很简单,把二阶项直接变成能用DNN拟合的f(x)即可。即公式变为:
y
=
w
0
+
∑
i
=
1
n
w
i
x
i
+
f
(
x
)
y=w_{0}+\sum\limits_{i=1}^nw_{i}x_{i}+f(x)
y=w0+i=1∑nwixi+f(x)f(x)是用来建模特征之间交互关系的多层前馈神经网络模块,这样在二阶特征组合的隐向量空间中,即引入了非线性变换来提升模型非线性表达能力,又学习到高阶的组合特征。
def _init_graph(self):
'''
初始化NFM模型构建图流程
'''
self.graph = tf.Graph()
with self.graph.as_default(): # , tf.device('/cpu:0'):
#设置图的随机种子
tf.set_random_seed(self.random_seed)
#输入数据。主要是features和labels。None是批次数目,features_M是数据集有的特征one-hot维度
self.train_features = tf.placeholder(tf.int32, shape=[None, None]) # None * features_M
self.train_labels = tf.placeholder(tf.float32, shape=[None, 1]) # None * 1
self.dropout_keep = tf.placeholder(tf.float32, shape=[None])
self.train_phase = tf.placeholder(tf.bool)
#初始化权重
self.weights = self._initialize_weights()
#NFM的模型部分。
#首先是Bi-Interaction Layer,计算FM中的二次项的过程。
# _________ sum_square part _____________
#即求公式中的(\sum x_iv_i)^2
#先对嵌入后的特征(嵌入即起到了v的作用)求和
nonzero_embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'], self.train_features)
self.summed_features_emb = tf.reduce_sum(nonzero_embeddings, 1) # None * K
#对每个元素求平方,得到element-multiplication
self.summed_features_emb_square = tf.square(self.summed_features_emb) # None * K
# _________ square_sum part _____________
#即求公式中的\sum (x_iv_i)^2
#先对嵌入后的特征求平方,再求和
self.squared_features_emb = tf.square(nonzero_embeddings)
self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1) # None * K
# ________ FM __________
#对Bi-Interaction Layer的两个部分相减(sub)再0.5得到完整的二次项结果
self.FM = 0.5 * tf.sub(self.summed_features_emb_square, self.squared_sum_features_emb) # None * K
#如果BN。BN的目的是为了缓解协方差偏移(covariance shift):由于参数的更新隐藏层的输入分布不断变化,模型参数反而需要去学习这些变化,使收敛速度变慢。
if self.batch_norm:
self.FM = self.batch_norm_layer(self.FM, train_phase=self.train_phase, scope_bn='bn_fm')
self.FM = tf.nn.dropout(self.FM, self.dropout_keep[-1]) #对FM部分进行dropout
# ________ Deep Layers __________
#这是NFM的不同之处,deep部分会将Bi-Interaction Layer层得到的结果经过一个多层的神经网络,以提升FM捕捉特征间多阶交互信息的能力。
for i in range(0, len(self.layers)):#MLP
self.FM = tf.add(tf.matmul(self.FM, self.weights['layer_%d' %i]), self.weights['bias_%d'%i]) # None * layer[i] * 1
if self.batch_norm:
self.FM = self.batch_norm_layer(self.FM, train_phase=self.train_phase, scope_bn='bn_%d' %i) # None * layer[i] * 1
self.FM = self.activation_function(self.FM)#激活函数
self.FM = tf.nn.dropout(self.FM, self.dropout_keep[i]) #每层都要进行Dropout
self.FM = tf.matmul(self.FM, self.weights['prediction']) # None * 1,最后投影得到二次项的预测结果
# _________out _________
#得到预测的输出,除了二次项,还需要计算偏置项w0和一次项\sum w_ix_i
Bilinear = tf.reduce_sum(self.FM, 1, keep_dims=True) # None * 1
self.Feature_bias = tf.reduce_sum(tf.nn.embedding_lookup(self.weights['feature_bias'], self.train_features) , 1) # None * 1
Bias = self.weights['bias'] * tf.ones_like(self.train_labels) # None * 1
#FM的输出最后由三部分组成二次,一次,偏置
self.out = tf.add_n([Bilinear, self.Feature_bias, Bias]) # None * 1
#计算损失函数。NFM可以用于分类、回归、ranking问题,对应着不同的目标函数。
if self.loss_type == 'square_loss':
if self.lamda_bilinear > 0:
self.loss = tf.nn.l2_loss(tf.sub(self.train_labels, self.out)) + tf.contrib.layers.l2_regularizer(self.lamda_bilinear)(self.weights['feature_embeddings']) # regulizer
else:
self.loss = tf.nn.l2_loss(tf.sub(self.train_labels, self.out))
elif self.loss_type == 'log_loss':
self.out = tf.sigmoid(self.out)
if self.lambda_bilinear > 0:
self.loss = tf.contrib.losses.log_loss(self.out, self.train_labels, weight=1.0, epsilon=1e-07, scope=None) + tf.contrib.layers.l2_regularizer(self.lamda_bilinear)(self.weights['feature_embeddings']) # regulizer
else:
self.loss = tf.contrib.losses.log_loss(self.out, self.train_labels, weight=1.0, epsilon=1e-07, scope=None)
#多种梯度下降优化器
if self.optimizer_type == 'AdamOptimizer':
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss)
elif self.optimizer_type == 'AdagradOptimizer':
self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss)
elif self.optimizer_type == 'GradientDescentOptimizer':
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
elif self.optimizer_type == 'MomentumOptimizer':
self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(self.loss)
#初始化图
self.saver = tf.train.Saver()#Saver管理参数便于保存和读取
init = tf.global_variables_initializer()#初始化模型参数,即run了所有global Variable的assign op。
self.sess = tf.Session()#会话控制和输出
self.sess.run(init)#然后运行图
#计算整个模型的参数数量,这主要是为了证明NFM比其他并行神经网络拥有更少的参数量。
total_parameters = 0
for variable in self.weights.values():
shape = variable.get_shape() #每个参数变量的维度大小
variable_parameters = 1
for dim in shape:#所有维度的数量
variable_parameters *= dim.value
total_parameters += variable_parameters
if self.verbose > 0:
print "#params: %d" %total_parameters
完整的源码逐行阅读笔记在:https://github.com/nakaizura/Source-Code-Notebook/tree/master/NFM