一、DeepFM文章地址
论文
https://arxiv.org/pdf/1703.04247.pdf
二、代码
git地址
https://github.com/ChenglongChen/tensorflow-DeepFM
本文分2两部分:
阅读笔记1,数据处理,初始化
阅读笔记2,fit、训练、预测、评估
(代码中添加了注释)
三、阅读
3.1、前沿
deepFM 将FM算法作为一个训练参数放到网络中,直接使用原始的特征输入,只需要告诉网络你的特征那个是numerical,哪个是categories特征。
deepFM在wied&Deep的基础上进行改进,end-to-end的,无需特征工程
优点如下:
不需要预训练FM得到隐向量
不需要特征工程
同时学习低阶和高阶的组合特征
FM模块和Deep模块共享Feature Embedding部分,可以更快的训练,以及更精确的训练学习
3.2、代码中的数据
kaggle数据
(https://www.kaggle.com/c/porto-seguro-safe-driver-prediction)
(上面的地址不好下载数据,我这儿有o( ̄︶ ̄)o
要数据先点赞哦!
要数据请发邮件到691847973@qq.com,不定时回复)
数据的EDA分析
EDA分析地址(https://blog.csdn.net/qq_37195507/article/details/78553581)
3.3、代码阅读
3.3.1、加载数据并处理
main.py
_load_data()到相应的变量中,并将category的变量保存
dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices = _load_data()
def _load_data():
#利用 pandas 读取数据,然后获得对应的特征和 target ,保存到对应的变量中。并且将 categories 的变量保存下来。
dfTrain = pd.read_csv(config.TRAIN_FILE)
dfTest = pd.read_csv(config.TEST_FILE)
def preprocess(df):
cols = [c for c in df.columns if c not in ["id", "target"]]
df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
df["ps_car_13_x_ps_reg_03"] = df["ps_car_13"] * df["ps_reg_03"]
return df
dfTrain = preprocess(dfTrain)
dfTest = preprocess(dfTest)
cols = [c for c in dfTrain.columns if c not in ["id", "target"]]
cols = [c for c in cols if (not c in config.IGNORE_COLS)]
X_train = dfTrain[cols].values
y_train = dfTrain["target"].values
X_test = dfTest[cols].values
ids_test = dfTest["id"].values
cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS]
return dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices
3.3.2、分层采样,交叉验证
main.py NUM_SPLITS = 3 RANDOM_SEED = 2017
folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True,
random_state=config.RANDOM_SEED).split(X_train, y_train))
print("\nfolds: "+str(len(folds)))
3.3.3、参数
# params
dfm_params = {
"use_fm": True,
"use_deep": True,
"embedding_size": 8,
"dropout_fm": [1.0, 1.0],
"deep_layers": [32, 32],
"dropout_deep": [0.5, 0.5, 0.5],
"deep_layers_activation": tf.nn.relu,
"epoch": 30,
"batch_size": 1024,
"learning_rate": 0.001,
"optimizer_type": "adam",
"batch_norm": 1,
"batch_norm_decay": 0.995,
"l2_reg": 0.01,
"verbose": True,
"eval_metric": gini_norm,
"random_seed": config.RANDOM_SEED
}
3.3.4、模型代码
main.py _run_base_model_dfm()
1、创建一个特征处理的字典
#创建一个特征处理的字典
fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
numeric_cols=config.NUMERIC_COLS,
ignore_cols=config.IGNORE_COLS)
具体代码在 DataReader.py中
若特征为numerical,key为特征名,value为索引 (索引号+1,一个取值)
若特征为category, key 为特征名,value为字典[key为特征的取值,vaule为索引](索引号+特征的取值数, 多个取值)
class FeatureDictionary(object):
def __init__(self, trainfile=None, testfile=None,
dfTrain=None, dfTest=None, numeric_cols=[], ignore_cols=[]):
assert not ((trainfile is None) and (dfTrain is None)), "trainfile or dfTrain at least one is set"
assert not ((trainfile is not None) and (dfTrain is not None)), "only one can be set"
assert not ((testfile is None) and (dfTest is None)), "testfile or dfTest at least one is set"
assert not ((testfile is not None) and (dfTest is not None)), "only one can be set"
self.trainfile = trainfile
self.testfile = testfile
self.dfTrain = dfTrain
self.dfTest = dfTest
self.numeric_cols = numeric_cols
self.ignore_cols = ignore_cols
##根据特征的种类是 numerical 还是 categories 的类别 计算输入到网络里面的特征的长度
##主要是计算了特征的的维度,numerical 的特征只占一位,categories 的特征有多少个取值,就占多少位。
self.gen_feat_dict()
def gen_feat_dict(self):
#在初始化方法中,传入第一步读取得到的训练集和测试集
if self.dfTrain is None:
dfTrain = pd.read_csv(self.trainfile)
else:
dfTrain = self.dfTrain
if self.dfTest is None:
dfTest = pd.read_csv(self.testfile)
else:
dfTest = self.dfTest
df = pd.concat([dfTrain, dfTest])
self.feat_dict = {}
tc = 0
#循环遍历特征的每一列
for col in df.columns:
if col in self.ignore_cols:
continue
#若特征是数值型的
if col in self.numeric_cols:
# map to a single index
#key为特征, value为索引
self.feat_dict[col] = tc
tc += 1
#若特征是category
else:
#统计特征中不同的取值数量
us = df[col].unique()
#key为特征,value为字典[key为特征的取值,vaule为索引]
self.feat_dict[col] = dict(zip(us, range(tc, len(us)+tc)))
tc += len(us)
self.feat_dim = tc
self.feat_dim 特征纬度feat_size #259
self.field_size 特征个数field_size #39
2.解析数据
#Xi_train存放的是特征对应的索引 Xv_train 存放的是特征的具体的值
data_parser = DataParser(feat_dict=fd)
Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)
具体代码在 DataReader.py中
class DataParser(object):
def __init__(self, feat_dict):
self.feat_dict = feat_dict
def parse(self, infile=None, df=None, has_label=False):
assert not ((infile is None) and (df is None)), "infile or df at least one is set"
assert not ((infile is not None) and (df is not None)), "only one can be set"
if infile is None:
#copy当df的列表对象的元素更改时, dfi的也会跟着改动
dfi = df.copy()
else:
dfi = pd.read_csv(infile)
if has_label:
y = dfi["target"].values.tolist()
#删除列, axis = 1, inplace为True,那么原数组直接就被替换。
dfi.drop(["id", "target"], axis=1, inplace=True)
else:
ids = dfi["id"].values.tolist()
dfi.drop(["id"], axis=1, inplace=True)
# dfi for feature index
# dfv for feature value which can be either binary (1/0) or float (e.g., 10.24)
#dfi 记录了当前的特征在总的输入的特征中的索引
#dfv 中记录的是具体的值
dfv = dfi.copy()
for col in dfi.columns:#逐行处理每一条数据
if col in self.feat_dict.ignore_cols:
dfi.drop(col, axis=1, inplace=True)
dfv.drop(col, axis=1, inplace=True)
continue
if col in self.feat_dict.numeric_cols: #若是numeric_cols特征,存的是原始的值
dfi[col] = self.feat_dict.feat_dict[col]
else:#如果是 categories 类型的,就存放 1,这个相当于进行了 one-hot 编码,在 dfi 存储了特征所在的索引
dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
dfv[col] = 1.
# list of list of feature indices of each sample in the dataset
Xi = dfi.values.tolist()
# list of list of feature values of each sample in the dataset
Xv = dfv.values.tolist()
if has_label:
return Xi, Xv, y
else:
return Xi, Xv, ids
#输入到网络中的特征的长度是 numerical 特征的个数 +categories 特征 one-hot 编码的长度。
#最终,Xi 和 Xv 是一个二维的 list,里面的每一个 list 是一行数据,Xi 存放的是特征所在的索引,Xv 存放的是具体的特征值。
3.准备batch
#准备batch
for i, (train_idx, valid_idx) in enumerate(folds):
#取交叉验证的数据,Xi_train_存放的是特征对应的索引 Xv_train_ 存放的是特征的具体的值 y_train_是label
Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)
4.训练
#训练的主函数
dfm = DeepFM(**dfm_params)
dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)
#验证集上y_pred
y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)
#测试集上y_pred
y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)
#gini_norm评估函数 gini_norm(真实值, 预测值)
gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
gini_results_epoch_train[i] = dfm.train_result #训练集auc
gini_results_epoch_valid[i] = dfm.valid_result #测试集auc
下面的代码均在 DeepFM.py 中
dfm = DeepFM(**dfm_params)
(1) 初始化参数、图
最重要的是初始化图 self._init_graph()
def __init__(self, feature_size, field_size,
embedding_size=8, dropout_fm=[1.0, 1.0],
deep_layers=[32, 32], dropout_deep=[0.5, 0.5, 0.5],
deep_layers_activation=tf.nn.relu,
epoch=10, batch_size=256,
learning_rate=0.001, optimizer_type="adam",
batch_norm=0, batch_norm_decay=0.995,
verbose=False, random_seed=2016,
use_fm=True, use_deep=True,
loss_type="logloss", eval_metric=roc_auc_score,
l2_reg=0.0, greater_is_better=True):
assert (use_fm or use_deep)
assert loss_type in ["logloss", "mse"], \
"loss_type can be either 'logloss' for classification task or 'mse' for regression task"
self.feature_size = feature_size # denote as M, size of the feature dictionary #特征个数 259
self.field_size = field_size # denote as F, size of the feature fields #特征域个数 39
self.embedding_size = embedding_size # denote as K, size of the feature embedding #隐向量纬度,默认8
self.dropout_fm = dropout_fm #dropout参数
self.deep_layers = deep_layers #deep部分的层数
self.dropout_deep = dropout_deep
self.deep_layers_activation = deep_layers_activation #激活函数relu
self.use_fm = use_fm
self.use_deep = use_deep
self.l2_reg = l2_reg
self.epoch = epoch #迭代次数
self.batch_size = batch_size #batch_size
self.learning_rate = learning_rate #学习率
self.optimizer_type = optimizer_type #优化方法 ??还有哪些,并说出异同点
self.batch_norm = batch_norm #batch_norm参数
self.batch_norm_decay = batch_norm_decay
self.verbose = verbose
self.random_seed = random_seed
self.loss_type = loss_type
self.eval_metric = eval_metric
self.greater_is_better = greater_is_better
self.train_result, self.valid_result = [], []
self._init_graph() #最重要 !!初始化图
初始化图具体如下:
① 定义了 6 个 placeholder ,每个大小的 None 代表的是 batch_size 的大小
#先定义了 6 个 placeholder ,每个大小的 None 代表的是 batch_size 的大小
#placeholder()函数用于定义过程,在执行的时候再赋具体的值
self.feat_index = tf.placeholder(tf.int32, shape=[None, None],
name="feat_index") # None * F = batch_size * field_size
self.feat_value = tf.placeholder(tf.float32, shape=[None, None],
name="feat_value") # None * F = batch_size * field_size
self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") # None * 1
self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm")
self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep")
self.train_phase = tf.placeholder(tf.bool, name="train_phase")
②权重的初始化
调用权重的初始化方法,将所有的权重放到一个字典中
self.weights = self._initialize_weights()
i. feature_embeddings、feature_bias初始化
feature_embeddings 本质上就是 FM 中的 latent vector (隐向量),对于每一个特征都建立一个隐特征向量;
feature_bias 代表了 FM 中的 w 的权重
ii. 搭建深度图,并初始化权重
输入到深度网络的大小为:特征域field_size个数 * 每个隐特征向量的长度embedding_size
根据每层的配置文件,产生相应的权重
iii. 合并 concat projection layer
最终合并映射层,对于输出层,根据不同的配置,生成不同的输出的大小。
DeepFM:
input_size = self.field_size + self.embedding_size + self.deep_layers[-1]
#特征域个数39 + 隐向量纬度,默认8 + deep_layer的层数32(设置的)
FM: input_size = self.field_size + self.embedding_size
DNN: input_size = self.deep_layers[-1]
#初始化weights
def _initialize_weights(self):
weights = dict()
# embeddings
#random_normal从正态分布中输出随机值 random_normal(shape,mean=0.0,stddev=0.01)
#shape为feature_size行,embedding_size列(隐向量长度K)
#feature_embeddings 本质上就是 FM 中的 latent vector (隐向量)
#对于每一个特征都建立一个隐特征向量
weights["feature_embeddings"] = tf.Variable(
tf.random_normal([self.feature_size, self.embedding_size], 0.0, 0.01),
name="feature_embeddings") # feature_size * K
#feature_bias 代表了 FM 中的 w 的权重
weights["feature_bias"] = tf.Variable(
tf.random_uniform([self.feature_size, 1], 0.0, 1.0), name="feature_bias") # feature_size * 1
#搭建深度图
# deep layers
num_layer = len(self.deep_layers) #层数"deep_layers": [32, 32], 2层网络
#输入到深度网络的大小为:特征域个数 * 每个隐特征向量的长度
input_size = self.field_size * self.embedding_size
#todo ======================== 第一层的网络结构 =============================
glorot = np.sqrt(2.0 / (input_size + self.deep_layers[0]))
weights["layer_0"] = tf.Variable( #Variable变量
np.random.normal(loc=0, scale=glorot, size=(input_size, self.deep_layers[0])), dtype=np.float32)
weights["bias_0"] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[0])),dtype=np.float32) # 1 * layers[0]
#根据每层的配置文件,产生相应的权重
for i in range(1, num_layer):
glorot = np.sqrt(2.0 / (self.deep_layers[i-1] + self.deep_layers[i]))
weights["layer_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i-1], self.deep_layers[i])),
dtype=np.float32) # layers[i-1] * layers[i]
weights["bias_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),
dtype=np.float32) # 1 * layer[i]
# final concat projection layer 最终合并映射层
if self.use_fm and self.use_deep: #deepFM
input_size = self.field_size + self.embedding_size + self.deep_layers[-1]
elif self.use_fm:#fm
input_size = self.field_size + self.embedding_size
elif self.use_deep:#dnn
input_size = self.deep_layers[-1]
glorot = np.sqrt(2.0 / (input_size + 1))
weights["concat_projection"] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(input_size, 1)),
dtype=np.float32) # layers[i-1]*layers[i]
weights["concat_bias"] = tf.Variable(tf.constant(0.01), dtype=np.float32)
#tf.constant创建常量的函数
return weights
③dense-embedding 计算
注意理解 self.embeddings,是后面共享的隐向量
(在计算二阶项 和 deep 用到,一阶项没有用到)
最终的self.embeddings为 vif * xi
功能:
根据每次输入的特征的索引,从隐特征向量中取出其对应的隐向量。
具体:
将每一个特征对应的具体的值,和自己对应的隐向量相乘。如果是 numerical 的,就直接用对应的 value 乘以隐向量。如果是 categories 的特征,其对应的特征值是 1,相乘完还是原来的隐向量。
最后,self.embeddings 存放的就是输入的样本的特征值和隐向量的乘积。大小为 batch_size field_size embedding_size
具体为 batch_size个,field_size行embedding_size列二维向量(field_size=特征的个数,embedding_size隐向量的维度)
##feature_embeddings 本质上就是 FM 中的 latent vector (隐向量)
#对于每一个特征都建立一个隐特征向量 shape为feature_size行,embedding_size列(隐向量长度K)
#feat_index = None * F = batch_size * field_size
#self.embeddings 大小为 batch_size * field_size * embedding_size
#解释为 :batch_size个,field_size行embedding_size列二维向量(field_size=特征的个数,embedding_size隐向量的纬度)
self.embeddings = tf.nn.embedding_lookup(self.weights["feature_embeddings"],self.feat_index) # None * F * K
#feat_value是Xv_train
feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1])
#self.embeddings 存放的就是输入的样本的特征值和隐向量的乘积 v*x
self.embeddings = tf.multiply(self.embeddings, feat_value)
#两个矩阵中对应元素各自相乘 这个就是就是每个值和自己的隐向量的乘积
④ FM 部分
i. 计算一阶项
从 self.weights[“feature_bias”] 取出对应的 w ,得到一阶项,大小为 batch_size*field_size。
# ---------- first order term ---------- 计算一阶项
#从 self.weights[“feature_bias”] 取出对应的 w ,得到一阶项,大小为 batch_size*field_size。
self.y_first_order = tf.nn.embedding_lookup(self.weights["feature_bias"], self.feat_index) # None * F * 1
#reduce_sum() 用于计算张量tensor沿着某一维度的和,可以在求和后降维。
self.y_first_order = tf.reduce_sum(tf.multiply(self.y_first_order, feat_value), 2) # None * F
#dropout是tensorflow中防止训练过拟合的一种措施
#需要传入一个参数keep_prob,训练过程中,每次迭代将只使用keep_prob*cell_size的神经元进行训练,其余神经元不进行迭代
#在不同的训练过程中随机扔掉一部分神经元。也就是让某个神经元的激活值以一定的概率p,让其停止工作,这次训练过程中不更新权值,也不参加神经网络的计算。但是它的权重得保留下来(只是暂时不更新而已),因为下次样本输入时它可能又得工作了
self.y_first_order = tf.nn.dropout(self.y_first_order, self.dropout_keep_fm[0]) # None * F
ii.计算二阶项
输入的是 self.embedding
利用化简后的式子,FM的化简如下:
# ---------- second order term ---------------二阶项的计算,利用化简后的式子
#self.embeddings 存放的就是输入的样本的特征值和隐向量的乘积 v*x
# sum_square part 和的平方
self.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None * K
self.summed_features_emb_square = tf.square(self.summed_features_emb) # None * K
# square_sum part 平方的和
self.squared_features_emb = tf.square(self.embeddings)
self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1) # None * K
# second order; subtract减法
self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square, self.squared_sum_features_emb) # None * K
self.y_second_order = tf.nn.dropout(self.y_second_order, self.dropout_keep_fm[1]) # None * K
⑤ 计算deep项
输入也是 self.embedding ,权值共享指的就是这里。
将 self.embeddings(大小为 batch_size* self.field_size * self.embedding_size) reshape 成 batch_size(self.field_size * self.embedding_size) 的大小,然后输入到网络里面进行计算
# ---------- Deep component ----------
#将 self.embeddings(大小为 batch_sizeself.field_size * self.embedding_size) reshape 成 batch_size(self.field_size * self.embedding_size) 的大小
self.y_deep = tf.reshape(self.embeddings, shape=[-1, self.field_size * self.embedding_size]) # None * (F*K)
self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
#然后输入到网络里面进行计算
for i in range(0, len(self.deep_layers)): #deep_layers=[32, 32]
self.y_deep = tf.add(tf.matmul(self.y_deep, self.weights["layer_%d" %i]), self.weights["bias_%d"%i]) # None * layer[i] * 1
#batchnorm是深度网络中经常用到的加速神经网络训练,加速收敛速度以及提供网络稳定性的算法
if self.batch_norm: #True/False
self.y_deep = self.batch_norm_layer(self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" %i) # None * layer[i] * 1
#tf.nn.relu激活函数
self.y_deep = self.deep_layers_activation(self.y_deep)
self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[1+i]) # dropout at each Deep layer
⑦最后将所有项 concat 起来,投影到一个值
DeepFM:
投影大小为 filed_size+embedding_size+deep
FM:
投影的大小为 filed_size+embedding_size 的大小
DNN:
投影大小为 deep
利用最后的全连接层,将特征映射到一个 scalar 。
# ---------- DeepFM ----------
#最后将所有项 concat 起来,投影到一个值
if self.use_fm and self.use_deep:#投影大小为 filed_size+embedding_size+deep部分
concat_input = tf.concat([self.y_first_order, self.y_second_order, self.y_deep], axis=1)
elif self.use_fm:#投影的大小为 filed_size+embedding_size 的大小
concat_input = tf.concat([self.y_first_order, self.y_second_order], axis=1)
elif self.use_deep:
concat_input = self.y_deep
self.out = tf.add(tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"])
⑧定义损失函数、optimizer优化器、初始化sess
定义损失函数
# loss定义损失函数
if self.loss_type == "logloss":#labels和predictions的log损失(即交叉熵)
self.out = tf.nn.sigmoid(self.out)
self.loss = tf.losses.log_loss(self.label, self.out)
elif self.loss_type == "mse": #均方误差
self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
# l2 regularization on weights正则化项目
if self.l2_reg > 0:
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["concat_projection"])
if self.use_deep:
for i in range(len(self.deep_layers)):
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["layer_%d"%i])
optimizer优化器
# optimizer优化器
#所谓的优化器,就是tensorflow中梯度下降的策略,用于更新神经网络中数以百万的参数。
if self.optimizer_type == "adam":
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,
epsilon=1e-8).minimize(self.loss)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,
initial_accumulator_value=1e-8).minimize(self.loss)
elif self.optimizer_type == "gd":
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
elif self.optimizer_type == "momentum":
self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(
self.loss)
#elif self.optimizer_type == "yellowfin":
# self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(
# self.loss)
init初始化sess
# init初始化sess
self.saver = tf.train.Saver()
init = tf.global_variables_initializer()
self.sess = self._init_session()
self.sess.run(init)
def _init_session(self):#是否使用gpu
config = tf.ConfigProto(device_count={"gpu": 0})
config.gpu_options.allow_growth = True
return tf.Session(config=config)
后面的内容请看
deepFM代码TensorFlow版阅读笔记2
2 介绍 fit,训练,预测,以及评估,以及Normalization 的 Gini值计算
若有问题,请指正,谢谢