简介
2016年,微软提出Deep Crossing模型,旨在解决特征工程中特征组合的难题,降低人力特征组合的时间开销,通过模型自动学习特征的组合方式,也能达到不错的效果,且在各种任务中表现出较好的稳定性 与FNN、PNN不同的是,Deep Crossing并没有采用显式交叉特征的方式,而是利用残差网络结构挖掘特征间的关系
相信当看到这个图片时,你已经翻阅了大量的相关资料:
Deep Crossing 主要分为5层或者者说四层:
从下而上看 特征抽取层、Embeddeding Layer、Stacking Layer、Multiple Resldual Units Layer、Scoring Layer
损失函数 :
一、特征抽取层:
对于基础数据的特征处理是整个推荐系统的基础,也是最重要,如何选取特征。
推荐系统常用 : 行为特征(阅读时长、关注、点赞、喜欢、评论等)、User 基本特征、item 基本特征 、统计特征(PV,UV,点击率等)
特征处理:类型特征OneHot、数字特征是否需要归一化,连续的特征是否需要cut 等
二、Embedding
https://zhuanlan.zhihu.com/p/83814532 embedding 介绍见这个blog 写的不错
Embedding的主要目的是将高维稀疏特征转化为低维稠密特征,其公式化定义为:
其中 代表输入的第 j 个特征Field,并且已经过one-hot编码表示,分别表示对应的模型参数,DeepCrossing加上了偏置
项 。公式中的 max 操作等价于使用 relu 激活函数。尽管可以通过分Field的操作,减少Embedding层的参数量,但是由于某
些 高基数特征 的存在,如paper中提到的CampaignID,其对应的 仍然十分庞大。为此作者提出,针对这些高基数特征构造
衍生特征,具体操作如下。根据CampaignID的历史点击率从高到低选择Top1000个,编号从0到999,将剩余的ID统一编号为
1000。同时构建其衍生特征,将所有ID对应的历史点击率组合成1001维的稠密矩阵,各个元素分别为对应ID的历史CTR,最后一
个元素为剩余ID的平均CTR。通过降维引入衍生特征的方式,可以有效的减少高基数特征带来的参数量剧增问题。
三、Stacking Layer
经过Embedding之后,直接对所有的 进行拼接 Stacking, 。作者将特征embedding为256维,但是对于本身维度低于256的特征Field,无需进行Embedding,直接送入Stacking层,如上图中的 Feature#2所示 。
四、Multiple Resldual Units Layer (残差神经网络层)
上图描述的残差单元,残差神经网络层是有残差单元组成
① 输入层经过两次以Relu为激活函数的Full Connected操作 生成输出向量
② 输出向量和原输入向量做元素叠加生成新的输出向量 ,
也就是经过①部分处理的数据 其实是 拟合 的残差
五、 Scoring Layer
是为了拟合优化目标存在的,通常使用逻辑回归解决CTR 中的二分类问题,如果是多分类就通过softmax 处理
python DeepCrossing
主要函数
class DeepCrossing(object):
def __init__(self, vec_dim=None, field_lens=None, lr=None, residual_unit_num=None, residual_w_dim=None, dropout_rate=None, lamda=None):
self.vec_dim = vec_dim
self.field_lens = field_lens
self.field_num = len(field_lens)
self.lr = lr
self.residual_unit_num = residual_unit_num
self.residual_w_dim = residual_w_dim
self.dropout_rate = dropout_rate
self.lamda = float(lamda)
self.l2_reg = tf.contrib.layers.l2_regularizer(self.lamda)
self._build_graph()
def _build_graph(self):
self.add_input()
self.inference()
def add_input(self):
self.x = [tf.placeholder(tf.float32, name='input_x_%d'%i) for i in range(self.field_num)]
self.y = tf.placeholder(tf.float32, shape=[None], name='input_y')
self.is_train = tf.placeholder(tf.bool)
def _residual_unit(self, input, i):
x = input
in_node = self.field_num*self.vec_dim
out_node = self.residual_w_dim
w0 = tf.get_variable(name='residual_w0_%d'%i, shape=[in_node, out_node], dtype=tf.float32, regularizer=self.l2_reg)
b0 = tf.get_variable(name='residual_b0_%d'%i, shape=[out_node], dtype=tf.float32)
residual = tf.nn.relu(tf.matmul(input, w0) + b0)
w1 = tf.get_variable(name='residual_w1_%d'%i, shape=[out_node, in_node], dtype=tf.float32, regularizer=self.l2_reg)
b1 = tf.get_variable(name='residual_b1_%d'%i, shape=[in_node], dtype=tf.float32)
residual = tf.matmul(residual, w1) + b1
out = tf.nn.relu(residual+x)
return out
def inference(self):
with tf.variable_scope('emb_part'):
emb = [tf.get_variable(name='emb_%d'%i, shape=[self.field_lens[i], self.vec_dim], dtype=tf.float32, regularizer=self.l2_reg) for i in range(self.field_num)]
emb_layer = tf.concat([tf.matmul(self.x[i], emb[i]) for i in range(self.field_num)], axis=1) # (batch, F*K)
x = emb_layer
with tf.variable_scope('residual_part'):
for i in range(self.residual_unit_num):
x = self._residual_unit(x, i)
x = tf.layers.dropout(x, rate=self.dropout_rate, training=self.is_train)
w = tf.get_variable(name='w', shape=[self.field_num*self.vec_dim, 1], dtype=tf.float32, regularizer=self.l2_reg)
b = tf.get_variable(name='b', shape=[1], dtype=tf.float32)
self.y_logits = tf.matmul(x, w) + b
self.y_hat = tf.nn.sigmoid(self.y_logits)
self.pred_label = tf.cast(self.y_hat > 0.5, tf.int32)
self.loss = -tf.reduce_mean(self.y*tf.log(self.y_hat+1e-8) + (1-self.y)*tf.log(1-self.y_hat+1e-8))
reg_variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
if len(reg_variables) > 0:
self.loss += tf.add_n(reg_variables)
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
python 代码实现 - Deep & Cross Network
https://download.csdn.net/download/zhuhongming123/12497979
Model 部分
import numpy as np
import tensorflow as tf
from time import time
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score
class DCN(BaseEstimator, TransformerMixin):
def __init__(self, cate_feature_size, field_size, numeric_feature_size,
embedding_size=8,
deep_layers=[32, 32],
dropout_deep=[0.5, 0.5, 0.5],
deep_layers_activation=tf.nn.relu,
epoch=10,
batch_size=256,
learning_rate=0.001,
optimizer_type="adam",
batch_norm=0,
batch_norm_decay=0.995,
verbose=False,
random_seed=2016,
loss_type="logloss",
eval_metric=roc_auc_score,
l2_reg=0.0,
greater_is_better=True,
cross_layer_num=3):
assert loss_type in ["logloss", "mse"], \
"loss_type can be either 'logloss' for classification task or 'mse' for regression task"
self.cate_feature_size = cate_feature_size
self.numeric_feature_size = numeric_feature_size
self.field_size = field_size
self.embedding_size = embedding_size
self.total_size = self.field_size * self.embedding_size + self.numeric_feature_size
self.deep_layers = deep_layers
self.cross_layer_num = cross_layer_num
self.dropout_dep = dropout_deep
self.deep_layers_activation = deep_layers_activation
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.batch_norm = batch_norm
self.batch_norm_decay = batch_norm_decay
self.verbose = verbose
self.random_seed = random_seed
self.loss_type = loss_type
self.eval_metric = eval_metric
self.greater_is_better = greater_is_better
self.train_result, self.valid_result = [], []
self._init_graph()
def _init_graph(self):
self.graph = tf.Graph()
with self.graph.as_default():
tf.set_random_seed(self.random_seed)
self.feat_index = tf.placeholder(tf.int32,
shape=[None, None],
name='feat_index')
self.feat_value = tf.placeholder(tf.float32,
shape=[None, None],
name='feat_value')
self.numeric_value = tf.placeholder(tf.float32, [None, None], name='num_value')
self.label = tf.placeholder(tf.float32, shape=[None, 1], name='label')
self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name='dropout_deep_deep')
self.train_phase = tf.placeholder(tf.bool, name='train_phase')
self.weights = self._initialize_weights()
# model
self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'], self.feat_index) # N * F * K
feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1])
#
self.embeddings = tf.multiply(self.embeddings, feat_value)
# 构建x0 向量
self.x0 = tf.concat([self.numeric_value,
tf.reshape(self.embeddings, shape=[-1, self.field_size * self.embedding_size])]
, axis=1)
# deep part
self.y_deep = tf.nn.dropout(self.x0, self.dropout_keep_deep[0])
for i in range(0, len(self.deep_layers)):
self.y_deep = tf.add(tf.matmul(self.y_deep, self.weights["deep_layer_%d" % i]),
self.weights["deep_bias_%d" % i])
self.y_deep = self.deep_layers_activation(self.y_deep)
self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[i + 1])
# cross_part
self._x0 = tf.reshape(self.x0, (-1, self.total_size, 1))
x_l = self._x0
for l in range(self.cross_layer_num):
x_l = tf.tensordot(tf.matmul(self._x0, x_l, transpose_b=True),
self.weights["cross_layer_%d" % l], 1) + self.weights["cross_bias_%d" % l] + x_l
self.cross_network_out = tf.reshape(x_l, (-1, self.total_size))
# concat_part
concat_input = tf.concat([self.cross_network_out, self.y_deep], axis=1)
self.out = tf.add(tf.matmul(concat_input, self.weights['concat_projection']), self.weights['concat_bias'])
# loss
if self.loss_type == "logloss":
self.out = tf.nn.sigmoid(self.out)
self.loss = tf.losses.log_loss(self.label, self.out)
elif self.loss_type == "mse":
self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
# l2 regularization on weights
if self.l2_reg > 0:
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["concat_projection"])
for i in range(len(self.deep_layers)):
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["deep_layer_%d" % i])
for i in range(self.cross_layer_num):
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["cross_layer_%d" % i])
if self.optimizer_type == "adam":
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,
epsilon=1e-8).minimize(self.loss)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,
initial_accumulator_value=1e-8).minimize(self.loss)
elif self.optimizer_type == "gd":
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
elif self.optimizer_type == "momentum":
self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(
self.loss)
# init
self.saver = tf.train.Saver()
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.sess.run(init)
# number of params
total_parameters = 0
for variable in self.weights.values():
shape = variable.get_shape()
variable_parameters = 1
for dim in shape:
variable_parameters *= dim.value
total_parameters += variable_parameters
if self.verbose > 0:
print("#params: %d" % total_parameters)
def _initialize_weights(self):
weights = dict()
# embeddings
weights['feature_embeddings'] = tf.Variable(
tf.random_normal([self.cate_feature_size, self.embedding_size], 0.0, 0.01),
name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal([self.cate_feature_size, 1], 0.0, 1.0),
name='feature_bias')
# deep layers
num_layer = len(self.deep_layers)
glorot = np.sqrt(2.0 / (self.total_size + self.deep_layers[0]))
weights['deep_layer_0'] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.total_size, self.deep_layers[0])), dtype=np.float32
)
weights['deep_bias_0'] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[0])), dtype=np.float32
)
for i in range(1, num_layer):
glorot = np.sqrt(2.0 / (self.deep_layers[i - 1] + self.deep_layers[i]))
weights["deep_layer_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i - 1], self.deep_layers[i])),
dtype=np.float32) # layers[i-1] * layers[i]
weights["deep_bias_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),
dtype=np.float32) # 1 * layer[i]
for i in range(self.cross_layer_num):
weights["cross_layer_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.total_size, 1)),
dtype=np.float32)
weights["cross_bias_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.total_size, 1)),
dtype=np.float32) # 1 * layer[i]
# final concat projection layer
input_size = self.total_size + self.deep_layers[-1]
glorot = np.sqrt(2.0 / (input_size + 1))
weights['concat_projection'] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(input_size, 1)),
dtype=np.float32)
weights['concat_bias'] = tf.Variable(tf.constant(0.01), dtype=np.float32)
return weights
def get_batch(self, Xi, Xv, Xv2, y, batch_size, index):
start = index * batch_size
end = (index + 1) * batch_size
end = end if end < len(y) else len(y)
return Xi[start:end], Xv[start:end], Xv2[start:end], [[y_] for y_ in y[start:end]]
# shuffle three lists simutaneously
def shuffle_in_unison_scary(self, a, b, c, d):
rng_state = np.random.get_state()
np.random.shuffle(a)
np.random.set_state(rng_state)
np.random.shuffle(b)
np.random.set_state(rng_state)
np.random.shuffle(c)
np.random.set_state(rng_state)
np.random.shuffle(d)
def predict(self, Xi, Xv, Xv2, y):
"""
:param Xi: list of list of feature indices of each sample in the dataset
:param Xv: list of list of feature values of each sample in the dataset
:return: predicted probability of each sample
"""
# dummy y
feed_dict = {self.feat_index: Xi,
self.feat_value: Xv,
self.numeric_value: Xv2,
self.label: y,
self.dropout_keep_deep: [1.0] * len(self.dropout_dep),
self.train_phase: True}
loss = self.sess.run([self.loss], feed_dict=feed_dict)
return loss
def fit_on_batch(self, Xi, Xv, Xv2, y):
feed_dict = {self.feat_index: Xi,
self.feat_value: Xv,
self.numeric_value: Xv2,
self.label: y,
self.dropout_keep_deep: self.dropout_dep,
self.train_phase: True}
loss, opt = self.sess.run([self.loss, self.optimizer], feed_dict=feed_dict)
return loss
def fit(self, cate_Xi_train, cate_Xv_train, numeric_Xv_train, y_train,
cate_Xi_valid=None, cate_Xv_valid=None, numeric_Xv_valid=None, y_valid=None,
early_stopping=False, refit=False):
"""
:param Xi_train: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
indi_j is the feature index of feature field j of sample i in the training set
:param Xv_train: [[val1_1, val1_2, ...], [val2_1, val2_2, ...], ..., [vali_1, vali_2, ..., vali_j, ...], ...]
vali_j is the feature value of feature field j of sample i in the training set
vali_j can be either binary (1/0, for binary/categorical features) or float (e.g., 10.24, for numerical features)
:param y_train: label of each sample in the training set
:param Xi_valid: list of list of feature indices of each sample in the validation set
:param Xv_valid: list of list of feature values of each sample in the validation set
:param y_valid: label of each sample in the validation set
:param early_stopping: perform early stopping or not
:param refit: refit the model on the train+valid dataset or not
:return: None
"""
print(len(cate_Xi_train))
print(len(cate_Xv_train))
print(len(numeric_Xv_train))
print(len(y_train))
has_valid = cate_Xv_valid is not None
for epoch in range(self.epoch):
t1 = time()
self.shuffle_in_unison_scary(cate_Xi_train, cate_Xv_train, numeric_Xv_train, y_train)
total_batch = int(len(y_train) / self.batch_size)
for i in range(total_batch):
cate_Xi_batch, cate_Xv_batch, numeric_Xv_batch, y_batch = self.get_batch(cate_Xi_train, cate_Xv_train,
numeric_Xv_train, y_train,
self.batch_size, i)
self.fit_on_batch(cate_Xi_batch, cate_Xv_batch, numeric_Xv_batch, y_batch)
if has_valid:
y_valid = np.array(y_valid).reshape((-1, 1))
loss = self.predict(cate_Xi_valid, cate_Xv_valid, numeric_Xv_valid, y_valid)
print("epoch", epoch, "loss", loss)
数据处理部分
import numpy as np
import pandas as pd
class FeatureDictionary(object):
def __init__(self, trainfile=None, testfile=None,
numeric_cols=[],
ignore_cols=[],
cate_cols=[]):
self.trainfile = trainfile
# self.testfile = testfile
self.testfile = testfile
self.cate_cols = cate_cols
self.numeric_cols = numeric_cols
self.ignore_cols = ignore_cols
self.gen_feat_dict()
def gen_feat_dict(self):
df = pd.concat([self.trainfile, self.testfile])
self.feat_dict = {}
self.feat_len = {}
tc = 0
for col in df.columns:
if col in self.ignore_cols or col in self.numeric_cols:
continue
else:
us = df[col].unique()
self.feat_dict[col] = dict(zip(us, range(tc, len(us) + tc)))
tc += len(us)
self.feat_dim = tc
class DataParser(object):
def __init__(self, feat_dict):
self.feat_dict = feat_dict
def parse(self, infile=None, df=None, has_label=False):
assert not ((infile is None) and (df is None)), "infile or df at least one is set"
assert not ((infile is not None) and (df is not None)), "only one can be set"
if infile is None:
dfi = df.copy()
else:
dfi = pd.read_csv(infile)
if has_label:
y = dfi["target"].values.tolist()
dfi.drop(["id", "target"], axis=1, inplace=True)
else:
ids = dfi["id"].values.tolist()
dfi.drop(["id"], axis=1, inplace=True)
# dfi for feature index
# dfv for feature value which can be either binary (1/0) or float (e.g., 10.24)
numeric_Xv = dfi[self.feat_dict.numeric_cols].values.tolist()
dfi.drop(self.feat_dict.numeric_cols, axis=1, inplace=True)
dfv = dfi.copy()
for col in dfi.columns:
if col in self.feat_dict.ignore_cols:
dfi.drop(col, axis=1, inplace=True)
dfv.drop(col, axis=1, inplace=True)
continue
else:
dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
dfv[col] = 1.
# list of list of feature indices of each sample in the dataset
cate_Xi = dfi.values.tolist()
# list of list of feature values of each sample in the dataset
cate_Xv = dfv.values.tolist()
if has_label:
return cate_Xi, cate_Xv, numeric_Xv, y
else:
return cate_Xi, cate_Xv, numeric_Xv, ids
main 执行函数
import tensorflow as tf
import pandas as pd
import numpy as np
import recommend.deep_cross_network.config as config
from sklearn.model_selection import StratifiedKFold
from recommend.deep_cross_network.DataLoader import FeatureDictionary, DataParser
from recommend.deep_cross_network.DCN import DCN
def load_data():
"""
加载配置环境中的测试集和训练集
:return:
"""
dfTrain = pd.read_csv(config.TRAIN_FILE)
dfTest = pd.read_csv(config.TEST_FILE)
def preprocess(df):
# 准备数据
cols = [c for c in df.columns if c not in ["id", "target"]]
df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
df["ps_car_13_x_ps_reg_03"] = df["ps_car_13"] * df["ps_reg_03"]
return df
dfTrain = preprocess(dfTrain)
dfTest = preprocess(dfTest)
cols = [c for c in dfTrain.columns if c not in ["id", "target"]]
cols = [c for c in cols if (not c in config.IGNORE_COLS)]
X_train = dfTrain[cols].values
y_train = dfTrain["target"].values
X_test = dfTest[cols].values
ids_test = dfTest["id"].values
# 对比DeepFM 缺少了特征字典
return dfTrain, dfTest, X_train, y_train, X_test, ids_test,
def run_base_model_dcn(dfTrain, dfTest, folds, dcn_params):
fd = FeatureDictionary(dfTrain, dfTest, numeric_cols=config.NUMERIC_COLS,
ignore_cols=config.IGNORE_COLS,
cate_cols=config.CATEGORICAL_COLS)
print(fd.feat_dim)
print(fd.feat_dict)
data_parser = DataParser(feat_dict=fd)
cate_Xi_train, cate_Xv_train, numeric_Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
cate_Xi_test, cate_Xv_test, numeric_Xv_test, ids_test = data_parser.parse(df=dfTest)
dcn_params["cate_feature_size"] = fd.feat_dim
dcn_params["field_size"] = len(cate_Xi_train[0])
dcn_params['numeric_feature_size'] = len(config.NUMERIC_COLS)
_get = lambda x, l: [x[i] for i in l]
for i, (train_idx, valid_idx) in enumerate(folds):
cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_, y_train_ = _get(cate_Xi_train, train_idx), _get(
cate_Xv_train, train_idx), _get(numeric_Xv_train, train_idx), _get(y_train, train_idx)
cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_, y_valid_ = _get(cate_Xi_train, valid_idx), _get(
cate_Xv_train, valid_idx), _get(numeric_Xv_train, valid_idx), _get(y_train, valid_idx)
dcn = DCN(**dcn_params)
dcn.fit(cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_, y_train_, cate_Xi_valid_, cate_Xv_valid_,
numeric_Xv_valid_, y_valid_)
# dfTrain = pd.read_csv(config.TRAIN_FILE,nrows=10000,index_col=None).to_csv(config.TRAIN_FILE,index=False)
# dfTest = pd.read_csv(config.TEST_FILE,nrows=2000,index_col=None).to_csv(config.TEST_FILE,index=False)
dfTrain, dfTest, X_train, y_train, X_test, ids_test = load_data()
print('load_data_over')
# folds StratifiedKFold 分层采样交叉切分,确保训练集,测试集中各类别样本的比例与原始数据集中相同
folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True,
random_state=config.RANDOM_SEED).split(X_train, y_train))
print('process_data_over')
dcn_params = {
"embedding_size": 8,
"deep_layers": [32, 32],
"dropout_deep": [0.5, 0.5, 0.5],
"deep_layers_activation": tf.nn.relu,
"epoch": 30,
"batch_size": 1024,
"learning_rate": 0.001,
"optimizer_type": "adam",
"batch_norm": 1,
"batch_norm_decay": 0.995,
"l2_reg": 0.01,
"verbose": True,
"random_seed": config.RANDOM_SEED,
"cross_layer_num": 3
}
print('start train')
run_base_model_dcn(dfTrain, dfTest, folds, dcn_params)
参考:
https://www.cnblogs.com/yinzm/p/11827905.html
https://zhuanlan.zhihu.com/p/83814532
https://www.cnblogs.com/gogoSandy/p/12892973.html