DeepCTR_Torch 代码解析(基于Wide&Deep网络)
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *
0. Load datasets
data = pd.read_csv('./criteo_sample.txt')
data["C1"]
0 05db9164
1 68fd1e64
2 05db9164
3 05db9164
4 05db9164
...
195 05db9164
196 be589b51
197 05db9164
198 05db9164
199 be589b51
Name: C1, Length: 200, dtype: object
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]
data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']
1. Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
lbe = LabelEncoder()
data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])
2. count #unique features for each sparse field,and record dense feature field name
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
for feat in sparse_features] + [DenseFeat(feat, 1, )
for feat in dense_features]
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(
linear_feature_columns + dnn_feature_columns)
# 特征名称
dnn_feature_columns
# 一个namedtuble类型的数据
[SparseFeat(name='C1', vocabulary_size=27, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C1', group_name='default_group'),
SparseFeat(name='C2', vocabulary_size=92, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C2', group_name='default_group'),
SparseFeat(name='C3', vocabulary_size=172, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C3', group_name='default_group'),
SparseFeat(name='C4', vocabulary_size=157, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C4', group_name='default_group'),
SparseFeat(name='C5', vocabulary_size=12, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C5', group_name='default_group'),
SparseFeat(name='C6', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C6', group_name='default_group'),
SparseFeat(name='C7', vocabulary_size=183, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C7', group_name='default_group'),
SparseFeat(name='C8', vocabulary_size=19, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C8', group_name='default_group'),
SparseFeat(name='C9', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C9', group_name='default_group'),
SparseFeat(name='C10', vocabulary_size=142, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C10', group_name='default_group'),
SparseFeat(name='C11', vocabulary_size=173, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C11', group_name='default_group'),
SparseFeat(name='C12', vocabulary_size=170, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C12', group_name='default_group'),
SparseFeat(name='C13', vocabulary_size=166, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C13', group_name='default_group'),
SparseFeat(name='C14', vocabulary_size=14, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C14', group_name='default_group'),
SparseFeat(name='C15', vocabulary_size=170, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C15', group_name='default_group'),
SparseFeat(name='C16', vocabulary_size=168, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C16', group_name='default_group'),
SparseFeat(name='C17', vocabulary_size=9, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C17', group_name='default_group'),
SparseFeat(name='C18', vocabulary_size=127, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C18', group_name='default_group'),
SparseFeat(name='C19', vocabulary_size=44, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C19', group_name='default_group'),
SparseFeat(name='C20', vocabulary_size=4, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C20', group_name='default_group'),
SparseFeat(name='C21', vocabulary_size=169, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C21', group_name='default_group'),
SparseFeat(name='C22', vocabulary_size=6, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C22', group_name='default_group'),
SparseFeat(name='C23', vocabulary_size=10, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C23', group_name='default_group'),
SparseFeat(name='C24', vocabulary_size=125, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C24', group_name='default_group'),
SparseFeat(name='C25', vocabulary_size=20, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C25', group_name='default_group'),
SparseFeat(name='C26', vocabulary_size=90, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C26', group_name='default_group'),
DenseFeat(name='I1', dimension=1, dtype='float32'),
DenseFeat(name='I2', dimension=1, dtype='float32'),
DenseFeat(name='I3', dimension=1, dtype='float32'),
DenseFeat(name='I4', dimension=1, dtype='float32'),
DenseFeat(name='I5', dimension=1, dtype='float32'),
DenseFeat(name='I6', dimension=1, dtype='float32'),
DenseFeat(name='I7', dimension=1, dtype='float32'),
DenseFeat(name='I8', dimension=1, dtype='float32'),
DenseFeat(name='I9', dimension=1, dtype='float32'),
DenseFeat(name='I10', dimension=1, dtype='float32'),
DenseFeat(name='I11', dimension=1, dtype='float32'),
DenseFeat(name='I12', dimension=1, dtype='float32'),
DenseFeat(name='I13', dimension=1, dtype='float32')]
3. generate input data for model
train, test = train_test_split(data, test_size=0.2)
# 输入类型为字典类型:{特征1:值,特征2:值}
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}
train_model_input
# 部分输出结果显示
{'C1': 85 11
151 25
197 0
36 11
127 11
..
35 0
40 24
64 16
154 0
165 0
Name: C1, Length: 160, dtype: int64, 'C2': 85 3
151 50
197 5
36 13
127 13
..
35 30
40 25
64 32
154 6
165 59
Name: C2, Length: 160, dtype: int64,
......
4. Define Model,train,predict and evaluate
4.1 训练
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
print('cuda ready...')
device = 'cuda:0'
model = WDL(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
task='binary',
l2_reg_embedding=1e-5, device=device)
model.compile("adagrad", "binary_crossentropy",
metrics=["binary_crossentropy", "auc"], )
model.fit(train_model_input,train[target].values,batch_size=32,epochs=10,verbose=2,validation_split=0.0)
pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
cuda ready...
cpu
Train on 160 samples, validate on 0 samples, 5 steps per epoch
Epoch 1/10
0s - loss: 0.5990 - binary_crossentropy: 0.5990 - auc: 0.6504
Epoch 2/10
0s - loss: 0.4579 - binary_crossentropy: 0.4579 - auc: 0.9717
Epoch 3/10
0s - loss: 0.2952 - binary_crossentropy: 0.2952 - auc: 0.9985
Epoch 4/10
0s - loss: 0.1802 - binary_crossentropy: 0.1802 - auc: 0.9990
Epoch 5/10
0s - loss: 0.1349 - binary_crossentropy: 0.1349 - auc: 1.0000
Epoch 6/10
0s - loss: 0.1112 - binary_crossentropy: 0.1112 - auc: 1.0000
Epoch 7/10
0s - loss: 0.0968 - binary_crossentropy: 0.0968 - auc: 1.0000
Epoch 8/10
0s - loss: 0.0839 - binary_crossentropy: 0.0839 - auc: 1.0000
Epoch 9/10
0s - loss: 0.0738 - binary_crossentropy: 0.0738 - auc: 1.0000
Epoch 10/10
0s - loss: 0.0663 - binary_crossentropy: 0.0663 - auc: 1.0000
test LogLoss 1.0295
test AUC 0.4265
4.2 Wide&Deep模型
4.2.1 模型架构
4.2.2 模型源码
model = WDL(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
task='binary',
l2_reg_embedding=1e-5, device=device)
# -*- coding:utf-8 -*-
"""
Author:
Weichen Shen,wcshen1994@163.com
Reference:
[1] Cheng H T, Koc L, Harmsen J, et al. Wide & deep learning for recommender systems[C]//Proceedings of the 1st Workshop on Deep Learning for Recommender Systems. ACM, 2016: 7-10.(https://arxiv.org/pdf/1606.07792.pdf)
"""
import torch.nn as nn
from .basemodel import BaseModel
from ..inputs import combined_dnn_input
from ..layers import DNN
class WDL(BaseModel):
"""Instantiates the Wide&Deep Learning architecture.
:param linear_feature_columns: An iterable containing all the features used by linear part of the model.
:param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
:param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN
:param l2_reg_linear: float. L2 regularizer strength applied to wide part
:param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
:param l2_reg_dnn: float. L2 regularizer strength applied to DNN
:param init_std: float,to use as the initialize std of embedding vector
:param seed: integer ,to use as random seed.
:param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
:param dnn_activation: Activation function to use in DNN
:param task: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss
:param device: str, ``"cpu"`` or ``"cuda:0"``
:return: A PyTorch model instance.
"""
def __init__(self,
linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(256, 128),
l2_reg_linear=1e-5,
l2_reg_embedding=1e-5, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0, dnn_activation='relu',
dnn_use_bn=False,
task='binary', device='cpu'):
super(WDL, self).__init__(linear_feature_columns, dnn_feature_columns, l2_reg_linear=l2_reg_linear,
l2_reg_embedding=l2_reg_embedding, init_std=init_std, seed=seed, task=task,
device=device)
# 是否使用Deep层
self.use_dnn = len(dnn_feature_columns) > 0 and len(
dnn_hidden_units) > 0
if self.use_dnn:
# 隐藏层
self.dnn = DNN(self.compute_input_dim(dnn_feature_columns), dnn_hidden_units,
activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, use_bn=dnn_use_bn,
init_std=init_std, device=device)
# 输出层
self.dnn_linear = nn.Linear(dnn_hidden_units[-1], 1, bias=False).to(device)
self.add_regularization_weight(
filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2_reg_dnn)
self.add_regularization_weight(self.dnn_linear.weight, l2_reg_dnn)
self.to(device)
def forward(self, X):
sparse_embedding_list, dense_value_list = self.input_from_feature_columns(X, self.dnn_feature_columns,
self.embedding_dict) # 转4.2.3
# Wide(保证记忆)
logit = self.linear_model(X)
# Deep(保证泛化)
if self.use_dnn:
# 系数特征和密度特征装换为一个输入向量
dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list)
dnn_output = self.dnn(dnn_input)
dnn_logit = self.dnn_linear(dnn_output)
# Wide&Deep部分的数据进行汇合
logit += dnn_logit
# 使用Sigmoid函数进行预测
y_pred = self.out(logit)
return y_pred
4.2.3 self.embedding_dict
def create_embedding_matrix(feature_columns, init_std=0.0001, linear=False, sparse=False, device='cpu'):
# 为稀疏特征创建一个Embedding向量
# Return nn.ModuleDict: for sparse features, {embedding_name: nn.Embedding}
# for varlen sparse features, {embedding_name: nn.EmbeddingBag}
sparse_feature_columns = list(
filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if len(feature_columns) else []
varlen_sparse_feature_columns = list(
filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if len(feature_columns) else []
# 稀疏编码(为每一个特征进行嵌入)
embedding_dict = nn.ModuleDict(
{feat.embedding_name: nn.Embedding(feat.vocabulary_size, feat.embedding_dim if not linear else 1, sparse=sparse)
for feat in
sparse_feature_columns + varlen_sparse_feature_columns}
)
# for feat in varlen_sparse_feature_columns:
# embedding_dict[feat.embedding_name] = nn.EmbeddingBag(
# feat.dimension, embedding_size, sparse=sparse, mode=feat.combiner)
# 初始化Embeding向量的权值
for tensor in embedding_dict.values():
nn.init.normal_(tensor.weight, mean=0, std=init_std)
# 返回Embedding
return embedding_dict.to(device)
4.2.4 稀疏特征编码
def input_from_feature_columns(self, X, feature_columns, embedding_dict, support_dense=True):
"""Embedding"""
sparse_feature_columns = list(
filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if len(feature_columns) else []
dense_feature_columns = list(
filter(lambda x: isinstance(x, DenseFeat), feature_columns)) if len(feature_columns) else []
varlen_sparse_feature_columns = list(
filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else []
if not support_dense and len(dense_feature_columns) > 0:
raise ValueError(
"DenseFeat is not supported in dnn_feature_columns")
# 获得每一个特征的嵌入向量位数为(batch, dic, embedding_dim)
# X的维度为(N, feature_dim)
# 对不同特征进行Embedding eg:32(bathch), 4(embedding维度)
sparse_embedding_list = [embedding_dict[feat.embedding_name](
X[:, self.feature_index[feat.name][0]:self.feature_index[feat.name][1]].long()) for
feat in sparse_feature_columns]
varlen_sparse_embedding_list = get_varlen_pooling_list(self.embedding_dict, X, self.feature_index,
varlen_sparse_feature_columns, self.device)
dense_value_list = [X[:, self.feature_index[feat.name][0]:self.feature_index[feat.name][1]] for feat in
dense_feature_columns]
return sparse_embedding_list + varlen_sparse_embedding_list, dense_value_list
4.2.5 DNN 层的构建
class DNN(nn.Module):
"""The Multi Layer Percetron # 多层感知集模型(全连接网络)
Input shape
- nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``.
Output shape
- nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape ``(batch_size, input_dim)``, the output would have shape ``(batch_size, hidden_size[-1])``.
Arguments
- **inputs_dim**: input feature dimension.
- **hidden_units**:list of positive integer, the layer number and units in each layer.
- **activation**: Activation function to use.
- **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix. # 正则化核权重
- **dropout_rate**: float in [0,1). Fraction of the units to dropout.
- **use_bn**: bool. Whether use BatchNormalization before activation or not.
- **seed**: A Python integer to use as random seed.
"""
def __init__(self, inputs_dim, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False,
init_std=0.0001, dice_dim=3, seed=1024, device='cpu'):
super(DNN, self).__init__()
self.dropout_rate = dropout_rate
self.dropout = nn.Dropout(dropout_rate)
self.seed = seed
self.l2_reg = l2_reg
self.use_bn = use_bn
if len(hidden_units) == 0:
raise ValueError("hidden_units is empty!!")
hidden_units = [inputs_dim] + list(hidden_units)
# 构建线性层
self.linears = nn.ModuleList(
[nn.Linear(hidden_units[i], hidden_units[i + 1]) for i in range(len(hidden_units) - 1)])
# 构建BN
if self.use_bn:
self.bn = nn.ModuleList(
[nn.BatchNorm1d(hidden_units[i + 1]) for i in range(len(hidden_units) - 1)])
# 构建激活函数(采用正态分布)
self.activation_layers = nn.ModuleList(
[activation_layer(activation, hidden_units[i + 1], dice_dim) for i in range(len(hidden_units) - 1)])
# 初始化权重:
for name, tensor in self.linears.named_parameters():
if 'weight' in name:
nn.init.normal_(tensor, mean=0, std=init_std)
self.to(device)
def forward(self, inputs):
deep_input = inputs
for i in range(len(self.linears)):
fc = self.linears[i](deep_input)
if self.use_bn:
fc = self.bn[i](fc)
fc = self.activation_layers[i](fc)
fc = self.dropout(fc)
deep_input = fc
return deep_input
4.2.6 预测结果
class PredictionLayer(nn.Module):
"""
Arguments
- **task**: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss
- **use_bias**: bool.Whether add bias term or not.
"""
def __init__(self, task='binary', use_bias=True, **kwargs):
if task not in ["binary", "multiclass", "regression"]:
raise ValueError("task must be binary,multiclass or regression")
super(PredictionLayer, self).__init__()
self.use_bias = use_bias
self.task = task
if self.use_bias:
self.bias = nn.Parameter(torch.zeros((1,)))
def forward(self, X):
output = X
if self.use_bias:
output += self.bias
if self.task == "binary":
output = torch.sigmoid(output) # 二分类使用Sigmoid函数
return output
4.2.7 模型可视化
WDL(
(embedding_dict): ModuleDict(
(C1): Embedding(27, 4)
(C10): Embedding(142, 4)
(C11): Embedding(173, 4)
(C12): Embedding(170, 4)
(C13): Embedding(166, 4)
(C14): Embedding(14, 4)
(C15): Embedding(170, 4)
(C16): Embedding(168, 4)
(C17): Embedding(9, 4)
(C18): Embedding(127, 4)
(C19): Embedding(44, 4)
(C2): Embedding(92, 4)
(C20): Embedding(4, 4)
(C21): Embedding(169, 4)
(C22): Embedding(6, 4)
(C23): Embedding(10, 4)
(C24): Embedding(125, 4)
(C25): Embedding(20, 4)
(C26): Embedding(90, 4)
(C3): Embedding(172, 4)
(C4): Embedding(157, 4)
(C5): Embedding(12, 4)
(C6): Embedding(7, 4)
(C7): Embedding(183, 4)
(C8): Embedding(19, 4)
(C9): Embedding(2, 4)
)
(linear_model): Linear(
(embedding_dict): ModuleDict(
(C1): Embedding(27, 1)
(C10): Embedding(142, 1)
(C11): Embedding(173, 1)
(C12): Embedding(170, 1)
(C13): Embedding(166, 1)
(C14): Embedding(14, 1)
(C15): Embedding(170, 1)
(C16): Embedding(168, 1)
(C17): Embedding(9, 1)
(C18): Embedding(127, 1)
(C19): Embedding(44, 1)
(C2): Embedding(92, 1)
(C20): Embedding(4, 1)
(C21): Embedding(169, 1)
(C22): Embedding(6, 1)
(C23): Embedding(10, 1)
(C24): Embedding(125, 1)
(C25): Embedding(20, 1)
(C26): Embedding(90, 1)
(C3): Embedding(172, 1)
(C4): Embedding(157, 1)
(C5): Embedding(12, 1)
(C6): Embedding(7, 1)
(C7): Embedding(183, 1)
(C8): Embedding(19, 1)
(C9): Embedding(2, 1)
)
)
(out): PredictionLayer()
(dnn): DNN(
(dropout): Dropout(p=0, inplace=False)
(linears): ModuleList(
(0): Linear(in_features=117, out_features=256, bias=True)
(1): Linear(in_features=256, out_features=128, bias=True)
)
(activation_layers): ModuleList(
(0): ReLU(inplace=True)
(1): ReLU(inplace=True)
)
)
(dnn_linear): Linear(in_features=128, out_features=1, bias=False)
)
4.3 compile接口
4.3.1 接口调用
model.compile("adagrad", "binary_crossentropy",
metrics=["binary_crossentropy", "auc"], )
4.3.2 函数实现
def compile(self, optimizer, loss=None, metrics=None,):
"""
:param optimizer: String (name of optimizer) or optimizer instance. See [optimizers](https://pytorch.org/docs/stable/optim.html).
:param loss: String (name of objective function) or objective function. See [losses](https://pytorch.org/docs/stable/nn.functional.html#loss-functions).
:param metrics: List of metrics to be evaluated by the model during training and testing. Typically you will use `metrics=['accuracy']`.
"""
self.metrics_names = ["loss"] # 存储所选指标名称,eg:["loss", "auc", "logloss"]
self.optim = self._get_optim(optimizer) # 获得指定的优化器
self.loss_func = self._get_loss_func(loss) # 指定损失函数
self.metrics = self._get_metrics(metrics) # 指定评价标准(一个列表)
def _get_optim(self, optimizer):
""" 优化器"""
if isinstance(optimizer, str):
if optimizer == "sgd":
optim = torch.optim.SGD(self.parameters(), lr=0.01)
elif optimizer == "adam":
optim = torch.optim.Adam(self.parameters()) # 0.001
elif optimizer == "adagrad":
optim = torch.optim.Adagrad(self.parameters()) # 0.01
elif optimizer == "rmsprop":
optim = torch.optim.RMSprop(self.parameters())
else:
raise NotImplementedError
else:
optim = optimizer
return optim
def _get_loss_func(self, loss):
"""损失函数"""
if isinstance(loss, str):
if loss == "binary_crossentropy":
loss_func = F.binary_cross_entropy
elif loss == "mse":
loss_func = F.mse_loss
elif loss == "mae":
loss_func = F.l1_loss
else:
raise NotImplementedError
else:
loss_func = loss
return loss_func
def _log_loss(self, y_true, y_pred, eps=1e-7, normalize=True, sample_weight=None, labels=None):
# change eps to improve calculation accuracy
return log_loss(y_true,
y_pred,
eps,
normalize,
sample_weight,
labels)
def _get_metrics(self, metrics, set_eps=False):
metrics_ = {}
if metrics:
for metric in metrics:
if metric == "binary_crossentropy" or metric == "logloss":
if set_eps:
metrics_[metric] = self._log_loss
else:
metrics_[metric] = log_loss
if metric == "auc":
metrics_[metric] = roc_auc_score
if metric == "mse":
metrics_[metric] = mean_squared_error
if metric == "accuracy" or metric == "acc":
metrics_[metric] = lambda y_true, y_pred: accuracy_score(
y_true, np.where(y_pred > 0.5, 1, 0))
self.metrics_names.append(metric)
return metrics_
4.4 fit接口
def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, initial_epoch=0, validation_split=0.,
validation_data=None, shuffle=True, callbacks=None):
"""
:param x: Numpy array of training data (if the model has a single input), or list of Numpy arrays (if the model has multiple inputs).If input layers in the model are named, you can also pass a
dictionary mapping input names to Numpy arrays.
:param y: Numpy array of target (label) data (if the model has a single output), or list of Numpy arrays (if the model has multiple outputs).
:param batch_size: Integer or `None`. Number of samples per gradient update. If unspecified, `batch_size` will default to 256.
:param epochs: Integer. Number of epochs to train the model. An epoch is an iteration over the entire `x` and `y` data provided. Note that in conjunction with `initial_epoch`, `epochs` is to be understood as "final epoch". The model is not trained for a number of iterations given by `epochs`, but merely until the epoch of index `epochs` is reached.
:param verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.
:param initial_epoch: Integer. Epoch at which to start training (useful for resuming a previous training run).
:param validation_split: Float between 0 and 1. Fraction of the training data to be used as validation data. The model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and any model metrics on this data at the end of each epoch. The validation data is selected from the last samples in the `x` and `y` data provided, before shuffling.
:param validation_data: tuple `(x_val, y_val)` or tuple `(x_val, y_val, val_sample_weights)` on which to evaluate the loss and any model metrics at the end of each epoch. The model will not be trained on this data. `validation_data` will override `validation_split`.
:param shuffle: Boolean. Whether to shuffle the order of the batches at the beginning of each epoch.
:param callbacks: List of `deepctr_torch.callbacks.Callback` instances. List of callbacks to apply during training and validation (if ). See [callbacks](https://tensorflow.google.cn/api_docs/python/tf/keras/callbacks). Now available: `EarlyStopping` , `ModelCheckpoint`
"""
"""
输入前的数据类型
x = {"f1":(索引+value)(Serise类型), "f2": data2}
{'C1': 85 11
151 25
197 0
36 11
127 11
..
35 0
40 24
64 16
154 0
165 0
Name: C1, Length: 160, dtype: int64, 'C2': 85 3
151 50
197 5
36 13
127 13
..
35 30
40 25
64 32
154 6
165 59
Name: C2, Length: 160, dtype: int64, 'C3': 85 77
"""
if isinstance(x, dict):
x = [x[feature] for feature in self.feature_index] # 变换为[(fea1)Serise,(fea2) Serise......]
# 验证集处理
# 本身带验证集
do_validation = False
if validation_data:
do_validation = True
if len(validation_data) == 2:
val_x, val_y = validation_data
val_sample_weight = None
elif len(validation_data) == 3:
val_x, val_y, val_sample_weight = validation_data # pylint: disable=unpacking-non-sequence
else:
raise ValueError(
'When passing a `validation_data` argument, '
'it must contain either 2 items (x_val, y_val), '
'or 3 items (x_val, y_val, val_sample_weights), '
'or alternatively it could be a dataset or a '
'dataset or a dataset iterator. '
'However we received `validation_data=%s`' % validation_data)
if isinstance(val_x, dict):
val_x = [val_x[feature] for feature in self.feature_index]
# 从训练数据中选择一部分数据作为验证集
elif validation_split and 0. < validation_split < 1.:
do_validation = True
# x[0]为表示单个特征下数据值,存储方式为一个Serise
if hasattr(x[0], 'shape'):
split_at = int(x[0].shape[0] * (1. - validation_split))
else:
split_at = int(len(x[0]) * (1. - validation_split))
x, val_x = (slice_arrays(x, 0, split_at),
slice_arrays(x, split_at))
y, val_y = (slice_arrays(y, 0, split_at),
slice_arrays(y, split_at))
else:
val_x = []
val_y = []
# 数据处理
# x (每一个特征(一个Serise),样本数目)
for i in range(len(x)):
if len(x[i].shape) == 1:
x[i] = np.expand_dims(x[i], axis=1) # (160, )
train_tensor_data = Data.TensorDataset(
torch.from_numpy(
np.concatenate(x, axis=-1)), # concat之后数据维度变为:(样本数目,特征) eg(160, 39)
torch.from_numpy(y))
if batch_size is None:
batch_size = 256
train_loader = DataLoader(
dataset=train_tensor_data, shuffle=shuffle, batch_size=batch_size)
print(self.device, end="\n")
model = self.train() # 设置为training模型
loss_func = self.loss_func
optim = self.optim
sample_num = len(train_tensor_data)
steps_per_epoch = (sample_num - 1) // batch_size + 1
callbacks = CallbackList(callbacks)
callbacks.set_model(self)
callbacks.on_train_begin()
self.stop_training = False # used for early stopping
# Train
print("Train on {0} samples, validate on {1} samples, {2} steps per epoch".format(
len(train_tensor_data), len(val_y), steps_per_epoch))
for epoch in range(initial_epoch, epochs):
callbacks.on_epoch_begin(epoch)
epoch_logs = {}
start_time = time.time()
loss_epoch = 0
total_loss_epoch = 0
train_result = {}
# 训练
try:
with tqdm(enumerate(train_loader), disable=verbose != 1) as t:
for index, (x_train, y_train) in t:
x = x_train.to(self.device).float()
y = y_train.to(self.device).float()
y_pred = model(x).squeeze()
optim.zero_grad()
loss = loss_func(y_pred, y.squeeze(), reduction='sum')
reg_loss = self.get_regularization_loss()
total_loss = loss + reg_loss + self.aux_loss #aux_loss为附加损失
loss_epoch += loss.item()
total_loss_epoch += total_loss.item()
total_loss.backward(retain_graph=True)
optim.step()
# 评估
if verbose > 0:
for name, metric_fun in self.metrics.items():
if name not in train_result:
train_result[name] = []
train_result[name].append(metric_fun(
y.cpu().data.numpy(), y_pred.cpu().data.numpy().astype("float64")))
except KeyboardInterrupt:
t.close()
raise
t.close()
# Add epoch_logs
epoch_logs["loss"] = total_loss_epoch / sample_num
for name, result in train_result.items():
epoch_logs[name] = np.sum(result) / steps_per_epoch
if do_validation:
eval_result = self.evaluate(val_x, val_y, batch_size)
for name, result in eval_result.items():
epoch_logs["val_" + name] = result
# verbose
if verbose > 0:
epoch_time = int(time.time() - start_time)
print('Epoch {0}/{1}'.format(epoch + 1, epochs))
eval_str = "{0}s - loss: {1: .4f}".format(
epoch_time, epoch_logs["loss"])
for name in self.metrics:
eval_str += " - " + name + \
": {0: .4f}".format(epoch_logs[name])
if do_validation:
for name in self.metrics:
eval_str += " - " + "val_" + name + \
": {0: .4f}".format(epoch_logs["val_" + name])
print(eval_str)
callbacks.on_epoch_end(epoch, epoch_logs)
if self.stop_training:
break
callbacks.on_train_end()