对github中的wechat_big_data_baseline_pytorch进行了修改,主要是整合了512个embded内容描述字段,可惜错过提交截止时间,看上去AUC结果还不错。第一次独立一人完成了整个大数据竞赛,特此纪念。
无embded数据预处理
# -*- coding: utf-8 -*-
#准备数据,跑一次就好
import numpy as np
import pandas as pd
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')
# 存储数据的根目录
ROOT_PATH = "/content/drive/MyDrive/Colab Notebooks"
# 比赛数据集路径
DATASET_PATH = ROOT_PATH + '/'
# 训练集
USER_ACTION = DATASET_PATH + "user_action.csv"
FEED_INFO = DATASET_PATH + "feed_info.csv"
FEED_EMBEDDINGS = DATASET_PATH + "feed_embeddings.csv"
# 测试集
TEST_FILE = DATASET_PATH + "test_a.csv"
# 初赛待预测行为列表
ACTION_LIST = ["read_comment", "like", "click_avatar", "forward"]
FEA_COLUMN_LIST = ["read_comment", "like", "click_avatar", "forward", "comment", "follow", "favorite"]
FEA_FEED_LIST = ['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id',
'emorder0','emorder1','emorder2','emorder3','emorder4','emorder5','emorder6','emorder7']
# 负样本下采样比例(负样本:正样本)
ACTION_SAMPLE_RATE = {"read_comment": 5, "like": 5, "click_avatar": 5, "forward": 10, "comment": 10, "follow": 10,
"favorite": 10}
def process_embed(train, start, end):
feed_embed_array = np.zeros((train.shape[0], end-start))
for i in tqdm(range(train.shape[0])):
x = train.loc[i, 'feed_embedding']
if x != np.nan and x != '':
y = [float(i) for i in str(x).strip().split(" ")][start:end]
else:
y = np.zeros((end-start,)).tolist()
feed_embed_array[i] += y
temp = pd.DataFrame(columns=[f"embed{start+i}" for i in range(end-start)], data=feed_embed_array)
del train['feed_embedding']
train = pd.concat((train, temp), axis=1)
return train
def prepare_data0():
feed_info_df = pd.read_csv(FEED_INFO)
user_action_df = pd.read_csv(USER_ACTION)[["userid", "date_", "feedid"] + FEA_COLUMN_LIST]
test = pd.read_csv(TEST_FILE)
# add feed feature
train0 = pd.merge(user_action_df, feed_info_df[FEA_FEED_LIST], on='feedid', how='left')
#
test = pd.merge(test, feed_info_df[FEA_FEED_LIST], on='feedid', how='left')
test["videoplayseconds"] = np.log(test["videoplayseconds"] + 1.0)
test.to_csv(ROOT_PATH + f'/test_data.csv', index=False)
for subfeat in range(0,512,16):
print(f"subfeat for {subfeat}")
feat = pd.read_csv(FEED_EMBEDDINGS)
feat = process_embed(feat, subfeat, subfeat+16)
train = pd.merge(train0, feat, on='feedid', how='left')
for action in tqdm(ACTION_LIST):
print(f"prepare data for {action}")
tmp = train.drop_duplicates(['userid', 'feedid', action], keep='last')
df_neg = tmp[tmp[action] == 0]
df_neg = df_neg.sample(frac=1.0 / ACTION_SAMPLE_RATE[action], random_state=42, replace=False)
df_all = pd.concat([df_neg, tmp[tmp[action] == 1]])
print(f"sample cnt:{df_all.shape[0]/30}")
df_all=df_all.sample(n=int(df_all.shape[0]/30))
df_all["videoplayseconds"] = np.log(df_all["videoplayseconds"] + 1.0)
df_all.to_csv(ROOT_PATH + f'/train_data_for_{action}_{subfeat}.csv', index=False)
def prepare_data():
feed_info_df = pd.read_csv(FEED_INFO)
feat = pd.read_csv(FEED_EMBEDDINGS)
train = pd.merge(train0, feat, on='feedid', how='left')
user_action_df = pd.read_csv(USER_ACTION)[["userid", "date_", "feedid"] + FEA_COLUMN_LIST]
test = pd.read_csv(TEST_FILE)
# add feed feature
train = pd.merge(user_action_df, feed_info_df[FEA_FEED_LIST], on='feedid', how='left')
train = pd.merge(train, temp, on='feedid', how='left')
test = pd.merge(test, feed_info_df[FEA_FEED_LIST], on='feedid', how='left')
test["videoplayseconds"] = np.log(test["videoplayseconds"] + 1.0)
test.to_csv(ROOT_PATH + f'/test_data.csv', index=False)
for action in tqdm(ACTION_LIST):
feat = process_embed(feat, subfeat, subfeat+16)
print(f"prepare data for {action}")
tmp = train.drop_duplicates(['userid', 'feedid', action], keep='last')
df_neg = tmp[tmp[action] == 0]
df_neg = df_neg.sample(frac=1.0 / ACTION_SAMPLE_RATE[action], random_state=42, replace=False)
df_all = pd.concat([df_neg, tmp[tmp[action] == 1]])
df_all["videoplayseconds"] = np.log(df_all["videoplayseconds"] + 1.0)
df_all.to_csv(ROOT_PATH + f'/train_data_for_{action}.csv', index=False)
prepare_data()
简单的embded降维处理,结果存盘备用
import numpy as np
import pandas as pd
import seaborn as sns
resall=pd.DataFrame()
for action in ACTION_LIST:
res=pd.DataFrame()
for subfeat in range(0,512,16):
train = pd.read_csv(ROOT_PATH + f'/train_data_for_{action}_{subfeat}.csv')
corrdat = train.corr()
actcord = corrdat.loc[[action]].filter(regex='embed.*')
if res.empty:
res=actcord
else:
res=pd.merge(res,actcord, right_index = True,
left_index = True)
if resall.empty:
resall = res
else:
resall = pd.concat([resall,res])
print("output=========>\n", resall.shape)
sns.lineplot(data=resall.T)
resall.to_csv(ROOT_PATH + "/resall.csv")
提取embded相关系数
#提取embded权重
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
ROOT_PATH = "/content/drive/MyDrive/Colab Notebooks"
resall=pd.read_csv(ROOT_PATH + "/resall.csv",index_col=0)
embedorder={}
for v in resall.index:
embedorder[v]=resall.loc[v].abs().sort_values(ascending=False)
含高相关系数的embded数据预处理
# -*- coding: utf-8 -*-
#准备数据,跑一次就好
import numpy as np
import pandas as pd
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')
# 存储数据的根目录
ROOT_PATH = "/content/drive/MyDrive/Colab Notebooks"
# 比赛数据集路径
DATASET_PATH = ROOT_PATH + '/'
# 训练集
USER_ACTION = DATASET_PATH + "user_action.csv"
FEED_INFO = DATASET_PATH + "feed_info.csv"
FEED_EMBEDDINGS = DATASET_PATH + "feed_embeddings.csv"
# 测试集
TEST_FILE = DATASET_PATH + "test_a.csv"
# 初赛待预测行为列表
ACTION_LIST = ["read_comment", "like", "click_avatar", "forward"]
FEA_COLUMN_LIST = ["read_comment", "like", "click_avatar", "forward", "comment", "follow", "favorite"]
FEA_FEED_LIST = ['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id',
'emorder0','emorder1','emorder2','emorder3','emorder4','emorder5','emorder6','emorder7']
# 负样本下采样比例(负样本:正样本)
ACTION_SAMPLE_RATE = {"read_comment": 5, "like": 5, "click_avatar": 5, "forward": 10, "comment": 10, "follow": 10,
"favorite": 10}
def process_ordered_embed(action, end):
train = pd.read_csv(FEED_EMBEDDINGS)
feed_embed_array = np.zeros((train.shape[0], end))
for i in tqdm(range(train.shape[0])):
x = train.loc[i, 'feed_embedding']
if x != np.nan and x != '':
y = [float(i) for i in str(x).strip().split(" ")]
else:
y = np.zeros((512,)).tolist()
feed_embed_array[i] += [y[int(embedorder[action].index[i][5:])] for i in range(end)]
temp = pd.DataFrame(columns=[f"emorder{i}" for i in range(end)], data=feed_embed_array)
del train['feed_embedding']
train = pd.concat((train, temp), axis=1)
return train
def prepare_data_with_embded():
feed_info_df = pd.read_csv(FEED_INFO)
user_action_df = pd.read_csv(USER_ACTION)[["userid", "date_", "feedid"] + FEA_COLUMN_LIST]
test = pd.read_csv(TEST_FILE)
# add feed feature
train0 = pd.merge(user_action_df, feed_info_df[FEA_FEED_LIST], on='feedid', how='left')
test = pd.merge(test, feed_info_df[FEA_FEED_LIST], on='feedid', how='left')
test["videoplayseconds"] = np.log(test["videoplayseconds"] + 1.0)
test.to_csv(ROOT_PATH + f'/test_data.csv', index=False)
for action in tqdm(ACTION_LIST):
feat = process_ordered_embed(action, 8)
train = pd.merge(train0, feat, on='feedid', how='left')
print(f"prepare data for {action}")
tmp = train.drop_duplicates(['userid', 'feedid', action], keep='last')
df_neg = tmp[tmp[action] == 0]
df_neg = df_neg.sample(frac=1.0 / ACTION_SAMPLE_RATE[action], random_state=42, replace=False)
df_all = pd.concat([df_neg, tmp[tmp[action] == 1]])
# df_all=df_all.sample(n=int(df_all.shape[0]/30))
df_all["videoplayseconds"] = np.log(df_all["videoplayseconds"] + 1.0)
df_all.to_csv(ROOT_PATH + f'/train_data_for_{action}.csv', index=False)
prepare_data_with_embded()
训练并生成测试集预测结果
# -*- coding: utf-8 -*-
#训练
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tqdm import tqdm
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models.deepfm import *
from deepctr_torch.models.basemodel import *
from google.colab import drive
drive.mount('/content/drive')
# 存储数据的根目录
ROOT_PATH = "/content/drive/MyDrive/Colab Notebooks"
# 比赛数据集路径
DATASET_PATH = ROOT_PATH + '/'
# 训练集
USER_ACTION = DATASET_PATH + "user_action.csv"
FEED_INFO = DATASET_PATH + "feed_info.csv"
FEED_EMBEDDINGS = DATASET_PATH + "feed_embeddings.csv"
# 测试集
TEST_FILE = DATASET_PATH + "test_a.csv"
# 初赛待预测行为列表
ACTION_LIST = ["read_comment", "like", "click_avatar", "forward"]
FEA_COLUMN_LIST = ["read_comment", "like", "click_avatar", "forward", "comment", "follow", "favorite"]
FEA_FEED_LIST = ['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id',
'emorder0','emorder1','emorder2','emorder3','emorder4','emorder5','emorder6','emorder7']
# 负样本下采样比例(负样本:正样本)
ACTION_SAMPLE_RATE = {"read_comment": 5, "like": 5, "click_avatar": 5, "forward": 10, "comment": 10, "follow": 10,
"favorite": 10}
class MyBaseModel(BaseModel):
def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=0, initial_epoch=0, validation_split=0.,
validation_data=None, shuffle=True, callbacks=None):
if isinstance(x, dict):
x = [x[feature] for feature in self.feature_index]
do_validation = False
if validation_data:
do_validation = True
if len(validation_data) == 2:
val_x, val_y = validation_data
val_sample_weight = None
elif len(validation_data) == 3:
val_x, val_y, val_sample_weight = validation_data # pylint: disable=unpacking-non-sequence
else:
raise ValueError(
'When passing a `validation_data` argument, '
'it must contain either 2 items (x_val, y_val), '
'or 3 items (x_val, y_val, val_sample_weights), '
'or alternatively it could be a dataset or a '
'dataset or a dataset iterator. '
'However we received `validation_data=%s`' % validation_data)
if isinstance(val_x, dict):
val_x = [val_x[feature] for feature in self.feature_index]
elif validation_split and 0. < validation_split < 1.:
do_validation = True
if hasattr(x[0], 'shape'):
split_at = int(x[0].shape[0] * (1. - validation_split))
else:
split_at = int(len(x[0]) * (1. - validation_split))
x, val_x = (slice_arrays(x, 0, split_at),
slice_arrays(x, split_at))
y, val_y = (slice_arrays(y, 0, split_at),
slice_arrays(y, split_at))
else:
val_x = []
val_y = []
for i in range(len(x)):
if len(x[i].shape) == 1:
x[i] = np.expand_dims(x[i], axis=1)
train_tensor_data = Data.TensorDataset(
torch.from_numpy(
np.concatenate(x, axis=-1)),
torch.from_numpy(y))
if batch_size is None:
batch_size = 256
model = self.train()
loss_func = self.loss_func
optim = self.optim
if self.gpus:
print('parallel running on these gpus:', self.gpus)
model = torch.nn.DataParallel(model, device_ids=self.gpus)
batch_size *= len(self.gpus) # input `batch_size` is batch_size per gpu
else:
print(self.device)
train_loader = DataLoader(
dataset=train_tensor_data, shuffle=shuffle, batch_size=batch_size)
sample_num = len(train_tensor_data)
steps_per_epoch = (sample_num - 1) // batch_size + 1
# configure callbacks
callbacks = (callbacks or []) + [self.history] # add history callback
callbacks = CallbackList(callbacks)
callbacks.on_train_begin()
callbacks.set_model(self)
if not hasattr(callbacks, 'model'):
callbacks.__setattr__('model', self)
callbacks.model.stop_training = False
# Train
print("Train on {0} samples, validate on {1} samples, {2} steps per epoch".format(
len(train_tensor_data), len(val_y), steps_per_epoch))
for epoch in range(initial_epoch, epochs):
callbacks.on_epoch_begin(epoch)
epoch_logs = {}
start_time = time.time()
loss_epoch = 0
total_loss_epoch = 0
train_result = {}
try:
with tqdm(enumerate(train_loader), disable=verbose != 1) as t:
for _, (x_train, y_train) in t:
x = x_train.to(self.device).float()
y = y_train.to(self.device).float()
y_pred = model(x).squeeze()
optim.zero_grad()
loss = loss_func(y_pred, y.squeeze(), reduction='sum')
reg_loss = self.get_regularization_loss()
total_loss = loss + reg_loss + self.aux_loss
loss_epoch += loss.item()
total_loss_epoch += total_loss.item()
total_loss.backward()
optim.step()
if verbose > 0:
for name, metric_fun in self.metrics.items():
if name not in train_result:
train_result[name] = []
try:
temp = metric_fun(
y.cpu().data.numpy(), y_pred.cpu().data.numpy().astype("float64"))
except Exception:
temp = 0
finally:
train_result[name].append(temp)
except KeyboardInterrupt:
t.close()
raise
t.close()
# Add epoch_logs
epoch_logs["loss"] = total_loss_epoch / sample_num
for name, result in train_result.items():
epoch_logs[name] = np.sum(result) / steps_per_epoch
if do_validation:
eval_result = self.evaluate(val_x, val_y, batch_size)
for name, result in eval_result.items():
epoch_logs["val_" + name] = result
# verbose
if verbose > 0:
epoch_time = int(time.time() - start_time)
print('Epoch {0}/{1}'.format(epoch + 1, epochs))
eval_str = "{0}s - loss: {1: .4f}".format(
epoch_time, epoch_logs["loss"])
for name in self.metrics:
eval_str += " - " + name + \
": {0: .4f}".format(epoch_logs[name])
if do_validation:
for name in self.metrics:
eval_str += " - " + "val_" + name + \
": {0: .4f}".format(epoch_logs["val_" + name])
print(eval_str)
callbacks.on_epoch_end(epoch, epoch_logs)
if self.stop_training:
break
callbacks.on_train_end()
return self.history
def evaluate(self, x, y, batch_size=256):
"""
:param x: Numpy array of test data (if the model has a single input), or list of Numpy arrays (if the model has multiple inputs).
:param y: Numpy array of target (label) data (if the model has a single output), or list of Numpy arrays (if the model has multiple outputs).
:param batch_size: Integer or `None`. Number of samples per evaluation step. If unspecified, `batch_size` will default to 256.
:return: Dict contains metric names and metric values.
"""
pred_ans = self.predict(x, batch_size)
eval_result = {}
for name, metric_fun in self.metrics.items():
try:
temp = metric_fun(y, pred_ans)
except Exception:
temp = 0
finally:
eval_result[name] = metric_fun(y, pred_ans)
return eval_result
def predict(self, x, batch_size=256):
"""
:param x: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).
:param batch_size: Integer. If unspecified, it will default to 256.
:return: Numpy array(s) of predictions.
"""
model = self.eval()
if isinstance(x, dict):
x = [x[feature] for feature in self.feature_index]
for i in range(len(x)):
if len(x[i].shape) == 1:
x[i] = np.expand_dims(x[i], axis=1)
tensor_data = Data.TensorDataset(
torch.from_numpy(np.concatenate(x, axis=-1)))
test_loader = DataLoader(
dataset=tensor_data, shuffle=False, batch_size=batch_size)
pred_ans = []
with torch.no_grad():
for _, x_test in enumerate(test_loader):
x = x_test[0].to(self.device).float()
y_pred = model(x).cpu().data.numpy() # .squeeze()
pred_ans.append(y_pred)
return np.concatenate(pred_ans).astype("float64")
class MyDeepFM(MyBaseModel):
def __init__(self,
linear_feature_columns, dnn_feature_columns, use_fm=True,
dnn_hidden_units=(256, 128),
l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024,
dnn_dropout=0,
dnn_activation='relu', dnn_use_bn=False, task='binary', device='cpu', gpus=None):
super(MyDeepFM, self).__init__(linear_feature_columns, dnn_feature_columns, l2_reg_linear=l2_reg_linear,
l2_reg_embedding=l2_reg_embedding, init_std=init_std, seed=seed, task=task,
device=device, gpus=gpus)
self.use_fm = use_fm
self.use_dnn = len(dnn_feature_columns) > 0 and len(
dnn_hidden_units) > 0
if use_fm:
self.fm = FM()
if self.use_dnn:
self.dnn = DNN(self.compute_input_dim(dnn_feature_columns), dnn_hidden_units,
activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, use_bn=dnn_use_bn,
init_std=init_std, device=device)
self.dnn_linear = nn.Linear(
dnn_hidden_units[-1], 1, bias=False).to(device)
self.add_regularization_weight(
filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2=l2_reg_dnn)
self.add_regularization_weight(self.dnn_linear.weight, l2=l2_reg_dnn)
self.to(device)
def forward(self, X):
sparse_embedding_list, dense_value_list = self.input_from_feature_columns(X, self.dnn_feature_columns,
self.embedding_dict)
logit = self.linear_model(X)
if self.use_fm and len(sparse_embedding_list) > 0:
fm_input = torch.cat(sparse_embedding_list, dim=1)
logit += self.fm(fm_input)
if self.use_dnn:
dnn_input = combined_dnn_input(
sparse_embedding_list, dense_value_list)
dnn_output = self.dnn(dnn_input)
dnn_logit = self.dnn_linear(dnn_output)
logit += dnn_logit
y_pred = self.out(logit)
return y_pred
def process_ordered_embed(action, end):
train = pd.read_csv(FEED_EMBEDDINGS)
feed_embed_array = np.zeros((train.shape[0], end))
for i in tqdm(range(train.shape[0])):
x = train.loc[i, 'feed_embedding']
if x != np.nan and x != '':
y = [float(i) for i in str(x).strip().split(" ")]
else:
y = np.zeros((512,)).tolist()
feed_embed_array[i] += [y[int(embedorder[action].index[i][5:])] for i in range(end)]
temp = pd.DataFrame(columns=[f"emorder{i}" for i in range(end)], data=feed_embed_array)
del train['feed_embedding']
train = pd.concat((train, temp), axis=1)
return train
submit = pd.read_csv(ROOT_PATH + '/test_data.csv')[['userid', 'feedid']]
for action in ACTION_LIST:
USE_FEAT = ['userid', 'feedid', action] + FEA_FEED_LIST[1:]
train = pd.read_csv(ROOT_PATH + f'/train_data_for_{action}.csv')[USE_FEAT]
train = train.sample(frac=1, random_state=42).reset_index(drop=True)
print("posi prop:")
print(sum((train[action]==1)*1)/train.shape[0])
test = pd.read_csv(ROOT_PATH + '/test_data.csv')[[i for i in USE_FEAT if i != action and i[:7]!='emorder']]
target = [action]
test[target[0]] = 0
test = test[USE_FEAT[:-8]]
embded=process_ordered_embed(action, 8)
test = pd.merge(test, embded, on='feedid', how='left')
data = pd.concat((train, test)).reset_index(drop=True)
dense_features = ['videoplayseconds','emorder0','emorder1','emorder2','emorder3','emorder4','emorder5','emorder6','emorder7']
sparse_features = [i for i in USE_FEAT if i not in dense_features and i not in target]
data[sparse_features] = data[sparse_features].fillna(0)
data[dense_features] = data[dense_features].fillna(0)
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
lbe = LabelEncoder()
data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])
# 2.count #unique features for each sparse field,and record dense feature field name
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
for feat in sparse_features] + [DenseFeat(feat, 1, )
for feat in dense_features]
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(
linear_feature_columns + dnn_feature_columns)
# 3.generate input data for model
train, test = data.iloc[:train.shape[0]].reset_index(drop=True), data.iloc[train.shape[0]:].reset_index(drop=True)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}
# 4.Define Model,train,predict and evaluate
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
print('cuda ready...')
device = 'cuda:0'
model = MyDeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
task='binary',
l2_reg_embedding=1e-1, device=device)
model.compile("adagrad", "binary_crossentropy", metrics=["binary_crossentropy", "auc"])
history = model.fit(train_model_input, train[target].values, batch_size=512, epochs=5, verbose=1,
validation_split=0.2)
pred_ans = model.predict(test_model_input, 128)
submit[action] = pred_ans
torch.cuda.empty_cache()
# 保存提交文件
submit.to_csv(ROOT_PATH + "/submit_B_stage.csv", index=False)
运行输出:
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
posi prop:
0.15565121499983292
100%|██████████| 106444/106444 [00:14<00:00, 7261.22it/s]
cuda ready...
0it [00:00, ?it/s]cuda:0
Train on 1316708 samples, validate on 329177 samples, 2572 steps per epoch
2572it [00:24, 105.72it/s]
1it [00:00, 9.22it/s]Epoch 1/5
26s - loss: 0.2517 - binary_crossentropy: 0.2360 - auc: 0.9215 - val_binary_crossentropy: 0.2162 - val_auc: 0.9391
2572it [00:23, 107.59it/s]
1it [00:00, 9.56it/s]Epoch 2/5
26s - loss: 0.2214 - binary_crossentropy: 0.2075 - auc: 0.9456 - val_binary_crossentropy: 0.2142 - val_auc: 0.9408
2572it [00:23, 107.79it/s]
2it [00:00, 18.41it/s]Epoch 3/5
26s - loss: 0.2141 - binary_crossentropy: 0.2019 - auc: 0.9491 - val_binary_crossentropy: 0.2146 - val_auc: 0.9407
2572it [00:24, 106.96it/s]
2it [00:00, 18.45it/s]Epoch 4/5
26s - loss: 0.2096 - binary_crossentropy: 0.1983 - auc: 0.9513 - val_binary_crossentropy: 0.2156 - val_auc: 0.9404
2572it [00:24, 106.54it/s]
Epoch 5/5
26s - loss: 0.2064 - binary_crossentropy: 0.1958 - auc: 0.9526 - val_binary_crossentropy: 0.2162 - val_auc: 0.9401
posi prop:
0.11860030306914048
100%|██████████| 106444/106444 [00:15<00:00, 6793.95it/s]
0it [00:00, ?it/s]cuda ready...
cuda:0
Train on 1273372 samples, validate on 318344 samples, 2488 steps per epoch
2488it [00:23, 106.72it/s]
2it [00:00, 18.72it/s]Epoch 1/5
25s - loss: 0.2801 - binary_crossentropy: 0.2647 - auc: 0.8423 - val_binary_crossentropy: 0.2481 - val_auc: 0.8681
2488it [00:23, 106.30it/s]
1it [00:00, 8.92it/s]Epoch 2/5
25s - loss: 0.2551 - binary_crossentropy: 0.2410 - auc: 0.8795 - val_binary_crossentropy: 0.2465 - val_auc: 0.8694
2488it [00:23, 106.01it/s]
2it [00:00, 18.83it/s]Epoch 3/5
25s - loss: 0.2484 - binary_crossentropy: 0.2360 - auc: 0.8866 - val_binary_crossentropy: 0.2466 - val_auc: 0.8691
2488it [00:23, 107.46it/s]
2it [00:00, 18.85it/s]Epoch 4/5
25s - loss: 0.2442 - binary_crossentropy: 0.2328 - auc: 0.8910 - val_binary_crossentropy: 0.2475 - val_auc: 0.8684
2488it [00:23, 107.84it/s]
Epoch 5/5
25s - loss: 0.2411 - binary_crossentropy: 0.2303 - auc: 0.8940 - val_binary_crossentropy: 0.2485 - val_auc: 0.8675
posi prop:
0.0371053151111731
100%|██████████| 106444/106444 [00:15<00:00, 6728.08it/s]
cuda ready...
2it [00:00, 19.33it/s]cuda:0
Train on 1187301 samples, validate on 296826 samples, 2319 steps per epoch
2319it [00:21, 106.98it/s]
2it [00:00, 18.47it/s]Epoch 1/5
23s - loss: 0.1372 - binary_crossentropy: 0.1269 - auc: 0.8246 - val_binary_crossentropy: 0.1166 - val_auc: 0.8670
2319it [00:21, 106.16it/s]
1it [00:00, 9.70it/s]Epoch 2/5
23s - loss: 0.1187 - binary_crossentropy: 0.1090 - auc: 0.8966 - val_binary_crossentropy: 0.1156 - val_auc: 0.8716
2319it [00:21, 106.86it/s]
2it [00:00, 19.60it/s]Epoch 3/5
23s - loss: 0.1126 - binary_crossentropy: 0.1041 - auc: 0.9117 - val_binary_crossentropy: 0.1163 - val_auc: 0.8710
2319it [00:21, 106.54it/s]
2it [00:00, 19.29it/s]Epoch 4/5
23s - loss: 0.1088 - binary_crossentropy: 0.1010 - auc: 0.9207 - val_binary_crossentropy: 0.1173 - val_auc: 0.8698
2319it [00:21, 106.67it/s]
Epoch 5/5
23s - loss: 0.1060 - binary_crossentropy: 0.0988 - auc: 0.9267 - val_binary_crossentropy: 0.1181 - val_auc: 0.8688
posi prop:
0.03752907526887493
100%|██████████| 106444/106444 [00:15<00:00, 7008.23it/s]
0it [00:00, ?it/s]cuda ready...
cuda:0
Train on 596039 samples, validate on 149010 samples, 1165 steps per epoch
1165it [00:10, 108.27it/s]
7it [00:00, 66.80it/s]Epoch 1/5
11s - loss: 0.1340 - binary_crossentropy: 0.1240 - auc: 0.8458 - val_binary_crossentropy: 0.1082 - val_auc: 0.8957
1165it [00:10, 108.65it/s]
6it [00:00, 57.95it/s]Epoch 2/5
11s - loss: 0.1073 - binary_crossentropy: 0.0962 - auc: 0.9330 - val_binary_crossentropy: 0.1070 - val_auc: 0.9014
1165it [00:10, 108.07it/s]
7it [00:00, 66.89it/s]Epoch 3/5
11s - loss: 0.0980 - binary_crossentropy: 0.0881 - auc: 0.9498 - val_binary_crossentropy: 0.1089 - val_auc: 0.9000
1165it [00:11, 105.74it/s]
7it [00:00, 67.40it/s]Epoch 4/5
12s - loss: 0.0919 - binary_crossentropy: 0.0827 - auc: 0.9573 - val_binary_crossentropy: 0.1122 - val_auc: 0.8977
1165it [00:10, 106.49it/s]
Epoch 5/5
11s - loss: 0.0878 - binary_crossentropy: 0.0793 - auc: 0.9637 - val_binary_crossentropy: 0.1151 - val_auc: 0.8962