import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm. autonotebook import *
from sklearn. decomposition import LatentDirichletAllocation
from sklearn. preprocessing import LabelEncoder
from sklearn. metrics import accuracy_score
import time
import datetime
from scipy. sparse import hstack
from sklearn. model_selection import StratifiedKFold
import re
from keras. layers import *
from keras. models import *
from keras. preprocessing. text import Tokenizer, text_to_word_sequence
from keras. preprocessing. sequence import pad_sequences
from keras. preprocessing import text, sequence
from keras. callbacks import *
from keras. layers. advanced_activations import LeakyReLU, PReLU
import keras. backend as K
from keras. optimizers import *
from keras. utils import to_categorical
import tensorflow as tf
import random as rn
import gc
import logging
import gensim
import warnings
warnings. filterwarnings( 'ignore' )
/home/frank/miniconda3/envs/reco2/lib/python3.7/site-packages/ipykernel_launcher.py:5: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
"""
/home/frank/miniconda3/envs/reco2/lib/python3.7/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
action_data = pd. read_csv( './data/my_data.csv' )
action_data. head( )
user_log_acct item_sku_id action_time action_type brand_code shop_id item_third_cate_cd vender_id shop_score age sex user_level province city county 0 937922 357022 2020-02-04 08:28:15 1 1791.0 8703.0 10.0 5227.0 -1.000000 5.0 1.0 5 11.0 348.0 1782.0 1 937922 73 2020-02-04 08:27:07 1 1791.0 8703.0 10.0 5227.0 -1.000000 5.0 1.0 5 11.0 348.0 1782.0 2 937922 29583 2020-02-04 08:26:31 1 1791.0 2738.0 10.0 3436.0 9.206167 5.0 1.0 5 11.0 348.0 1782.0 3 937922 108763 2020-02-04 08:26:10 1 1791.0 2738.0 10.0 3436.0 9.206167 5.0 1.0 5 11.0 348.0 1782.0 4 1369473 331139 2020-02-03 21:55:49 1 9985.0 6367.0 73.0 3666.0 0.000000 5.0 1.0 5 1.0 41.0 2058.0
action_data. shape
(37214269, 15)
数据预处理
action_data[ 'dd_len' ] = action_data[ 'action_time' ] . apply ( lambda x: len ( str ( x) ) )
action_data[ 'action_time' ] = action_data[ 'action_time' ] . apply ( lambda x: x[ : 19 ] )
del action_data[ 'dd_len' ]
action_data[ 'action_time' ] = pd. to_datetime( action_data[ 'action_time' ] )
action_data = action_data. sort_values( 'action_time' )
action_data[ 'month' ] = action_data[ 'action_time' ] . dt. month
action_data[ 'day' ] = action_data[ 'action_time' ] . dt. day
action_data[ 'month_day' ] = action_data[ 'month' ] . values * 100 + action_data[ 'day' ] . values
训练集切分
def _label_trans ( x, dic_) :
try :
return dic_[ x]
except :
return 0
def get_label ( df, label_st = ( 4 , 11 ) , label_en = ( 4 , 15 ) , candidate_st = ( 4 , 6 ) , candidate_en = ( 4 , 10 ) , fea_en = ( 4 , 10 ) ) :
lb_st = df. loc[ ( df[ 'month' ] == label_st[ 0 ] ) & ( df[ 'day' ] == label_st[ 1 ] ) , 'month_day' ] . values[ 0 ]
lb_en = df. loc[ ( df[ 'month' ] == label_en[ 0 ] ) & ( df[ 'day' ] == label_en[ 1 ] ) , 'month_day' ] . values[ 0 ]
cand_st = df. loc[ ( df[ 'month' ] == candidate_st[ 0 ] ) & ( df[ 'day' ] == candidate_st[ 1 ] ) , 'month_day' ] . values[ 0 ]
cand_en = df. loc[ ( df[ 'month' ] == candidate_en[ 0 ] ) & ( df[ 'day' ] == candidate_en[ 1 ] ) , 'month_day' ] . values[ 0 ]
fea_position = df. loc[ ( df[ 'month' ] == fea_en[ 0 ] ) & ( df[ 'day' ] == fea_en[ 1 ] ) , 'month_day' ] . values[ 0 ]
ind_label = ( df[ 'month_day' ] >= lb_st) & ( df[ 'month_day' ] <= lb_en) & ( df[ 'action_type' ] == 2 )
ind_candidate = ( df[ 'month_day' ] >= cand_st) & ( df[ 'month_day' ] <= cand_en)
ind_fea = ( df[ 'month_day' ] <= fea_position)
data_label = df. loc[ ind_label] . copy( )
data_fea = df. loc[ ind_fea] . copy( )
data_candidates = df. loc[ ind_candidate] . copy( )
df_candidates = data_candidates[ [ 'user_log_acct' , 'item_sku_id' ] ] . copy( )
df_candidates = df_candidates. drop_duplicates( subset = [ 'user_log_acct' , 'item_sku_id' ] )
df_candidates = df_candidates. loc[ ( df_candidates. item_sku_id. isnull( ) == False ) ]
label = data_label[ [ 'user_log_acct' , 'item_sku_id' , 'day' ] ] . copy( )
print ( 'get label' )
df_candidates[ 'label_cnt' ] = 0
df_candidates[ 'label_days' ] = 0
df_candidates[ 'user_item' ] = df_candidates[ 'user_log_acct' ] . astype( str ) + '_' + df_candidates[ 'item_sku_id' ] . astype( str )
label[ 'user_item' ] = label[ 'user_log_acct' ] . astype( str ) + '_' + label[ 'item_sku_id' ] . astype( str )
dic_cnt = label[ 'user_item' ] . value_counts( ) . to_dict( )
dic_days = label. groupby( 'user_item' ) [ 'day' ] . nunique( ) . to_dict( )
df_candidates[ 'label_cnt' ] = df_candidates[ 'user_item' ] . apply ( lambda x: _label_trans( x, dic_cnt) ) . values
df_candidates[ 'label_days' ] = df_candidates[ 'user_item' ] . apply ( lambda x: _label_trans( x, dic_days) ) . values
return df_candidates, data_fea
% % time
df_valid_label, data_valid_fea = get_label( action_data, label_st = ( 4 , 11 ) , label_en = ( 4 , 15 ) , candidate_st = ( 4 , 6 ) , candidate_en = ( 4 , 10 ) , fea_en = ( 4 , 10 ) )
get label
CPU times: user 5.74 s, sys: 844 ms, total: 6.58 s
Wall time: 6.58 s
% % time
df_train_label1, data_train_fea1 = get_label( action_data, label_st = ( 4 , 6 ) , label_en = ( 4 , 10 ) , candidate_st = ( 4 , 1 ) , candidate_en = ( 4 , 5 ) , fea_en = ( 4 , 5 ) )
get label
CPU times: user 4.88 s, sys: 681 ms, total: 5.57 s
Wall time: 5.56 s
df_train_label1. head( )
user_log_acct item_sku_id label_cnt label_days user_item 34296301 1144603 153700 0 0 1144603_153700 1415203 1129253 327893 0 0 1129253_327893 3960663 736788 201003 0 0 736788_201003 5158969 109461 256490 0 0 109461_256490 7377193 470525 142823 0 0 470525_142823
特征构建
原始特征
my_user = action_data[ [ 'user_log_acct' , 'age' , 'sex' , 'user_level' , 'province' , 'city' , 'county' ] ] . drop_duplicates( [ 'user_log_acct' ] , keep= 'first' )
my_item = action_data[ [ 'item_sku_id' , 'brand_code' , 'shop_id' , 'item_third_cate_cd' , 'vender_id' , 'shop_score' ] ] . drop_duplicates( [ 'item_sku_id' ] , keep= 'first' )
user特征
def gen_action_freq_feats ( df, start_date) :
key = [ 'user_log_acct' ]
action = df[ key+ [ 'action_type' , 'action_time' ] ] . copy( )
feats = pd. DataFrame( action[ key] . drop_duplicates( ) )
for w in tqdm( [ 1 , 3 , 5 , 7 , 15 , 30 ] ) :
bef_start_date = start_date - datetime. timedelta( days= w)
action_cl = action[ action[ 'action_time' ] >= bef_start_date] . copy( )
df = pd. get_dummies( action_cl[ 'action_type' ] , prefix= '_' . join( key) + '_last{}_days_action' . format ( w) )
action_cl = pd. concat( [ action_cl, df] , axis= 1 )
action_cl = action_cl. groupby( key, as_index= False ) . sum ( )
action_cl[ '_' . join( key) + '_last{}_days_action_1_rt' . format ( w) ] = action_cl[ '_' . join( key) + '_last{}_days_action_2' . format ( w) ] / ( 1 + action_cl[ '_' . join( key) + '_last{}_days_action_1' . format ( w) ] )
action_cl[ '_' . join( key) + '_last{}_days_action_3_rt' . format ( w) ] = action_cl[ '_' . join( key) + '_last{}_days_action_2' . format ( w) ] / ( 1 + action_cl[ '_' . join( key) + '_last{}_days_action_3' . format ( w) ] )
action_cl[ '_' . join( key) + '_last{}_days_action_4_rt' . format ( w) ] = action_cl[ '_' . join( key) + '_last{}_days_action_2' . format ( w) ] / ( 1 + action_cl[ '_' . join( key) + '_last{}_days_action_4' . format ( w) ] )
del action_cl[ 'action_type' ]
feats = feats. merge( action_cl, on= key, how= 'left' )
return feats
u_fea_train1 = gen_action_freq_feats( data_train_fea1, datetime. datetime( 2020 , 4 , 5 ) )
u_fea_val1 = gen_action_freq_feats( data_valid_fea, datetime. datetime( 2020 , 4 , 10 ) )
0%| | 0/6 [00:00<?, ?it/s]
0%| | 0/6 [00:00<?, ?it/s]
合并特征集
u_fea_cols1 = [ col for col in u_fea_train1. columns if col not in [ 'user_log_acct' ] ]
u_fea_cols2 = [ col for col in my_user. columns if col not in [ 'user_log_acct' ] ]
i_fea_cols = [ col for col in my_item. columns if col not in [ 'item_sku_id' ] ]
train_cols = [ 'user_log_acct' , 'item_sku_id' ] + u_fea_cols1 + u_fea_cols2 + i_fea_cols
训练集&验证集
df_train = df_train_label1. merge( u_fea_train1, on = 'user_log_acct' , how= 'left' )
df_train = df_train. merge( my_user, on = 'user_log_acct' , how= 'left' )
df_train = df_train. merge( my_item, on = 'item_sku_id' , how= 'left' )
df_train[ 'label' ] = df_train[ 'label_cnt' ] > 0
df_train[ 'label' ] = df_train[ 'label' ] . astype( int )
df_val = df_valid_label. merge( u_fea_val1, on = 'user_log_acct' , how= 'left' )
df_val = df_val. merge( my_user, on = 'user_log_acct' , how= 'left' )
df_val = df_val. merge( my_item, on = 'item_sku_id' , how= 'left' )
df_val[ 'label' ] = df_val[ 'label_cnt' ] > 0
df_val[ 'label' ] = df_val[ 'label' ] . astype( int )
def set_tokenizer ( docs, split_char= ' ' , max_len= 100 ) :
'''
输入
docs:文本列表
split_char:按什么字符切割
max_len:截取的最大长度
输出
X:序列化后的数据
word_index:文本和数字对应的索引
'''
tokenizer = Tokenizer( lower= False , char_level= False , split= split_char)
tokenizer. fit_on_texts( docs)
X = tokenizer. texts_to_sequences( docs)
maxlen = max_len
X = pad_sequences( X, maxlen= maxlen, value= 0 )
word_index= tokenizer. word_index
return X, word_index
valid_item_seq = data_valid_fea. groupby( [ 'user_log_acct' ] ) [ 'item_sku_id' ] . agg( list ) . reset_index( )
valid_item_seq. columns = [ 'user_log_acct' , 'item_seq' ]
df_val = df_val. merge( valid_item_seq, on= 'user_log_acct' , how= 'left' )
train_item_seq = data_train_fea1. groupby( [ 'user_log_acct' ] ) [ 'item_sku_id' ] . agg( list ) . reset_index( )
train_item_seq. columns = [ 'user_log_acct' , 'item_seq' ]
df_train = df_train. merge( train_item_seq, on= 'user_log_acct' , how= 'left' )
df_data = pd. concat( [ df_train[ [ 'item_seq' ] ] , df_val[ [ 'item_seq' ] ] ] , axis= 0 , ignore_index= True )
df_data[ 'item_seq' ] = df_data[ 'item_seq' ] . apply ( lambda x: ',' . join( [ str ( i) for i in x] ) )
text_1_list = list ( df_data[ 'item_seq' ] )
print ( '开始序列化' )
x1, index_1 = set_tokenizer( text_1_list, split_char= ',' , max_len= 20 )
print ( '序列化完成' )
gc. collect( )
开始序列化
序列化完成
0
sparse_col = [ 'age' , 'sex' , 'user_level' , 'province' , 'city' , 'county' , 'brand_code' , 'shop_id' , 'item_third_cate_cd' , 'vender_id' ]
sparse_col += [ 'item_sku_id' ]
rest_col = [ 'user_log_acct' , 'label_cnt' , 'label_days' , 'user_item' , 'item_seq' , 'label' ]
dense_cols = [ ]
for i in df_train. columns:
if df_train[ i] . dtype in [ 'float64' , 'int64' ] and i not in sparse_col and i not in rest_col:
dense_cols. append( i)
from sklearn. preprocessing import StandardScaler
ss= StandardScaler( )
df_data = pd. concat( [ df_train[ dense_cols] , df_val[ dense_cols] ] , axis= 0 , ignore_index= True )
df_data = df_data. fillna( 0 )
ss. fit( df_data)
dense_feature = ss. transform( df_data)
dense_feature_input = dense_feature. shape[ 1 ]
train_input_1 = x1[ : df_train. shape[ 0 ] ]
test_input_1 = x1[ df_train. shape[ 0 ] : ]
train_input_2 = dense_feature[ : df_train. shape[ 0 ] ]
test_input_2 = dense_feature[ df_train. shape[ 0 ] : ]
train_label = df_train[ 'label' ]
test_label = df_val[ 'label' ]
def process_sparse_feats ( data, cols) :
d = data. copy( )
for f in cols:
d[ f] = d[ f] . fillna( '-1' ) . astype( str )
label_encoder = LabelEncoder( )
d[ f] = label_encoder. fit_transform( d[ f] )
return d
sparse_feature = pd. concat( [ df_train[ sparse_col] , df_val[ sparse_col] ] , axis= 0 , ignore_index= True )
sparse_feature = process_sparse_feats( sparse_feature, sparse_col[ : - 1 ] )
sparse_feature[ 'item_sku_id' ] = sparse_feature[ 'item_sku_id' ] . astype( str ) . map ( index_1)
train_input_3 = sparse_feature[ : df_train. shape[ 0 ] ] . values
test_input_3 = sparse_feature[ df_train. shape[ 0 ] : ] . values
数据格式转换
train_sparse_x1 = [ np. array( [ np. array( i) for i in train_input_1] ) ]
train_dense_x = [ i for i in train_input_2. T]
train_sparse_x2 = [ i for i in train_input_3. T]
train_label = [ train_label. values]
test_sparse_x1 = [ np. array( [ np. array( i) for i in test_input_1] ) ]
test_dense_x = [ i for i in test_input_2. T]
test_sparse_x2 = [ i for i in test_input_3. T]
test_label = [ test_label. values]
train_sparse_x = train_sparse_x2 + train_sparse_x1
test_sparse_x = test_sparse_x2 + test_sparse_x1
numss = { }
for col in sparse_col:
numss[ col] = sparse_feature[ col] . max ( ) + 30
numss[ 'item_seq' ] = numss[ 'item_sku_id' ]
DeepFM
def deepfm_model ( sparse_columns, dense_columns) :
sparse_input = [ ]
lr_embedding = [ ]
fm_embedding = [ ]
for col in sparse_columns:
_input = Input( shape= ( 1 , ) )
sparse_input. append( _input)
nums = numss[ col]
embed = Flatten( ) ( Embedding( nums, 1 , embeddings_regularizer= tf. keras. regularizers. l2( 0.5 ) ) ( _input) )
lr_embedding. append( embed)
embed = Embedding( nums, 10 , input_length= 1 , embeddings_regularizer= tf. keras. regularizers. l2( 0.5 ) ) ( _input)
reshape = Reshape( ( 10 , ) ) ( embed)
fm_embedding. append( reshape)
fm_square = Lambda( lambda x: K. square( x) ) ( Add( ) ( fm_embedding) )
square_fm = Add( ) ( [ Lambda( lambda x: K. square( x) ) ( embed)
for embed in fm_embedding] )
snd_order_sparse_layer = subtract( [ fm_square, square_fm] )
snd_order_sparse_layer = Lambda( lambda x: x * 0.5 ) ( snd_order_sparse_layer)
dense_input = [ ]
for col in dense_columns:
_input = Input( shape= ( 1 , ) )
dense_input. append( _input)
concat_dense_input = concatenate( dense_input)
fst_order_dense_layer = Dense( 4 , activation= 'relu' ) ( concat_dense_input)
fst_order_sparse_layer = concatenate( lr_embedding)
linear_part = concatenate( [ fst_order_dense_layer, fst_order_sparse_layer] )
concat_fm_embedding = concatenate( fm_embedding, axis= - 1 )
fc_layer = Dropout( 0.2 ) ( Activation( activation= "relu" ) ( BatchNormalization( ) ( Dense( 128 ) ( concat_fm_embedding) ) ) )
fc_layer = Dropout( 0.2 ) ( Activation( activation= "relu" ) ( BatchNormalization( ) ( Dense( 64 ) ( fc_layer) ) ) )
fc_layer = Dropout( 0.2 ) ( Activation( activation= "relu" ) ( BatchNormalization( ) ( Dense( 32 ) ( fc_layer) ) ) )
output_layer = concatenate( [ linear_part, snd_order_sparse_layer, fc_layer] )
output_layer = Dense( 1 , activation= 'sigmoid' ) ( output_layer)
model = Model( inputs= sparse_input+ dense_input, outputs= output_layer)
return model
model = deepfm_model( sparse_col, dense_cols)
model. compile ( optimizer= "adam" ,
loss= "binary_crossentropy" ,
metrics= [ "binary_crossentropy" , tf. keras. metrics. AUC( name= 'auc' ) ] )
from keras. callbacks import *
file_path = "deepfm_model.h5"
earlystopping = EarlyStopping( monitor= "val_auc" , patience= 5 , mode= 'max' )
plateau = ReduceLROnPlateau( monitor= "val_auc" , verbose= 1 , mode= 'max' , factor= 0.5 , patience= 3 )
checkpoint = ModelCheckpoint(
file_path, monitor= 'val_auc' , save_weights_only= True , verbose= 1 , save_best_only= True , mode= 'max' )
callbacks_list = [ earlystopping, checkpoint, plateau]
hist = model. fit( train_sparse_x2+ train_dense_x,
train_label,
batch_size= 4096 ,
epochs= 10 ,
validation_data= ( test_sparse_x2+ test_dense_x, test_label) ,
callbacks= callbacks_list,
shuffle= False )
Epoch 1/10
401/401 [==============================] - 17s 37ms/step - loss: 227.0156 - binary_crossentropy: 0.2611 - auc: 0.4573 - val_loss: 0.0669 - val_binary_crossentropy: 0.0526 - val_auc: 0.4837
Epoch 00001: val_auc improved from -inf to 0.48368, saving model to deepfm_model.h5
Epoch 2/10
401/401 [==============================] - 14s 35ms/step - loss: 0.0443 - binary_crossentropy: 0.0383 - auc: 0.5046 - val_loss: 0.0568 - val_binary_crossentropy: 0.0368 - val_auc: 0.7002
Epoch 00002: val_auc improved from 0.48368 to 0.70018, saving model to deepfm_model.h5
Epoch 3/10
401/401 [==============================] - 14s 35ms/step - loss: 0.0413 - binary_crossentropy: 0.0333 - auc: 0.5932 - val_loss: 0.0516 - val_binary_crossentropy: 0.0349 - val_auc: 0.7352
Epoch 00003: val_auc improved from 0.70018 to 0.73523, saving model to deepfm_model.h5
Epoch 4/10
401/401 [==============================] - 14s 36ms/step - loss: 0.0389 - binary_crossentropy: 0.0309 - auc: 0.6606 - val_loss: 0.0467 - val_binary_crossentropy: 0.0326 - val_auc: 0.7665
Epoch 00004: val_auc improved from 0.73523 to 0.76648, saving model to deepfm_model.h5
Epoch 5/10
401/401 [==============================] - 14s 35ms/step - loss: 0.0381 - binary_crossentropy: 0.0296 - auc: 0.6905 - val_loss: 0.0548 - val_binary_crossentropy: 0.0326 - val_auc: 0.7862
Epoch 00005: val_auc improved from 0.76648 to 0.78620, saving model to deepfm_model.h5
Epoch 6/10
401/401 [==============================] - 14s 35ms/step - loss: 0.0388 - binary_crossentropy: 0.0288 - auc: 0.7164 - val_loss: 0.0476 - val_binary_crossentropy: 0.0316 - val_auc: 0.7986
Epoch 00006: val_auc improved from 0.78620 to 0.79863, saving model to deepfm_model.h5
Epoch 7/10
401/401 [==============================] - 14s 36ms/step - loss: 0.0384 - binary_crossentropy: 0.0283 - auc: 0.7240 - val_loss: 0.0492 - val_binary_crossentropy: 0.0320 - val_auc: 0.8047
Epoch 00007: val_auc improved from 0.79863 to 0.80470, saving model to deepfm_model.h5
Epoch 8/10
401/401 [==============================] - 14s 36ms/step - loss: 0.0389 - binary_crossentropy: 0.0281 - auc: 0.7281 - val_loss: 0.0538 - val_binary_crossentropy: 0.0318 - val_auc: 0.8079
Epoch 00008: val_auc improved from 0.80470 to 0.80785, saving model to deepfm_model.h5
Epoch 9/10
401/401 [==============================] - 14s 35ms/step - loss: 0.0384 - binary_crossentropy: 0.0278 - auc: 0.7376 - val_loss: 0.0527 - val_binary_crossentropy: 0.0318 - val_auc: 0.8137
Epoch 00009: val_auc improved from 0.80785 to 0.81366, saving model to deepfm_model.h5
Epoch 10/10
401/401 [==============================] - 14s 35ms/step - loss: 0.0387 - binary_crossentropy: 0.0277 - auc: 0.7400 - val_loss: 0.0528 - val_binary_crossentropy: 0.0311 - val_auc: 0.8130
Epoch 00010: val_auc did not improve from 0.81366