import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn. decomposition import LatentDirichletAllocation
from sklearn. metrics import accuracy_score
import time
import datetime
from scipy. sparse import hstack
from sklearn. model_selection import StratifiedKFold
import re
from keras. layers import *
from tensorflow. keras. models import *
from tensorflow. keras. preprocessing. text import Tokenizer, text_to_word_sequence
from tensorflow. keras. preprocessing. sequence import pad_sequences
from tensorflow. keras. preprocessing import text, sequence
from tensorflow. keras. callbacks import *
from keras. layers. advanced_activations import LeakyReLU, PReLU
import tensorflow. keras. backend as K
from keras. optimizers import *
from tensorflow. keras. utils import to_categorical
from tensorflow. keras. backend import cast
import tensorflow as tf
import random as rn
import gc
import logging
import gensim
np. random. seed( 1024 )
rn. seed( 1024 )
import warnings
warnings. filterwarnings( 'ignore' )
/home/frank/miniconda3/envs/reco2/lib/python3.7/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
action_data = pd. read_csv( './data/my_data.csv' )
action_data. head( )
user_log_acct item_sku_id action_time action_type brand_code shop_id item_third_cate_cd vender_id shop_score age sex user_level province city county 0 937922 357022 2020-02-04 08:28:15 1 1791.0 8703.0 10.0 5227.0 -1.000000 5.0 1.0 5 11.0 348.0 1782.0 1 937922 73 2020-02-04 08:27:07 1 1791.0 8703.0 10.0 5227.0 -1.000000 5.0 1.0 5 11.0 348.0 1782.0 2 937922 29583 2020-02-04 08:26:31 1 1791.0 2738.0 10.0 3436.0 9.206167 5.0 1.0 5 11.0 348.0 1782.0 3 937922 108763 2020-02-04 08:26:10 1 1791.0 2738.0 10.0 3436.0 9.206167 5.0 1.0 5 11.0 348.0 1782.0 4 1369473 331139 2020-02-03 21:55:49 1 9985.0 6367.0 73.0 3666.0 0.000000 5.0 1.0 5 1.0 41.0 2058.0
action_data. shape
(37214269, 15)
数据预处理
action_data[ 'dd_len' ] = action_data[ 'action_time' ] . apply ( lambda x: len ( str ( x) ) )
action_data[ 'action_time' ] = action_data[ 'action_time' ] . apply ( lambda x: x[ : 19 ] )
del action_data[ 'dd_len' ]
action_data[ 'action_time' ] = pd. to_datetime( action_data[ 'action_time' ] )
action_data = action_data. sort_values( 'action_time' )
action_data[ 'month' ] = action_data[ 'action_time' ] . dt. month
action_data[ 'day' ] = action_data[ 'action_time' ] . dt. day
action_data[ 'month_day' ] = action_data[ 'month' ] . values * 100 + action_data[ 'day' ] . values
训练集切分
def _label_trans ( x, dic_) :
try :
return dic_[ x]
except :
return 0
def get_label ( df, label_st = ( 4 , 11 ) , label_en = ( 4 , 15 ) , candidate_st = ( 4 , 6 ) , candidate_en = ( 4 , 10 ) , fea_en = ( 4 , 10 ) ) :
lb_st = df. loc[ ( df[ 'month' ] == label_st[ 0 ] ) & ( df[ 'day' ] == label_st[ 1 ] ) , 'month_day' ] . values[ 0 ]
lb_en = df. loc[ ( df[ 'month' ] == label_en[ 0 ] ) & ( df[ 'day' ] == label_en[ 1 ] ) , 'month_day' ] . values[ 0 ]
cand_st = df. loc[ ( df[ 'month' ] == candidate_st[ 0 ] ) & ( df[ 'day' ] == candidate_st[ 1 ] ) , 'month_day' ] . values[ 0 ]
cand_en = df. loc[ ( df[ 'month' ] == candidate_en[ 0 ] ) & ( df[ 'day' ] == candidate_en[ 1 ] ) , 'month_day' ] . values[ 0 ]
fea_position = df. loc[ ( df[ 'month' ] == fea_en[ 0 ] ) & ( df[ 'day' ] == fea_en[ 1 ] ) , 'month_day' ] . values[ 0 ]
ind_label = ( df[ 'month_day' ] >= lb_st) & ( df[ 'month_day' ] <= lb_en) & ( df[ 'action_type' ] == 2 )
ind_candidate = ( df[ 'month_day' ] >= cand_st) & ( df[ 'month_day' ] <= cand_en)
ind_fea = ( df[ 'month_day' ] <= fea_position)
data_label = df. loc[ ind_label] . copy( )
data_fea = df. loc[ ind_fea] . copy( )
data_candidates = df. loc[ ind_candidate] . copy( )
df_candidates = data_candidates[ [ 'user_log_acct' , 'item_sku_id' ] ] . copy( )
df_candidates = df_candidates. drop_duplicates( subset = [ 'user_log_acct' , 'item_sku_id' ] )
df_candidates = df_candidates. loc[ ( df_candidates. item_sku_id. isnull( ) == False ) ]
label = data_label[ [ 'user_log_acct' , 'item_sku_id' , 'day' ] ] . copy( )
print ( 'get label' )
df_candidates[ 'label_cnt' ] = 0
df_candidates[ 'label_days' ] = 0
df_candidates[ 'user_item' ] = df_candidates[ 'user_log_acct' ] . astype( str ) + '_' + df_candidates[ 'item_sku_id' ] . astype( str )
label[ 'user_item' ] = label[ 'user_log_acct' ] . astype( str ) + '_' + label[ 'item_sku_id' ] . astype( str )
dic_cnt = label[ 'user_item' ] . value_counts( ) . to_dict( )
dic_days = label. groupby( 'user_item' ) [ 'day' ] . nunique( ) . to_dict( )
df_candidates[ 'label_cnt' ] = df_candidates[ 'user_item' ] . apply ( lambda x: _label_trans( x, dic_cnt) ) . values
df_candidates[ 'label_days' ] = df_candidates[ 'user_item' ] . apply ( lambda x: _label_trans( x, dic_days) ) . values
return df_candidates, data_fea
% % time
df_valid_label, data_valid_fea = get_label( action_data, label_st = ( 4 , 11 ) , label_en = ( 4 , 15 ) , candidate_st = ( 4 , 6 ) , candidate_en = ( 4 , 10 ) , fea_en = ( 4 , 10 ) )
get label
CPU times: user 5.48 s, sys: 924 ms, total: 6.4 s
Wall time: 6.4 s
% % time
df_train_label1, data_train_fea1 = get_label( action_data, label_st = ( 4 , 6 ) , label_en = ( 4 , 10 ) , candidate_st = ( 4 , 1 ) , candidate_en = ( 4 , 5 ) , fea_en = ( 4 , 5 ) )
get label
CPU times: user 4.67 s, sys: 744 ms, total: 5.41 s
Wall time: 5.41 s
df_train_label1. head( )
user_log_acct item_sku_id label_cnt label_days user_item 34296301 1144603 153700 0 0 1144603_153700 1415203 1129253 327893 0 0 1129253_327893 3960663 736788 201003 0 0 736788_201003 5158969 109461 256490 0 0 109461_256490 7377193 470525 142823 0 0 470525_142823
特征构建
原始特征
my_user = action_data[ [ 'user_log_acct' , 'age' , 'sex' , 'user_level' , 'province' , 'city' , 'county' ] ] . drop_duplicates( [ 'user_log_acct' ] , keep= 'first' )
my_item = action_data[ [ 'item_sku_id' , 'brand_code' , 'shop_id' , 'item_third_cate_cd' , 'vender_id' , 'shop_score' ] ] . drop_duplicates( [ 'item_sku_id' ] , keep= 'first' )
user特征
def gen_action_freq_feats ( df, start_date) :
key = [ 'user_log_acct' ]
action = df[ key+ [ 'action_type' , 'action_time' ] ] . copy( )
feats = pd. DataFrame( action[ key] . drop_duplicates( ) )
for w in tqdm( [ 1 , 3 , 5 , 7 , 15 , 30 ] ) :
bef_start_date = start_date - datetime. timedelta( days= w)
action_cl = action[ action[ 'action_time' ] >= bef_start_date] . copy( )
df = pd. get_dummies( action_cl[ 'action_type' ] , prefix= '_' . join( key) + '_last{}_days_action' . format ( w) )
action_cl = pd. concat( [ action_cl, df] , axis= 1 )
action_cl = action_cl. groupby( key, as_index= False ) . sum ( )
action_cl[ '_' . join( key) + '_last{}_days_action_1_rt' . format ( w) ] = action_cl[ '_' . join( key) + '_last{}_days_action_2' . format ( w) ] / ( 1 + action_cl[ '_' . join( key) + '_last{}_days_action_1' . format ( w) ] )
action_cl[ '_' . join( key) + '_last{}_days_action_3_rt' . format ( w) ] = action_cl[ '_' . join( key) + '_last{}_days_action_2' . format ( w) ] / ( 1 + action_cl[ '_' . join( key) + '_last{}_days_action_3' . format ( w) ] )
action_cl[ '_' . join( key) + '_last{}_days_action_4_rt' . format ( w) ] = action_cl[ '_' . join( key) + '_last{}_days_action_2' . format ( w) ] / ( 1 + action_cl[ '_' . join( key) + '_last{}_days_action_4' . format ( w) ] )
del action_cl[ 'action_type' ]
feats = feats. merge( action_cl, on= key, how= 'left' )
return feats
u_fea_train1 = gen_action_freq_feats( data_train_fea1, datetime. datetime( 2020 , 4 , 5 ) )
u_fea_val1 = gen_action_freq_feats( data_valid_fea, datetime. datetime( 2020 , 4 , 10 ) )
100%|██████████| 6/6 [00:05<00:00, 1.10it/s]
100%|██████████| 6/6 [00:05<00:00, 1.03it/s]
合并特征集
u_fea_cols1 = [ col for col in u_fea_train1. columns if col not in [ 'user_log_acct' ] ]
u_fea_cols2 = [ col for col in my_user. columns if col not in [ 'user_log_acct' ] ]
i_fea_cols = [ col for col in my_item. columns if col not in [ 'item_sku_id' ] ]
train_cols = [ 'user_log_acct' , 'item_sku_id' ] + u_fea_cols1 + u_fea_cols2 + i_fea_cols
训练集&验证集
df_train = df_train_label1. merge( u_fea_train1, on = 'user_log_acct' , how= 'left' )
df_train = df_train. merge( my_user, on = 'user_log_acct' , how= 'left' )
df_train = df_train. merge( my_item, on = 'item_sku_id' , how= 'left' )
df_train[ 'label' ] = df_train[ 'label_cnt' ] > 0
df_train[ 'label' ] = df_train[ 'label' ] . astype( int )
df_val = df_valid_label. merge( u_fea_val1, on = 'user_log_acct' , how= 'left' )
df_val = df_val. merge( my_user, on = 'user_log_acct' , how= 'left' )
df_val = df_val. merge( my_item, on = 'item_sku_id' , how= 'left' )
df_val[ 'label' ] = df_val[ 'label_cnt' ] > 0
df_val[ 'label' ] = df_val[ 'label' ] . astype( int )
序列化
def set_tokenizer ( docs, split_char= ' ' , max_len= 100 ) :
'''
输入
docs:文本列表
split_char:按什么字符切割
max_len:截取的最大长度
输出
X:序列化后的数据
word_index:文本和数字对应的索引
'''
tokenizer = Tokenizer( lower= False , char_level= False , split= split_char)
tokenizer. fit_on_texts( docs)
X = tokenizer. texts_to_sequences( docs)
maxlen = max_len
X = pad_sequences( X, maxlen= maxlen, value= 0 )
word_index= tokenizer. word_index
return X, word_index
valid_item_seq = data_valid_fea. groupby( [ 'user_log_acct' ] ) [ 'item_sku_id' ] . agg( list ) . reset_index( )
valid_item_seq. columns = [ 'user_log_acct' , 'item_seq' ]
df_val = df_val. merge( valid_item_seq, on= 'user_log_acct' , how= 'left' )
train_item_seq = data_train_fea1. groupby( [ 'user_log_acct' ] ) [ 'item_sku_id' ] . agg( list ) . reset_index( )
train_item_seq. columns = [ 'user_log_acct' , 'item_seq' ]
df_train = df_train. merge( train_item_seq, on= 'user_log_acct' , how= 'left' )
df_data = pd. concat( [ df_train[ [ 'item_seq' ] ] , df_val[ [ 'item_seq' ] ] ] , axis= 0 , ignore_index= True )
df_data[ 'item_seq' ] = df_data[ 'item_seq' ] . apply ( lambda x: str ( x) [ 1 : - 1 ] )
text_1_list = list ( df_data[ 'item_seq' ] )
print ( '开始序列化' )
x1, index_1 = set_tokenizer( text_1_list, split_char= ',' , max_len= 20 )
print ( '序列化完成' )
gc. collect( )
开始序列化
序列化完成
0
sparse_col = [ 'item_sku_id' , 'age' , 'sex' , 'user_level' , 'province' , 'city' , 'county' , 'brand_code' , 'shop_id' , 'item_third_cate_cd' , 'vender_id' ]
rest_col = [ 'user_log_acct' , 'label_cnt' , 'label_days' , 'user_item' , 'item_seq' , 'label' ]
dense_cols = [ ]
for i in df_train. columns:
if df_train[ i] . dtype in [ 'float64' , 'int64' ] and i not in sparse_col and i not in rest_col:
dense_cols. append( i)
from sklearn. preprocessing import StandardScaler
ss= StandardScaler( )
df_data = pd. concat( [ df_train[ dense_cols] , df_val[ dense_cols] ] , axis= 0 , ignore_index= True )
df_data = df_data. fillna( 0 )
ss. fit( df_data)
dense_feature = ss. transform( df_data)
dense_feature_input = dense_feature. shape[ 1 ]
train_input_1 = x1[ : df_train. shape[ 0 ] ]
test_input_1 = x1[ df_train. shape[ 0 ] : ]
train_input_2 = dense_feature[ : df_train. shape[ 0 ] ]
test_input_2 = dense_feature[ df_train. shape[ 0 ] : ]
train_label = df_train[ 'label' ]
test_label = df_val[ 'label' ]
Lstm + Transformer
from transformer import Encoder, padding_mask
def model_3 ( emb1, dense_feature_input, n_layers = 1 ,
num_heads = 4 ,
middle_units = 1024 ,
max_seq_length = 20 ,
training = False ) :
K. clear_session( )
emb_layer_1 = Embedding(
input_dim= emb1. shape[ 0 ] ,
output_dim= 20 ,
input_length= 20 ,
trainable= True
)
seq1 = Input( shape= ( 20 , ) )
x1 = emb_layer_1( seq1)
sdrop= SpatialDropout1D( rate= 0.2 )
x1 = sdrop( x1)
padding_mask_list = padding_mask( seq1)
d_model = x1. shape[ - 1 ]
l = Encoder( n_layers, d_model, num_heads,
middle_units, max_seq_length, training) ( [ x1, padding_mask_list] )
x = Dropout( 0.2 ) ( LSTM( 200 , return_sequences= True ) ( l) )
semantic = TimeDistributed( Dense( 100 , activation= "tanh" ) ) ( x)
merged_1 = Lambda( lambda x: K. max ( x, axis= 1 ) , output_shape= ( 100 , ) ) ( semantic)
merged_1_avg = Lambda( lambda x: K. mean( x, axis= 1 ) , output_shape= ( 100 , ) ) ( semantic)
hin = Input( shape= ( dense_feature_input, ) )
htime = Dense( 16 , activation= 'relu' ) ( hin)
x = concatenate( [ merged_1, merged_1_avg, htime] )
x = Dropout( 0.2 ) ( Activation( activation= "relu" ) ( BatchNormalization( ) ( Dense( 128 ) ( x) ) ) )
x = Activation( activation= "relu" ) ( BatchNormalization( ) ( Dense( 64 ) ( x) ) )
pred = Dense( 1 , activation= 'sigmoid' ) ( x)
model = Model( inputs= [ seq1, hin] , outputs= pred)
model. compile ( optimizer= "adam" ,
loss= "binary_crossentropy" ,
metrics= [ "binary_crossentropy" , tf. keras. metrics. AUC( name= 'auc' ) ] )
return model
file_path = "nn_lstm.h5"
earlystopping = EarlyStopping( monitor= "val_auc" , patience= 5 , mode= 'max' )
plateau = ReduceLROnPlateau( monitor= "val_auc" , verbose= 1 , mode= 'max' , factor= 0.5 , patience= 3 )
checkpoint = ModelCheckpoint(
file_path, monitor= 'val_auc' , save_weights_only= True , verbose= 1 , save_best_only= True , mode= 'max' )
callbacks_list = [ earlystopping, checkpoint, plateau]
model_lstm_transformer = model_3( x1, dense_feature_input)
hist = model_lstm_transformer. fit( [ x1_tr, x2_tr] ,
y_tr, batch_size= 4096 , epochs= 20 ,
validation_data= ( [ x1_va, x2_va] , y_va) ,
callbacks= callbacks_list, verbose= 1 , shuffle= True )
test_pred = model_lstm_transformer. predict( [ x1_va, x2_va] , batch_size= 2048 , verbose= 1 )
Epoch 1/20
401/401 [==============================] - 571s 1s/step - loss: 0.2253 - binary_crossentropy: 0.2253 - auc: 0.5852 - val_loss: 0.0349 - val_binary_crossentropy: 0.0349 - val_auc: 0.8060
Epoch 00001: val_auc improved from -inf to 0.80597, saving model to nn_lstm.h5
Epoch 2/20
401/401 [==============================] - 567s 1s/step - loss: 0.0304 - binary_crossentropy: 0.0304 - auc: 0.8699 - val_loss: 0.0339 - val_binary_crossentropy: 0.0339 - val_auc: 0.6791
Epoch 00002: val_auc did not improve from 0.80597
Epoch 3/20
401/401 [==============================] - 568s 1s/step - loss: 0.0240 - binary_crossentropy: 0.0240 - auc: 0.9364 - val_loss: 0.0312 - val_binary_crossentropy: 0.0312 - val_auc: 0.7775
Epoch 00003: val_auc did not improve from 0.80597
Epoch 4/20
401/401 [==============================] - 568s 1s/step - loss: 0.0203 - binary_crossentropy: 0.0203 - auc: 0.9601 - val_loss: 0.0345 - val_binary_crossentropy: 0.0345 - val_auc: 0.7099
Epoch 00004: val_auc did not improve from 0.80597
Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 5/20
401/401 [==============================] - 569s 1s/step - loss: 0.0178 - binary_crossentropy: 0.0178 - auc: 0.9760 - val_loss: 0.0372 - val_binary_crossentropy: 0.0372 - val_auc: 0.6537
Epoch 00005: val_auc did not improve from 0.80597
Epoch 6/20
401/401 [==============================] - 572s 1s/step - loss: 0.0168 - binary_crossentropy: 0.0168 - auc: 0.9799 - val_loss: 0.0371 - val_binary_crossentropy: 0.0371 - val_auc: 0.6627
Epoch 00006: val_auc did not improve from 0.80597
864/864 [==============================] - 116s 133ms/step