import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn. decomposition import LatentDirichletAllocation
from sklearn. metrics import accuracy_score
import time
import datetime
from scipy. sparse import hstack
from sklearn. model_selection import StratifiedKFold
import re
from keras. layers import *
from tensorflow. keras. models import *
from tensorflow. keras. preprocessing. text import Tokenizer, text_to_word_sequence
from tensorflow. keras. preprocessing. sequence import pad_sequences
from tensorflow. keras. preprocessing import text, sequence
from tensorflow. keras. callbacks import *
from keras. layers. advanced_activations import LeakyReLU, PReLU
import tensorflow. keras. backend as K
from keras. optimizers import *
from tensorflow. keras. utils import to_categorical
from tensorflow. keras. backend import cast
import tensorflow as tf
import random as rn
import gc
import logging
import gensim
np. random. seed( 1024 )
rn. seed( 1024 )
import warnings
warnings. filterwarnings( 'ignore' )
/home/frank/miniconda3/envs/reco2/lib/python3.7/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
action_data = pd. read_csv( './data/my_data.csv' )
action_data. head( )
user_log_acct item_sku_id action_time action_type brand_code shop_id item_third_cate_cd vender_id shop_score age sex user_level province city county 0 937922 357022 2020-02-04 08:28:15 1 1791.0 8703.0 10.0 5227.0 -1.000000 5.0 1.0 5 11.0 348.0 1782.0 1 937922 73 2020-02-04 08:27:07 1 1791.0 8703.0 10.0 5227.0 -1.000000 5.0 1.0 5 11.0 348.0 1782.0 2 937922 29583 2020-02-04 08:26:31 1 1791.0 2738.0 10.0 3436.0 9.206167 5.0 1.0 5 11.0 348.0 1782.0 3 937922 108763 2020-02-04 08:26:10 1 1791.0 2738.0 10.0 3436.0 9.206167 5.0 1.0 5 11.0 348.0 1782.0 4 1369473 331139 2020-02-03 21:55:49 1 9985.0 6367.0 73.0 3666.0 0.000000 5.0 1.0 5 1.0 41.0 2058.0
action_data. shape
(37214269, 15)
数据预处理
action_data[ 'dd_len' ] = action_data[ 'action_time' ] . apply ( lambda x: len ( str ( x) ) )
action_data[ 'action_time' ] = action_data[ 'action_time' ] . apply ( lambda x: x[ : 19 ] )
del action_data[ 'dd_len' ]
action_data[ 'action_time' ] = pd. to_datetime( action_data[ 'action_time' ] )
action_data = action_data. sort_values( 'action_time' )
action_data[ 'month' ] = action_data[ 'action_time' ] . dt. month
action_data[ 'day' ] = action_data[ 'action_time' ] . dt. day
action_data[ 'month_day' ] = action_data[ 'month' ] . values * 100 + action_data[ 'day' ] . values
训练集切分
def _label_trans ( x, dic_) :
try :
return dic_[ x]
except :
return 0
def get_label ( df, label_st = ( 4 , 11 ) , label_en = ( 4 , 15 ) , candidate_st = ( 4 , 6 ) , candidate_en = ( 4 , 10 ) , fea_en = ( 4 , 10 ) ) :
lb_st = df. loc[ ( df[ 'month' ] == label_st[ 0 ] ) & ( df[ 'day' ] == label_st[ 1 ] ) , 'month_day' ] . values[ 0 ]
lb_en = df. loc[ ( df[ 'month' ] == label_en[ 0 ] ) & ( df[ 'day' ] == label_en[ 1 ] ) , 'month_day' ] . values[ 0 ]
cand_st = df. loc[ ( df[ 'month' ] == candidate_st[ 0 ] ) & ( df[ 'day' ] == candidate_st[ 1 ] ) , 'month_day' ] . values[ 0 ]
cand_en = df. loc[ ( df[ 'month' ] == candidate_en[ 0 ] ) & ( df[ 'day' ] == candidate_en[ 1 ] ) , 'month_day' ] . values[ 0 ]
fea_position = df. loc[ ( df[ 'month' ] == fea_en[ 0 ] ) & ( df[ 'day' ] == fea_en[ 1 ] ) , 'month_day' ] . values[ 0 ]
ind_label = ( df[ 'month_day' ] >= lb_st) & ( df[ 'month_day' ] <= lb_en) & ( df[ 'action_type' ] == 2 )
ind_candidate = ( df[ 'month_day' ] >= cand_st) & ( df[ 'month_day' ] <= cand_en)
ind_fea = ( df[ 'month_day' ] <= fea_position)
data_label = df. loc[ ind_label] . copy( )
data_fea = df. loc[ ind_fea] . copy( )
data_candidates = df. loc[ ind_candidate] . copy( )
df_candidates = data_candidates[ [ 'user_log_acct' , 'item_sku_id' ] ] . copy( )
df_candidates = df_candidates. drop_duplicates( subset = [ 'user_log_acct' , 'item_sku_id' ] )
df_candidates = df_candidates. loc[ ( df_candidates. item_sku_id. isnull( ) == False ) ]
label = data_label[ [ 'user_log_acct' , 'item_sku_id' , 'day' ] ] . copy( )
print ( 'get label' )
df_candidates[ 'label_cnt' ] = 0
df_candidates[ 'label_days' ] = 0
df_candidates[ 'user_item' ] = df_candidates[ 'user_log_acct' ] . astype( str ) + '_' + df_candidates[ 'item_sku_id' ] . astype( str )
label[ 'user_item' ] = label[ 'user_log_acct' ] . astype( str ) + '_' + label[ 'item_sku_id' ] . astype( str )
dic_cnt = label[ 'user_item' ] . value_counts( ) . to_dict( )
dic_days = label. groupby( 'user_item' ) [ 'day' ] . nunique( ) . to_dict( )
df_candidates[ 'label_cnt' ] = df_candidates[ 'user_item' ] . apply ( lambda x: _label_trans( x, dic_cnt) ) . values
df_candidates[ 'label_days' ] = df_candidates[ 'user_item' ] . apply ( lambda x: _label_trans( x, dic_days) ) . values
return df_candidates, data_fea
% % time
df_valid_label, data_valid_fea = get_label( action_data, label_st = ( 4 , 11 ) , label_en = ( 4 , 15 ) , candidate_st = ( 4 , 6 ) , candidate_en = ( 4 , 10 ) , fea_en = ( 4 , 10 ) )
get label
CPU times: user 5.44 s, sys: 948 ms, total: 6.39 s
Wall time: 6.39 s
% % time
df_train_label1, data_train_fea1 = get_label( action_data, label_st = ( 4 , 6 ) , label_en = ( 4 , 10 ) , candidate_st = ( 4 , 1 ) , candidate_en = ( 4 , 5 ) , fea_en = ( 4 , 5 ) )
get label
CPU times: user 4.81 s, sys: 616 ms, total: 5.43 s
Wall time: 5.43 s
df_train_label1. head( )
user_log_acct item_sku_id label_cnt label_days user_item 34296301 1144603 153700 0 0 1144603_153700 1415203 1129253 327893 0 0 1129253_327893 3960663 736788 201003 0 0 736788_201003 5158969 109461 256490 0 0 109461_256490 7377193 470525 142823 0 0 470525_142823
特征构建
原始特征
my_user = action_data[ [ 'user_log_acct' , 'age' , 'sex' , 'user_level' , 'province' , 'city' , 'county' ] ] . drop_duplicates( [ 'user_log_acct' ] , keep= 'first' )
my_item = action_data[ [ 'item_sku_id' , 'brand_code' , 'shop_id' , 'item_third_cate_cd' , 'vender_id' , 'shop_score' ] ] . drop_duplicates( [ 'item_sku_id' ] , keep= 'first' )
user特征
def gen_action_freq_feats ( df, start_date) :
key = [ 'user_log_acct' ]
action = df[ key+ [ 'action_type' , 'action_time' ] ] . copy( )
feats = pd. DataFrame( action[ key] . drop_duplicates( ) )
for w in tqdm( [ 1 , 3 , 5 , 7 , 15 , 30 ] ) :
bef_start_date = start_date - datetime. timedelta( days= w)
action_cl = action[ action[ 'action_time' ] >= bef_start_date] . copy( )
df = pd. get_dummies( action_cl[ 'action_type' ] , prefix= '_' . join( key) + '_last{}_days_action' . format ( w) )
action_cl = pd. concat( [ action_cl, df] , axis= 1 )
action_cl = action_cl. groupby( key, as_index= False ) . sum ( )
action_cl[ '_' . join( key) + '_last{}_days_action_1_rt' . format ( w) ] = action_cl[ '_' . join( key) + '_last{}_days_action_2' . format ( w) ] / ( 1 + action_cl[ '_' . join( key) + '_last{}_days_action_1' . format ( w) ] )
action_cl[ '_' . join( key) + '_last{}_days_action_3_rt' . format ( w) ] = action_cl[ '_' . join( key) + '_last{}_days_action_2' . format ( w) ] / ( 1 + action_cl[ '_' . join( key) + '_last{}_days_action_3' . format ( w) ] )
action_cl[ '_' . join( key) + '_last{}_days_action_4_rt' . format ( w) ] = action_cl[ '_' . join( key) + '_last{}_days_action_2' . format ( w) ] / ( 1 + action_cl[ '_' . join( key) + '_last{}_days_action_4' . format ( w) ] )
del action_cl[ 'action_type' ]
feats = feats. merge( action_cl, on= key, how= 'left' )
return feats
u_fea_train1 = gen_action_freq_feats( data_train_fea1, datetime. datetime( 2020 , 4 , 5 ) )
u_fea_val1 = gen_action_freq_feats( data_valid_fea, datetime. datetime( 2020 , 4 , 10 ) )
100%|██████████| 6/6 [00:05<00:00, 1.15it/s]
100%|██████████| 6/6 [00:05<00:00, 1.06it/s]
合并特征集
u_fea_cols1 = [ col for col in u_fea_train1. columns if col not in [ 'user_log_acct' ] ]
u_fea_cols2 = [ col for col in my_user. columns if col not in [ 'user_log_acct' ] ]
i_fea_cols = [ col for col in my_item. columns if col not in [ 'item_sku_id' ] ]
train_cols = [ 'user_log_acct' , 'item_sku_id' ] + u_fea_cols1 + u_fea_cols2 + i_fea_cols
训练集&验证集
df_train = df_train_label1. merge( u_fea_train1, on = 'user_log_acct' , how= 'left' )
df_train = df_train. merge( my_user, on = 'user_log_acct' , how= 'left' )
df_train = df_train. merge( my_item, on = 'item_sku_id' , how= 'left' )
df_train[ 'label' ] = df_train[ 'label_cnt' ] > 0
df_train[ 'label' ] = df_train[ 'label' ] . astype( int )
df_val = df_valid_label. merge( u_fea_val1, on = 'user_log_acct' , how= 'left' )
df_val = df_val. merge( my_user, on = 'user_log_acct' , how= 'left' )
df_val = df_val. merge( my_item, on = 'item_sku_id' , how= 'left' )
df_val[ 'label' ] = df_val[ 'label_cnt' ] > 0
df_val[ 'label' ] = df_val[ 'label' ] . astype( int )
序列化
def set_tokenizer ( docs, split_char= ' ' , max_len= 100 ) :
'''
输入
docs:文本列表
split_char:按什么字符切割
max_len:截取的最大长度
输出
X:序列化后的数据
word_index:文本和数字对应的索引
'''
tokenizer = Tokenizer( lower= False , char_level= False , split= split_char)
tokenizer. fit_on_texts( docs)
X = tokenizer. texts_to_sequences( docs)
maxlen = max_len
X = pad_sequences( X, maxlen= maxlen, value= 0 )
word_index= tokenizer. word_index
return X, word_index
valid_item_seq = data_valid_fea. groupby( [ 'user_log_acct' ] ) [ 'item_sku_id' ] . agg( list ) . reset_index( )
valid_item_seq. columns = [ 'user_log_acct' , 'item_seq' ]
df_val = df_val. merge( valid_item_seq, on= 'user_log_acct' , how= 'left' )
train_item_seq = data_train_fea1. groupby( [ 'user_log_acct' ] ) [ 'item_sku_id' ] . agg( list ) . reset_index( )
train_item_seq. columns = [ 'user_log_acct' , 'item_seq' ]
df_train = df_train. merge( train_item_seq, on= 'user_log_acct' , how= 'left' )
df_data = pd. concat( [ df_train[ [ 'item_seq' ] ] , df_val[ [ 'item_seq' ] ] ] , axis= 0 , ignore_index= True )
df_data[ 'item_seq' ] = df_data[ 'item_seq' ] . apply ( lambda x: str ( x) [ 1 : - 1 ] )
text_1_list = list ( df_data[ 'item_seq' ] )
print ( '开始序列化' )
x1, index_1 = set_tokenizer( text_1_list, split_char= ',' , max_len= 20 )
print ( '序列化完成' )
gc. collect( )
开始序列化
序列化完成
0
sparse_col = [ 'item_sku_id' , 'age' , 'sex' , 'user_level' , 'province' , 'city' , 'county' , 'brand_code' , 'shop_id' , 'item_third_cate_cd' , 'vender_id' ]
rest_col = [ 'user_log_acct' , 'label_cnt' , 'label_days' , 'user_item' , 'item_seq' , 'label' ]
dense_cols = [ ]
for i in df_train. columns:
if df_train[ i] . dtype in [ 'float64' , 'int64' ] and i not in sparse_col and i not in rest_col:
dense_cols. append( i)
from sklearn. preprocessing import StandardScaler
ss= StandardScaler( )
df_data = pd. concat( [ df_train[ dense_cols] , df_val[ dense_cols] ] , axis= 0 , ignore_index= True )
df_data = df_data. fillna( 0 )
ss. fit( df_data)
dense_feature = ss. transform( df_data)
dense_feature_input = dense_feature. shape[ 1 ]
train_input_1 = x1[ : df_train. shape[ 0 ] ]
test_input_1 = x1[ df_train. shape[ 0 ] : ]
train_input_2 = dense_feature[ : df_train. shape[ 0 ] ]
test_input_2 = dense_feature[ df_train. shape[ 0 ] : ]
train_label = df_train[ 'label' ]
test_label = df_val[ 'label' ]
LightGBM
import lightgbm as lgb
eval_set = [ ( df_train[ sparse_col+ dense_cols] , df_train[ 'label' ] ) , ( df_val[ sparse_col+ dense_cols] , df_val[ 'label' ] ) ]
lgb_model = lgb. LGBMClassifier( boosting_type= "gbdt" , num_leaves= 2 ** 7 - 1 , reg_alpha= 0 , reg_lambda= 0.01 ,
max_depth= - 1 , n_estimators= 2000 , objective= 'binary' , subsample= 0.9 ,
colsample_bytree= 0.85 , subsample_freq= 1 , min_child_samples= 25 ,
learning_rate= 0.1 , random_state= 2021 , metric= "None" , n_jobs= 20 )
lgb_model. fit( df_train[ sparse_col+ dense_cols] , df_train[ 'label' ] , eval_set = eval_set, eval_metric= 'auc' , verbose= 100 , early_stopping_rounds= 100 )
Training until validation scores don't improve for 100 rounds
[100] valid_0's auc: 0.97877 valid_1's auc: 0.880513
Early stopping, best iteration is:
[16] valid_0's auc: 0.91334 valid_1's auc: 0.884251
LGBMClassifier(colsample_bytree=0.85, metric='None', min_child_samples=25,
n_estimators=2000, n_jobs=20, num_leaves=127, objective='binary',
random_state=2021, reg_alpha=0, reg_lambda=0.01, subsample=0.9,
subsample_freq=1)
eval_set = [ ( df_train[ dense_cols] , df_train[ 'label' ] ) , ( df_val[ dense_cols] , df_val[ 'label' ] ) ]
lgb_model = lgb. LGBMClassifier( boosting_type= "gbdt" , num_leaves= 2 ** 7 - 1 , reg_alpha= 0 , reg_lambda= 0.01 ,
max_depth= - 1 , n_estimators= 2000 , objective= 'binary' , subsample= 0.9 ,
colsample_bytree= 0.85 , subsample_freq= 1 , min_child_samples= 25 ,
learning_rate= 0.1 , random_state= 2021 , metric= "None" , n_jobs= 20 )
lgb_model. fit( df_train[ dense_cols] , df_train[ 'label' ] , eval_set = eval_set, eval_metric= 'auc' , verbose= 100 , early_stopping_rounds= 100 )
Training until validation scores don't improve for 100 rounds
[100] valid_0's auc: 0.963161 valid_1's auc: 0.878001
Early stopping, best iteration is:
[9] valid_0's auc: 0.898261 valid_1's auc: 0.881422
LGBMClassifier(colsample_bytree=0.85, metric='None', min_child_samples=25,
n_estimators=2000, n_jobs=20, num_leaves=127, objective='binary',
random_state=2021, reg_alpha=0, reg_lambda=0.01, subsample=0.9,
subsample_freq=1)
eval_set = [ ( df_train[ sparse_col] , df_train[ 'label' ] ) , ( df_val[ sparse_col] , df_val[ 'label' ] ) ]
lgb_model = lgb. LGBMClassifier( boosting_type= "gbdt" , num_leaves= 2 ** 7 - 1 , reg_alpha= 0 , reg_lambda= 0.01 ,
max_depth= - 1 , n_estimators= 2000 , objective= 'binary' , subsample= 0.9 ,
colsample_bytree= 0.85 , subsample_freq= 1 , min_child_samples= 25 ,
learning_rate= 0.1 , random_state= 2021 , metric= "None" , n_jobs= 20 )
lgb_model. fit( df_train[ sparse_col] , df_train[ 'label' ] , eval_set = eval_set, eval_metric= 'auc' , verbose= 100 , early_stopping_rounds= 100 )
Training until validation scores don't improve for 100 rounds
[100] valid_0's auc: 0.935879 valid_1's auc: 0.741442
Early stopping, best iteration is:
[24] valid_0's auc: 0.847825 valid_1's auc: 0.744424
LGBMClassifier(colsample_bytree=0.85, metric='None', min_child_samples=25,
n_estimators=2000, n_jobs=20, num_leaves=127, objective='binary',
random_state=2021, reg_alpha=0, reg_lambda=0.01, subsample=0.9,
subsample_freq=1)