import numpy as np
import pandas as pd
import math
from sklearn. metrics import f1_score
idx = pd. IndexSlice
% matplotlib inline
actions = pd. read_csv( "./fresh_comp_offline/tianchi_fresh_comp_train_user.csv" )
items = pd. read_csv( "./fresh_comp_offline/tianchi_fresh_comp_train_item.csv" )
actions. head( )
user_id item_id behavior_type user_geohash item_category time 0 10001082 285259775 1 97lk14c 4076 2014-12-08 18 1 10001082 4368907 1 NaN 5503 2014-12-12 12 2 10001082 4368907 1 NaN 5503 2014-12-12 12 3 10001082 53616768 1 NaN 9762 2014-12-02 15 4 10001082 151466952 1 NaN 5232 2014-12-12 11
def prepare_data ( actions, items) :
actions. time = pd. to_datetime( actions. time)
user_index = actions. user_id. drop_duplicates( )
user_index = user_index. reset_index( drop= True ) . reset_index( ) . set_index( "user_id" )
user_index. columns = [ 'user' ]
actions = pd. merge( actions, user_index, left_on= 'user_id' , right_index= True , how= 'left' )
item_ids = actions. item_id. drop_duplicates( )
item_ids = item_ids. reset_index( drop= True ) . reset_index( ) . set_index( "item_id" )
item_ids. columns = [ 'item' ]
actions = pd. merge( actions, item_ids, left_on= 'item_id' , right_index= True , how= 'left' )
items = pd. merge( items, item_ids, left_on= 'item_id' , right_index= True , how= 'left' )
category = actions. item_category. drop_duplicates( )
category = category. reset_index( drop= True ) . reset_index( ) . set_index( "item_category" )
category. columns = [ 'category' ]
actions = pd. merge( actions, category, left_on= 'item_category' , right_index= True , how= 'left' )
actions = actions. drop( [ 'user_id' , 'item_id' , 'item_category' ] , axis= 1 )
items = items. drop( [ 'item_id' , 'item_category' ] , axis= 1 )
actions = actions. loc[ : , [ 'user' , 'item' , 'behavior_type' , 'category' , 'time' , 'user_geohash' ] ]
actions[ 'date' ] = actions. time. dt. date
actions[ 'hour' ] = actions. time. dt. hour
return actions, items, user_index, item_ids, category
actions, items, user_index, item_ids, _ = prepare_data( actions, items)
actions. head( )
user item behavior_type category time user_geohash date hour 0 0 0 1 0 2014-12-08 18:00:00 97lk14c 2014-12-08 18 1 0 1 1 1 2014-12-12 12:00:00 NaN 2014-12-12 12 2 0 1 1 1 2014-12-12 12:00:00 NaN 2014-12-12 12 3 0 2 1 2 2014-12-02 15:00:00 NaN 2014-12-02 15 4 0 3 1 3 2014-12-12 11:00:00 NaN 2014-12-12 11
items. head( )
item_geohash item 0 NaN 1185692 1 NaN 2222915 2 NaN 2623414 3 NaN 1772057 4 NaN 2634707
geo = pd. concat( [ items. item_geohash, actions. user_geohash] ) . drop_duplicates( )
item_geo = items. item_geohash. drop_duplicates( ) . dropna( )
print ( "商品的geo去重后总数的统计" , item_geo. count( ) )
action_geo = actions. user_geohash. drop_duplicates( ) . dropna( )
print ( "用户行为的geo去重后总数的统计" , action_geo. count( ) )
print ( "商品与用户行为的geo去重后总数的统计:\n" ,
"交集 / 用户行为geo:" ,
len ( action_geo[ action_geo. isin( item_geo) ] ) / len ( action_geo) ,
"\n交集 / 商品geo:" ,
len ( item_geo[ item_geo. isin( action_geo) ] ) / len ( item_geo)
)
del item_geo
del action_geo
商品的geo去重后总数的统计 57358
用户行为的geo去重后总数的统计 1018981
商品与用户行为的geo去重后总数的统计:
交集 / 用户行为geo: 0.025223237724746585
交集 / 商品geo: 0.44809791136371563
ag = actions. loc[ : , [ 'user' , 'user_geohash' ] ] . dropna( )
print ( "用户行为带有geohash的数量" , len ( ag) )
ag = ag. drop_duplicates( )
print ( "用户行为带有geohash的数量(去重后)" , len ( ag) )
ag[ 'c' ] = 1
ag = ag. loc[ : , [ 'user' , 'c' ] ] . groupby( 'user' ) . sum ( )
print ( ag. describe( ) )
del ag
用户行为带有geohash的数量 7380017
用户行为带有geohash的数量(去重后) 1257674
c
count 16240.000000
mean 77.442980
std 53.782759
min 1.000000
25% 42.000000
50% 68.000000
75% 103.000000
max 709.000000
df = actions[ actions. user_geohash. notna( ) ]
print ( "购买的时候, 有geo信息的行为数量" , len ( df) , "占全部行为的" , len ( df[ df. user_geohash. isin( items. item_geohash) ] ) / len ( df) )
del df
购买的时候, 有geo信息的行为数量 7380017 占全部行为的 0.03044234179948366
saved_actions = actions
print ( len ( actions) )
actions. head( )
23291027
user item behavior_type category time user_geohash date hour 0 0 0 1 0 2014-12-08 18:00:00 97lk14c 2014-12-08 18 1 0 1 1 1 2014-12-12 12:00:00 NaN 2014-12-12 12 2 0 1 1 1 2014-12-12 12:00:00 NaN 2014-12-12 12 3 0 2 1 2 2014-12-02 15:00:00 NaN 2014-12-02 15 4 0 3 1 3 2014-12-12 11:00:00 NaN 2014-12-12 11
print ( "共计: {}条交易记录" . format ( actions. user. max ( ) ) )
共计: 19999条交易记录
user = actions. groupby( [ 'user' , 'behavior_type' ] ) [ [ 'item' ] ] . count( ) . unstack( ) . fillna( 0 ) . astype( np. int )
user. rename( columns= { 'item' : 'c' } , level= 0 , inplace= True )
user. head( )
c behavior_type 1 2 3 4 user 0 207 0 0 4 1 456 26 1 5 2 446 1 6 8 3 800 31 1 4 4 282 0 2 0
c = actions. drop_duplicates( [ 'user' , 'behavior_type' , 'item' ] ) \
. groupby( [ 'user' , 'behavior_type' ] ) [ [ 'item' ] ] . count( ) . unstack( ) . fillna( 0 ) . astype( np. int )
user = user. merge( c, left_index= True , right_index= True , how= 'left' )
user. head( )
c item_x item_y behavior_type 1 2 3 4 1 2 3 4 1 2 3 4 user 0 207 0 0 4 89 0 0 4 89 0 0 4 1 456 26 1 5 148 25 1 5 148 25 1 5 2 446 1 6 8 201 1 5 8 201 1 5 8 3 800 31 1 4 321 30 1 4 321 30 1 4 4 282 0 2 0 151 0 2 0 151 0 2 0
c = actions. drop_duplicates( [ 'user' , 'behavior_type' , 'category' ] ) \
. groupby( [ 'user' , 'behavior_type' ] ) [ [ 'category' ] ] . count( ) . unstack( ) . fillna( 0 ) . astype( np. int )
user = user. merge( c, left_index= True , right_index= True , how= 'left' )
user. head( )
c item_x item_y category behavior_type 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 user 0 207 0 0 4 89 0 0 4 89 0 0 4 29 0 0 4 1 456 26 1 5 148 25 1 5 148 25 1 5 45 12 1 5 2 446 1 6 8 201 1 5 8 201 1 5 8 75 1 5 7 3 800 31 1 4 321 30 1 4 321 30 1 4 58 16 1 4 4 282 0 2 0 151 0 2 0 151 0 2 0 53 0 2 0
user = pd. DataFrame( user. values, index= user. index, columns= [ "u{}" . format ( i) for i in range ( 0 , 16 , 1 ) ] )
user. head( )
u0 u1 u2 u3 u4 u5 u6 u7 u8 u9 u10 u11 u12 u13 u14 u15 user 0 207 0 0 4 89 0 0 4 89 0 0 4 29 0 0 4 1 456 26 1 5 148 25 1 5 148 25 1 5 45 12 1 5 2 446 1 6 8 201 1 5 8 201 1 5 8 75 1 5 7 3 800 31 1 4 321 30 1 4 321 30 1 4 58 16 1 4 4 282 0 2 0 151 0 2 0 151 0 2 0 53 0 2 0
user = user / ( user. mean( ) + user. std( ) * 3 )
user. head( )
u0 u1 u2 u3 u4 u5 u6 u7 u8 u9 u10 u11 u12 u13 u14 u15 user 0 0.039515 0.000000 0.000000 0.066647 0.042137 0.000000 0.000000 0.082082 0.042137 0.000000 0.000000 0.082082 0.096836 0.000000 0.000000 0.115024 1 0.087048 0.108395 0.004484 0.083308 0.070070 0.111353 0.005754 0.102603 0.070070 0.111353 0.005754 0.102603 0.150263 0.186512 0.015255 0.143780 2 0.085139 0.004169 0.026905 0.133293 0.095163 0.004454 0.028768 0.164165 0.095163 0.004454 0.028768 0.164165 0.250439 0.015543 0.076277 0.201293 3 0.152716 0.129241 0.004484 0.066647 0.151976 0.133623 0.005754 0.082082 0.151976 0.133623 0.005754 0.082082 0.193673 0.248682 0.015255 0.115024 4 0.053833 0.000000 0.008968 0.000000 0.071490 0.000000 0.011507 0.000000 0.071490 0.000000 0.011507 0.000000 0.176977 0.000000 0.030511 0.000000
good = actions. groupby( [ 'item' , 'behavior_type' ] ) [ [ 'user' ] ] . count( ) . unstack( ) . fillna( 0 ) . astype( np. int )
good. rename( columns= { 'user' : 'c' } , level= 0 , inplace= True )
good. head( )
c behavior_type 1 2 3 4 item 0 78 1 1 0 1 4 0 0 0 2 87 0 3 2 3 3 0 0 0 4 7 0 0 0
c = actions. drop_duplicates( [ 'user' , 'behavior_type' , 'item' ] ) \
. groupby( [ 'item' , 'behavior_type' ] ) [ [ 'user' ] ] . count( ) . unstack( ) . fillna( 0 ) . astype( np. int )
good = good. merge( c, left_index= True , right_index= True , how= 'left' )
good. head( )
c user behavior_type 1 2 3 4 1 2 3 4 item 0 78 1 1 0 36 1 1 0 1 4 0 0 0 2 0 0 0 2 87 0 3 2 22 0 3 2 3 3 0 0 0 1 0 0 0 4 7 0 0 0 3 0 0 0
good = pd. DataFrame( good. values, index= good. index, columns= [ "g{}" . format ( i) for i in range ( 0 , 8 , 1 ) ] )
good. head( )
g0 g1 g2 g3 g4 g5 g6 g7 item 0 78 1 1 0 36 1 1 0 1 4 0 0 0 2 0 0 0 2 87 0 3 2 22 0 3 2 3 3 0 0 0 1 0 0 0 4 7 0 0 0 3 0 0 0
good = good / ( good. mean( ) + good. std( ) * 3 )
good. head( )
g0 g1 g2 g3 g4 g5 g6 g7 item 0 1.876140 0.762983 0.533016 0.000000 2.389785 0.825291 0.688164 0.000000 1 0.096212 0.000000 0.000000 0.000000 0.132766 0.000000 0.000000 0.000000 2 2.092618 0.000000 1.599047 1.832312 1.460424 0.000000 2.064493 2.381005 3 0.072159 0.000000 0.000000 0.000000 0.066383 0.000000 0.000000 0.000000 4 0.168372 0.000000 0.000000 0.000000 0.199149 0.000000 0.000000 0.000000
cat = actions. groupby( [ 'category' , 'behavior_type' ] ) [ [ 'user' ] ] . count( ) . unstack( ) . fillna( 0 ) . astype( np. int )
cat. rename( columns= { 'user' : 'c' } , level= 0 , inplace= True )
cat. head( )
c behavior_type 1 2 3 4 category 0 8168 130 125 44 1 134719 3399 3470 690 2 7419 118 128 74 3 271839 5117 8852 3049 4 250652 5567 7268 1678
c = actions. drop_duplicates( [ 'user' , 'behavior_type' , 'category' ] ) \
. groupby( [ 'category' , 'behavior_type' ] ) [ [ 'user' ] ] . count( ) . unstack( ) . fillna( 0 ) . astype( np. int )
cat = cat. merge( c, left_index= True , right_index= True , how= 'left' )
cat. head( )
c user behavior_type 1 2 3 4 1 2 3 4 category 0 8168 130 125 44 923 76 63 42 1 134719 3399 3470 690 6445 1190 1259 506 2 7419 118 128 74 556 51 70 65 3 271839 5117 8852 3049 9766 1748 2877 1873 4 250652 5567 7268 1678 8869 1716 2325 1184
c = actions. drop_duplicates( [ 'item' , 'behavior_type' , 'category' ] ) \
. groupby( [ 'category' , 'behavior_type' ] ) [ [ 'item' ] ] . count( ) . unstack( ) . fillna( 0 ) . astype( np. int )
cat = cat. merge( c, left_index= True , right_index= True , how= 'left' )
cat. head( )
c user item behavior_type 1 2 3 4 1 2 3 4 1 2 3 4 category 0 8168 130 125 44 923 76 63 42 1354 99 92 35 1 134719 3399 3470 690 6445 1190 1259 506 34516 2803 2389 571 2 7419 118 128 74 556 51 70 65 1386 93 95 59 3 271839 5117 8852 3049 9766 1748 2877 1873 42985 3693 4811 1821 4 250652 5567 7268 1678 8869 1716 2325 1184 55323 4512 4763 1319
cat = pd. DataFrame( cat. values, index= cat. index, columns= [ "c{}" . format ( i) for i in range ( 0 , 12 , 1 ) ] )
cat. head( )
c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 category 0 8168 130 125 44 923 76 63 42 1354 99 92 35 1 134719 3399 3470 690 6445 1190 1259 506 34516 2803 2389 571 2 7419 118 128 74 556 51 70 65 1386 93 95 59 3 271839 5117 8852 3049 9766 1748 2877 1873 42985 3693 4811 1821 4 250652 5567 7268 1678 8869 1716 2325 1184 55323 4512 4763 1319
cat = cat / ( cat. mean( ) + cat. std( ) * 3 )
cat. head( )
c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 category 0 0.133160 0.094975 0.084855 0.108459 0.437772 0.227932 0.144700 0.166593 0.111956 0.093267 0.093912 0.122399 1 2.196275 2.483237 2.355563 1.700830 3.056815 3.568934 2.891698 2.007054 2.853974 2.640682 2.438651 1.996850 2 0.120949 0.086208 0.086891 0.182408 0.263707 0.152954 0.160778 0.257823 0.114602 0.087614 0.096974 0.206329 3 4.431693 3.738372 6.009060 7.515698 4.631941 5.242434 6.607956 7.429271 3.554238 3.479143 4.910988 6.368237 4 4.086289 4.067132 4.933783 4.136222 4.206500 5.146463 5.340110 4.696347 4.574413 4.250716 4.861990 4.612688
del c
def read_csv ( ) :
return pd. read_csv( "user.csv" , index_col= 0 )
def read_good ( ) :
return pd. read_csv( "good.csv" , index_col= 0 )
def read_cat ( ) :
return pd. read_csv( 'cat.csv' , index_col= 0 )
def read_label ( ) :
return pd. read_csv( "label.csv" , index_col= 0 )
label = actions[ actions. behavior_type == 4 ] . copy( )
label. date = ( pd. to_datetime( label. date) - np. timedelta64( 1 , 'D' ) )
print ( label. date. dtypes)
label[ 'buy' ] = 1
label = label. set_index( [ 'date' , 'user' ] ) . loc[ : , [ 'item' , 'category' , 'buy' ] ] . drop_duplicates( )
label. set_index( [ 'category' , 'item' , ] , append= True , inplace= True )
label. head( )
datetime64[ns]
buy date user category item 2014-12-01 0 2 2 1 2014-12-13 0 9 13 1 2014-12-01 0 8 11 1 27 59 1 2014-12-12 1 30 90 1
d_action = actions. copy( )
d_action[ 'd' ] = 1
d_action. date = pd. to_datetime( d_action. date)
d_action = d_action. groupby( [ 'date' , 'user' , 'category' , 'item' , 'behavior_type' ] ) . sum ( ) [ [ 'd' ] ]
d_action = d_action / ( d_action. mean( ) + d_action. std( ) * 3 )
d_action = d_action. unstack( ) . fillna( 0 ) . astype( np. float32)
d_action. columns = d_action. columns. droplevel( 0 )
d_action. columns = [ 'd_t{}' . format ( i) for i in range ( 1 , 5 , 1 ) ]
d_action. head( )
d_t1 d_t2 d_t3 d_t4 date user category item 2014-11-18 1 46 129 0.342359 0.0 0.0 0.0 49 163 0.342359 0.0 0.0 0.0 58 176 0.342359 0.0 0.0 0.0 3 5 478 0.342359 0.0 0.0 0.0 139 445 0.513539 0.0 0.0 0.0
x_action = actions. copy( )
x_action[ 'c' ] = 1
x_action. date = pd. to_datetime( x_action. date) . dt. date
x_action = x_action. loc[ x_action. hour. isin( [ 23 , 22 , 21 ] ) ]
x_action. date = pd. to_datetime( x_action. date)
x_action = x_action. groupby( [ 'date' , 'user' , 'category' , 'item' , 'hour' , 'behavior_type' ] ) . sum ( )
x_action = x_action. unstack( )
x_action = x_action / ( x_action. mean( ) + x_action. std( ) * 3 )
x_action = x_action. stack( ) . astype( np. float32)
x_action. head( )
c date user category item hour behavior_type 2014-11-18 3 139 445 21 1 0.559228 487 21 1 0.186409 643 21 1 0.186409 5 20 1043 22 1 0.372819 1070 22 1 0.372819
x_action = x_action. unstack( [ 'hour' , 'behavior_type' ] , fill_value= 0 ) . sort_index( axis= 1 )
x_action. columns = x_action. columns. droplevel( 0 )
x_action = pd. DataFrame( x_action. values, index= x_action. index, columns= pd. MultiIndex. from_product( [ range ( 1 , 5 , 1 ) , range ( 21 , 24 , 1 ) ] , names= [ 'behavior_type' , 'hour' ] ) )
x_action = x_action. fillna( 0 )
x_action = pd. DataFrame( x_action. values, index= x_action. index, columns = [ "h{}_{}" . format ( h, t) for h in range ( 21 , 24 , 1 ) for t in [ 1 , 2 , 3 , 4 ] ] )
x_action. head( )
h21_1 h21_2 h21_3 h21_4 h22_1 h22_2 h22_3 h22_4 h23_1 h23_2 h23_3 h23_4 date user category item 2014-11-18 3 139 445 0.559228 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 487 0.186409 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 643 0.186409 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5 20 1043 0.000000 0.0 0.0 0.0 0.372819 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1070 0.000000 0.0 0.0 0.0 0.372819 0.0 0.0 0.0 0.0 0.0 0.0 0.0
x_action = d_action. merge( x_action, left_index= True , right_index= True , how= 'left' )
x_action. fillna( 0 , inplace= True )
x_action. head( )
d_t1 d_t2 d_t3 d_t4 h21_1 h21_2 h21_3 h21_4 h22_1 h22_2 h22_3 h22_4 h23_1 h23_2 h23_3 h23_4 date user category item 2014-11-18 1 46 129 0.342359 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 49 163 0.342359 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 58 176 0.342359 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3 5 478 0.342359 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 139 445 0.513539 0.0 0.0 0.0 0.559228 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
x_action = x_action. merge( label, left_index= True , right_index= True , how= 'left' )
x_action. fillna( 0 , inplace= True )
x_action. buy = x_action. buy. astype( np. int8)
x_action. head( )
d_t1 d_t2 d_t3 d_t4 h21_1 h21_2 h21_3 h21_4 h22_1 h22_2 h22_3 h22_4 h23_1 h23_2 h23_3 h23_4 buy date user category item 2014-11-18 1 46 129 0.342359 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 49 163 0.342359 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 58 176 0.342359 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 3 5 478 0.342359 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 139 445 0.513539 0.0 0.0 0.0 0.559228 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0
x_action. reset_index( inplace= True )
x_action = user. merge( x_action, right_on= 'user' , left_index= True , how= 'right' )
x_action = good. merge( x_action, right_on= 'item' , left_index= True , how= 'right' )
x_action = cat. merge( x_action, right_on= 'category' , left_index= True , how= 'right' )
x_action. set_index( [ 'date' , 'user' , 'category' , 'item' ] , inplace= True )
x_action. head( )
c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ... h21_4 h22_1 h22_2 h22_3 h22_4 h23_1 h23_2 h23_3 h23_4 buy date user category item 2014-11-18 1 46 129 0.218716 0.149038 0.156811 0.300727 1.252132 0.437869 0.349117 0.424416 0.116091 0.110225 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 49 163 2.285206 2.003978 2.334519 3.026985 3.575691 3.059086 3.169614 3.962542 2.383990 2.098980 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 58 176 0.149577 0.097898 0.171746 0.340166 0.492316 0.209937 0.264134 0.456149 0.072846 0.092325 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 3 5 478 9.422435 9.349217 9.173796 5.410612 4.985289 7.713696 7.315377 5.906132 9.139068 9.315400 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 139 445 2.903059 2.843413 1.963195 1.099377 2.688290 2.945120 1.952298 0.995594 2.503222 2.730181 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0
5 rows × 53 columns