用户新增预测挑战赛Baseline注释版
背景 :数据由约62万条训练集、20万条测试集数据组成,共包含13个字段。其中uuid为样本唯一标识 ,eid为访问行为ID ,udmap为行为属性 ,其中的key1到key9表示不同的行为属性 ,如项目名、项目id等相关字段,common_ts为应用访问记录发生时间(毫秒时间戳) ,其余字段x1至x8为用户相关的属性,为匿名处理字段。target字段为预测目标,即是否为新增用户。
from IPython. core. interactiveshell import InteractiveShell
InteractiveShell. ast_node_interactivity = 'all'
1.导入需要的packages
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
from sklearn. tree import DecisionTreeClassifier
2.csv数据文件读取函数
def ReadData ( path) :
train_data = pd. read_csv( path + 'train.csv' )
test_data = pd. read_csv( path + 'test.csv' )
return train_data, test_data
2.1针对数据的初步观察
train_data, test_data = ReadData( '用户新增预测挑战赛公开数据/' )
train_data. head( )
train_data. info( )
train_data. describe( )
uuid eid udmap common_ts x1 x2 x3 x4 x5 x6 x7 x8 target 0 0 26 {"key3":"67804","key2":"650"} 1689673468244 4 0 41 107 206 1 0 1 0 1 1 26 {"key3":"67804","key2":"484"} 1689082941469 4 0 41 24 283 4 8 1 0 2 2 8 unknown 1689407393040 4 0 41 71 288 4 7 1 0 3 3 11 unknown 1689467815688 1 3 41 17 366 1 6 1 0 4 4 26 {"key3":"67804","key2":"650"} 1689491751442 0 3 41 92 383 4 8 1 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 620356 entries, 0 to 620355
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 uuid 620356 non-null int64
1 eid 620356 non-null int64
2 udmap 620356 non-null object
3 common_ts 620356 non-null int64
4 x1 620356 non-null int64
5 x2 620356 non-null int64
6 x3 620356 non-null int64
7 x4 620356 non-null int64
8 x5 620356 non-null int64
9 x6 620356 non-null int64
10 x7 620356 non-null int64
11 x8 620356 non-null int64
12 target 620356 non-null int64
dtypes: int64(12), object(1)
memory usage: 61.5+ MB
uuid eid common_ts x1 x2 x3 x4 x5 x6 x7 x8 target count 620356.000000 620356.000000 6.203560e+05 620356.000000 620356.000000 620356.000000 620356.000000 620356.000000 620356.000000 620356.000000 620356.000000 620356.000000 mean 310177.500000 22.148287 1.689317e+12 2.675723 1.106350 40.974499 82.860080 224.909096 2.901681 5.863720 0.855459 0.140566 std 179081.496134 12.139122 2.746865e+08 1.719279 1.174157 1.373016 44.109037 114.305062 1.444797 2.575854 0.351638 0.347574 min 0.000000 0.000000 1.688382e+12 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 25% 155088.750000 11.000000 1.689088e+12 1.000000 0.000000 41.000000 51.000000 133.000000 1.000000 6.000000 1.000000 0.000000 50% 310177.500000 26.000000 1.689377e+12 4.000000 1.000000 41.000000 86.000000 241.000000 4.000000 7.000000 1.000000 0.000000 75% 465266.250000 34.000000 1.689563e+12 4.000000 2.000000 41.000000 107.000000 313.000000 4.000000 7.000000 1.000000 0.000000 max 620355.000000 42.000000 1.689696e+12 4.000000 3.000000 74.000000 151.000000 413.000000 4.000000 9.000000 1.000000 1.000000
3.对udmap列进行one-hot编码
def udmap_onethot ( d) :
v = np. zeros( 9 )
if d == 'unknown' :
return v
d = eval ( d)
for i in range ( 1 , 10 ) :
if 'key' + str ( i) in d:
v[ i- 1 ] = d[ 'key' + str ( i) ]
return v
udmap_onethot( train_data[ 'udmap' ] [ 1 ] )
array([ 0., 484., 67804., 0., 0., 0., 0., 0.,
0.])
train_udmap_df = pd. DataFrame( np. vstack( train_data[ 'udmap' ] . apply ( udmap_onethot) ) )
test_udmap_df = pd. DataFrame( np. vstack( test_data[ 'udmap' ] . apply ( udmap_onethot) ) )
train_udmap_df. columns = [ 'key' + str ( i) for i in range ( 1 , 10 ) ]
test_udmap_df. columns = [ 'key' + str ( i) for i in range ( 1 , 10 ) ]
train_data = pd. concat( [ train_data, train_udmap_df] , axis= 1 )
test_data = pd. concat( [ test_data, test_udmap_df] , axis= 1 )
4. 检测udmap是否为空
train_data[ 'udmap_isknown' ] = ( train_data[ 'udmap' ] == 'unknown' ) . astype( int )
test_data[ 'udmap_isknown' ] = ( test_data[ 'udmap' ] == 'unknown' ) . astype( int )
train_data. head( )
uuid eid udmap common_ts x1 x2 x3 x4 x5 x6 ... key1 key2 key3 key4 key5 key6 key7 key8 key9 udmap_isknown 0 0 26 {"key3":"67804","key2":"650"} 1689673468244 4 0 41 107 206 1 ... 0.0 650.0 67804.0 0.0 0.0 0.0 0.0 0.0 0.0 0 1 1 26 {"key3":"67804","key2":"484"} 1689082941469 4 0 41 24 283 4 ... 0.0 484.0 67804.0 0.0 0.0 0.0 0.0 0.0 0.0 0 2 2 8 unknown 1689407393040 4 0 41 71 288 4 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 3 3 11 unknown 1689467815688 1 3 41 17 366 1 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 4 4 26 {"key3":"67804","key2":"650"} 1689491751442 0 3 41 92 383 4 ... 0.0 650.0 67804.0 0.0 0.0 0.0 0.0 0.0 0.0 0
5 rows × 23 columns
5.提取eid的频次特征 eid为访问行为ID
train_data[ 'eid_freq' ] = train_data[ 'eid' ] . map ( train_data[ 'eid' ] . value_counts( ) )
test_data[ 'eid_freq' ] = test_data[ 'eid' ] . map ( train_data[ 'eid' ] . value_counts( ) )
6.提取eid的标签特征
train_data[ 'eid_mean' ] = train_data[ 'eid' ] . map ( train_data. groupby( 'eid' ) [ 'target' ] . mean( ) )
test_data[ 'eid_mean' ] = test_data[ 'eid' ] . map ( train_data. groupby( 'eid' ) [ 'target' ] . mean( ) )
7.提取时间戳
train_data[ 'common_ts' ] = pd. to_datetime( train_data[ 'common_ts' ] , unit= 'ms' )
test_data[ 'common_ts' ] = pd. to_datetime( test_data[ 'common_ts' ] , unit= 'ms' )
train_data[ 'common_ts_hour' ] = train_data[ 'common_ts' ] . dt. hour
test_data[ 'common_ts_hour' ] = test_data[ 'common_ts' ] . dt. hour
train_data[ 'common_ts_day' ] = train_data[ 'common_ts' ] . dt. day
test_data[ 'common_ts_day' ] = test_data[ 'common_ts' ] . dt. day
8. 利用决策树模型进行训练
clf = DecisionTreeClassifier( )
clf. fit(
train_data. drop( [ 'udmap' , 'common_ts' , 'uuid' , 'target' ] , axis= 1 ) ,
train_data[ 'target' ]
)
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
9.对测试集进行预测
result_df = pd. DataFrame( {
'uuid' : test_data[ 'uuid' ] ,
'target' : clf. predict( test_data. drop( [ 'udmap' , 'common_ts' , 'uuid' ] , axis= 1 ) )
} )
10.保存结果
result_df. to_csv( 'submit.csv' , index= None )