打比赛一点点的总结经验,看人家代码学习方法,这篇是看o2o的一些统计代码的小结,方便以后打比赛的时候查找代码。最近报名了IJCAI-2018的比赛,关于广告算法。从这里开始先找经验吧,这算是一个真正的打比赛了,希望能取得好成绩。
off_train = pd.read_csv('data/ccf_offline_stage1_train.csv', header=None)
off_train.columns = ['user_id', 'merchant_id', 'coupon_id', 'discount_rate', 'distance', 'date_received', 'date']
off_test = pd.read_csv('data/ccf_offline_stage1_test_revised.csv', header=None)
off_test.columns = ['user_id', 'merchant_id', 'coupon_id', 'discount_rate', 'distance', 'date_received']
on_train = pd.read_csv('data/ccf_online_stage1_train.csv',header=None)
on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date']
dataset3 = off_test
feature3 = off_train[((off_train.date>='20160315')&(off_train.date<='20160630'))|((off_train.date=='null')&(off_train.date_received>='20160315')&(off_train.date_received<='20160630'))]
dataset2 = off_train[(off_train.date_received>='20160515')&(off_train.date_received<='20160615')]
feature2 = off_train[(off_train.date>='20160201')&(off_train.date<='20160514')|((off_train.date=='null')&(off_train.date_received>='20160201')&(off_train.date_received<='20160514'))]
dataset1 = off_train[(off_train.date_received>='20160414')&(off_train.date_received<='20160514')]
feature1 = off_train[(off_train.date>='20160101')&(off_train.date<='20160413')|((off_train.date=='null')&(off_train.date_received>='20160101')&(off_train.date_received<='20160413'))]
上面dataset和feature是对训练集和测试集的划分。
t = dataset3[['user_id']]
t['this_month_user_receive_all_coupon_count'] = 1
t = t.groupby('user_id').agg('sum').reset_index()
这是以用户的id为分组,查看每个分组下的总数。如果上面是单方括号,那么就不能取出dataframe。只能是取出Series。
dataset3 = dataset3.groupby('User_id').apply(lambda df: np.mean(df["Date_received"])).reset_index()
dataset3.columns = ['User_is', "avg_cvr"]
print dataset3.head(4)
取出两列数据,然后给这两列数据命名。其中lambda出入的dataframe的数据。
查看xgboost的特征重要性:
# 输出特征的重要性
features = [x for x in df.columns]
importance = gbm.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
查看lgb的重要性:
其中调用
clf = lgb.LGBMClassifier(num_leaves=63, max_depth=7, n_estimators=80, n_jobs=20)
然后输出属性的重要性:
# 查看属性重要性
df = pd.DataFrame(columns=['feature', 'important'])
df['feature'] = features
df['important'] = clf.feature_importances_
df = df.sort_values(axis=0, ascending=True, by='important').reset_index()
print df
取出时间属性中,时间最大和时间最小的特征:
t2 = dataset3[['user_id','coupon_id','date_received']]
t2.date_received = t2.date_received.astype('str')
t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
t2 = t2[t2.receive_number>1]
t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(d) for d in s.split(':')]))
t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(d) for d in s.split(':')]))
t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
统计各类别在总样本中的count数
def count_cat_prep(df,column,newcolumn):
count_dict = {}
df[newcolumn] = 0
data = df[[column,newcolumn]].values
for cat_list in data:
if cat_list[0] not in count_dict:
count_dict[cat_list[0]] = 0
cat_list[1] = 0
else:
count_dict[cat_list[0]] += 1
cat_list[1] = count_dict[cat_list[0]]
df[[column,newcolumn]] = data
调用
for column in ['user_id','item_id','item_brand_id','shop_id','user_item_id','user_shop_id','user_brand_id','user_category_id']:
count_cat_prep(train_data,column,column+'_click_count_prep')
for column in ['user_id','item_id','item_brand_id','shop_id','user_item_id','user_shop_id','user_brand_id','user_category_id']:
train_data = train_data.join(train_data[column].value_counts(),on = column ,rsuffix = '_count')
求点击时间间隔的代码:
def nexttime_delta(column):
data[column+'_nexttime_delta'] = 0
train_data = data[['context_timestamp', column, column+'_nexttime_delta']].values
nexttime_dict = {}
for df_list in train_data:
if df_list[1] not in nexttime_dict:
df_list[2] = -1
nexttime_dict[df_list[1]] = df_list[0]
else:
df_list[2] = nexttime_dict[df_list[1]] - df_list[0]
nexttime_dict[df_list[1]] = df_list[0]
data[['context_timestamp', column, column+'_nexttime_delta']] = train_data
return data
这段代码说明了后一次点击与现在的时间差。
def lasttime_delta(column):
train_data[column+'_lasttime_delta'] = 0
data = train_data[['context_timestamp',column,column+'_lasttime_delta']].values
lasttime_dict = {}
for df_list in data:
if df_list[1] not in lasttime_dict:
df_list[2] = -1
lasttime_dict[df_list[1]] = df_list[0]
else:
df_list[2] = df_list[0] - lasttime_dict[df_list[1]]
lasttime_dict[df_list[1]] = df_list[0]
train_data[['context_timestamp',column,column+'_lasttime_delta']] = data
前一次点击与现在的时间差。
代码调用:
for column in ['user_id', 'item_id', 'item_brand_id', 'shop_id']:
data = nexttime_delta(column)
对于属性特征的重命名:
df_fe = df[['item_id', 'user_age_level']]
df_fe = df_fe.groupby(['item_id']).agg('mean').reset_index()
df_fe.rename(columns={'user_age_level':'user_age_mean'}, inplace=True)
也可以用列的方式重命名:
item_id_trade.columns = ['user_age_level', 'item_id_trade_ratio']
统计特征的第一次点击和最后一次点击,代码实现:
# 第一次点击和最后一次点击
def first_last_hit(data, columns):
df = data[['context_timestamp', columns, 'day']]
df_first_last = df.groupby([columns, 'day']).agg({'context_timestamp': ['max', 'min']}).reset_index()
df_first_last.columns = [columns, 'day', columns+'_last_time', columns+'_first_time']
df_first_last[columns+'_time_diff'] = df_first_last[columns+'_last_time'] - df_first_last[columns+'_first_time']
data = pd.merge(data, df_first_last, 'left', on=[columns, 'day'])
return data
使用agg 函数实现。
和上面求第一次和最后一次的不同,由于时间不一定按照顺序出现,我们需要求顺序上的时间差。
#同一天内时间差
subset = ['user_id', 'day']
temp = data[['context_timestamp', 'user_id', 'day']]
temp = temp.drop_duplicates(subset=subset, keep='first')
temp.rename(columns={'context_timestamp': 'u_day_diffTime_first'}, inplace=True)
data = pd.merge(data, temp, how='left', on=subset)
data['u_day_diffTime_first'] = data['context_timestamp'] - data['u_day_diffTime_first']
del temp
gc.collect()
temp = data[['context_timestamp','user_id', 'day']]
temp = temp.drop_duplicates(subset=subset, keep='last')
temp.rename(columns={'context_timestamp': 'u_day_diffTime_last'}, inplace=True)
data = pd.merge(data, temp, how='left', on=subset)
data['u_day_diffTime_last'] = data['u_day_diffTime_last'] - data['context_timestamp']
del temp
gc.collect()
data.loc[~data.duplicated(subset=subset, keep=False), ['u_day_diffTime_first', 'u_day_diffTime_last']] = -1
做一些特征工程的时候,常常会有些匿名的原始特征,如果这类特征非常的多,那就需要进行特征的筛选,由于不知道特征的真实名称,无法通过认为的进行选择,那么就需要EDA,具有时间特性的数据,我们可以查看,训练集和测试集上特征的分布。如果分布能够一致,说明特征能够使用,如果分布差别很大,那么该特征需要删除,ATEC比赛初赛就是这样,根据每天特征的均值,显示。代码如下:
data['ndays'] = data['date'].apply(lambda x: (parse(str(x))-parse(str(20170905))).days)
p = 'f99'
a = pd.DataFrame(data.groupby('ndays')[p].mean()).reset_index()
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
plt.plot(a['ndays'], a[p])
plt.axvline(61, color='r')
plt.axvline(122, color='r')
plt.axvline(153, color='r')
plt.xlabel('ndays')
plt.ylabel('mean_of_' + p)
plt.title('distribution of ' + p)
求unique的特征:
comb = pd.concat([train_df[[col]+[main_column]],test_df[[col]+[main_column]]],axis=0)
mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict()
train_df[col+'_'+main_column+'_ct'] = train_df[col].map(mp).astype('float32')
test_df[col+'_'+main_column+'_ct'] = test_df[col].map(mp).astype('float32')
print(col+'_'+main_column+'_ct, ',end='')
统计是第几次出现类特征。可以用字典进行编程,但是pandas提供了更加简单的方法。cumcount()函数。
train_data['cash'] = train_data.groupby(['f111']).cumcount()
模型的最优迭代:
XGBOOST: clf.booster().best_iteration
LGB: clf.best_iteration_