2021微信大数据挑战赛—参赛总结
摘要
-
赛题任务
:本次比赛基于脱敏和采样后的数据信息,对于给定的一定数量到访过微信视频号“热门推荐”的用户,根据这些用户在视频号内的历史n天的行为数据,通过算法在测试集上预测出这些用户对于不同视频内容的互动行为(包括点赞、点击头像、收藏、转发等)的发生概率。本次比赛以多个行为预测结果的加权uAUC值进行评分。 -
数据处理
:因为考虑到一个用户会多次观看同一个视频的情况,且只会在其中一条数据中产生互动行为,因此,将训练集中的一个userid对应一个feedid,若该userid对这个feedid有过互动行为,则这条对应关系的数据的对应互动行为标为1,这样做法是保证了数据的唯一性,也即去掉了时间上的关系。 -
特征工程
:5个id的稀疏矩阵(userid、feedid、authorid、bgm_song_id和bgm_singer_id);keyword和tag的稀疏矩阵;512维的feed_embedding通过PCA降维到32维。 -
特征筛选
:只对稀疏矩阵做特征筛选,利用特征重要性进行筛选。 -
模型选择
:由于稀疏矩阵的维度大,训练时间长,于是分别采用了两个lightgbm模型,最后再进行融合。A模型:稀疏矩阵+32维的feed_embedding;B模型:5折交叉的5个id的labelencode+32维的feed_embedding;最后将两个模型的输出进行简单的算术平均融合,得出最终的预测结果。 -
赛题得分
:初赛A榜:A模型线上得分为0.651285、B模型线上得分为0.644215、融合后线上得分为0.654282。初赛B榜:由于B榜只能提交两次结果,故没有分别看A模型和B模型的线上结果,只提交了融合后的结果,得分为0.65216。 -
最终成绩
:初赛A榜381名,B榜291名。没能进入复赛(B榜前100名)。 -
总结结论
:很有意义的一次比赛、自我感觉良好,有机会下次再来。
赛题任务
数据处理
用户-视频
唯一性处理keyword、tag
提取
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
feed_info=pd.read_csv('wechat_algo_data1/feed_info.csv')
user_action=pd.read_csv('wechat_algo_data1/user_action.csv')
func={
'date_':'count','device':'nunique','play':'sum','stay':'sum',
'read_comment':'max','like':'max','click_avatar':'max','forward':'max',
'comment':'max','follow':'max','favorite':'max'
}
user_action=user_action.groupby(['userid','feedid']).agg(func).reset_index()
feed_info['manual_keyword_list']=feed_info['manual_keyword_list'].fillna('').apply(lambda x:' '.join(x.split(';')))
feed_info['machine_keyword_list']=feed_info['machine_keyword_list'].fillna('').apply(lambda x:' '.join(x.split(';')))
feed_info['manual_tag_list']=feed_info['manual_tag_list'].fillna('').apply(lambda x:' '.join(x.split(';')))
feed_info['machine_tag_list']=feed_info['machine_tag_list'].fillna('').apply(lambda x:' '.join([i.split(' ')[0] for i in x.split(';')]))
特征工程
- 5个id的稀疏矩阵
- keyword和tag的稀疏矩阵
- 512维的feed_embedding通过PCA降维到32维
import numpy as np
import time
import gc
from scipy import sparse
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
def creat_npz(data):
df_feature=pd.DataFrame()
for col in one_hot_cols+['userid','feedid']:
s = time.time()
LE=LabelEncoder()
if col == 'userid':
try:
LE.fit(user_action[col].apply(int))
except:
LE.fit(user_action[col])
user_action[col]=LE.transform(user_action[col])
data[col]=LE.transform(data[col])
OHE=OneHotEncoder()
OHE.fit(user_action[col].values.reshape(-1, 1))
arr=OHE.transform(data[col].values.reshape(-1, 1))
df_feature = sparse.hstack((df_feature,arr))
print(col,int(time.time()-s),'s')
else:
try:
LE.fit(feed_info[col].apply(int))
except:
LE.fit(feed_info[col])
feed_info[col]=LE.transform(feed_info[col])
data[col]=LE.transform(data[col])
OHE=OneHotEncoder()
OHE.fit(feed_info[col].values.reshape(-1, 1))
arr=OHE.transform(data[col].values.reshape(-1, 1))
df_feature = sparse.hstack((df_feature,arr))
print(col,int(time.time()-s),'s')
sparse.save_npz("data_process/one_hot_cols.npz",df_feature)
df_feature=pd.DataFrame()
for col in vec_cols2:
s = time.time()
print(col,'start...')
CV=CountVectorizer()
CV.fit(feed_info[col])
arr = CV.transform(data[col])
print(col,'hstack...')
df_feature = sparse.hstack((df_feature,arr))
arr=[]
print(col,int(time.time()-s),'s')
sparse.save_npz("data_process/vec_cols2.npz",df_feature)
def reduce_mem(df):
starttime = time.time()
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if pd.isnull(c_min) or pd.isnull(c_max):
continue
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,100*(start_mem-end_mem)/start_mem,(time.time()-starttime)/60))
return df
def process(x):
num_list=x.split(' ')[:-1]
res={}
for i,num in enumerate(num_list):
res[i]=float(num)
return pd.Series(res)
vec_cols2=['manual_keyword_list','machine_keyword_list','manual_tag_list','machine_tag_list']
one_hot_cols=['authorid','bgm_song_id','bgm_singer_id']
test_a=pd.read_csv('wechat_algo_data1/test_a.csv')
data=pd.concat([user_action[['userid','feedid']],test_a[['userid','feedid']]],ignore_index=True)
data=pd.merge(data,feed_info[['feedid']+one_hot_cols+vec_cols2],on='feedid',how='left')
creat_npz(data)
feed_embeddings=pd.read_csv('wechat_algo_data1/feed_embeddings.csv')
feed_embeddings_512=feed_embeddings.feed_embedding.apply(process)
pca = PCA(n_components=32,random_state=2021)
feed_embeddings_32 = pd.DataFrame(pca.fit_transform(feed_embeddings_512))
del feed_embeddings['feed_embedding']
feed_embeddings=pd.concat([feed_embeddings,feed_embeddings_32],axis=1)
feed_embeddings_pca=pd.concat([user_action[['userid','feedid']],test_a[['userid','feedid']]],ignore_index=True)
feed_embeddings_pca=pd.merge(feed_embeddings_pca,feed_embeddings,on='feedid',how='left')
del feed_embeddings_pca['userid']
del feed_embeddings_pca['feedid']
del feed_embeddings_512,feed_embeddings_32,feed_embeddings
gc.collect()
feed_embeddings_pca=reduce_mem(feed_embeddings_pca)
feed_embeddings_pca.to_hdf('data_process/feed_embeddings_pca.h5',key='pca',mode='w')
特征筛选
- 对稀疏矩阵利用特征重要性进行筛选
- 并对筛选后的特征进行保存
from lightgbm import LGBMClassifier
for npz_file in ['one_hot_cols','vec_cols2']:
start_time=time.time()
for label in ['read_comment','like','click_avatar','forward']:
s=time.time()
feature = pd.DataFrame()
feature = sparse.hstack((feature,sparse.load_npz(f'data_process/{npz_file}.npz'))).tocsc()
print(npz_file,label,'feature 的 shape',feature.shape)
data=pd.concat([user_action[['userid','feedid',label]],test_a[['userid','feedid']]],ignore_index=True).fillna(-1)
train_index=data[data[label]!=-1].index.to_list()
test_index=data[data[label]==-1].index.to_list()
y=data.loc[train_index,label]
ind_evals=[]
ind_evals.extend(y[y==0].sample(frac=0.1,random_state=2021).index.to_list())
ind_evals.extend(y[y==1].sample(frac=0.1,random_state=2021).index.to_list())
ind_train=y.drop(index=ind_evals).index.to_list()
train_x=feature[ind_train,:]
train_y=y[ind_train]
evals_x=feature[ind_evals,:]
evals_y=y[ind_evals]
print(npz_file,label,'data is ready')
del feature,data
gc.collect()
print('已清理部分内存!!')
clf = LGBMClassifier(boosting_type='gbdt',
num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10000,
subsample_for_bin=200000, objective=None,
class_weight=None, min_split_gain=0.0,
min_child_weight=0.001,
min_child_samples=20, subsample=1.0, subsample_freq=1,
colsample_bytree=1.0,
reg_alpha=0.0, reg_lambda=0.0, random_state=2021,
n_jobs=-1, silent=True)
print(npz_file,label,'fitting...')
clf.fit(train_x,train_y,eval_set=[(train_x, train_y),(evals_x, evals_y)],
eval_names =['train','valid'],
eval_metric='auc',early_stopping_rounds=50)
se = pd.Series(clf.feature_importances_)
se = se[se>0]
col =list(se.sort_values(ascending=False).index)
filename=f'data_process/feature_importances_{npz_file}_{label}.csv'
pd.Series(col).to_csv(filename,index=False)
print(npz_file,label,'特征重要性不为零的编码特征有',len(se),'个')
n = clf.best_iteration_
print(npz_file,label,'n',n)
baseloss = clf.best_score_['valid']['auc']
print(npz_file,label,'baseloss',baseloss)
del clf,train_x,evals_x,train_y,evals_y,y
gc.collect()
print(npz_file,label,'耗时',int(time.time()-s),'s')
print(npz_file,'总耗时',int(time.time()-start_time),'s')
for label in ['read_comment','like','click_avatar','forward']:
for npz_file in ['one_hot_cols','vec_cols2']:
col=pd.read_csv(f'data_process/feature_importances_{npz_file}_{label}.csv')['0'].values.tolist()
feature = pd.DataFrame()
feature = sparse.hstack((feature,sparse.load_npz(f'data_process/{npz_file}.npz').tocsr()[:,col]))
print(label,npz_file,feature.shape)
sparse.save_npz(f'data_process/{npz_file}_{label}.npz',feature)
模型选择
选用了比赛常用模型lightbgm
- A模型:稀疏矩阵+32维feed_embedding
import pickle
data_process_path='data_process/'
wechat_algo_data1_path='wechat_algo_data1/'
feed_info=pd.read_csv(wechat_algo_data1_path+'feed_info.csv')
user_action=pd.read_csv(wechat_algo_data1_path+'user_action.csv')
test_a=pd.read_csv(wechat_algo_data1_path+'test_a.csv')
submit=test_a[['userid','feedid']]
func={
'date_':'count','device':'nunique','play':'sum','stay':'sum',
'read_comment':'max','like':'max','click_avatar':'max','forward':'max',
'comment':'max','follow':'max','favorite':'max'
}
user_action=user_action.groupby(['userid','feedid']).agg(func).reset_index()
ys=['read_comment','like','click_avatar','forward']
vec_cols2=['manual_keyword_list','machine_keyword_list','manual_tag_list','machine_tag_list']
one_hot_cols=['authorid','bgm_song_id','bgm_singer_id']
start_time=time.time()
for label in ys:
s=time.time()
feature = pd.DataFrame()
feature1 = pd.DataFrame()
feature1 = sparse.hstack((feature1,sparse.load_npz(f'{data_process_path}one_hot_cols_{label}.npz')))
feature=sparse.hstack((feature,feature1))
del feature1
gc.collect()
print('1.已清理部分内存!!')
feature2 = pd.DataFrame()
feature2 = sparse.hstack((feature2,sparse.load_npz(f'{data_process_path}vec_cols2_{label}.npz')))
feature=sparse.hstack((feature,feature2))
del feature2
gc.collect()
print('2.已清理部分内存!!')
feed_embeddings_pca=pd.read_hdf(f'{data_process_path}feed_embeddings_pca.h5',key='pca')
feature=sparse.hstack((feature,feed_embeddings_pca)).tocsc()
del feed_embeddings_pca
gc.collect()
print('3.已清理部分内存!!')
print(label,'feature 的 shape',feature.shape)
data=pd.concat([user_action[['userid','feedid',label]],test_a[['userid','feedid']]],ignore_index=True).fillna(-1)
train_index=data[data[label]!=-1].index.to_list()
test_index=data[data[label]==-1].index.to_list()
y=data.loc[train_index,label]
ind_evals=[]
ind_evals.extend(y[y==0].sample(frac=0.1,random_state=2021).index.to_list())
ind_evals.extend(y[y==1].sample(frac=0.1,random_state=2021).index.to_list())
ind_train=y.drop(index=ind_evals).index.to_list()
train_x=feature[ind_train,:]
train_y=y[ind_train]
evals_x=feature[ind_evals,:]
evals_y=y[ind_evals]
test_x=feature[test_index,:]
print(label,'data is ready')
del feature,data
gc.collect()
print('4.已清理部分内存!!')
clf = LGBMClassifier(boosting_type='gbdt', num_leaves=40, max_depth=-1, learning_rate=0.1,
n_estimators=10000, subsample_for_bin=200000, objective=None,
class_weight=None, min_split_gain=0.0, min_child_weight=0.001,
min_child_samples=20, subsample=0.7, subsample_freq=1,
colsample_bytree=0.7,
reg_alpha=6, reg_lambda=3,
random_state=2021, n_jobs=-1, silent=True)
model_file=f"{label}.pickle.dat"
if os.path.exists(model_file):
clf=pickle.load(open(model_file, "rb"))
else:
print(label,'fitting...')
clf.fit(train_x,train_y,eval_set=[(train_x, train_y),(evals_x, evals_y)],
eval_names =['train','valid'],
eval_metric='auc',early_stopping_rounds=50)
print(label,'dumping...')
pickle.dump(clf, open(model_file, "wb"))
print(label,'predicting test...')
test_pred=clf.predict_proba(test_x,num_iteration=clf.best_iteration_)[:,1]
submit[label]=test_pred
del clf,train_x,test_x,train_y,evals_x,evals_y
gc.collect()
print('5.已清理部分内存!!')
print(label,'耗时',int(time.time()-s),'s')
submit_file=f'submit_a_model1.csv'
submit.to_csv(submit_file,index=False)
- B模型:5折交叉,5个id的labelencode+32维的feed_embedding
from sklearn.model_selection import StratifiedKFold
data_process_path='data_process/'
wechat_algo_data1_path='wechat_algo_data1/'
feed_info=pd.read_csv(wechat_algo_data1_path+'feed_info.csv')
user_action=pd.read_csv(wechat_algo_data1_path+'user_action.csv')
test_a=pd.read_csv(wechat_algo_data1_path+'test_a.csv')
submit=test_a[['userid','feedid']]
func={
'date_':'count','device':'nunique','play':'sum','stay':'sum',
'read_comment':'max','like':'max','click_avatar':'max','forward':'max',
'comment':'max','follow':'max','favorite':'max'
}
user_action=user_action.groupby(['userid','feedid']).agg(func).reset_index()
ys=['read_comment','like','click_avatar','forward']
vec_cols2=['manual_keyword_list','machine_keyword_list','manual_tag_list','machine_tag_list']
one_hot_cols=['authorid','bgm_song_id','bgm_singer_id']
def label_encode(data):
df_feature=pd.DataFrame()
for col in one_hot_cols+['userid','feedid']:
LE=LabelEncoder()
if col == 'userid':
try:
LE.fit(user_action[col].apply(int))
except:
LE.fit(user_action[col])
data[col]=LE.transform(data[col])
else:
try:
LE.fit(feed_info[col].apply(int))
except:
LE.fit(feed_info[col])
data[col]=LE.transform(data[col])
def get_feature_data(label):
feature=pd.concat([user_action[['userid','feedid']],test_a[['userid','feedid']]],ignore_index=True)
feature=pd.merge(feature,feed_info[['feedid']+one_hot_cols],on='feedid',how='left')
label_encode(feature)
feed_embeddings_pca=pd.read_hdf(f'{data_process_path}feed_embeddings_pca.h5',key='pca')
feature=pd.concat([feature,feed_embeddings_pca],axis=1)
del feed_embeddings_pca
gc.collect()
data=pd.concat([user_action[['userid','feedid',label]],test_a[['userid','feedid']]],ignore_index=True).fillna(-1)
return feature,data
start_time=time.time()
sk = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
evals_score=pd.DataFrame(columns=ys)
for label in ys:
sj=time.time()
feature,data=get_feature_data(label)
print('feature 的 shape',feature.shape)
train_index=data[data[label]!=-1].index.to_list()
test_index=data[data[label]==-1].index.to_list()
y=data.loc[train_index,label]
train_feature=feature.loc[train_index,:]
test_feature=feature.loc[test_index,:]
del feature,data
gc.collect()
print(label,'data is ready')
for k_fold,(k_train_index,k_evals_index) in enumerate(sk.split(train_feature,y)):
print('k_fold',k_fold,'begin')
s=time.time()
train_x=train_feature.loc[k_train_index,:]
train_y=y[k_train_index]
evals_x=train_feature.loc[k_evals_index,:]
evals_y=y[k_evals_index]
clf = LGBMClassifier(boosting_type='gbdt', num_leaves=40, max_depth=-1, learning_rate=0.1,
n_estimators=10000, subsample_for_bin=200000, objective=None,
class_weight=None, min_split_gain=0.0, min_child_weight=0.001,
min_child_samples=20, subsample=0.7, subsample_freq=1,
colsample_bytree=0.7,categorical_feature=[0,1,2,3,4],
reg_alpha=6, reg_lambda=3,
random_state=2021, n_jobs=-1, silent=True)
model_file=f"{label}.k_fold{k_fold}.pickle.dat"
if os.path.exists(model_file):
clf=pickle.load(open(model_file, "rb"))
else:
print('k_fold',k_fold,label,'fitting...')
clf.fit(train_x,train_y,eval_set=[(train_x, train_y),(evals_x, evals_y)],
eval_names =['train','valid'],
eval_metric='auc',early_stopping_rounds=50)
print('k_fold',k_fold,label,'dumping...')
pickle.dump(clf, open(model_file, "wb"))
score=clf.best_score_['valid']['auc']
evals_score.loc[k_fold,label]=score
print('k_fold',k_fold,label,'predicting test...')
test_pred=clf.predict_proba(test_feature,num_iteration=clf.best_iteration_)[:,1]
submit[label+f'_k_fold{k_fold}']=test_pred
del clf,train_x,evals_x,train_y,evals_y
gc.collect()
print('已清理部分内存!!')
print('k_fold',k_fold,label,'耗时',int(time.time()-s),'s')
del train_feature,test_feature
gc.collect()
print(label,'总耗时',int(time.time()-sj),'s')
print('总耗时',int(time.time()-start_time),'s')
evals_score.to_csv('evals_score_model2.csv',index=False)
score=[]
for col in evals_score.columns.to_list():
score.append(evals_score[col].mean())
print('线下得分',sum((np.array(score)*np.array([4,3,2,1]))/10))
for label in ys:
cols=[i for i in submit.columns.to_list() if label+'_k_fold' in i]
submit[label]=submit[cols].mean(axis=1)
submit[[i for i in submit.columns.to_list() if 'k_fold' not in i]].to_csv('submit_a_model2.csv',index=False)
- 模型融合
submit_a_model1=pd.read_csv('submit_a_model1.csv')
submit_a_model2=pd.read_csv('submit_a_model2.csv')
submit_a=submit_model1[['userid','feedid']].copy()
for col in submit_a_model1.columns.to_list()[2:]:
submit_a[col]=(submit_a_model1[col]+submit_a_model2[col])/2
submit_a.to_csv('submit_a.csv',index=False)
赛题得分
B榜最终成绩:
参考链接
推荐阅读
微信视频号推荐算法解题思路
微信大数据竞赛Trick–如何3ID上0.706+
OTTO队伍的树模型方案
以上即为本文的全部内容,若需要全部源代码的,请关注公众号《Python王者之路》,回复关键词:20210811
,即可获取。
写在最后
虽然没能进入复赛,一个原因是没什么时间打比赛;一个是电脑硬件配置低,程序运行时需要不断做内存优化,验证想法去跑一次模型所消耗的时间很长;还有一个原因也是最主要的原因是自己还是太菜了,仍需努力呀~
但是从中学到了不少东西,比如内存管理、第一次接触到并使用了稀疏矩阵、根据特征重要性筛选特征、多模型融合上分技巧……
由于整场比赛下来只用了lgb这一个模型,并没有尝试过nn模型,于是初赛停止提交结果之后,我又重新训练了一个DeepFM模型,只可惜已经不能验证这个想法了,说不定还能再上一点分呢……
不管怎样,把比赛的过程记录下来,就当作是一次成长吧,下次会更好的!!