# 二、数据概况

https://tianchi.aliyun.com/competition/entrance/531842/learn

# 三、评价方式理解

s c o r e ( u s e r ) = ∑ k = 1 5 s ( u s e r , k ) k score(user) = \sum_{k=1}^5 \frac{s(user, k)}{k}

# 五、Baseline

## 1、导包

# import packages
import time, math, os
from tqdm import tqdm
import gc
import pickle
import random
from datetime import datetime
from operator import itemgetter
import numpy as np
import pandas as pd
import warnings
import collections
from collections import defaultdict
warnings.filterwarnings('ignore')

data_path = './data_raw/'
save_path = './tmp_results/'


## 2、读取采样或全局数据

def get_all_click_sample(data_path,sample_nums =10000):
all_users_id = all_click.user_id.unique()

sample_user_ids = np.random.choice(all_users_id,size = sample_nums,replace=False)
all_click = all_click[all_click['user_id'].isin(sample_user_ids)]

all_click = all_click.drop_duplicates((['user_id','click_article_id','click_timestamp']))

return all_click
def get_all_click_df(data_path = './data_raw/',offline = True):
if offline:
else:

all_click = trn_click.append(tst_click)
all_click = all_click.drop_duplicates((['user_id','click_article_id','click_timestamp']))
return all_click



## 3、获取 用户-文章-点击时间列表

def get_user_item_time(click_df):
click_df = click_df.sort_values('click_timestamp')

def make_item_time_pair(df):
return list(zip(df['click_article_id'],df['click_timestamp']))
user_item_time_df = click_df.groupby('user_id')['click_article_id','click_timestamp'].apply(lambda x : make_item_time_pair(x))\
.reset_index().rename(columns = {0:'item_time_list'})
user_item_time_dict = dict(zip(user_item_time_df['user_id'],user_item_time_df['item_time_list']))

return user_item_time_dict
get_user_item_time(all_click_sample)


## 4、获取点击最多的topk文章

def get_item_topk_click(click_df,k):
topk_click = click_df['click_article_id'].value_counts().index[:k]


## 5、itemcf的物品相似度计算

1 l o g ( 1 + N ( i ) ) \frac{1}{log(1+N(i))}

def itemcf_sim(df):
user_item_time_dict = get_user_item_time(df)

i2i_sim={}
item_cnt = defaultdict(int)

for user,item_time_list in tqdm(user_item_time_dict.items()):

for i , i_click_time in item_time_list:
item_cnt[i]+=1
i2i_sim.setdefault(i,{})

for j ,j_click_time in item_time_list:
if(i==j):
continue
i2i_sim[i].setdefault(j,0)

i2i_sim[i][j] +=1/math.log(len(item_time_list)+1)
i2i_sim_ = i2i_sim.copy()
for i,related_items in i2i_sim.items():
for j,wij in related_items.items():
i2i_sim[i][j] = wij / math.sqrt(item_cnt[i]*item_cnt[j])
pickle.dump(i2i_sim_,open(save_path+'itemcf_i2i_sim.pkl','wb'))

return i2i_sim_


## 6、itemcf的文章推荐

def item_based_recommend(user_id,user_item_time_dict,i2i_sim,sim_item_topk,recall_item_num,item_topk_click):
user_hist_items = user_item_time_dict[user_id]
item_rank = {}

for loc,(i,click_time)in enumerate(user_hist_items):
for j,wij in sorted(i2i_sim[i].items(),key = lambda x:x[1],reverse = True)[:sim_item_topk]:
if j in user_hist_items:
continue
item_rank.setdefault(j,0)
item_rank[j]+=wij
if len(item_rank) < recall_item_num:
for i,item in enumerate(item_topk_click):
if item in item_rank.items():
continue
item_rank[item] = -i-100
if len(item_rank)==recall_item_num:
break
item_rank = sorted(item_rank.items(),key = lambda x: x[1],reverse = True)[:recall_item_num]

return item_rank



## 7、召回字典转换成df

user_item_score_list = []
for user,items in tqdm(user_recall_items_dict.items()):
for item,score in items:
user_item_score_list.append([user,item,score])
recall_df = pd.DataFrame(user_item_score_list,columns = ['user_id','click_article_id','pred_score'])


## 8、生成提交文件

# 生成提交文件
def submit(recall_df, topk=5, model_name=None):
recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 判断是不是每个用户都有5篇文章及以上
tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
assert tmp.min() >= topk

del recall_df['pred_score']
submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()

submit.columns = [int(col) if isinstance(col,int) else col for col in submit.columns.droplevel(0)]
# 按照提交格式定义列名
submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2',
3: 'article_3', 4: 'article_4', 5: 'article_5'})

save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'


# 六、总结

• 点赞
• 评论 3
• 分享
x

海报分享

扫一扫，分享海报

• 收藏
• 打赏

打赏

热爱数学的小菜鸡

你的鼓励将是我创作的最大动力

C币 余额
2C币 4C币 6C币 10C币 20C币 50C币
• 举报
• 一键三连

点赞Mark关注该博主, 随时了解TA的最新博文

07-13 2447

08-09 5519
02-09 6481
09-28
06-06