基于物品的协同过滤itemCF代码

算法介绍链接:https://www.jianshu.com/p/c8a905e0164b


# import packages
import time, math, os
#from tqdm import tqdm
import gc
import pickle
import random
from datetime import datetime
from operator import itemgetter
import numpy as np
import pandas as pd
import warnings
from collections import defaultdict
import collections
warnings.filterwarnings('ignore')


data_path='...'           #测试数据所在目录
save_path='...'       #相似度矩阵,评分矩阵存放目录
#读入数据
data=pd.read_csv(data_path + '新闻推荐测试数据.csv') 


# 获取近期点击最多的10个文章 k=10,作为协同过滤召回不足时的补充
def get_item_topk_click(click_df, k):
    topk_click = click_df['click_article_id'].value_counts().index[:k]
    return topk_click
# item_topk_click=get_item_topk_click(data_before,50)
# item_topk_click

## 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
def get_user_item_time(click_df):
    
    click_df = click_df.sort_values('click_timestamp')
    
    def make_item_time_pair(df):
        return dict(zip(df['click_article_id'], df['click_timestamp']))
        
    user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\
                                                            .reset_index().rename(columns={0: 'item_time_list'})
    user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
    
    return user_item_time_dict

user_item_time_dict=get_user_item_time(data)
pickle.dump(user_item_time_dict, open(save_path + 'user_item_time_dict.pkl', 'wb')) #保存用户的评分矩阵
                
#计算相似度矩阵 {user1: [(user2, sim1), (user3, sim2)..]...}       
def itemcf_sim(user_item_time_dict):
    N={}    #存放每个items出现的记录数
    C={}    #存放item之间在共同出现的记录数
    for uid,items in user_item_time_dict.items():
        for i in items.keys():
            N.setdefault(i,0)
            if items[i]>0.0:
                N[i]+=1
            C.setdefault(i,{})
            for j in items.keys():
                C[i].setdefault(j,0.0)
                if (i != j and items[i]>0.0 and items[j]>0.0):
                        C[i][j] += 1/ math.log(len(items) + 1) #惩罚用户活跃度
    #构建item间相似度矩阵                    
    i2i_sim={}                    
    for i,related_items in C.items():
        i2i_sim.setdefault(i,{})
        for j,cij in related_items.items():
            i2i_sim[i].setdefault(j,0.0)
            i2i_sim[i][j] = cij/math.sqrt(N[i]*N[j])
    
    pickle.dump(i2i_sim, open(save_path + 'itemcf_i2i_sim.pkl', 'wb')) #保存items的相似性矩阵
    
    return i2i_sim

i2i_sim=itemcf_sim(user_item_time_dict) #调用相似性矩阵函数,生成i2i_sim

# 基于item相似性的召回i2i
def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click):
    """
        基于文章协同过滤的召回
        :param user_id: 用户id
        :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
        :param i2i_sim: 字典,item相似性矩阵
        :param sim_item_topk: 整数, 选择与当前文章最相似的前k篇文章
        :param recall_item_num: 整数, 最后的召回文章数量
        :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全        
        return: 召回的文章列表 {item1:score1, item2: score2...}
    """
    # 获取用户历史交互的文章
    user_hist_items = user_item_time_dict[user_id]
#    user_hist_items_ = {item for item, _ in user_hist_items}
    user_hist_items_ = {item for item in user_hist_items.keys()}
    
    item_rank = {}
    for i, score in user_hist_items.items():
        for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:
            if j in user_hist_items_:
                continue
                
            item_rank.setdefault(j, 0)
            item_rank[j] +=  i2i_sim[j][i]*score
    
    # 不足10个,用热门商品补全
    if len(item_rank) < recall_item_num:
        for i, item in enumerate(item_topk_click):
            if item in item_rank.keys(): # 填充的item应该不在原来的列表中
                continue
            item_rank[item] = - i - 100 # 随便给个负数就行
            if len(item_rank) == recall_item_num:
                break
    
    item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]
        
    return item_rank


#取用户的评分矩阵
user_item_time_dict=pickle.load(open(save_path+'user_item_time_dict.pkl','rb'))
# 去取文章相似度矩阵
i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))
# 相似文章的数量
sim_item_topk = 10
# 召回文章数量
recall_item_num = 10
# 用户热度补全
item_topk_click = get_item_topk_click(data, k=20)
#生成用户推荐物品的矩阵
user_recall_items_dict={}
for user in data['user_id'].unique():
    user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click)


# 将字典user_recall_items_dict的形式转换成dataframe
user_item_score_list = []

for user, items in user_recall_items_dict.items():
    for item, score in items:
        user_item_score_list.append([user, item, score])

recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_article_id', 'pred_score'])



# 生成提交文件,每个用户生成5个推荐文章
def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判断是不是每个用户都有5篇文章及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)

# 生成提交文件
submit(recall_df, topk=5, model_name='协同过滤推荐列表')

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值