推荐系统实践--UserCF

原代码参考:https://github.com/Lockvictor/MovieLens-RecSys/blob/8d4575c274f7bf87ddb80fe2b448801901070618/usercf.py

模仿人家代码完全参考《推荐系统实践》上的源码组织的:

#coding=UTF-8
'''Created on 2018/8'''
##############  MovieLens 数据集 H:\movies\ml-1m  ##############
import sys
import random
import math
import os
from operator import itemgetter
from collections import defaultdict

random.seed(0) #后续生成相同的随机数

class UserCF(object):

    def __init__(self):
        self.Train_set = {}  #训练集
        self.Test_set = {}   #测试集

        self.M_all_train_test = 6
        self.k_split_data = 1  #[0,M-1]
        self.k_sim_user = 20  #推荐相似用户的个数
        self.n_rec_movie = 10  #为用户推荐电影数

        self.w_sim_matrix = {}   #相似矩阵
        self.popular_movie = {}   #热门电影
        self.rec_movie = {}   #推荐电影
        self.count_movie = 0   #电影总数量

        print('similar user NO. = %d' % self.k_sim_user, sys.stderr)
        print('recommended movie NO. = %d' % self.n_rec_movie, sys.stderr)

    def loaddata(self,filename): #打开数据文件,加载数据
        ''' load a file,return a generator '''
        fp = open(filename,'r')
        for i, line in enumerate(fp):
            yield line.strip('\t\n')
            if i%100000 == 0: print('%s load %s' % (filename,i),sys.stderr)
        fp.close()
        print('%s load END ' % filename,sys.stderr)

    def SplitData(self,filename): #划分训练集,测试集
        M = self.M_all_train_test
        k = self.k_split_data
        testset_len = 0
        trainset_len = 0
        for line in self.loaddata(filename):
            user, movie, rating,_ = line.split('::')
            if random.randint(0,M) == k:
                self.Test_set.setdefault(user,{})
                self.Test_set[user][movie] = float(rating)
                testset_len += 1
            else:
                self.Train_set.setdefault(user,{})
                self.Train_set[user][movie] = float(rating)
                trainset_len += 1

        print('split data: train set = %s, test set = %s' % (trainset_len, testset_len),sys.stderr)

    def UserSimilarity(self): #计算相似矩阵
        item_users = dict()
        for user,items in self.Train_set.items(): #建立倒排表
            for movie in items.keys():
                if movie not in item_users: item_users[movie] = set()
                item_users[movie].add(user)

                if movie not in self.popular_movie:  #统计热门电影
                    self.popular_movie[movie] = 0
                self.popular_movie[movie] += 1

        self.count_movie = len(item_users)
        print('count movie  =  %d' %self.count_movie, sys.stderr)

        for movie,user in item_users.items(): #计算相似矩阵
            for u in user:
                self.w_sim_matrix.setdefault(u,defaultdict(int))
                for v in user:
                    if u == v: continue
                    self.w_sim_matrix[u][v] += 1/math.log(1+len(user))

        for u,users in self.w_sim_matrix.items():
            for v, count in users.items():
                self.w_sim_matrix[u][v] = count/math.sqrt(len(self.Train_set[u])*len(self.Train_set[v]))

        print('calculate user similarity matrix END ',sys.stderr)

    def Recommend(self, user):  #推荐
        watched_movies = self.Train_set[user]
        N = self.n_rec_movie
        w = self.w_sim_matrix
        K = self.k_sim_user
        for v, wuv in sorted(w[user].items(),key=itemgetter(1),reverse=True)[0:K]:
            for movie,rat  in self.Train_set[v].items():
                if movie in watched_movies : continue
                self.rec_movie.setdefault(movie,0)
                self.rec_movie[movie] += wuv * rat
        return sorted(self.rec_movie.items(),key=itemgetter(1),reverse=True)[0:N]

    def evaluate(self):
        num = 0  #  为用户推荐集合与测试集合的交集
        recall_den = 0 #召回率
        precision_den = 0 #准确率

        recommend_movies = set()

        popularity_num = 0 #新颖度

        for i, user in enumerate(self.Train_set):
            if i % 1e3 == 0: print('i = %d' % i)

            test = self.Test_set.get(user,{})
            rank = self.Recommend(user)
            recall_den += len(test)
            precision_den += self.n_rec_movie

            for movie,_ in rank:
                recommend_movies.add(movie[0])
                popularity_num += math.log(1+self.popular_movie[movie[0]])
                if movie[0] in test.keys():
                    num += 1

        print('num = %d, recall den = %d, precision den = %d' %(num, recall_den, precision_den),sys.stderr)
        print('recommend movies = %d' % len(recommend_movies),sys.stderr)
        print('count movie = %d' %self.count_movie, sys.stderr)

        recall = num / (1.0*recall_den) #召回率
        precision = num / (1.0*precision_den) #准确度
        coverage = len(recommend_movies) / (1.0*self.count_movie) #覆盖率
        popularity = popularity_num / (1.0*precision_den) #新颖度

        print ('recall = %.6f, precision = %.6f, coverage = %.6f, popularity = %.6f'
               %( recall, precision, coverage, popularity) )

if __name__ == '__main__':
    ratingfile = os.path.join('ml-1m', 'ratings.dat')
    usercf = UserCF()
    usercf.SplitData(ratingfile)
    usercf.UserSimilarity()
    usercf.evaluate()

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值