极其简单的协同过滤算法

        几乎完全按照王喆的《深度学习推荐系统》来复现的,用的皮尔逊相似度使用物品平均分对各独立评分作修正,如果评分都是一样的话部分退化成余弦相似度。但是最后效果较差,mae高达惊人的0.9,运算速度极其缓慢,当然也可能与使用语言有关。由于是完全手写,可以很明显的感受到python的笨拙,没有c++那么轻便。

        

import random
import math
from operator import itemgetter

class ItemBasedCF():
    #初始化参数
    def __init__(self):
        #找到相似的20部,推荐给用户10部
        self.n_sim_movie = 20
        self.n_rec_movie = 10

        #划分训练集和测试集
        self.trainSet = {}
        self.testSet = {}

        #物品相似度矩阵,电影评分人数,电影计数
        self.user_movie_matrix = {}
        self.movie_sim_matrix = {}
        self.movie_popular = {}
        self.movie_count = 0

        print("相似电影个数是 %d 个" %self.n_sim_movie)
        print("推荐电影个数是 %d 个" % self.n_rec_movie)

    def get_dataset(self,filename,pivot=0.7):
        trainSet_len = 0
        testSet_len = 0
        for line in self.load_file(filename):
            user,movie,rating,timestamp  = line.split('\t')
            if user == '405' and movie == '288':
                self.testSet.setdefault(user, {})
                self.testSet[user][movie] = float(rating)
                testSet_len += 1
            elif(random.random()<pivot):
                self.user_movie_matrix.setdefault(user,{})
                self.user_movie_matrix[user][movie] = float(rating)
                self.trainSet.setdefault(movie,{})
                self.trainSet[movie][user] = float(rating)
                trainSet_len += 1
            else:
                self.testSet.setdefault(user, {})
                self.testSet[user][movie] = float(rating)
                testSet_len += 1
        print(self.testSet['405']['288'])
        print("成功分隔训练集和测试集")
        print("训练集长度:%s" % trainSet_len)
        print("测试集长度:%s" % testSet_len)
   #读文件,返回每一行
    def load_file(self,filename):
        with open(filename,'r') as f:
            for i,line in enumerate(f):
                if i == 0:
                    continue
                yield line.strip('\r\n') #yield 加强版return 返回多个值,去除\r \n
        print("%s数据读取成功!" % filename)

    #计算相似度
    def calcu_movie_sim(self):
        #统计各电影评分个数
        # for moive,users in self.trainSet.items():
        #     if movie not in self.movie_popular:
        #         self.movie_popular[movie] = 0
        #         self.movie_popular[movie] += 1
        # self.movie_count = len(self.movie_popular)
        # print("评价电影的总个数:%d" % self.movie_count)
        print("计算相似度矩阵")
        for movie,users in self.trainSet.items():
            for movie1,users1 in self.trainSet.items():
                self.movie_sim_matrix.setdefault(movie, {})
                if movie1 not in self.movie_sim_matrix[movie].keys():
                    self.movie_sim_matrix[movie].setdefault(movie1, 0)
                #考虑当x两部电影都未被点评过的情况,相似度直接设置为0
                len1,len2,sum1,sum2,avg1,avg2 = 0,0,0,0,0,0
                for user in users.keys():
                        len1 += 1
                        sum1 += self.trainSet[movie][user]
                for user in users1.keys():
                        len2 += 1
                        sum2 += self.trainSet[movie1][user]
                avg1 = sum1/len1
                avg2 = sum2/len2
                zi = 0
                mu1 = 0
                mu2 = 0
                for user in users.keys():
                    if user in users1.keys():
                        zi += (self.trainSet[movie][user] - avg1) * (self.trainSet[movie1][user] - avg2)
                    mu1 += (self.trainSet[movie][user] - avg1)**2
                #如果评分都一样,退化成部分余弦相似度,不然就是皮尔逊相似度
                for user in users1.keys():
                        mu2 += (self.trainSet[movie1][user] - avg2)**2
                if mu1 == 0 or mu2 == 0:
                    if mu1 == 0 and mu2 != 0:
                        for user in users.keys():
                            if user in users1.keys():
                                zi += (self.trainSet[movie][user]) * (self.trainSet[movie1][user] - avg2)
                        for user in users.keys():
                            mu1 += (self.trainSet[movie][user]) ** 2
                        for user in users1.keys():
                            mu2 += (self.trainSet[movie1][user] - avg2) ** 2
                    elif mu1!= 0 and mu2 == 0:
                        for user in users.keys():
                            if user in users1.keys():
                                zi += (self.trainSet[movie][user] - avg1) * (self.trainSet[movie1][user])
                        for user in users.keys():
                            mu1 += (self.trainSet[movie][user] - avg1) ** 2
                        for user in users1.keys():
                            mu2 += (self.trainSet[movie1][user]) ** 2
                    else:
                        for user in users.keys():
                            if user in users1.keys():
                                zi += (self.trainSet[movie][user]) * (self.trainSet[movie1][user])
                        for user in users.keys():
                            mu1 += (self.trainSet[movie][user]) ** 2
                        for user in users1.keys():
                            mu2 += (self.trainSet[movie1][user]) ** 2
                else:
                    for user in users.keys():
                        if user in users1.keys():
                            zi += (self.trainSet[movie][user] - avg1) * (self.trainSet[movie1][user] - avg2)
                    for user in users.keys():
                        mu1 += (self.trainSet[movie][user] - avg1) ** 2
                    for user in users1.keys():
                        mu2 += (self.trainSet[movie1][user]) ** 2
                mu = math.sqrt(mu1) * math.sqrt(mu2)
                self.movie_sim_matrix[movie][movie1] = zi/mu
                # print("%s和%s的相似度为%.3f" %(movie,movie1,self.movie_sim_matrix[movie][movie1]))
        print("成功计算相似度矩阵")

    #给用户做物品推荐
    def recommend(self, user):
        n = self.n_rec_movie
        k = self.n_sim_movie
        rank = {}
        sum = {}
        whatched_movies = self.user_movie_matrix[user]

        for movie, rating in whatched_movies.items():
            #按照相似度排序取前k个相似的
            for related_movies, w in sorted(self.movie_sim_matrix[movie].items(), key=itemgetter(1),reverse=True)[:k]:
                if related_movies in whatched_movies:
                    continue
                if related_movies not in rank.keys():
                    rank.setdefault(related_movies, 0)
                #通过与其相似物品与物品related_movie的偏好值相乘并相加
                #排名的依据是推荐电影与已看电影的相似度(累积)*用户对已看电影的评分
                rank[related_movies] += float(w) * float(rating)
                if related_movies not in sum.keys():
                    sum.setdefault(related_movies, 0)
                sum[related_movies] += float(w)
        for related_movies in rank:
            if sum[related_movies] == 0 :
                rank[related_movies] = 0
                continue
            rank[related_movies] = float(rank[related_movies]/sum[related_movies])
        print("预测")
        print(rank)
        return sorted(rank.items(),key=itemgetter(1), reverse=True)[:n]

    #产生推荐列表并通过准确率,召回率和覆盖率进行评估
    def evaluate(self):
        print("开始评估")
        n = self.n_rec_movie
        #准确率和召回率MAE
        hit = 0
        rec_count = 0
        test_count = 0
        sum = 0

        #覆盖率,去掉重复的电影
        all_rec_movies = set()
        for i,user in enumerate(self.user_movie_matrix):
            test_movies = self.testSet.get(user,{})
            rec_movies = self.recommend(user)
            print("用户id:%s" %(user))
            for movie,w in rec_movies:
                if movie in test_movies:
                    hit += 1
                    print("电影id:%s,电影预测评分%.3f电影实际评分%.3f" %(movie,float(w),float(test_movies[movie])))
                    sum +=abs(w-float(test_movies[movie]))
                all_rec_movies.add(movie)
            rec_count += n
            test_count +=len(test_movies)
        mae = sum/hit
        precision = hit /(1.0 * rec_count)
        recall = hit /(1.0 * test_count)
        print("推荐准确度是%.3f,召回率是%.3f,MAE是%.3f" %(precision,recall,mae))


if  __name__ == '__main__':
    rating_file = r'ml-100k/u.data'
    itemCF = ItemBasedCF()
    itemCF.get_dataset(rating_file)
    itemCF.calcu_movie_sim()
    itemCF.evaluate()

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值