《一种基于标签的电影组推荐方法》实现

本文是对论文《一种基于标签的电影组推荐方法》熊聪聪,林 颖,史艳翠,冯 阔 (天津科技大学人工智能学院,天津 300457)中,一些算法的实现。

基于标签的用户相似度

在这里插入图片描述
在这里插入图片描述
代码实现:

# coding = utf-8

# 基于用户的协同过滤推荐算法实现
import random

import math
from operator import itemgetter
import pickle


class UserBasedCF():
    # 初始化相关参数
    def __init__(self):
        # 找到与目标用户兴趣相似的20个用户,为其推荐10部电影
        self.n_sim_user = 20
        self.n_rec_movie = 10

        # 将数据集划分为训练集和测试集
        self.trainSet = {}
        self.testSet = {}

        # 用户相似度矩阵
        self.user_sim_matrix = {}
        self.movie_count = 0

        print('Similar user number = %d' % self.n_sim_user)
        print('Recommneded movie number = %d' % self.n_rec_movie)


    # 读文件得到“用户-电影”数据
    def get_dataset(self, filename, pivot=0.75):
        trainSet_len = 0
        testSet_len = 0
        for line in self.load_file(filename):
            user, movie, rating, timestamp = line.split('\t')
            if random.random() < pivot:
                self.trainSet.setdefault(user, {})
                self.trainSet[user][movie] = rating
                trainSet_len += 1
            else:
                self.testSet.setdefault(user, {})
                self.testSet[user][movie] = rating
                testSet_len += 1
        print('Split trainingSet and testSet success!')
        print('TrainSet = %s' % trainSet_len)
        print('TestSet = %s' % testSet_len)


    # 读文件,返回文件的每一行
    def load_file(self, filename):
        with open(filename, 'r') as f:
            for i, line in enumerate(f):
                if i == 0:  # 去掉文件第一行的title
                    continue
                yield line.strip('\r\n')
        print('Load %s success!' % filename)

    def user_label(self, user, label):#计算用户对某一标签的偏好
        #label 标签下标  0-19
        #user  用户id
        f_utk = 0 #这里计算该用户所有标签数的总和
        for num in self.f_ut[user]:
            f_utk += num
        p_ut = (self.f_ut[user][label]/f_utk)*math.log(self.m_t/self.m_tj[label])
        return p_ut
    def importanceOfLabel(self, label): #标签的重要性(权重)
        t = self.t #表示所有电影的标签出现次数
        t_j = self.m_tj #表示标签tj在总标签中出现次数
        n = self.user_n #n表示用户数
        # n_tj  #n_tj表示看过标签tj的用户数

        w_tj = math.log((t/t_j[label])*(n/self.n_tj[label]))
        return w_tj
    def user_label_w(self, p_ut, label):
        return p_ut*self.importanceOfLabel(label)

    def calc_user_sim_of_label(self, user1, user2, label_list):
        #label_list  user1和user2均观看过的标签
        p_ui = {} #标签偏好权重均值 
        p_uj = {} #标签偏好权重均值 
        p_ui_avg = 0
        p_uj_avg = 0
        for label in label_list:
            p_ui[label] = self.user_label_w(self.user_label(user1, label), label)
            p_uj[label] = self.user_label_w(self.user_label(user2, label), label)
            p_ui_avg += p_ui[label]
            p_uj_avg += p_uj[label]
        p_ui_avg /= len(label_list)
        p_uj_avg /= len(label_list)

        result_fenzi = 0
        for label in label_list:
            result_fenzi += ((p_ui[label]-p_ui_avg)*(p_uj[label]-p_uj_avg))

        user1_fenmu = 0
        user2_fenmu = 0
        for label in label_list:
            user1_fenmu += (p_ui[label]-p_ui_avg)**2
            user2_fenmu += (p_uj[label]-p_uj_avg)**2

        return (result_fenzi)/(math.sqrt(user1_fenmu * user2_fenmu))

    # 计算用户之间的相似度
    def calc_user_sim(self):
        # 构建“电影-用户”倒排索引
        # key = movieID, value = list of userIDs who have seen this movie
        # self.movie_label  mt
        self.f_ut = {}#用户u观看的电影的标签t的个数
        self.m_tj = [0 for n in range(19)] #用包含标签tj的电影数
        self.m_t = 0 #有标签的电影数,这里的数据集每个电影都有标签
        self.t = 0
        self.user_n = 0 #用户数
        self.n_tj = [0 for n in range(19)] #统计观看的label对应的用户数
        print('Building movie-user table ...')
        movie_user = {}
        for user, movies in self.trainSet.items():
            self.user_n += 1
            is_collect = [False for n in range(19)]
            if user not in self.f_ut:
                self.f_ut[user] = [0 for n in range(19)]
            for movie in movies:
                indexs = 0
                for num in self.movie_label[int(movie)]:#电影总共有19个标签
                    if num!=0:
                        self.f_ut[user][indexs] += num
                        self.m_tj[indexs] += 1
                        self.t += 1
                        if is_collect[indexs]==False:
                            self.n_tj[indexs] += 1
                            is_collect[indexs] = True
                    indexs += 1
                if movie not in movie_user:
                    movie_user[movie] = set()
                movie_user[movie].add(user)
        print('Build movie-user table success!')

        self.m_t = self.movie_count = len(movie_user)
        print('Total movie number = %d' % self.movie_count)

        print('Build user co-rated movies matrix ...')
        for movie, users in movie_user.items():
            for u in users:
                for v in users:
                    if u == v:
                        continue
                    if u not in self.user_sim_matrix:
                        self.user_sim_matrix.setdefault(u, {})
                    if v not in self.user_sim_matrix[u]:
                        self.user_sim_matrix[u].setdefault(v, 0)
                    self.user_sim_matrix[u][v] += 1
        print('Build user co-rated movies matrix success!')

        # 计算相似性
        print('Calculating user similarity matrix ...')
        alpha = 0.3
        for u, related_users in self.user_sim_matrix.items():
            for v, count in related_users.items():
                self.user_sim_matrix[u][v] = count / math.sqrt(len(self.trainSet[u]) * len(self.trainSet[v]))
                label_list = list()
                for index in range(19):
                    if self.f_ut[u][index]!=0 and self.f_ut[v][index]!=0:
                        label_list.append(index)
                # self.user_sim_matrix[u][v] += self.calc_user_sim_of_label(u, v, label_list)
                #计算总体偏好相似度
                self.user_sim_matrix[u][v] = (1-alpha)*self.user_sim_matrix[u][v] + alpha*self.calc_user_sim_of_label(u, v, label_list)
        print('Calculate user similarity matrix success!')


    # 针对目标用户U,找到其最相似的K个用户,产生N个推荐
    def recommend(self, user):
        K = self.n_sim_user
        N = self.n_rec_movie
        rank = {}
        watched_movies = self.trainSet[user]

        # v=similar user, wuv=similar factor
        for v, wuv in sorted(self.user_sim_matrix[user].items(), key=itemgetter(1), reverse=True)[0:K]:
            for movie in self.trainSet[v]:
                if movie in watched_movies:
                    continue
                rank.setdefault(movie, 0)
                rank[movie] += wuv
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]


    # 产生推荐并通过准确率、召回率和覆盖率进行评估
    def evaluate(self):
        print("Evaluation start ...")
        N = self.n_rec_movie
        # 准确率和召回率
        hit = 0
        rec_count = 0
        test_count = 0
        # 覆盖率
        all_rec_movies = set()

        for i, user, in enumerate(self.trainSet):
            test_movies = self.testSet.get(user, {})
            rec_movies = self.recommend(user)
            print("rec_movies = ",rec_movies)
            for movie, w in rec_movies:
                if movie in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
            rec_count += N
            test_count += len(test_movies)

        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * self.movie_count)
        # print('precisioin=%.4f\trecall=%.4f\tcoverage=%.4f' % (precision, recall, coverage))
        print('precisioin=%.2f%%\t recall=%.2f%%\t coverage=%.2f%%' % (precision*100, recall*100, coverage*100))
        return precision, recall, coverage
    def get_label(self):
        fs=open('movie_label.pkl','rb')
        self.movie_label=pickle.load(fs)

if __name__ == '__main__':
    rating_file = '../ItemCF/ml-100k/u.data'
    precision, recall, coverage = 0,0,0
    for i in range(10):
        userCF = UserBasedCF()
        userCF.get_dataset(rating_file)
        userCF.get_label()
        userCF.calc_user_sim()
        t1, t2, t3 = userCF.evaluate()
        precision += t1
        recall += t2
        coverage += t3
    print('precisioin=%.2f%%\t recall=%.2f%%\t coverage=%.2f%%' % ((precision/10)*100, (recall/10)*100, (coverage/10)*100))

结果:
在这里插入图片描述
使用数据集:https://share.weiyun.com/2UP7O67E

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值