基于物品的协同过滤算法 python实现

基于物品的协同过滤算法

参考文章:https://blog.csdn.net/likeyou1314918273/article/details/89607596
参考代码和数据: https://blog.csdn.net/qq_25948717/article/details/81839463
大部分代码有修改。

数据的读取和拼接用pandas实现。
用户、电影、评分基础数据使用字典数据结构,当然可以优化为pandas数据框实现,后续可以优化。
代码中拼接未评分物品的相似度矩阵用pandas数据框实现。
代码中用户对未评分物品的喜好程度的数据用pandas数据框实现。

python实现

import pandas as pd
import math as mh

movies_data = pd.read_csv("D:/Python/Python36/ML/datasets/ml-latest-small/movies.csv")
ratings_data = pd.read_csv("D:/Python/Python36/ML/datasets/ml-latest-small//ratings.csv")
data = pd.merge(movies_data, ratings_data, on='movieId')
csv_file = "D:/Python/Python36/ML/datasets/ml-latest-small/data.csv"
data[['userId', 'movieId', 'rating', 'title', 'genres']].sort_values('userId').to_csv(csv_file, index=False,
                                                                                      header=None)
# 采用python字典来表示每位用户评论的电影和评分
# 定义数据结构 {userId: {title: rating}}
data = {}
with open(csv_file, 'r', encoding='UTF-8') as file:
    for line in file.readlines():
        line = line.strip().split(',')
        if not line[0] in data.keys():
            data[line[0]] = {line[3]: line[2]}
        else:
            data[line[0]][line[3]] = line[2]

# print(data)

"""计算任何两个电影之间的相似度,由于每位用户评论的电影不完全一样,所以首先要找到两位用户共同评论过的电影
       然后计算两者之间的距离,最后算出两者之间的相似度
"""


# 使用欧氏距离倒数计算电源间相似度
def euclidean(movie1, movie2):
    distance = 0.0
    for userId in data.keys():
        distance += mh.pow(float(data[userId].get(movie1, 0.0)) - float(data[userId].get(movie2, 0.0)), 2)
    return 1. / (1.0 + mh.sqrt(distance))


# distance = euclidean('Absolute Power (1997)', 'Buffy the Vampire Slayer (1992)')


# 计算两用户之间的Pearson相关系数
def pearson(userId1, userId2):
    movies_user1 = data[userId1]
    movies_user2 = data[userId2]
    common = {}
    for movie in movies_user1.keys():
        if movie in movies_user2.keys():
            common[movie] = 1  # 过滤掉没有共同评分的电影
    n = len(common)
    if n == 0:
        return 0

    # 逐个评分乘积和
    sum12 = sum([float(movies_user1[movie]) * float(movies_user2[movie]) for movie in common])
    # 评分和
    sum1 = sum([float(movies_user1[movie]) for movie in common])
    sum2 = sum([float(movies_user2[movie]) for movie in common])
    # 评分平方和
    square_sum1 = sum([mh.pow(float(movies_user1[movie]), 2) for movie in common])
    square_sum2 = sum([mh.pow(float(movies_user2[movie]), 2) for movie in common])
    # 分子
    num = n * sum12 - sum1 * sum2
    den = mh.sqrt(n * square_sum1 - sum1 * sum1) * mh.sqrt(n * square_sum2 - sum2 * sum2)
    if den == 0:
        return 0
    return num / den


p1 = pearson('1', '8')
print("p1 = ", p1)


# 用户基于和其他被该用户评分的电影相似度较高的电影的相似度较高的电影与未评分电影的相似度进行推荐
def similar_movies_based_topmovie(userId, k):
    # 计算指定用户的已评分电影间的相似度
    movies_simi = []  # movie间相似度
    movies = data[userId].keys()
    movies = list(movies)
    for i in range(len(movies)):
        for j in range(i + 1, len(movies)):
            simi = euclidean(movies[i], movies[j])
            movies_simi.append((movies[i], movies[j], simi))
    movies_simi.sort(key=lambda val: val[2], reverse=True)
    # 查找和其他电影相似度最高的电影
    movies_simi = movies_simi[: k]
    # print('movies_simi = ', movies_simi)
    movie_count = {}
    for movies_score in movies_simi:
        for i in [0, 1]:
            if movies_score[i] not in movie_count.keys():
                movie_count[movies_score[i]] = 0
            movie_count[movies_score[i]] += 1
    # 字典排序
    movie_count = sorted(movie_count.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
    # print('movie_count=', movie_count)
    # 和其他电影相似度最高的电影
    top_movie = movie_count[0][0]
    # 查找到的与该电影相似度最高的已评分电影,命名为相似评分电影
    similar_movies = []
    for movie in movies:
        if not movie == top_movie:
            simi = euclidean(top_movie, movie)
            similar_movies.append((movie, simi))
    similar_movies.sort(key=lambda val: val[1], reverse=True)
    # 取前k个得分最高的电影
    similar_movies = similar_movies[: k]
    similar_movies_score = []
    # 将电影相似度修改为评分
    for movie_simi in similar_movies:
        score = data[userId].get(movie_simi[0])
        similar_movies_score.append((movie_simi[0], score))
    # print("similar_movies_score=", similar_movies_score)
    # 构建相似评分电影与未评分电影矩阵
    # 使用未知物品评分对相似用户的相似度进行加权得到用户对商品的喜好程度,并排序后返回推荐列表
    # unratedMovies = unrated_movies_matrix(userId)
    return similar_movies_score


# 构建相似评分电影与未评分电影矩阵
def unrated_movies_matrix(userId, k):
    # 获取用户userId未评分的电影
    all_user = data.keys()
    unratedMovies = set()
    for user in all_user:
        for movie in data[user].keys():
            if movie not in data[userId].keys():
                unratedMovies.add(movie)
    unratedMovies = list(unratedMovies)
    # print('unratedMovies=', unratedMovies)

    # movies = unrated_movies('1', 2)
    cols = ['movie', 'rating']
    cols.extend(unratedMovies)
    unrated_matrix = pd.DataFrame()
    # userId基于最高已评分电影的相似评分电影与userId未评分电影的相似度矩阵
    unratedMovies_similar = []

    movie_similar_score = similar_movies_based_topmovie(userId, k)

    for movie_score in movie_similar_score:
        # 拼接相似度电影及其评分
        scores = []
        movie = movie_score[0]
        score = float(movie_score[1])
        scores.append(movie)
        scores.append(score)
        # 拼接未评分电影的相似度
        for unratedMovie in unratedMovies:
            simi = euclidean(movie, unratedMovie)  # 相似度电影和未评分电影相似度
            scores.append(simi)
        df_movie = pd.DataFrame([scores])
        unrated_matrix = unrated_matrix.append(df_movie, ignore_index=True)
    unrated_matrix.columns = cols
    csv1_file = "D:/Python/Python36/ML/datasets/ml-latest-small/unweighted_score_matrix_movies.csv"
    unrated_matrix.to_csv(csv1_file)
    return unrated_matrix


a = similar_movies_based_topmovie('1', 3)
b = unrated_movies_matrix('1', 3)


# 定义指定用户对未评分商品的喜好程度
def unrated_interest_of_user(userId, k, topN):
    unrated_matrix = unrated_movies_matrix(userId, k)
    cols = list(unrated_matrix.columns)
    scores_sum = sum(unrated_matrix['rating'])
    movies_name = list(unrated_matrix.columns)[2:]
    rated_simi = []
    weighted_rated_simi = ['movies', scores_sum]
    for movie in movies_name:
        simi = sum(unrated_matrix['rating'].mul(unrated_matrix[movie])) / scores_sum
        # print('rated_simi=', (movie, rated_simi))
        rated_simi.append((movie, simi))
        weighted_rated_simi.append(simi)
    # simi = unrated_interest_of_user('1', 2, 1)
    df_weighted_score = pd.DataFrame([weighted_rated_simi], columns=cols)
    rated_matrix = unrated_matrix.append(df_weighted_score, ignore_index=True)
    csv1_file = "D:/Python/Python36/ML/datasets/ml-latest-small/weighted_score_matrix_movies.csv"
    rated_matrix.to_csv(csv1_file)
    rated_simi = sorted(rated_simi, key=lambda x: x[1], reverse=True)
    return rated_simi[: topN]


# 召回topN个电影
unrated_movies_interest = unrated_interest_of_user('1', 3, 5)
print("unrated_movies_interest=", unrated_movies_interest)



  • 3
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值