生成电影推荐
代码如下:
# -*- coding:utf-8 -*-
'''
生成电影推荐系统
'''
import json
import numpy as np
# 定义一个计算两个用户之间欧几里得距离的函数 user1和user2
def euclidean_score(dataset, user1, user2):
if user1 not in dataset:
raise TypeError('User'+user1+'not present in the dataset')
if user2 not in dataset:
raise TypeError('User'+user2+'not present in the dataset')
# 提取两个用户均评过分的电影
rated_by_both = {}
for item in dataset[user1]:
if item in dataset [user2]:
rated_by_both[item]=1
# 如果两个用户均没评分过,则得分为0
if len(rated_by_both)==0:
return 0
# 计算平方和的平方根 并将其归一化 使得评分取值在0-1之间
squared_difference = []
for item in dataset[user1]:
if item in dataset[user2]:
squared_difference.append(np.square(dataset[user1][item]-dataset[user2][item]))
return 1/(1+np.sqrt(np.sum(squared_difference)))
# 皮尔逊系数函数
def pearson_score(dataset,user1,user2):
if user1 not in dataset:
raise TypeError('User'+user1+'not present in the dataset')
if user2 not in dataset:
raise TypeError('User'+user2+'not present in the dataset')
# 提取两个用户都评过分的电影
rated_by_both = {}
for item in dataset[user1]:
if item in dataset[user2]:
rated_by_both[item]=1
num_ratings=len(rated_by_both)
# 如果两个用户都没有评分 则说明两个用户之间没有相似度 返回值为0
if num_ratings==0:
return 0
# 计算相同评分电影的平方值之和
user1_sum=np.sum([dataset[user1][item] for item in rated_by_both])
user2_sum=np.sum([dataset[user2][item] for item in rated_by_both])
# 计算相同电影的评分的平方和
user1_squared_sum=np.sum([np.square(dataset[user1][item]) for item in rated_by_both])
user2_squared_sum=np.sum([np.square(dataset[user2][item]) for item in rated_by_both])
# 计算数据集的乘积之和
product_sum=np.sum([dataset[user1][item]*dataset[user2][item] for item in rated_by_both])
# 计算皮尔逊相关系数
Sxy=product_sum-(user1_sum*user2_sum/num_ratings)
Sxx=user1_squared_sum-np.square(user1_sum)/num_ratings
Syy=user2_squared_sum-np.square(user2_sum)/num_ratings
# 考虑分母为0的情况
if Sxx*Syy==0:
return 0
# 如果上述正常 返回皮尔逊相关系数
return Sxy/np.sqrt(Sxx*Syy)
# 寻找特定数量的与输入用户相似的用户
# dataset:代表数据库 user: 输入用户 num_users:相似的用户个数
def find_similar_users(dataset,user,num_users):
if user not in dataset:
raise TypeError('User '+user+' not present in the dataset')
# 计算所有用户的皮尔逊相关度
scores = np.array([[x,pearson_score(dataset,user,x)] for x in dataset if user!=x])
# 评分按照第二列进行排列
scores_sorted =np.argsort(scores[:,1])
# 评分按照降序排列
scores_sorted_dec=scores_sorted[::-1]
# 提取多个高分并返回
top_k=scores_sorted_dec[0:num_users]
return scores[top_k]
# 定义一个为用户生成电影推荐的函数 首先检查用户是否在数据库中
def generate_recommendations(dataset, user):
if user not in dataset:
raise TypeError('User ' + user + ' not present in the dataset')
total_scores = {}
similarity_sums = {}
for u in [x for x in dataset if x != user]:
similarity_score = pearson_score(dataset, user, u)
if similarity_score <= 0:
continue
for item in [x for x in dataset[u] if x not in dataset[user] or dataset[user][x] == 0]:
total_scores.update({item: dataset[u][item] * similarity_score})
similarity_sums.update({item: similarity_score})
if len(total_scores) == 0:
return ['No recommendations possible']
# Create the normalized list
movie_ranks = np.array([[total/similarity_sums[item], item]
for item, total in total_scores.items()])
# Sort in decreasing order based on the first column
movie_ranks = movie_ranks[np.argsort(movie_ranks[:, 0])[::-1]]
# Extract the recommended movies
recommendations = [movie for _, movie in movie_ranks]
return recommendations
if __name__=='__main__':
data_file='movie_ratings.json'
with open(data_file,'r') as f:
data=json.loads(f.read())
# 为Michael Henry 生成推荐
user='Michael Henry'
print "\nRecommendations for "+user+":"
movies=generate_recommendations(data,user)
for i,movie in enumerate(movies):
print str(i+1)+'. '+movie
# 用户John Carson 看过所有电影 因此生成0推荐
user='John Carson'
print "\nRecommendation for "+user+":"
movies=generate_recommendations(data,user)
for i,movie in enumerate(movies):
print str(# -*- coding:utf-8 -*-
'''
生成电影推荐系统
'''
import json
import numpy as np
# 定义一个计算两个用户之间欧几里得距离的函数 user1和user2
def euclidean_score(dataset, user1, user2):
if user1 not in dataset:
raise TypeError('User'+user1+'not present in the dataset')
if user2 not in dataset:
raise TypeError('User'+user2+'not present in the dataset')
# 提取两个用户均评过分的电影
rated_by_both = {}
for item in dataset[user1]:
if item in dataset [user2]:
rated_by_both[item]=1
# 如果两个用户均没评分过,则得分为0
if len(rated_by_both)==0:
return 0
# 计算平方和的平方根 并将其归一化 使得评分取值在0-1之间
squared_difference = []
for item in dataset[user1]:
if item in dataset[user2]:
squared_difference.append(np.square(dataset[user1][item]-dataset[user2][item]))
return 1/(1+np.sqrt(np.sum(squared_difference)))
# 皮尔逊系数函数
def pearson_score(dataset,user1,user2):
if user1 not in dataset:
raise TypeError('User'+user1+'not present in the dataset')
if user2 not in dataset:
raise TypeError('User'+user2+'not present in the dataset')
# 提取两个用户都评过分的电影
rated_by_both = {}
for item in dataset[user1]:
if item in dataset[user2]:
rated_by_both[item]=1
num_ratings=len(rated_by_both)
# 如果两个用户都没有评分 则说明两个用户之间没有相似度 返回值为0
if num_ratings==0:
return 0
# 计算相同评分电影的平方值之和
user1_sum=np.sum([dataset[user1][item] for item in rated_by_both])
user2_sum=np.sum([dataset[user2][item] for item in rated_by_both])
# 计算相同电影的评分的平方和
user1_squared_sum=np.sum([np.square(dataset[user1][item]) for item in rated_by_both])
user2_squared_sum=np.sum([np.square(dataset[user2][item]) for item in rated_by_both])
# 计算数据集的乘积之和
product_sum=np.sum([dataset[user1][item]*dataset[user2][item] for item in rated_by_both])
# 计算皮尔逊相关系数
Sxy=product_sum-(user1_sum*user2_sum/num_ratings)
Sxx=user1_squared_sum-np.square(user1_sum)/num_ratings
Syy=user2_squared_sum-np.square(user2_sum)/num_ratings
# 考虑分母为0的情况
if Sxx*Syy==0:
return 0
# 如果上述正常 返回皮尔逊相关系数
return Sxy/np.sqrt(Sxx*Syy)
# 寻找特定数量的与输入用户相似的用户
# dataset:代表数据库 user: 输入用户 num_users:相似的用户个数
def find_similar_users(dataset,user,num_users):
if user not in dataset:
raise TypeError('User '+user+' not present in the dataset')
# 计算所有用户的皮尔逊相关度
scores = np.array([[x,pearson_score(dataset,user,x)] for x in dataset if user!=x])
# 评分按照第二列进行排列
scores_sorted =np.argsort(scores[:,1])
# 评分按照降序排列
scores_sorted_dec=scores_sorted[::-1]
# 提取多个高分并返回
top_k=scores_sorted_dec[0:num_users]
return scores[top_k]
# 定义一个为用户生成电影推荐的函数 首先检查用户是否在数据库中
def generate_recommendations(dataset, user):
if user not in dataset:
raise TypeError('User ' + user + ' not present in the dataset')
total_scores = {}
similarity_sums = {}
# 计算用户与数据库中其他用户的皮尔逊相关系数
for u in [x for x in dataset if x != user]:
similarity_score = pearson_score(dataset, user, u)
if similarity_score <= 0:
continue
# 找到还未被用户评分过的电影
for item in [x for x in dataset[u] if x not in dataset[user] or dataset[user][x] == 0]:
total_scores.update({item: dataset[u][item] * similarity_score})
similarity_sums.update({item: similarity_score})
# 如果用户看过所有电影 就不能为其推荐电影
if len(total_scores) == 0:
return ['No recommendations possible']
# 生成一个电影评分标准化列表
movie_ranks = np.array([[total/similarity_sums[item], item]
for item, total in total_scores.items()])
# 根据第一列对皮尔逊系数进行降序排列
movie_ranks = movie_ranks[np.argsort(movie_ranks[:, 0])[::-1]]
# 提取出推荐的电影
recommendations = [movie for _, movie in movie_ranks]
return recommendations
if __name__=='__main__':
data_file='movie_ratings.json'
with open(data_file,'r') as f:
data=json.loads(f.read())
# 为Michael Henry 生成推荐
user='Michael Henry'
print "\nRecommendations for "+user+":"
movies=generate_recommendations(data,user)
for i,movie in enumerate(movies):
print str(i+1)+'. '+movie
# 用户John Carson 看过所有电影 因此生成0推荐
user='John Carson'
print "\nRecommendation for "+user+":"
movies=generate_recommendations(data,user)
for i,movie in enumerate(movies):
print str(i+1)+'. '+movie
输出结果如下:
Recommendations for Michael Henry:
1. Jerry Maguire
2. Anger Management
3. Inception
Recommendation for John Carson:
1. No recommendations possible
使用json数据如下:
{
"John Carson":
{
"Inception": 2.5,
"Pulp Fiction": 3.5,
"Anger Management": 3.0,
"Fracture": 3.5,
"Serendipity": 2.5,
"Jerry Maguire": 3.0
},
"Michelle Peterson":
{
"Inception": 3.0,
"Pulp Fiction": 3.5,
"Anger Management": 1.5,
"Fracture": 5.0,
"Jerry Maguire": 3.0,
"Serendipity": 3.5
},
"William Reynolds":
{
"Inception": 2.5,
"Pulp Fiction": 3.0,
"Fracture": 3.5,
"Jerry Maguire": 4.0
},
"Jillian Hobart":
{
"Pulp Fiction": 3.5,
"Anger Management": 3.0,
"Jerry Maguire": 4.5,
"Fracture": 4.0,
"Serendipity": 2.5
},
"Melissa Jones":
{
"Inception": 3.0,
"Pulp Fiction": 4.0,
"Anger Management": 2.0,
"Fracture": 3.0,
"Jerry Maguire": 3.0,
"Serendipity": 2.0
},
"Alex Roberts":
{
"Inception": 3.0,
"Pulp Fiction": 4.0,
"Jerry Maguire": 3.0,
"Fracture": 5.0,
"Serendipity": 3.5
},
"Michael Henry":
{
"Pulp Fiction": 4.5,
"Serendipity": 1.0,
"Fracture": 4.0
}
}