数据来源:MovieLens
数据源说明:http://grouplens.org/datasets/movielens/
算法简要说明:根据电影评分情况判断用户相似度,计算出用户相似度之后,选择前N个用户作为目标,输出当前用户未看过的电影。此处相似度计算用的是Pearson相似度。
·计算用户之间的相似度–Pearson相似度,计算原则是根据对电影的评分情况。
·预测算法,公式如图代码见getRecommendations()函数
'''
u.data -- The full u data set, 100000 ratings by 943 users on 1682 items.
Each user has rated at least 20 movies. Users and items are
numbered consecutively from 1. The data is randomly
ordered. This is a tab separated list of
user id | item id | rating | timestamp.
The time stamps are unix seconds since 1/1/1970 UTC
u.item -- Information about the items (movies); this is a tab separated
list of
movie id | movie title | release date | video release date |
IMDb URL | unknown | Action | Adventure | Animation |
Children's | Comedy | Crime | Documentary | Drama | Fantasy |
Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
Thriller | War | Western |
The last 19 fields are the genres, a 1 indicates the movie
is of that genre, a 0 indicates it is not; movies can be in
several genres at once.
The movie ids are the ones used in the u.data data set.
'''
# *_*coding:utf-8 *_*
from operator import itemgetter, attrgetter
from math import sqrt
def load_data():
filename_user_movie = 'ratings.dat'
ilename_movieInfo = 'movies.dat'
user_movie = {}
for line in open(filename_user_movie):
(userId, itemId, rating, timestamp) =line.strip().split('::')
user_movie.setdefault(userId, {})[itemId] = float(rating)
movies = {}
for line in open(ilename_movieInfo):
(movieId, movieTitle) =line.strip().split('::')[0:2]
movies[movieId] = movieTitle
for line in open(ilename_movieInfo):
(movieId,movieTitle) = line.strip().split('::')[0:2]
return user_movie, movies
def average_rating(user): #该用户对已经看过的全部电影求评分均,
average = 0
for u in user_movie[user].keys():
average += user_movie[user][u]
average = average * 1.0 / len(user_movie[user].keys())
return average
def calUserSim(user_movie): #求相似度
# 创建itemid 与 userid之间的对应关系 数据内容{itemId:userId}接下来要用这个字典中的数据
movie_user = {}
for ukey in user_movie.keys():
for mkey in user_movie[ukey].keys():
if mkey not in movie_user:
movie_user[mkey] = []
movie_user[mkey].append(ukey)
c = {}#把用户与看过的电影对应起来,字典的数据内容{useri-j:{useri-j:[movie1.....movie9999]}}
for movie,users in movie_user.items():
for user_i in users:
c.setdefault(user_i,{})
for user_j in users:
if user_i == user_j:
continue
c[user_i].setdefault(user_j,[])
c[user_i][user_j].append(movie)
userSim = {} #计算相似度
for user_i in c.keys():
for user_j in c[user_i].keys():
uesrSim.setdefault(user_i,{})
userSim[user_i].setdefault(user_j,0)
average_user_i_rate = average_rating(user_i)
average_user_j_rate = average_rating(user_j)
part1 = 0
part2 = 0
part3 = 0
for m in c[user_i][user_j]:
part1 += (user_movie[user_i][m]-average_user_i_rate) * (user_movie[user_j][m]-average_user_j_rate)*1.0
part2 += pow(user_movie[user_i][m]-average_user_i_rate,2) *1.0
part3 += pow(user_movie[user_j][m]-average_user_j_rate,2) *1.0
part2 = sqrt(part2)
part3 = sqrt(part3)
if part2 ==0:
part2 =0.01
if part3 ==0:
part3 = 0.01
userSim[user_i][user_j] = part1 / (part2*part3)
return userSim
#预测推荐内容的算法
def getRecommendations(user, user_movie, movies, userSim, N):
pred = {}
interacted_items = user_movie[user].keys()
average_u_rate = average_rating(user)
sumUserSim = 0
for user_j , nuw in sorted(userSim[user].items(),key= itemgetter(1),reverse=True)[0:N]:
average_user_j_rate = average_rating(user_j)
for i, nrating in user_movie[user_j].items():
if i in interacted_items:
continue
pred.setdefault(i,0)
pred[i] += nuw * (nrating -average_user_j_rate)
sumUserSim += nuw
for i,rating in pred.items():
pred[i] = average_u_rate + (pred[i]*1.0) /sumUserSim
pred = sorted(pred.items(),key= itemgetter(1),reverse=True)[0:10]
return pred
if __name__ =='__main__':
user_movie,movies = load_data()
userSim = calUserSim(user_movie)
pred = getRecommendations('182', user_movie, movies, userSim, 20)
for i ,rating in pred:
print'film:%s,rating:%s'%(movies[i],rating)
函数setdefault()用法见https://www.cnblogs.com/elleblog/p/7533413.html
函数itemgetter () 用法见https://blog.csdn.net/qq_40952927/article/details/80421793