##ml_1m数据集下载地址:
http://files.grouplens.org/datasets/movielens/
##基于用户的协同过滤
import numpy as np
import pandas as pd
file = "ml-1m/ratings.dat"
header = ["uid","iid","rating"]
dtype = {"uid" :np.int32,"iid":np.int32,"rating":np.float32}
data = pd.read_csv(file,sep="::",usecols=range(3),names=header,dtype=dtype)
# print(data)
#构建透视表,找到用户与电影之间的评分关系
rating_matrix = data.pivot_table(index=["uid"],columns=["iid"],values="rating")
# print(rating_matrix)
#计算相似度 .corr():计算两列之间的pearson相似度
similar = rating_matrix.T.corr()
# print(similar)
def predict(uid,iid,rating_matrix,similar):
#预测用户uid对物品iid的评分
#1.找到正相关的相似用户
similar_users = similar[uid].drop([uid]).dropna()
similar_users = similar_users.where(similar_users>0).dropna()
#2.找到物品iid所有用户与相似用户交集
ids = set(rating_matrix[iid].dropna().index&similar_users.index)
# print(ids)
finally_similar_users = similar_users[list(ids)]
# print(finally_similar_users)
#3.计算预测评分y=sum(sim*ri)/sum(sim)
sum_up,sum_down = 0,0
for sim_uid,simi in finally_similar_users.iteritems():
#近邻用户的评分数量
sim_user_rated_movies = rating_matrix.loc[sim_uid].dropna()
#近邻用户对iid的评分
sim_user_rating_item = sim_user_rated_movies[iid]
#计算分子值
sum_up += simi*sim_user_rating_item
#计算分母值
sum_down += simi
y = sum_up/sum_down
return y
def predict_all(uid,rating_matrix,similar):
"""
:param uid: 用户id
:param rating_metrix:用户-物品打分矩阵
:param similar: 用户两两之间的相似度
:return: 生成器,逐个返回预测评分
"""
#准备要预测的物品Id列表
item_ids = rating_matrix.columns
#逐个预测
for iid in item_ids:
try:
rating = predict(uid,iid,rating_matrix,similar)
except Exception as e:
print(e)
else:
yield uid,iid,rating
result = predict_all(1,rating_matrix,similar)
print(list(result))
##基于物品的协同过滤
import numpy as np
import pandas as pd
file = "ml-1m/ratings.dat"
data = pd.read_csv(file,sep="::",names=["uid","iid","rating"],usecols=range(3))
#数据可视化
rate_matrix = pd.pivot_table(data,index=["uid"],columns=["iid"],values="rating")
# print(rate_matrix)
#items相似度
similar = rate_matrix.corr()
# print(similar)
def predict(uid,iid):
#找出相似items
similar_items = similar[iid].drop([iid]).dropna()
#找出正相关items
similar_items = similar_items.where(similar_items>0).dropna()
#用户评分过的物品与相似物品交集
ids = set(rate_matrix.loc[uid].dropna().index&similar_items.index)
finally_similar_item = similar_items[list(ids)]
#计算uid对iid的预测评分
sum_up,sum_down = 0,0
for sim_iid,simi in finally_similar_item.iteritems():
#近邻物品的评分数据
sim_item_rated_movies = rate_matrix[sim_iid].dropna()
#uid对相似物品的评分
sim_item_rated_user = sim_item_rated_movies[uid]
#计算分子
sum_up += simi*sim_item_rated_user
#计算分母
sum_down += simi
y = sum_up/sum_down
return y
result = predict(1,1)
print(result)