单纯的练手项目,电影的时间没考虑,用户的口味随时间的变化之类的因素都没考虑,单纯的练手
import pandas as pd
import numpy as np
import copy
#这块是自己写的一个personr相关系数的算法,最好别用这个,我在学习,一堆坑
#from scipy.stats import pearsonr
# pearsonr(vector1,vector2) 建议用scipy框架的pearsonr相关系数
def get_pearsonr(b,o):
b_s = np.sum(b)
o_s = np.sum(o)
b_ss = np.sum(np.square(b))
o_ss = np.sum(np.square(o))
t_bo = 0
n = len(b)
for i in range(len(b)):
if not np.isnan(o[i]) :
t_bo += b[i] * o[i]
denominator = np.sqrt(b_ss - np.square(b_s)/n)*np.sqrt(o_ss - np.square(o_s)/n)
if denominator:
return (t_bo - (o_s * b_s)/n) / denominator
else:
return 0
def d_pearsonr(data_f):
n_arr = data_f.values
base_vector = n_arr[0,:]
d_values = []
for o_vector in n_arr:
if base_vector is o_vector:
continue
d_values.append(get_pearsonr(base_vector,o_vector))
if len(d_values) > 0:
return pd.Series(d_values,index=data_f.index).sort_values()
else:
return -1
# emmm数据来源于https://grouplens.org/datasets/movielens/
# 算法分为根据用户相关系数推荐,和根据电影本身推荐
# 这是简单的根据用户相关系数的推荐算法,随机生成一个userId在1000以内的用户,里面包含了已经有数据的用户和没有数据的用户,
# 有数据的用户根据其它用户评分和该用户评分的相关系数生成推荐
# 没用户数据的推荐评分相对较高的热门电影.
# 毫无难度所以比较适合入门,没区分时间所以并不准,只适合了解推荐算法这个东西
def start_studio(data_f):
result = d_pearsonr(data_f).index[0:3]
return result
def newUser_reeccomendation(ratings):
# ratings.userId = 1
n_r = copy.deepcopy(ratings)
n_r.userId = 1
n_s = n_r.groupby('movieId').sum()
# print(n_s)
# print(n_s.sort_values(by='userId',ascending=False).iloc[0:100])
# print(type(n_s.sort_values(by='userId',ascending=False).iloc[0:100]))
u_movieId = n_s.sort_values(by='userId',ascending=False).iloc[0:100].index
r_movieId = n_s.sort_values(by='rating',ascending=False).iloc[0:100].index
return fetch_movieInfo(list((set(u_movieId) & set(r_movieId)))[0:20])
def fetch_movieInfo(movieIds):
if len(movieIds) < 20:
return -1
movies = pd.read_csv('./ml-latest-small/movies.csv', delimiter=',')
r_movies = []
for m_id in movieIds[0:20]:
name_str = ('电影名称:' + movies[movies['movieId'] == m_id]['title'])
actor_str = ('演员' + movies[movies['movieId'] == m_id]['genres'])
r_movies.append(name_str + actor_str)
return r_movies
def push(userId):
ratings = pd.read_csv('./ml-latest-small/ratings.csv', delimiter=',')
all_user = set(ratings.userId)
real_columns = set(ratings[ratings['userId'] == userId]['movieId'])
if not len(real_columns):
return newUser_reeccomendation(ratings)
real_indexs = [userId]
for o_userId in all_user:
if o_userId == userId:
continue
u_movie = set(ratings[ratings['userId'] == o_userId]['movieId'])
if len(u_movie&real_columns) > 26:
real_indexs.append(o_userId)
data_arr = np.array(ratings[ratings.userId == userId]['rating']).reshape((1,-1))
for u_id in real_indexs:
if u_id == userId:
continue
t_arr = []
for m_id in real_columns:
um_rank = ratings[ratings.userId == u_id][ratings.movieId == m_id]['rating']
if not um_rank.empty:
t_arr.append(um_rank.values[0])
else:
t_arr.append(np.NaN)
data_arr = np.concatenate((data_arr,np.array(t_arr).reshape((1,-1))),axis=0)
s_userIds = start_studio(pd.DataFrame(data=data_arr,index=real_indexs,columns=real_columns))
allreccomendation = []
for r_userId in s_userIds:
allreccomendation.extend(ratings[ratings['userId'] == r_userId].sort_values(by='rating')['movieId'])
real_movie = list(set(allreccomendation) - set(real_columns))
return fetch_movieInfo(real_movie)
print(push(345))