推荐引擎
欧氏距离分数
通过欧氏距离判断相关性,判断相似程度
欧
氏
距
离
分
数
=
1
1
+
欧
氏
距
离
欧氏距离分数=\frac{1}{1+欧氏距离}
欧氏距离分数=1+欧氏距离1
欧氏距离->0 - oo
欧氏距离分数->1 - 0
0 <-> 1
不相似 相似
a b c ...
a x x x ...
b x x x ...
c x x x ...
...
代码:
import json
import numpy as np
with open(r'C:\Users\Cs\Desktop\机器学习\ML\data\ratings.json', 'r') as f:
ratings = json.loads(f.read())
users, scmat = list(ratings.keys()), []
for user1 in users:
scrow = []
for user2 in users:
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
if len(movies) == 0:
score = 0
else:
x, y = [], []
for movie in movies:
x.append(ratings[user1][movie])
y.append(ratings[user2][movie])
x = np.array(x)
y = np.array(y)
score = 1 / (1 + np.sqrt(
((x - y) ** 2).sum()))
scrow.append(score)
scmat.append(scrow)
users = np.array(users)
scmat = np.array(scmat)
for scrow in scmat:
print(' '.join('{:>5.2f}'.format(score)
for score in scrow))
皮氏距离分数
通过皮氏距离相关性判断两者喜欢的电影的程度。
相关性矩阵
/ 1 相关性系数
\ 相关性系数 1 /
相关性系数 = 协方差/标准差之积
-1 <------- 0 -------> 1
反相关 不相关 正相关
3-5
0-2
代码:
import json
import numpy as np
with open(r'C:\Users\Cs\Desktop\机器学习\ML\data\ratings.json', 'r') as f:
ratings = json.loads(f.read())
users, scmat = list(ratings.keys()), []
for user1 in users:
scrow = []
for user2 in users:
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
if len(movies) == 0:
score = 0
else:
x, y = [], []
for movie in movies:
x.append(ratings[user1][movie])
y.append(ratings[user2][movie])
x = np.array(x)
y = np.array(y)
score = np.corrcoef(x, y)[0, 1]
scrow.append(score)
scmat.append(scrow)
users = np.array(users)
scmat = np.array(scmat)
for scrow in scmat:
print(' '.join('{:>5.2f}'.format(score)
for score in scrow))
按照相似度从高到低排列每个用户的相似用户
代码:
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import json
import numpy as np
with open(r'C:\Users\Cs\Desktop\机器学习\ML\data\ratings.json', 'r') as f:
ratings = json.loads(f.read())
users, scmat = list(ratings.keys()), []
#遍历两次users列表,匹配每个人和所有人的皮氏距离分数。并记录每个人的分数列表和下标
for user1 in users:
# 记录每个人的分数列表,
scrow = []
for user2 in users:
# 记录两个人共同的电影集合
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
# 这里其实有一个问题,没有考虑两个人的电影集合问题,建议分数根据相似电影占总电影比例进行一次加权。
if len(movies) == 0:
score = 0
else:
x, y = [], []
for movie in movies:
x.append(ratings[user1][movie])
y.append(ratings[user2][movie])
x = np.array(x)
y = np.array(y)
score = np.corrcoef(x, y)[0, 1]
scrow.append(score)
scmat.append(scrow)
users = np.array(users)
scmat = np.array(scmat)
# 按照下标顺序得到每个人的分数并排序输出
for i, user in enumerate(users):
sorted_indices = scmat[i].argsort()[::-1]
sorted_indices = sorted_indices[
sorted_indices != i]
similar_users = users[sorted_indices]
similar_scores = scmat[i, sorted_indices]
print(user, similar_users, similar_scores,
sep='\n')
4.生成推荐清单
推荐度
皮氏距离分数>0的用户
打分高低
相似度权重
代码:
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import json
import numpy as np
with open('../../data/ratings.json', 'r') as f:
ratings = json.loads(f.read())
users, scmat = list(ratings.keys()), []
for user1 in users:
scrow = []
for user2 in users:
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
if len(movies) == 0:
score = 0
else:
x, y = [], []
for movie in movies:
x.append(ratings[user1][movie])
y.append(ratings[user2][movie])
x = np.array(x)
y = np.array(y)
score = np.corrcoef(x, y)[0, 1]
scrow.append(score)
scmat.append(scrow)
users = np.array(users)
scmat = np.array(scmat)
for i, user in enumerate(users):
sorted_indices = scmat[i].argsort()[::-1]
sorted_indices = sorted_indices[
sorted_indices != i]
similar_users = users[sorted_indices]
similar_scores = scmat[i, sorted_indices]
positive_mask = similar_scores > 0
similar_users = similar_users[positive_mask]
similar_scores = similar_scores[positive_mask]
score_sums, weight_sums = {}, {}
for similar_user, similar_score in zip(
similar_users, similar_scores):
for movie, score in ratings[
similar_user].items():
if movie not in ratings[user].keys():
if movie not in score_sums.keys():
score_sums[movie] = 0
score_sums[movie] += score * similar_score
if movie not in weight_sums.keys():
weight_sums[movie] = 0
weight_sums[movie] += similar_score
movie_ranks = {}
for movie, score_sum in score_sums.items():
movie_ranks[movie] = \
score_sum / weight_sums[movie]
sorted_indices = np.array(list(
movie_ranks.values())).argsort()[::-1]
recomms = np.array(list(
movie_ranks.keys()))[sorted_indices]
print(user, recomms, sep='\n')