KNN算法
K - 若干个
N - Nearest,最近
N - Neighbors, 邻居
1.分类
对于一个未知类别的样本,在其周围寻找距离最近的K个已知样本,根据与距离成反比的加权投票,决定未知样本的类别。
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.neighbors as sn
import matplotlib.pyplot as mp
train_x, train_y = [], []
with open('../../data/knn.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
train_x.append(data[:-1])
train_y.append(data[-1])
train_x = np.array(train_x)
train_y = np.array(train_y, dtype=int)
# 创建KNN分类器模型
model = sn.KNeighborsClassifier(
n_neighbors=10, weights='distance')
# 训练KNN分类器模型
model.fit(train_x, train_y)
# 点阵水平边界和步长
l, r, h = train_x[:, 0].min() - 1, \
train_x[:, 0].max() + 1, 0.005
# 点阵垂直边界和步长
b, t, v = train_x[:, 1].min() - 1, \
train_x[:, 1].max() + 1, 0.005
# 生成二维点阵
# _ grid_x
# ^ |h| /
# t | * * * *
# | * * * *-- v
# b | * * * *--
# +-------->
# l r
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
# 将点阵中每个点的水平坐标和垂直坐标作为
# 样本的两个特征合并成一个两列的二维数组
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
# 利用朴素贝叶斯分类器模型预测点阵的类别
flat_y = model.predict(flat_x)
# 将一维形式的类别变成点阵形式的二维数组
grid_y = flat_y.reshape(grid_x[0].shape)
test_x = np.array([
[2.2, 6.2],
[3.6, 1.8],
[4.5, 3.6]])
pred_test_y = model.predict(test_x)
nn_distance, nn_indices = model.kneighbors(test_x)
# 绘制训练样本和分类边界
mp.figure('KNN Classification', facecolor='lightgray')
mp.title('KNN Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
mp.scatter(train_x[:, 0], train_x[:, 1], c=train_y,
cmap='brg', s=60)
mp.scatter(test_x[:, 0], test_x[:, 1], c=pred_test_y,
cmap='brg', s=60, marker='D')
mp.scatter(train_x[nn_indices[0], 0],
train_x[nn_indices[0], 1], marker='D',
edgecolor='r', facecolor='none', s=180)
mp.scatter(train_x[nn_indices[1], 0],
train_x[nn_indices[1], 1], marker='D',
edgecolor='b', facecolor='none', s=180)
mp.scatter(train_x[nn_indices[2], 0],
train_x[nn_indices[2], 1], marker='D',
edgecolor='g', facecolor='none', s=180)
mp.show()
对于类别比例相差悬殊的样本不适合用KNN法预测分类。
2.回归
对于一个未知输出的样本,在其周围寻找距离最近的K个已知样本,根据与距离成反比的加权平均,决定未知样本的输出。
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.neighbors as sn
import matplotlib.pyplot as mp
train_x = 10 * np.random.rand(100, 1) - 5
train_y = np.sinc(train_x).ravel()
train_y += 0.2 * (0.5 - np.random.rand(
train_y.size))
model = sn.KNeighborsRegressor(
n_neighbors=10, weights='distance')
model.fit(train_x, train_y)
test_x = np.linspace(-5, 5, 10000).reshape(-1, 1)
test_y = np.sinc(test_x).ravel()
pred_test_y = model.predict(test_x)
mp.figure('KNN Regression', facecolor='lightgray')
mp.title('KNN Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.scatter(train_x, train_y, c='dodgerblue',
s=60, label='Training')
mp.plot(test_x, test_y, '--', c='limegreen',
linewidth=1, label='Testing')
mp.plot(test_x, pred_test_y, c='orangered',
label='Predicted')
mp.legend()
mp.show()
对于预测输入远远偏离训练集的样本不适合用KNN法回归预测。
----------------------------------------------------------------
回归问题:线性回归、岭回归、多项式回归、决策树、支持向量机、KNN
分类问题:逻辑分类、朴素贝叶斯、决策树、支持向量机、KNN
聚类问题:K均值、均值漂移、凝聚层次、DBSCAN
----------------------------------------------------------------
推荐引擎
1.欧氏距离得分
两个样本:
A: [a1, a2, ..., an]
B: [b1, b2, ..., bn]
欧氏距离: e = sqrt((a1-b1)^2 + (a2-b2)^2 + ... +
(an-bn)^2)
0 <- e -> oo
相似 不相似
欧氏距离得分: es = 1 / (1 + e)
0 <- es -> 1
不相似 相似
衡量两个样本相似度的指标。
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import json
import numpy as np
with open('../../data/ratings.json') as f:
ratings = json.loads(f.read())
users, scmat = list(ratings.keys()), []
for user1 in users:
scrow = []
for user2 in users:
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
if len(movies) == 0:
score = 0
else:
x, y = [], []
for movie in movies:
x.append(ratings[user1][movie])
y.append(ratings[user2][movie])
x = np.array(x)
y = np.array(y)
score = 1 / (1 + np.sqrt(
((x - y) ** 2).sum()))
scrow.append(score)
scmat.append(scrow)
users = np.array(users)
scmat = np.array(scmat)
print(users)
for scrow in scmat:
print(' '.join('{:>5.2f}'.format(score)
for score in scrow))
A B C
A 1 0.6 0.7
B 0.6 1 0.4
C 0.7 0.4 1
2.皮氏距离得分
相关性系数
ave(dev(a)dev(b))
---------------------, -1 <- 0 -> 1
std(a)std(b) 反相关 不相关 正相关
相反 不相似 很相似
衡量两个样本相似度的指标。
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import json
import numpy as np
with open('../../data/ratings.json') as f:
ratings = json.loads(f.read())
users, scmat = list(ratings.keys()), []
for user1 in users:
scrow = []
for user2 in users:
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
if len(movies) == 0:
score = 0
else:
x, y = [], []
for movie in movies:
x.append(ratings[user1][movie])
y.append(ratings[user2][movie])
x = np.array(x)
y = np.array(y)
score = np.corrcoef(x, y)[0, 1]
scrow.append(score)
scmat.append(scrow)
users = np.array(users)
scmat = np.array(scmat)
print(users)
for scrow in scmat:
print(' '.join('{:>5.2f}'.format(score)
for score in scrow))
3.按照皮氏距离得分从高到低的顺序,为每个用户生成除其自身以外的相似用户清单
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import json
import numpy as np
with open('../../data/ratings.json') as f:
ratings = json.loads(f.read())
users, scmat = list(ratings.keys()), []
for user1 in users:
scrow = []
for user2 in users:
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
if len(movies) == 0:
score = 0
else:
x, y = [], []
for movie in movies:
x.append(ratings[user1][movie])
y.append(ratings[user2][movie])
x = np.array(x)
y = np.array(y)
score = np.corrcoef(x, y)[0, 1]
scrow.append(score)
scmat.append(scrow)
users = np.array(users)
scmat = np.array(scmat)
for i, user in enumerate(users):
sorted_indices = scmat[i].argsort()[::-1]
similar_indices = sorted_indices[
sorted_indices != i]
similar_users = users[similar_indices]
similar_scores = scmat[i, similar_indices]
print(user)
for similar_user, similar_score in zip(
similar_users, similar_scores):
print(' ', similar_user, '->',
similar_score)
4.推荐列表
相似用户评价过而被推荐用户不曾评价。
按推荐度的降序排列。
如何计算推荐度?用被推荐者的评分根据相似用户的相似度做加权平均。
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import json
import numpy as np
with open('../../data/ratings.json') as f:
ratings = json.loads(f.read())
users, scmat = list(ratings.keys()), []
for user1 in users:
scrow = []
for user2 in users:
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
if len(movies) == 0:
score = 0
else:
x, y = [], []
for movie in movies:
x.append(ratings[user1][movie])
y.append(ratings[user2][movie])
x = np.array(x)
y = np.array(y)
score = np.corrcoef(x, y)[0, 1]
scrow.append(score)
scmat.append(scrow)
users = np.array(users)
scmat = np.array(scmat)
for i, user in enumerate(users):
sorted_indices = scmat[i].argsort()[::-1]
similar_indices = sorted_indices[
sorted_indices != i]
similar_users = users[similar_indices]
similar_scores = scmat[i, similar_indices]
positive_mask = similar_scores > 0
similar_users = similar_users[positive_mask]
similar_scores = similar_scores[positive_mask]
score_sums, weight_sums = {}, {}
for similar_user, similar_score in zip(
similar_users, similar_scores):
for movie, score in ratings[
similar_user].items():
if movie not in ratings[user].keys():
if movie not in score_sums.keys():
score_sums[movie] = 0
score_sums[movie] += \
score * similar_score
if movie not in weight_sums.keys():
weight_sums[movie] = 0
weight_sums[movie] += similar_score
movie_ranks = {}
for movie, score_sum in score_sums.items():
movie_ranks[movie] = \
score_sum / weight_sums[movie]
sorted_indices = np.array(list(
movie_ranks.values())).argsort()[::-1]
recomms = np.array(list(
movie_ranks.keys()))[sorted_indices]
print(user)
for movie in recomms:
print(" ", movie)
想要看更多的课程请微信关注SkrEric的编程课堂