kNN 算法实现

kNN Algorithm

import numpy as np
import matplotlib.pyplot as plt
raw_data_X = [[3.4, 2.3],
              [3.1, 1.8],
              [1.3, 3.4],
              [3.6, 4.7],
              [2.3, 2.9],
              [7.4, 4.7],
              [5.7, 3.5],
              [9.2, 2.5],
              [7.8, 3.4],
              [7.9, 0.8],
             ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
X_train
array([[3.4, 2.3],
       [3.1, 1.8],
       [1.3, 3.4],
       [3.6, 4.7],
       [2.3, 2.9],
       [7.4, 4.7],
       [5.7, 3.5],
       [9.2, 2.5],
       [7.8, 3.4],
       [7.9, 0.8]])
y_train
array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
# 要预测的点
x = np.array([8.1, 3.4])
plt.scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], color='g')
plt.scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], color='r')
plt.scatter(x[0], x[1], color='b')
<matplotlib.collections.PathCollection at 0x20ba3c947b8>

在这里插入图片描述

在这里插入图片描述

1. kNN实现过程

  • 欧拉距离
    D i s t = ∑ i = 1 n ( X i ( a ) − X i ( b ) ) 2 Dist = \sqrt{\sum_{i=1}^n(X_i^{(a)} - X_i^{(b)})^2} Dist=i=1n(Xi(a)Xi(b))2
from math import sqrt

distances = []
for x_train in X_train:
    # 计算欧拉距离
    d = sqrt(np.sum((x_train - x) ** 2))
    distances.append(d)

    # 也可以如下写法
# distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in X_train]
distances
[4.827007354458868,
 5.2497618993626745,
 6.8,
 4.684015371452148,
 5.821511831131154,
 1.4764823060233399,
 2.4020824298928622,
 1.4212670403551892,
 0.2999999999999998,
 2.607680962081059]
nearest = np.argsort(distances)
nearest
array([8, 7, 5, 6, 9, 3, 0, 1, 4, 2], dtype=int64)
# 找出距离最近的 K 个点
k = 6
topK_y = [y_train[i] for i in nearest[:k]]
topK_y
[1, 1, 1, 1, 1, 0]
from collections import Counter
# 对最近的 K 个值进行类别统计
Counter(topK_y)
Counter({1: 5, 0: 1})
votes = Counter(topK_y)
# 返回数量最多的 K 个值, 这里 K 为 1
votes.most_common(1)
[(1, 5)]
# 得出预测结果
predict_y = votes.most_common(1)[0][0]
predict_y
1

2. 使用 scikit-learn中的kNN

from sklearn.neighbors import KNeighborsClassifier
KNN_classifier = KNeighborsClassifier(n_neighbors=6)
KNN_classifier.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=6, p=2,
           weights='uniform')
# 在 sklearn 中, predict 接受一个数组
X_predict = x.reshape(1, -1)
X_predict
array([[8.1, 3.4]])
KNN_classifier.predict(X_predict)
array([1])
y_predict = KNN_classifier.predict(X_predict)
y_predict[0]
1

3. 重新整理我们的 kNN 代码

from kNN.kNN import KNNClassifier
knn_clf = KNNClassifier(k=6)
knn_clf.fit(X_train, y_train)
KNN(k=6)
y_predict = knn_clf.predict(X_predict)
y_predict
array([1])

KNNClassifier.py

import numpy as np
from math import sqrt
from collections import Counter
from metrics import accuracy_score

class KNNClassifier:
    def __init__(self, k):
        assert k >= 1, "k must be valid"
        self.k = k
        self._X_train = None
        self._y_train = None

    def fit(self, X_train, y_train):
        """根据训练数据集X_train和y_train训练kNN分类器"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        assert self.k <= X_train.shape[0], \
            "the size of X_train must be at least k."

        self._X_train = X_train
        self._y_train = y_train
        return self

    def predict(self, X_predict):
        assert self._X_train is not None and self._y_train is not None, \
            "must fit before predict!"
        assert X_predict.shape[1] == self._X_train.shape[1], \
            "the feature number of X_predict must be equal to X_train"

        y_predict = [self._predict(x_predict) for x_predict in X_predict]
        return np.array(y_predict)

    def _predict(self, x_predict):
        """给定单个待预测数据x,返回x的预测结果值"""
        assert x_predict.shape[0] == self._X_train.shape[1], \
            "the feature number of x must be equal to X_train"

        dist = [ sqrt(sum((x_train - x_predict)**2)) for x_train in self._X_train]
        nearest = np.argsort(dist)
        top_K = [ self._y_train[i] for i in nearest[:self.k]]
        votes = Counter(top_K)
        return votes.most_common(1)[0][0]

    def score(self, X_test, y_test):
        y_predict = self.predict(X_test)
        return accuracy_score(y_test, y_predict)

    def __repr__(self):
        return "KNN(k=%d)" % self.k

metrics.py

import numpy as np

def accuracy_score(y_true, y_predict):
    """计算 y_true 和 y_predict 之间的准确率"""
    assert y_true.shape[0] == y_predict.shape[0], \
        "the size of y_true must be equal to the size of y_predict"

    return sum(y_true == y_predict) / len(y_true)

项目源码地址

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值