python代码仅用numpy模块实现knn算法_knn算法#encoding=utf8import numpy as npclass knnclas-CSDN博客

本文链接：https://blog.csdn.net/weixin_43141320/article/details/105315522

首先说说我实现knn的想法，首先对于投影在一个二维平面的所有点集来说（便于理解所以这里举例子就用二维空间来说，knn完全可以实现n维的预测），假设有A，B，C，D四类，然后给定一个sample坐标值，现在预测它是什么类别的。
给定k值
首先计算这个sample和二维平面其他所有点的欧氏距离，将距离从小到大排序，取前k个距离，分别统计这k个距离里中各类标签出现的次数，然后将sample归类为出现次数最多的那一类标签。但是有可能出现特殊的情况，比如当k=10的时候，A类有12个，B类也有12个，有两个或多个出现次数最多的标签类别，那么就要确定sample到底是分为A类还是B类，我采取的方式是，从已经排序好的距离列表中按顺序确定，如果第一次遇到一个距离所对应的点的标签是A，就将sample分类为A类，同理B类。聪明的你可能还想到一种情况，比如：
[C, D, A, C, B…]
这是和sample距离的从小到大排好序之后的列表，这里的ACB的距离相等，其中C类出现的次数少于A和B，首先排除C类，而你认为这样不能简单地将sample分为A类，那么我们要不要继续判断下一个A和B呢？其实分一下是可以的，但是不分的话影响不大，因为现实情况下这样的情况是很少出现的，即便出现了，我们将sample分为A类也是可以接受的，毕竟人家在K取值为10 的情况下出现的次数和B一样是最多的。而我在代码中也是这么做的。
说这么多，结合代码看是最容易理解的：

import numpy as np


class KNNClassifier(object):
    def __init__(self, k=3):
        self.k = k

    def calDistance(self, sample, train_x):
        # 计算欧式距离：
        # (28, 28, 1)-(10, 28, 28, 1)是可以广播实现的
        squarDistance = (sample - train_x) ** 2
        distanceList = []
        for i in range(train_x.shape[0]):
            distanceList.append(np.sum(squarDistance[i]))
        # 这里不采用求算术平方根的方法，而是采用减去最大值的方法以减少计算量
        distanceList -= np.max(distanceList)
        sortedDistanceIndex = np.argsort(distanceList)
        return distanceList, sortedDistanceIndex

    def classify(self, sample, train_x, train_y, progressBar=True):
        sample = np.array(sample)
        # 将(4, )变为(1, 4)
        if len(sample.shape) == 1:
            sample = sample.reshape((1, sample.shape[0]))
        train_x = np.array(train_x)
        if len(train_x.shape) == 1:
            train_x = train_x.reshape((1, train_x.shape[0]))
        train_y = np.array(train_y)
        if self.k <= 0 or self.k > train_x.shape[0]:
            print("输入的k值不合法！已将k值设为默认值！")
            self.k = 3
        # if isinstance(sample, np.ndarray) and isinstance(train_x, np.ndarray) and isinstance(train_y, np.ndarray):
        #     pass
        # else:
        #     try:
        #         sample = np.array(sample)
        #         train_x = np.array(train_x)
        #         train_y = np.array(train_y)
        #     except:
        #         raise TypeError("numpy.ndarray required for train_x and sample and train_y")
        result = []
        # print("sample.shape[0]:", sample.shape[0])
        for j in range(sample.shape[0]):
            distanceList, sortedDistanceIndex = self.calDistance(sample[j], train_x)
            # 计数在K个近邻中每一种类别出现的次数
            classCount = {}
            # print("sortedDistanceIndex: ", sortedDistanceIndex)
            kLabels = train_y[sortedDistanceIndex[: self.k]]
            # print("kLabels: ", kLabels)
            # for i in range(self.k):
            #     if kLabels[i] not in classCount:
            #         classCount[kLabels[i]] = 1
            #     else:
            #         classCount[kLabels[i]] += 1
            for i in range(kLabels.shape[0]):
                classCount[kLabels[i]] = classCount.get(kLabels[i], 0) + 1
            # 将次数制作成列表
            valueList = list(classCount.values())
            # 如果列表中最大值的个数为1，那么就认为这个sample的类别为这个最大数值个数的类别
            if valueList.count(np.max(valueList)) == 1:
                result.append(kLabels[valueList.index(np.max(valueList))])
            # 如果列表中有多个最大值，那么需要区分这多个类中哪一类距离样本点最近
            else:
                keyList = list(classCount.keys())
                maxValue = np.max(valueList)
                # 储存最大值所对应的标签
                maxNumberLabels = []
                i = 0
                for value in valueList:
                    if value == maxValue:
                        maxNumberLabels.append(keyList[i])
                    i += 1
                # 接下来确定哪一个标签作为预测标签最合适

                for index in range(len(kLabels)):
                    if kLabels[index] in maxNumberLabels:
                        result.append(kLabels[index])
                        break
            if progressBar:
                complete = (j + 1) / sample.shape[0]
                not_complete = 1 - complete
                complete = int(50 * complete)
                not_complete = int(50 * not_complete)
                print('[' + ">" * complete, end='')
                print('-' * not_complete + ']' + '[%' + str((j + 1) * 100 / sample.shape[0]) + ']')

        return result


if __name__ == "__main__":
    train_x = [[4, 2.5], [5, 3.5], [4, 0], [4, 5]]
    train_y = ['A', 'A', 'B', 'B']
    clf = KNNClassifier(k=4)
    inputx = [2.5, 2.5]
    result = clf.classify(inputx, train_x, train_y)
    print(result)