ML：K 近邻算法及实现

最新推荐文章于 2024-08-26 22:44:43 发布

Raymone_

最新推荐文章于 2024-08-26 22:44:43 发布

阅读量282

点赞数

分类专栏：机器学习文章标签： KNN K近邻机器学习

本文链接：https://blog.csdn.net/u012470887/article/details/102492005

版权

机器学习专栏收录该内容

6 篇文章 1 订阅

订阅专栏

K 近邻算法及实现

1. 算法概述
2. KNN Python 实现
3. 实战：使用 k-近邻算法改进约会网站的配对效果
4. 实战：手写识别系统

1. 算法概述

K 近邻算法采用测量不同特征值之间的距离方法进行分类
优点：精度高、对异常值不敏感、无数据输入假定
缺点：计算复杂度高、空间复杂度高
适用数据范围：数值型和标称型
工作原理：已知训练样本集，它包括特征值和标签（X 和 y），对于测试集，计算测试集和训练集的距离，取距离最小的 k 条数据，选取这 k 条数据中出现次数最多的 y 作为测试集的输出。

2. KNN Python 实现

np.tile 实现广播
np.argsort() 返回排序后的索引值
collections.Counter().most_common() 选出出现次数最多的类别
欧式距离： $d=\sqrt{(A_1-B_1)^2+(A_2-B_2)^2+...+(A_N-B_N)^2}$

'''k 近邻算法'''
import pandas as pd
import numpy as np
from collections import Counter
def classify0(x, group, labels, k):    # x 为测试集特征矩阵，group 为训练集特征矩阵，labels 为训练集分类值
    result = []   # 初始化分类结果
    for i in range(x.shape[0]):    # 每条数据逐一预测分类
        diff_mat = np.tile(x[i], (group.shape[0], 1)) - group    # np.tile 将 x[i] 广播 group 那么多行，以便进行矩阵运算
        distances = (diff_mat ** 2).sum(axis=1) ** 0.5    # 距离计算采用欧氏距离
        sorted_dis_idx = distances.argsort()    # argsort 返回排序后的索引值
        class_count = Counter([labels[i] for i in sorted_dis_idx[:k]).most_common(1)    # 利用 Counter 选出出现次数最多的类别
        result.append(class_count[0][0])    # Counter.most_common 返回的是元组的列表如 [('a', 7)]
    return result

3. 实战：使用 k-近邻算法改进约会网站的配对效果

数据形式：共 3 列，每年飞行距离，玩游戏时间比，每周冰淇淋消费数，共 1000 行
读取数据，将文本转换为 Numpy：

'''读取数据，将文本转换为 Numpy'''
import pandas as pd
import numpy as np
def file2mat(filename):
    data = pd.read_csv(filename, sep='\t', header=None)   # 文本文件中分隔符为 '\t'
    value = data.values
    group = value[:, :value.shape[1]-1]
    labels = value[:, value.shape[1]-1]
    return group, labels

filename = 'Ch02/datingTestSet2.txt'
group, labels = file2mat(filename)

分析数据，使用 Matplotlib 创建散点图：

import matplotlib.pyplot as plt
plt.rcParams['figure.constrained_layout.use'] = True    # 自动调整位置
plt.rcParams['font.sans-serif'] = ['SimHei']    # 显示中文
%matplotlib notebook
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(group[:,1], group[:, 2], 15.0*np.array(labels), 15.0*np.array(labels))    # 利用颜色及尺寸标识属性类别
ax.set_xlabel('玩视频游戏所耗时间百分比')
ax.set_ylabel('\n'.join('每周消费的冰淇淋公升数'), rotation='horizontal', 
              verticalalignment='center', horizontalalignment='right')
plt.show()

在这里插入图片描述
4. 准备数据，归一化数值，使用 Min-Max 归一化

import numpy as np
def autoNorm(dataSet):
    minVals = dataSet.min(0)    # 各列最小值
    maxVals = dataSet.max(0)    # 各列最大值
    ranges = maxVals - minVals    # 各列极差
    normDataSet = np.zeros(np.shape(dataSet))    # 初始化归一化数据集
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minVals, (m,1))
    normDataSet = normDataSet / np.tile(ranges, (m,1))
    return normDataSet, ranges, minVals

使用测试集对分类器进行测试，使用 4-近邻，分类错误率为 0.4%

'''分类器针对约会网站的测试代码'''
def datingClassTest():
    hoRatio = 0.1    # 测试集比例
    datingDataMat, datingLabels = file2mat('Ch02/datingTestSet2.txt')    # 载入数据
    normMat, ranges, minVals = autoNorm(datingDataMat)    # 标准化
    m = int(normMat.shape[0] * hoRatio)    # 测试集索引号
    testMat = normMat[:m,:]    # 测试集特征矩阵
    trainMat = normMat[m:,:]    # 训练集特征矩阵
    testLab = datingLabels[:m]    # 测试集分类标签
    trainLab = datingLabels[m:]    # 训练集分类标签
    classifierResults = classify0(testMat, trainMat, trainLab, 4)    # 分类结果
    errorCount = np.sum([classifierResults[i] != testLab[i] for i in range(len(testLab))])    # 计算分类错误率
    print('The total error rate is: %f' % (errorCount / float(len(testLab))))

[IN] datingClassTest()
[OUT] The total error rate is: 0.040000

约会网站预测函数

def classifyPerson(game, fly, ice):    # 输入参数可以是数值（单人），也可以是列表（多人）
    resultList = ['not at all', 'in small doses', 'in large doses']
    datingDataMat, datingLabels = file2mat('Ch02/datingTestSet2.txt')    # 载入已有数据
    normMat, ranges, minVals = autoNorm(datingDataMat)    # 标准化
    if type(game) in (int, float):    # 单人情况
        inArr = np.array([[fly, game, ice]])
    else:    # 多人情况
        inArr = np.array([fly, game, ice])
    classifierResults = classify0((inArr - minVals)/ranges, normMat, datingLabels, 4)    # 分类结果
    print('You will probably like this person:', [resultList[int(i)-1] for i in classifierResults])

[IN] classifyPerson(10, 10000, 0.5)
[OUT] You will probably like this person: ['in small doses']

[IN] game = [10, 9, 10]
[IN] fly = [10000, 8000, 9000]
[IN] ice = [0.4, 0.5, 0.6]
[IN] classifyPerson(game, fly, ice)
[OUT] You will probably like this person: ['in large doses', 'in small doses', 'in small doses']

4. 实战：手写识别系统

说明：识别手写数字，分为两个文件夹，训练集和测试集，每个文件为 txt 文件，包含 32 行，每行 32 个数字，对应 32*32 图像，数字间没有分隔符。
将 3232 的图像转换为 11024 的向量

'''将 32*32 的图像转换为 1*1024 的向量'''
import pandas as pd
import numpy as np

def img2vector(filename):
    # 读取数据，由于数字间没有分隔符，所以按行读取
    # 首先将每条数据映射为列表，然后得到列表组成的 array，使用 np 连接，转换为 int 类型，重组为 1*1024 矩阵
    data = pd.read_csv(filename, sep='\n', header=None)
    return np.concatenate(data[0].map(list).values).astype(int).reshape(1, 1024)

手写数字识别系统的测试代码，使用 3-近邻，错误率为 1.16%。

'''手写数字识别系统的测试代码'''
import os
def handwritingClassTest():
    # 处理训练集
    hwLabels = []    # 初始化训练集标签
    trainingFileList = os.listdir('Ch02/trainingDigits/')    # 得到训练集文件列表
    m = len(trainingFileList)
    trainingMat = np.zeros((m, 1024))    # 初始化训练集特征矩阵
    for i in range(m):
        fileName = trainingFileList[i]
        hwLabels.append(int(fileName.split('.')[0].split('_')[0]))    # 训练集标签
        trainingMat[i, :] = img2vector('Ch02/trainingDigits/{}'.format(fileName))    # 训练集特征矩阵
    
    # 处理测试集
    testFileList = os.listdir('Ch02/testDigits/')
    errorCount = 0.0
    mTest = len(testFileList)
    testMat = np.zeros((mTest, 1024))
    classNum = []
    for i in range(mTest):
        fileName = testFileList[i]
        classNum.append(int(fileName.split('.')[0].split('_')[0]))
        testMat[i, :] = img2vector('Ch02/testDigits/{}'.format(fileName))
    classifierResults = classify0(testMat, trainingMat, hwLabels, 3)    # 分类结果
    errorCount = np.sum([classifierResults[i] != classNum[i] for i in range(len(classNum))])    # 计算分类错误率
    print('The total error rate is: %f' % (errorCount / float(len(classNum))))
    
[IN] handwritingClassTest()
[OUT] The total error rate is: 0.011628