python实现K-近邻算法

python实现K-近邻算法

摘要:本文首先浅谈了自己对K-近邻算法的理解,进而通过Python一步步实现K-近邻算法,并通过Matplotlib对数据可视化,最后,选取相应的测试数据集对算法进行测试。

关键词:机器学习, k-近邻算法 , python, matplotlib

1、简介
k-近邻算法(K Nearest Neighbor,KNN),简单来说就是采用测量不同特征值之间的距离方法进行分类。即给定一个训练数据集,并且样本数据集中的每个数据都存在标签,对新的输入实例,在训练数据集中找到与该实例最邻近的K个实例(也就是上面所说的K个邻居), 这K个实例的多数属于某个类,就把该输入实例分类到这个类中。

在进行最近点计算时,采用欧式距离公式,计算两个向量点xAxB之间的距离:
这里写图片描述
2、数据准备
数据以txt文件格式存储,包括三类特征:
1. 每年运动的总里程数
2. 每周打游戏所占时间百分比
3. 每天喝饮料的升数
这里写图片描述
从文本读取数据代码:

def file2matrix(filename):
    """读取文件
        返回矩阵
    """
    fr = open(filename)
    arrayOfLines = fr.readlines()
    numberOfLines = len(arrayOfLines)
    returnMat= np.zeros([numberOfLines,3])
    classLabelVector = []
    index = 0
    for line in arrayOfLines:
        listFromLine = line.strip().split()
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index+=1
    return returnMat,classLabelVector

使用matplotlib工具包可视化上述数据:

def test2():
    """
    测试画图
    :return:
    """
    datingDataMat, datingLabels= file2matrix('./datingTestSet2.txt')
    datingDataMat,_,_ = autoNorm(datingDataMat)
    print(datingDataMat)
    fig = plt.figure()
    ax = Axes3D(fig)
    #ax = fig.add_subplot(111)
    datingLabels = np.array(datingLabels)
    id_x1 = np.where(datingLabels==1)
    id_x2 = np.where(datingLabels==2)
    id_x3 = np.where(datingLabels==3)
    p1 = ax.scatter(datingDataMat[id_x1,0],datingDataMat[id_x1,1],datingDataMat[id_x1,2],marker='o',c='r',s=10)
    p2 = ax.scatter(datingDataMat[id_x2,0],datingDataMat[id_x2,1],datingDataMat[id_x2,2],marker='o',c='g',s=20)
    p3 = ax.scatter(datingDataMat[id_x3,0],datingDataMat[id_x3,1],datingDataMat[id_x3,2],marker='o',c='b',s=30)

    #ax.scatter(datingDataMat[:,0],datingDataMat[:,1],datingDataMat[:,2],15*np.array(datingLabels),15*np.array(datingLabels),15*np.array(datingLabels))
    # plt.xlabel(r'玩游戏所耗的时间百分比')
    # plt.ylabel(r'每周消费的冰淇淋公升数')
    ax.set_xlabel(r'每周打游戏所占时间百分比')
    ax.set_ylabel(r'每天喝饮料的升数')
    ax.set_zlabel(r'每年运动的总里程数')
    ax.set_xlim(-0.1,1.1)
    ax.set_ylim(-0.1,1.1)
    ax.legend(handles=[p1,p2,p3],labels=['不受欢迎', '一般受欢迎', '非常受欢迎'])
    # plt.xlim(-0.1,1.1)
    # plt.ylim(-0.1,1.1)


    fig = plt.figure(2)
    ax = fig.add_subplot(111)
    datingLabels = np.array(datingLabels)  # 从列表变成一个数组
    id_x1 = np.where(datingLabels == 1)
    p1 = ax.scatter(datingDataMat[id_x1, 1], datingDataMat[id_x1, 2], marker="o", c='r', label=r'不喜欢', s=10)
    id_x2 = np.where(datingLabels == 2)
    p2 = ax.scatter(datingDataMat[id_x2, 1], datingDataMat[id_x2, 2], marker="o", c='g', label=r'魅力一般', s=20)
    id_x3 = np.where(datingLabels == 3)
    p3 = ax.scatter(datingDataMat[id_x3, 1], datingDataMat[id_x3, 2], marker="o", c='b', label=r'极具魅力', s=30)
    plt.xlabel(r'玩游戏所耗的时间百分比')
    plt.ylabel(r'每天喝饮料的升数')
    plt.ylim(0,1)
    plt.legend(handles=[p1, p2, p3], labels=['不受欢迎', '一般受欢迎', '非常受欢迎'])

    fig = plt.figure(3)
    ax = fig.add_subplot(111)
    datingLabels = np.array(datingLabels) # 从列表变成一个数组
    id_x1= np.where(datingLabels==1)
    p1 = ax.scatter(datingDataMat[id_x1,0],datingDataMat[id_x1,1],marker ="o",c='r',label=r'不喜欢',s=10 )
    id_x2 = np.where(datingLabels == 2)
    p2 = ax.scatter(datingDataMat[id_x2, 0], datingDataMat[id_x2, 1], marker="o", c='g', label=r'魅力一般', s=20)
    id_x3 = np.where(datingLabels == 3)
    p3 = ax.scatter(datingDataMat[id_x3, 0], datingDataMat[id_x3, 1], marker="o", c='b', label=r'极具魅力', s=30)
    plt.legend(handles=[p1,p2,p3],labels=['不受欢迎', '一般受欢迎', '非常受欢迎'],loc='best')
    plt.xlabel(r'每年运动的总里程数')
    plt.ylabel(r'玩游戏所耗的时间百分比')
    plt.xlim(0,1)
    plt.show()

这里写图片描述
二维散点图
这里写图片描述

归一化处理
由于采用欧式距离作为计算最近点的方式,不同属性对计算结果会造成影响。
采用数值归一化方法可以避免这种问题。
这里写图片描述

def autoNorm(dataSet): # 数组np.array [mxn]
    """归一化"""
    minVals = dataSet.min(0)#[1xn]
    maxVals = dataSet.max(0)#[1xn]
    ranges = maxVals - minVals#[1xn]
    normDataSet = np.zeros(np.shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minVals,[m,1])
    normDataSet = normDataSet / np.tile(ranges,[m,1])
    return normDataSet,ranges,minVals

分类算法:
参数:
inX–输入(待分类数据)
dataSet–以获取的数据集
labels–对应的分类标签
k–近邻值(一般小于20)

def classify0(inX,dataSet,labels,k):
    """
    分类
    :param inX: input
    :param dataSet:数据集
    :param labels:标签
    :param k:K邻近
    :return:
    """
    dataSetSize = dataSet.shape[0]
    diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances ** 0.5
    sortedDistIndicies = distances.argsort()
    classCount = {}   #这是一个dict,用于存储不同标签出现的次数;
    for  i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]] # 键值  labels = ['A','A','B','B']
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)# 从大到小
    return  sortedClassCount[0][0]

测试算法:
100组作为测试输入
900组作为数据集

def datingClassTest():
    """
    分类器验证
    :return:
    """
    hoRatio = 0.1
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals=autoNorm(datingDataMat)
    m = normMat.shape[0] #
    numTestVecs = int(m*hoRatio)
    print(numTestVecs)
    errorCount =0
    for i in range(numTestVecs):
        classifierResult = classify0(inX=normMat[i,:],
                                     dataSet=normMat[numTestVecs:m,:],
                                     labels=datingLabels[numTestVecs:m],
                                     k=4)
        print("the classifier came back with :%d,the real answer is %d"
              %(classifierResult,datingLabels[i]))
        if (classifierResult != datingLabels[i]):
            errorCount += 1
    print("the total error rate is %f"%(errorCount / float(numTestVecs)))

验证结果:
这里写图片描述
这里写图片描述

小结

  1. 一个分类(聚类)算法,通过计算K个最邻近的,并取某个类的多数作为最终分类
  2. 数据归一化处理,避免某个属性的差异导致较大的误差
  3. matplotlib工具包的应用

参考文献
[1].PeterHarrington, 哈林顿, 李锐. 机器学习实战[M]. 人民邮电出版社, 2013.

完整代码:

import numpy as np
import operator
import os
import matplotlib.pyplot as plt
from pylab import mpl
from mpl_toolkits.mplot3d import Axes3D
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False #解决保存图像是负号'-'显示为方块的问题
def createDataSet():
    group = np.array([[1.0,1.1],
                   [1.0,1.0],
                   [0,0],
                   [0,0.1]])
    labels = ['A','A','B','B']
    return group,labels

def classify0(inX,dataSet,labels,k):
    """
    分类
    :param inX: input
    :param dataSet:数据集
    :param labels:标签
    :param k:K邻近
    :return:
    """
    dataSetSize = dataSet.shape[0]
    diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances ** 0.5
    sortedDistIndicies = distances.argsort()
    classCount = {}   #这是一个dict,用于存储不同标签出现的次数;
    for  i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]] # 键值  labels = ['A','A','B','B']
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)# 从大到小
    return  sortedClassCount[0][0]



def file2matrix(filename):
    """读取文件
        返回矩阵
    """
    fr = open(filename)
    arrayOfLines = fr.readlines()
    numberOfLines = len(arrayOfLines)
    returnMat= np.zeros([numberOfLines,3])
    classLabelVector = []
    index = 0
    for line in arrayOfLines:
        listFromLine = line.strip().split()
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index+=1
    return returnMat,classLabelVector

def autoNorm(dataSet): # 数组np.array [mxn]
    """归一化"""
    minVals = dataSet.min(0)#[1xn]
    maxVals = dataSet.max(0)#[1xn]
    ranges = maxVals - minVals#[1xn]
    normDataSet = np.zeros(np.shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minVals,[m,1])
    normDataSet = normDataSet / np.tile(ranges,[m,1])
    return normDataSet,ranges,minVals

def datingClassTest():
    """
    分类器验证
    :return:
    """
    hoRatio = 0.1
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals=autoNorm(datingDataMat)
    m = normMat.shape[0] #
    numTestVecs = int(m*hoRatio)
    print(numTestVecs)
    errorCount =0
    for i in range(numTestVecs):
        classifierResult = classify0(inX=normMat[i,:],
                                     dataSet=normMat[numTestVecs:m,:],
                                     labels=datingLabels[numTestVecs:m],
                                     k=4)
        print("the classifier came back with :%d,the real answer is %d"
              %(classifierResult,datingLabels[i]))
        if (classifierResult != datingLabels[i]):
            errorCount += 1
    print("the total error rate is %f"%(errorCount / float(numTestVecs)))

def classifyPerson():
    resultList = ['不受欢迎', '一般受欢迎', '非常受欢迎']

    percentTats = float(input('每周打游戏所占时间百分比(0-100):'))
    ffMiles = float(input("每年运动的总里程数(0-100000):"))
    iceCream = float(input('每天喝饮料的升数(0-10):'))
    datingDataMat ,datingLabels = file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals = autoNorm(datingDataMat)
    inarr = np.array([ffMiles,percentTats,iceCream])
    result = classify0(inX=(inarr-minVals)/ranges,dataSet=normMat,labels=datingLabels,k=3)
    print("你是一个这样的人:",resultList[result-1])

def img2vector(filename):
    returnVector = np.zeros([1,1024])# 行向量
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVector[0,32*i+j] = int(lineStr[j])
    return returnVector


def handwritingClassTest():
    hwLabels = []
    trainingFileList = os.listdir('digits\\trainingDigits')
    #print(trainingFileList)
    m =len(trainingFileList)# m个数据
    trainingMat = np.zeros([m,1024])
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0] # 0_0
        classNumStr = int(fileStr.split('_')[0]) # 标签
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('digits\\trainingDigits\\%s' % fileNameStr)

    testFileList = os.listdir('digits\\testDigits')
    mTest = len(testFileList)
    errorCount = 0
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('digits\\testDigits\\%s'%fileNameStr)
        classifierResult = classify0(vectorUnderTest,trainingMat,hwLabels,5)
        print('分类结果为:%d,真实结果为:%d'%(classifierResult,classNumStr))
        if classifierResult != classNumStr:
            errorCount+=1
    print("\n错误的总个数:%d"%errorCount)
    print('\n总个数:%d'%mTest)
    print('\n误差率:%f'%(errorCount/float(mTest)))

def test1():
    """用于测试K 近邻算法(KNN)"""
    group, labels = createDataSet()
    fig = plt.figure()
    plt.scatter(group[:, 0], group[:, 1])

    inX = [[0, 1],
           [0, 0],
           [2, 2]]
    ans = []
    for x in inX:
        a = classify0(x, group, labels, 3)
        ans.append(a)
    print(ans)
    plt.show()

def test2():
    """
    测试画图
    :return:
    """
    datingDataMat, datingLabels= file2matrix('./datingTestSet2.txt')
    datingDataMat,_,_ = autoNorm(datingDataMat)
    print(datingDataMat)
    fig = plt.figure()
    ax = Axes3D(fig)
    #ax = fig.add_subplot(111)
    datingLabels = np.array(datingLabels)
    id_x1 = np.where(datingLabels==1)
    id_x2 = np.where(datingLabels==2)
    id_x3 = np.where(datingLabels==3)
    p1 = ax.scatter(datingDataMat[id_x1,0],datingDataMat[id_x1,1],datingDataMat[id_x1,2],marker='o',c='r',s=10)
    p2 = ax.scatter(datingDataMat[id_x2,0],datingDataMat[id_x2,1],datingDataMat[id_x2,2],marker='o',c='g',s=20)
    p3 = ax.scatter(datingDataMat[id_x3,0],datingDataMat[id_x3,1],datingDataMat[id_x3,2],marker='o',c='b',s=30)

    #ax.scatter(datingDataMat[:,0],datingDataMat[:,1],datingDataMat[:,2],15*np.array(datingLabels),15*np.array(datingLabels),15*np.array(datingLabels))
    # plt.xlabel(r'玩游戏所耗的时间百分比')
    # plt.ylabel(r'每周消费的冰淇淋公升数')
    ax.set_xlabel(r'每周打游戏所占时间百分比')
    ax.set_ylabel(r'每天喝饮料的升数')
    ax.set_zlabel(r'每年运动的总里程数')
    ax.set_xlim(-0.1,1.1)
    ax.set_ylim(-0.1,1.1)
    ax.legend(handles=[p1,p2,p3],labels=['不受欢迎', '一般受欢迎', '非常受欢迎'])
    # plt.xlim(-0.1,1.1)
    # plt.ylim(-0.1,1.1)


    fig = plt.figure(2)
    ax = fig.add_subplot(111)
    datingLabels = np.array(datingLabels)  # 从列表变成一个数组
    id_x1 = np.where(datingLabels == 1)
    p1 = ax.scatter(datingDataMat[id_x1, 1], datingDataMat[id_x1, 2], marker="o", c='r', label=r'不喜欢', s=10)
    id_x2 = np.where(datingLabels == 2)
    p2 = ax.scatter(datingDataMat[id_x2, 1], datingDataMat[id_x2, 2], marker="o", c='g', label=r'魅力一般', s=20)
    id_x3 = np.where(datingLabels == 3)
    p3 = ax.scatter(datingDataMat[id_x3, 1], datingDataMat[id_x3, 2], marker="o", c='b', label=r'极具魅力', s=30)
    plt.xlabel(r'玩游戏所耗的时间百分比')
    plt.ylabel(r'每天喝饮料的升数')
    plt.ylim(0,1)
    plt.legend(handles=[p1, p2, p3], labels=['不受欢迎', '一般受欢迎', '非常受欢迎'])

    fig = plt.figure(3)
    ax = fig.add_subplot(111)
    datingLabels = np.array(datingLabels) # 从列表变成一个数组
    id_x1= np.where(datingLabels==1)
    p1 = ax.scatter(datingDataMat[id_x1,0],datingDataMat[id_x1,1],marker ="o",c='r',label=r'不受欢迎',s=10 )
    id_x2 = np.where(datingLabels == 2)
    p2 = ax.scatter(datingDataMat[id_x2, 0], datingDataMat[id_x2, 1], marker="o", c='g', label=r'一般受欢迎', s=20)
    id_x3 = np.where(datingLabels == 3)
    p3 = ax.scatter(datingDataMat[id_x3, 0], datingDataMat[id_x3, 1], marker="o", c='b', label=r'非常受欢迎', s=30)
    plt.legend(handles=[p1,p2,p3],labels=['不受欢迎', '一般受欢迎', '非常受欢迎'],loc='best')
    plt.xlabel(r'每年运动的总里程数')
    plt.ylabel(r'玩游戏所耗的时间百分比')
    plt.xlim(0,1)
    plt.show()

def test3():
    datingDataMat, datingLabels = file2matrix('./datingTestSet2.txt')
    normMat ,ranges,minVals = autoNorm(dataSet=datingDataMat)
    print(normMat,ranges,minVals)

def test4():
    """
    将图片转化为一个行向量
    :return:
    """
    vector= img2vector('digits/testDigits/0_0.txt')
    print(vector[0,0:31])
def test5():
    handwritingClassTest()

if __name__ == "__main__":
    #test1()  #用于测试K 近邻算法(KNN)
    #test2()  # 绘图测试画图
    #test3()  # 测试归一化
    #datingClassTest() # 检测正确率
    #classifyPerson()
    #test4()
    test5()
  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值