机器学习实战第二章-k近邻算法

最新推荐文章于 2023-10-09 21:18:54 发布

名为不二的兔子

最新推荐文章于 2023-10-09 21:18:54 发布

阅读量296

点赞数

分类专栏：机器学习

本文链接：https://blog.csdn.net/kt513226724/article/details/80169497

版权

机器学习专栏收录该内容

6 篇文章 0 订阅

订阅专栏

K-近邻算法

1.k近邻算法简要概述

准备：导入数据

import numpy as np
import operator

def createDataSet():
    group=np.array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels=['A','A','B','B']
    return group,labels

group,labels=createDataSet()

group

array([[1. , 1.1],
       [1. , 1. ],
       [0. , 0. ],
       [0. , 0.1]])

labels

['A', 'A', 'B', 'B']

import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns

plt.scatter(group[0:2,0],group[0:2,1],color='g',label='A')
plt.scatter(group[2:4,0],group[2:4,1],color='r',label='B')
plt.legend()

这里写图片描述

实施kNN算法

def classify0(inX,dataSet,labels,k):
    #统计样本数目，该值和标签数目一致
    dataSetSize=dataSet.shape[0]
    #用欧式距离公式计算距离
    diffMat=np.tile(inX,(dataSetSize,1))-dataSet #np.tile(A,(x,y))将矩阵A在行方向重复x次，列方向重复y次
    sqDiffMat=diffMat**2
    sqDistances=sqDiffMat.sum(axis=1)
    distances=sqDistances**0.5
    sortedDistIndicies=np.argsort(distances)#将距离排序
    #确定前k个主要分类
    classCount={}
    for i in range(k):
        voteIlabel=labels[sortedDistIndicies[i]]
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
    #将分类数目排序
    # dict.items()返回一个完整的列表 dict.iteritems()返回一个迭代器,python3没有这个属性，直接用items
    sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

#预测[0,0]所在数据分类
classify0([0,0],group,labels,3)

'B'

2.示例：使用k近邻算法改进约会网站的配对效果

处理数据集中的数据

def file2matrix(filename):
    fr=open(filename)
    arrayOLines=fr.readlines()
    numberOfLines=len(arrayOLines)
    returnMat=np.zeros((numberOfLines,3)) #特征数量三列
    classLabelVector=[]
    index=0
    for line in arrayOLines:
        line=line.strip() #去掉每行头尾空白
        listFromLine=line.split('\t') #以制表符分隔
        returnMat[index,:]=listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index+=1
    return returnMat,classLabelVector

path='H:/机器学习课程资料/machinelearninginaction/Ch02/datingTestSet2.txt'
dataMat,dataLabels=file2matrix(path)

dataMat

array([[4.0920000e+04, 8.3269760e+00, 9.5395200e-01],
       [1.4488000e+04, 7.1534690e+00, 1.6739040e+00],
       [2.6052000e+04, 1.4418710e+00, 8.0512400e-01],
       ...,
       [2.6575000e+04, 1.0650102e+01, 8.6662700e-01],
       [4.8111000e+04, 9.1345280e+00, 7.2804500e-01],
       [4.3757000e+04, 7.8826010e+00, 1.3324460e+00]])

dataLabels[0:20]

[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]

分析数据，绘制散点图

fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(dataMat[:,1],dataMat[:,2])
ax.set_xlabel('玩视频游戏所耗时间百分比')
ax.set_ylabel('每周所消费的冰淇淋公升数')

Text(0,0.5,'每周所消费的冰淇淋公升数')

这里写图片描述

上图未带标签很难判断

fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(dataMat[:,1],dataMat[:,2],15.0*np.array(dataLabels),15.0*np.array(dataLabels))
ax.set_xlabel('玩视频游戏所耗时间百分比')
ax.set_ylabel('每周所消费的冰淇淋公升数')

Text(0,0.5,'每周所消费的冰淇淋公升数')

这里写图片描述

def showdatas(dataMat,dataLabels):
   #当nrow=2，nclos=2时，代表fig画布被分为四个区域，axs[0][0]代表第一行第一个区域
    fig,axs=plt.subplots(nrows=2,ncols=2,sharex=False,sharey=False,figsize=(13,8))

    numberOfLabels=len(dataLabels)
    LabelsColors=[]
    for i in dataLabels:
        if i==1:
            LabelsColors.append('black')
        if i==2:
            LabelsColors.append('orange')
        if i==3:
            LabelsColors.append('red')
    #绘制第一第二特征的散点图
    axs[0][0].scatter(x=dataMat[:,0],y=dataMat[:,1],color=LabelsColors,s=15,alpha=.5)
    axs0_title_text=axs[0][0].set_title('每年获得飞行常客里程数与玩视频游戏所消耗占比')
    axs0_xlabel_text=axs[0][0].set_xlabel('每年获得的飞行常客里程数')
    axs0_ylabel_text=axs[0][0].set_ylabel('玩视频游戏所消耗时间占')
    plt.setp(axs0_title_text,size=9,weight='bold',color='red')
    plt.setp(axs0_xlabel_text,size=7,weight='bold',color='black')
    plt.setp(axs0_ylabel_text,size=7,weight='bold',color='black')

    #绘制第一第三特征散点图
    axs[0][1].scatter(x=dataMat[:,0],y=dataMat[:,2],color=LabelsColors,s=15,alpha=.5)
    axs1_title_text=axs[0][1].set_title('每年获得飞行常客里程数与每周消费的冰激淋公升数')
    axs1_xlabel_text=axs[0][1].set_xlabel('每年获得的飞行常客里程数')
    axs1_ylabel_text=axs[0][1].set_ylabel('每周消费的冰激淋公升数')
    plt.setp(axs1_title_text,size=9,weight='bold',color='red')
    plt.setp(axs1_xlabel_text,size=7,weight='bold',color='black')
    plt.setp(axs1_ylabel_text,size=7,weight='bold',color='black')

    #绘制第二第三特征散点图
    axs[1][0].scatter(x=dataMat[:,1],y=dataMat[:,2],color=LabelsColors,s=15,alpha=.5)
    axs2_title_text=axs[1][0].set_title('玩视频游戏所消耗时间占比与每周消费的冰激淋公升数')
    axs2_xlabel_text=axs[1][0].set_xlabel('玩视频游戏所消耗时间占比')
    axs2_ylabel_text=axs[1][0].set_ylabel('每周消费的冰激淋公升数')
    plt.setp(axs2_title_text,size=9,weight='bold',color='red')
    plt.setp(axs2_xlabel_text,size=7,weight='bold',color='black')
    plt.setp(axs2_ylabel_text,size=7,weight='bold',color='black')

    #设置图例
    didntLike=mlines.Line2D([],[],color='black',marker='.',markersize=6,label='didntLike')
    smallDoses=mlines.Line2D([],[],color='orange',marker='.',markersize=6,label='smallDoses')
    largeDoses=mlines.Line2D([],[],color='red',marker='.',markersize=6,label='largeDoses')

    #添加图例
    axs[0][0].legend(handles=[didntLike,smallDoses,largeDoses])
    axs[0][1].legend(handles=[didntLike,smallDoses,largeDoses])
    axs[1][0].legend(handles=[didntLike,smallDoses,largeDoses])
    plt.show()

showdatas(dataMat,dataLabels)

这里写图片描述

准备数据：数据归一化

newValue=（oldValue-min）/（max-min）

def autoNorm(dataSet):
    minVals=dataSet.min(0)#无参数是所有最小值，(0)axis=0每列最小值(1)axis=1每行最小值
    maxVals=dataSet.max(0)
    ranges=maxVals-minVals
    normDataSet=np.zeros(np.shape(dataSet))
    m=dataSet.shape[0]
    normDataSet=dataSet-np.tile(minVals,(m,1))
    normDataSet=normDataSet/np.tile(ranges,(m,1))
    return normDataSet,ranges,minVals

normMat,ranges,minVals=autoNorm(dataMat)

normMat

array([[0.44832535, 0.39805139, 0.56233353],
       [0.15873259, 0.34195467, 0.98724416],
       [0.28542943, 0.06892523, 0.47449629],
       ...,
       [0.29115949, 0.50910294, 0.51079493],
       [0.52711097, 0.43665451, 0.4290048 ],
       [0.47940793, 0.3768091 , 0.78571804]])

ranges

array([9.1273000e+04, 2.0919349e+01, 1.6943610e+00])

minVals

array([0.      , 0.      , 0.001156])

测试算法：验证分类器

def datingClassTest():
    hoRatio=0.10
    dataMat,dataLabels=file2matrix(path)
    normMat,ranges,minVals=autoNorm(dataMat)
    m=normMat.shape[0]#获取行数
    numTestVecs=int(m*hoRatio)#10%测试数据的个数
    errorCount=0.0
    for i in range(numTestVecs):
        #前numTestVecs个数据作为测试集，后m-numTestVecs个数据作为训练集
        classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],
                                   dataLabels[numTestVecs:m],7)
        print('分类结果：%d\t真实类别：%d'%(classifierResult,dataLabels[i]))
        if classifierResult !=dataLabels[i]:
            errorCount+=1.0
    print('错误率：%f%%'%(errorCount/float(numTestVecs)*100))

datingClassTest()

分类结果：3  真实类别：3
分类结果：2  真实类别：2
分类结果：1  真实类别：1
分类结果：1  真实类别：1
分类结果：1  真实类别：1
分类结果：1  真实类别：1

……

分类结果：2  真实类别：2
分类结果：1  真实类别：1
分类结果：3  真实类别：3
分类结果：3  真实类别：3
分类结果：2  真实类别：2
分类结果：2  真实类别：1
分类结果：1  真实类别：1
错误率：4.000000%

使用算法：构建完整可用系统

def classifyPerson():
    resultList=['讨厌','有些喜欢','非常喜欢']
    #三维特征用户输入
    precentTats=float(input('玩视频游戏所耗时间百分比：'))
    ffMiles=float(input('每年获得的飞行常客里程数：'))
    iceCream=float(input('每周消费的冰淇淋公升数：'))
    #打开的文件路径
    filename=path
    dataMat,dataLabels=file2matrix(filename)
    normMat,ranges,minVals=autoNorm(dataMat)
    #生成Numpy数组，测试集
    inArr=np.array([precentTats,ffMiles,iceCream])
    #归一化
    norminArr=(inArr-minVals)/ranges
    #分类结果
    classifierResult=classify0(norminArr,normMat,dataLabels,7)
    #打印结果
    print('你可能%s这个人'%(resultList[classifierResult-1]))

classifyPerson()

玩视频游戏所耗时间百分比：10
每年获得的飞行常客里程数：10000
每周消费的冰淇淋公升数：0.5
你可能讨厌这个人

3.示例：手写识别系统

'''
将图像格式化处理为一个向量，即把一个32*32的二进制图像矩阵转换为1*1024的向量
'''
def img2vector(filename):
    returnVect=np.zeros((1,1024))
    fr=open(filename)
    for i in range(32):
        lineStr=fr.readline()
        for j in range(32):
            returnVect[0,32*i+j]=int(lineStr[j])
    return returnVect

path='H:/机器学习课程资料/machinelearninginaction/Ch02/trainingDigits'
path2='H:/机器学习课程资料/machinelearninginaction/Ch02/testDigits'
from os import listdir
def handwritingClassTest():
    hwLabels=[]
    trainingFileList=listdir(path)
    m=len(trainingFileList)
    trainingMat=np.zeros((m,1024))
    for i in range(m):
        fileNameStr=trainingFileList[i] #eg: 0_1.txt
        fileStr=fileNameStr.split('.')[0] #0_1
        classNumStr=int(fileStr.split('_')[0])#0
        hwLabels.append(classNumStr)
        trainingMat[i,:]=img2vector(path+'/%s'%fileNameStr)
    testFileList=listdir(path2)
    errorCount=0.0
    mTest=len(testFileList)
    for i in range(mTest):
        fileNameStr=testFileList[i]
        fileStr=fileNameStr.split('.')[0]
        classNumStr=int(fileStr.split('_')[0])
        vectorUnderTest=img2vector(path2+'/%s'%fileNameStr)
        classifierResult=classify0(vectorUnderTest,trainingMat,hwLabels,3)
        print('the classifier came back with:%d,the real answer is: %d'%(classifierResult,classNumStr))
        if(classifierResult!=classNumStr):
            errorCount+=1.0
    print('\nthe total number of errors is:%d'%errorCount)
    print('\nthe total error rate is:%f'%(errorCount/float(mTest)))

handwritingClassTest()

the classifier came back with:0,the real answer is: 0
the classifier came back with:0,the real answer is: 0
the classifier came back with:0,the real answer is: 0
the classifier came back with:0,the real answer is: 0

……

the classifier came back with:9,the real answer is: 9
the classifier came back with:9,the real answer is: 9
the classifier came back with:9,the real answer is: 9
the classifier came back with:9,the real answer is: 9
the classifier came back with:9,the real answer is: 9
the classifier came back with:9,the real answer is: 9
the classifier came back with:9,the real answer is: 9
the classifier came back with:9,the real answer is: 9

the total number of errors is:10

the total error rate is:0.010571

错误率是1.1%，但这种方法执行效率并不高，可以使用其他方法来对比

sklearn中的k近邻

官方文档

KNneighborsClassifier参数说明：

n_neighbors：默认为5，就是k-NN的k的值，选取最近的k个点。
weights：默认是uniform，参数可以是uniform、distance，也可以是用户自己定义的函数。uniform是均等的权重，就说所有的邻近点的权重都是相等的。distance是不均等的权重，距离近的点比距离远的点的影响大。用户自定义的函数，接收距离的数组，返回一组维数相同的权重。
algorithm：快速k近邻搜索算法，默认参数为auto，可以理解为算法自己决定合适的搜索算法。除此之外，用户也可以自己指定搜索算法ball_tree、kd_tree、brute方法进行搜索，brute是蛮力搜索，也就是线性扫描，当训练集很大时，计算非常耗时。kd_tree，构造kd树存储数据以便对其进行快速检索的树形数据结构，kd树也就是数据结构中的二叉树。以中值切分构造的树，每个结点是一个超矩形，在维数小于20时效率高。ball tree是为了克服kd树高纬失效而发明的，其构造过程是以质心C和半径r分割样本空间，每个节点是一个超球体。
leaf_size：默认是30，这个是构造的kd树和ball树的大小。这个值的设置会影响树构建的速度和搜索速度，同样也影响着存储树所需的内存大小。需要根据问题的性质选择最优的大小。
metric：用于距离度量，默认度量是minkowski，也就是p=2的欧氏距离(欧几里德度量)。
p：距离度量公式。在上小结，我们使用欧氏距离公式进行距离度量。除此之外，还有其他的度量方法，例如曼哈顿距离。这个参数默认为2，也就是默认使用欧式距离公式进行距离度量。也可以设置为1，使用曼哈顿距离公式进行距离度量。
metric_params：距离公式的其他关键参数，这个可以不管，使用默认的None即可。
n_jobs：并行处理设置。默认为1，临近点搜索并行工作数。如果为-1，那么CPU的所有cores都用于并行工作。

from sklearn.neighbors import KNeighborsClassifier as kNN

path='H:/机器学习课程资料/machinelearninginaction/Ch02/trainingDigits'
path2='H:/机器学习课程资料/machinelearninginaction/Ch02/testDigits'
from os import listdir
def handwritingClassTest2():
    hwLabels=[]
    trainingFileList=listdir(path)
    m=len(trainingFileList)
    trainingMat=np.zeros((m,1024))
    for i in range(m):
        fileNameStr=trainingFileList[i] #eg: 0_1.txt
        fileStr=fileNameStr.split('.')[0] #0_1
        classNumStr=int(fileStr.split('_')[0])#0
        hwLabels.append(classNumStr)
        trainingMat[i,:]=img2vector(path+'/%s'%fileNameStr)
    #构建kNN分类器
    neigh=kNN(n_neighbors=3,algorithm='auto')
    #拟合模型
    neigh.fit(trainingMat,hwLabels)
    testFileList=listdir(path2)
    errorCount=0.0
    mTest=len(testFileList)
    for i in range(mTest):
        fileNameStr=testFileList[i]
        fileStr=fileNameStr.split('.')[0]
        classNumStr=int(fileStr.split('_')[0])
        vectorUnderTest=img2vector(path2+'/%s'%fileNameStr)
        classifierResult=neigh.predict(vectorUnderTest)
        print('the classifier came back with:%d,the real answer is: %d'%(classifierResult,classNumStr))
        if(classifierResult!=classNumStr):
            errorCount+=1.0
    print('\nthe total number of errors is:%d'%errorCount)
    print('\nthe total error rate is:%f'%(errorCount/float(mTest)))

handwritingClassTest2()

the classifier came back with:0,the real answer is: 0
the classifier came back with:0,the real answer is: 0
the classifier came back with:0,the real answer is: 0
the classifier came back with:0,the real answer is: 0
the classifier came back with:0,the real answer is: 0

……

the classifier came back with:9,the real answer is: 9
the classifier came back with:9,the real answer is: 9
the classifier came back with:9,the real answer is: 9

 the total number of errors is:12

 the total error rate is:0.012685