KNN简单实现

最新推荐文章于 2022-04-07 14:43:19 发布

lilong117194

最新推荐文章于 2022-04-07 14:43:19 发布

阅读量1.1k

点赞数 2

分类专栏：机器学习实战文章标签：机器学习 KNN 实战

本文链接：https://blog.csdn.net/lilong117194/article/details/76641337

版权

机器学习实战专栏收录该内容

22 篇文章 2 订阅

订阅专栏

最近开始学习机器学习实战，第一个就是KNN，由于K-近邻算法比较简单，这里不再介绍理论知识，直接看代码实现：

KNN的简单实现

需要用到的一些语法：
tile()
sum(axis=1)
argsort，sort 和 sorted，operator.itemgetter函数
 get(),items()，iteritems()方法

# coding=utf-8
from numpy import *
import operator # 运算符模块,执行排序操作时将用到
import matplotlib.pyplot as plt

# 建立训训练集和相应的标签
def createDataset():
    # 数组，注意此处是两个中括号
    group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels=['A','A','B','B']
    return (group,labels)

# 简单分类
def classify0(inX,dataSet,labels,k):
    #shape[0]得到的是矩阵行数，shape[1]得到列数
    dataSetSize=dataSet.shape[0] 
    # tile()得到和dataset相同的维数，进行相减
    diffMat=tile(inX,(dataSetSize,1))-dataSet 
    #print(diffMat)
    # 各向量相减后平方
    sqDiffMat = diffMat**2
    #print(sqDiffMat)
    # axis=1按行求和，得到了平方和
    sqDistances = sqDiffMat.sum(axis=1)
    #print(sqDistances)
    # 开根号，求得输入向量和训练集各向量的欧氏距离
    distances = sqDistances**0.5
    #print(distances)
    # 得到各距离索引值，是升序,即最小距离到最大距离
    sortedDistIndicies = distances.argsort()
    #print( sortedDistIndicies)
    classCount={} 
    for i in range(k):
        # 前k个最小距离的标签
        voteIlabel = labels[sortedDistIndicies[i]]
        #print( voteIlabel)
        # 累计投票数
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    print('classCount:',classCount)

    # 把分类结果进行排序，然后返回得票数最多的分类结果
    # 其中iteritems()把字典分解为元祖列表，itemgetter(1)按照第二个元素的次序对元祖排序
    sortedClassCount = sorted(classCount.iteritems(), \
                        key=operator.itemgetter(1), reverse=True)
    print(sortedClassCount)
    # 输出分类标签
    #print(sortedClassCount[0][0]) 
    return sortedClassCount[0][0]


# 读的是datingTestSet2.txt，不是datingTestSet.txt
file_raw='C:\Users\LiLong\Desktop\datingTestSet2.txt'   
if __name__== "__main__": 
    # 导入数据
    group,labels=createDataset()
    print('training data set:',group)
    print('labels of training data set:',labels)
    # 简单分类    
    tt=classify0([0,0],group,labels,3)
    print('Classification results:',tt)

运行结果：

('training data set:', array([[ 1. ,  1.1],
       [ 1. ,  1. ],
       [ 0. ,  0. ],
       [ 0. ,  0.1]]))
('labels of training data set:', ['A', 'A', 'B', 'B'])
('classCount:', {'A': 1, 'B': 2})
[('B', 2), ('A', 1)]
('Classification results:', 'B')

至此一个最简单的KNN分类就实现了

KNN算法改进约会网站的配对效果

数据的处理

会用到的语法：
matplotlib
min(iterable, *[, key, default])

# coding=utf-8
from numpy import *
import operator # 运算符模块,执行排序操作时将用到
import matplotlib.pyplot as plt

# 数据预处理   
def file2matrix(filename):
    '''
     从文件中读入训练数据，并存储为矩阵
    '''
    fr=open(filename,'r')
    # 源代码有错误
    arrayOfLines=fr.readlines()  # 只能读一次
    numberOfLines = len(arrayOfLines) # 得到样本的行数
    # 得到一个二维矩阵，行数是样本的行数，每行3列
    returnMat = zeros((numberOfLines,3)) 
    print('row:%s and column:%s' %(returnMat.shape[0],returnMat.shape[1]))
    classLabelVector = []  # 得到一个一维的数组，存放样本标签
    index = 0
    for line in arrayOfLines:
        #strip() 方法用于移除字符串头尾指定的字符（默认为所有的空字符，包括空格、换行(\n)、制表符(\t)等）
        line = line.strip()  # 把回车符号给去掉
        #对于每一行，按照制表符切割字符串，得到的结果构成一个数组，
        listFromLine = line.split('\t')
        #print(listFromLine[0:4])
        # 把分割好的数据放至数据集,是一个1000*3的数组
        returnMat[index,:] = listFromLine[0:3]       
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return ( returnMat,classLabelVector)    
    fr.close()

# 归一化数据
def autoNorm(dataSet):
    # 每列的最小值minvals
    minVals=dataSet.min(0) # 0表示返回每列的最小值
    maxVals=dataSet.max(0)
    ranges=maxVals-minVals
    # 得到dataset相同行列数的0数组
    normDataSet=zeros(shape(dataSet))
    m = dataSet.shape[0] #数组的行数
    # tile复制形如[A,B,C](ABC分别代表每列的最小值)m行
    normDataSet = dataSet - tile(minVals, (m,1)) 
    # 归一化公式,注意是具体特征值相除
    normDataSet = normDataSet/tile(ranges, (m,1))   #element wise divide
    return normDataSet, ranges, minVals

# 分类测试
def datingClassTest():
    hoRatio = 0.10      
    datingDataMat,datingLabels = file2matrix('C:\Users\LiLong\Desktop\datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    # 测试数据的数量
    numTestVecs = int(m*hoRatio)
    print('the test number:',numTestVecs)
    errorCount = 0.0

    for i in range(numTestVecs):
        #normMat[i,:]表示输入的测试集是前100行的数据，normMat[numTestVecs:m,:]表示训练集
        #是100-1000的，datingLabels[numTestVecs:m]表示和训练集是对应的
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
                                     datingLabels[numTestVecs:m],3)
        print ("the classifier came back with: %d, the real answer is: %d"\
                % (classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]):  errorCount += 1.0
    print "the total error rate is: %f" % (errorCount/float(numTestVecs))
    print  errorCount

# 读的是datingTestSet2.txt，不是datingTestSet.txt
file_raw='C:\Users\LiLong\Desktop\datingTestSet2.txt'   
if __name__== "__main__":
    # 格式化数据
    datingDataMat,datingLables=file2matrix(file_raw)
    print datingDataMat
    print datingLables
    #print(array(datingLables)) # 以数组的部分省略形式显示
    # 创建散点图
    fig=plt.figure()
    ax=fig.add_subplot(111)
    ax.scatter(datingDataMat[:,1],datingDataMat[:,2])
    # c是颜色的数目，s是尺寸
    ax.scatter(datingDataMat[:,1],datingDataMat[:,2],\           c=15.0*array(datingLables),s=15.0*array(datingLables))
    plt.show()
    # 数据归一化
    normMat, ranges, minVals=autoNorm(datingDataMat)    
    print normMat

其中file2matrix得到的是数组矩阵，也即是可以处理的数据格式，如下：

[[  4.09200000e+04   8.32697600e+00   9.53952000e-01]
 [  1.44880000e+04   7.15346900e+00   1.67390400e+00]
 [  2.60520000e+04   1.44187100e+00   8.05124000e-01]
 ..., 
 [  2.65750000e+04   1.06501020e+01   8.66627000e-01]
 [  4.81110000e+04   9.13452800e+00   7.28045000e-01]
 [  4.37570000e+04   7.88260100e+00   1.33244600e+00]]

[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3, 2, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 1, 3, 1, 2, 1, 1, 2, 3, 3, 1, 2, 3, 3, 3, 1, 1, 1, 1, 2, 2, 1, 3, 2, 2, 2, 2, 3, 1, 2, 1, 2, 2, 2, 2, 2, 3, 2, 3, 1, 2, 3, 2, 2, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 1, 2, 2, 1, 1, 3, 3, 1, 2, 1, 3, 3, 2, 1, 1, 3, 1, 2, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 3, 1, 2, 1, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 3, 3, 1, 3, 2, 2, 3, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 2, 3, 3, 1, 2, 3, 2, 2, 3, 3, 3, 1, 2, 2, 1, 1, 3, 2, 3, 3, 1, 2, 1, 3, 1, 2, 3, 2, 3, 1, 1, 1, 3, 2, 3, 1, 3, 2, 1, 3, 2, 2, 3, 2, 3, 2, 1, 1, 3, 1, 3, 2, 2, 2, 3, 2, 2, 1, 2, 2, 3, 1, 3, 3, 2, 1, 1, 1, 2, 1, 3, 3, 3, 3, 2, 1, 1, 1, 2, 3, 2, 1, 3, 1, 3, 2, 2, 3, 1, 3, 1, 1, 2, 1, 2, 2, 1, 3, 1, 3, 2, 3, 1, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 2, 1, 1, 1, 3, 3, 2, 1, 1, 1, 2, 2, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 3, 2, 3, 3, 3, 3, 1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 2, 1, 3, 1, 3, 2, 2, 1, 2, 2, 3, 1, 3, 2, 1, 1, 3, 3, 2, 3, 3, 2, 3, 1, 3, 1, 3, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 2, 1, 3, 1, 1, 3, 3, 2, 2, 3, 1, 2, 3, 3, 2, 2, 1, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 1, 3, 3, 3, 2, 3, 2, 1, 1, 1, 1, 1, 3, 2, 2, 1, 2, 1, 3, 2, 1, 3, 2, 1, 3, 1, 1, 3, 3, 3, 3, 2, 1, 1, 2, 1, 3, 3, 2, 1, 2, 3, 2, 1, 2, 2, 2, 1, 1, 3, 1, 1, 2, 3, 1, 1, 2, 3, 1, 3, 1, 1, 2, 2, 1, 2, 2, 2, 3, 1, 1, 1, 3, 1, 3, 1, 3, 3, 1, 1, 1, 3, 2, 3, 3, 2, 2, 1, 1, 1, 2, 1, 2, 2, 3, 3, 3, 1, 1, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 1, 1, 1, 3, 3, 3, 3, 2, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 3, 2, 1, 2, 2, 2, 3, 2, 1, 3, 2, 3, 2, 3, 2, 1, 1, 2, 3, 1, 3, 3, 3, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 1, 3, 3, 2, 2, 2, 3, 1, 2, 1, 1, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2, 1, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 3, 2, 2, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 1, 3, 3, 1, 2, 3, 1, 3, 1, 1, 2, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 3, 1, 3, 1, 1, 2, 2, 2, 3, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 3, 2, 3, 2, 1, 2, 1, 1, 1, 2, 3, 2, 2, 1, 2, 2, 1, 3, 1, 3, 3, 3, 2, 2, 3, 3, 1, 2, 2, 2, 3, 1, 2, 1, 3, 1, 2, 3, 1, 1, 1, 2, 2, 3, 1, 3, 1, 1, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 2, 2, 3, 1, 3, 1, 2, 3, 2, 2, 3, 1, 2, 3, 2, 3, 1, 2, 2, 3, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 3, 2, 1, 3, 3, 3, 1, 1, 3, 1, 2, 3, 3, 2, 2, 2, 1, 2, 3, 2, 2, 3, 2, 2, 2, 3, 3, 2, 1, 3, 2, 1, 3, 3, 1, 2, 3, 2, 1, 3, 3, 3, 1, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 1, 2, 1, 3, 1, 2, 2, 1, 3, 2, 1, 3, 3, 2, 2, 2, 1, 2, 2, 1, 3, 1, 3, 1, 3, 3, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 3, 2, 2, 1, 3, 1, 2, 3, 1, 3, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 3, 3, 3, 1, 3, 2, 2, 1, 1, 3, 3, 2, 2, 2, 1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 3, 1, 2, 2, 2, 3, 2, 1, 2, 1, 2, 3, 3, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 1, 1, 3, 2, 1, 2, 1, 2, 2, 3, 2, 2, 2, 3, 1, 2, 1, 2, 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 2, 3, 3, 2, 2, 1, 1, 1, 3, 3, 1, 1, 1, 3, 3, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 3, 1, 1, 2, 3, 2, 2, 1, 3, 1, 2, 3, 1, 2, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 2, 3, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 3, 3, 3]

下图是数据的散点图：
这里写图片描述

归一化后的数据：

[[ 0.44832535  0.39805139  0.56233353]
 [ 0.15873259  0.34195467  0.98724416]
 [ 0.28542943  0.06892523  0.47449629]
 ..., 
 [ 0.29115949  0.50910294  0.51079493]
 [ 0.52711097  0.43665451  0.4290048 ]
 [ 0.47940793  0.3768091   0.78571804]]

测试算法

# coding=utf-8
from numpy import *
import operator # 运算符模块,执行排序操作时将用到
import matplotlib.pyplot as plt

# 建立训训练集和相应的标签
def createDataset():
    # 数组，注意此处是两个中括号
    group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels=['A','A','B','B']
    return (group,labels)

# 简单分类
def classify0(inX,dataSet,labels,k):
    #shape[0]得到的是矩阵行数，shape[1]得到列数
    dataSetSize=dataSet.shape[0] 
    # tile()得到和dataset相同的维数，进行相减
    diffMat=tile(inX,(dataSetSize,1))-dataSet 
    # 各向量相减后平方
    sqDiffMat = diffMat**2
    # axis=1按行求和，得到了平方和
    sqDistances = sqDiffMat.sum(axis=1)
    # 开根号，求得输入向量和训练集各向量的欧氏距离
    distances = sqDistances**0.5
    # 得到各距离索引值，是升序,即最小距离到最大距离
    sortedDistIndicies = distances.argsort()
    classCount={}  # 定义一个字典
    for i in range(k):
        # 前k个最小距离的标签
        voteIlabel = labels[sortedDistIndicies[i]]     
        # 累计投票数
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1

    # 把分类结果进行排序，然后返回得票数最多的分类结果
    # 其中iteritems()把字典分解为元祖列表，itemgetter(1)按照第二个元素的次序对元祖排序
    sortedClassCount = sorted(classCount.iteritems(), \
                              key=operator.itemgetter(1), reverse=True)
    # 输出分类标签
    #print(sortedClassCount[0][0]) 
    return sortedClassCount[0][0]

# 数据预处理   
def file2matrix(filename):
    '''
     从文件中读入训练数据，并存储为矩阵
    '''
    fr=open(filename,'r')
    # 源代码有错误
    arrayOfLines=fr.readlines()  # 只能读一次
    numberOfLines = len(arrayOfLines) # 得到样本的行数
    returnMat = zeros((numberOfLines,3))  # 得到一个二维矩阵，行数是样本的行数，每行3列
    print('row:%s and column:%s' %(returnMat.shape[0],returnMat.shape[1]))
    classLabelVector = []  # 得到一个一维的数组，存放样本标签
    index = 0
    for line in arrayOfLines:
        #strip() 方法用于移除字符串头尾指定的字符（默认为所有的空字符，包括空格、换行(\n)、制表符(\t)等）
        line = line.strip()  # 把回车符号给去掉
        #对于每一行，按照制表符切割字符串，得到的结果构成一个数组，
        listFromLine = line.split('\t')
        #print(listFromLine[0:4])
        # 把分割好的数据放至数据集,是一个1000*3的数组
        returnMat[index,:] = listFromLine[0:3]       
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return ( returnMat,classLabelVector)    
    fr.close()

# 归一化数据
def autoNorm(dataSet):
    # 每列的最小值minvals
    minVals=dataSet.min(0) # 0表示返回每列的最小值
    maxVals=dataSet.max(0)
    ranges=maxVals-minVals
    # 得到dataset相同行列数的0数组
    normDataSet=zeros(shape(dataSet))
    m = dataSet.shape[0] #数组的行数
    # tile复制形如[A,B,C](ABC分别代表每列的最小值)m行
    normDataSet = dataSet - tile(minVals, (m,1)) 
    # 归一化公式,注意是具体特征值相除
    normDataSet = normDataSet/tile(ranges, (m,1))   #element wise divide
    return normDataSet, ranges, minVals

# 分类测试
def datingClassTest():
    hoRatio = 0.10      #hold out 10%
    datingDataMat,datingLabels = file2matrix('C:\Users\LiLong\Desktop\datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    # 测试数据的数量
    numTestVecs = int(m*hoRatio)
    print('the test number:',numTestVecs)
    errorCount = 0.0

    for i in range(numTestVecs):
        #normMat[i,:]表示输入的测试集是前100行的数据，normMat[numTestVecs:m,:]表示训练集
        #是100-1000的，datingLabels[numTestVecs:m]表示和训练集是对应的
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
                                     datingLabels[numTestVecs:m],3)
        print ("the classifier came back with: %d, the real answer is: %d"\
                % (classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]):  errorCount += 1.0
    print "the total error rate is: %f" % (errorCount/float(numTestVecs))
    print  errorCount


# 读的是datingTestSet2.txt，不是datingTestSet.txt
#file_raw='C:\Users\LiLong\Desktop\datingTestSet2.txt'   
if __name__== "__main__": 
    datingClassTest()

结果:

row:1000 and column:3
('the test number:', 100)
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
..., 
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 1
the total error rate is: 0.050000
5.0

结果显示错误率5.0%

lilong117194

关注

2
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
KNN简单实现

最近开始学习机器学习实战，第一个就是KNN，由于K-近邻算法比较简单，这里不再介绍理论知识，直接看代码实现：用到的数据准备：源码及数据KNN的简单实现需要用到的一些语法： tile() sum(axis=1) argsort，sort 和 sorted，operator.itemgetter函数 get(),items()，iteritems()方法# coding=utf-8from n
复制链接

扫一扫