KNN简单实现

最近开始学习机器学习实战,第一个就是KNN,由于K-近邻算法比较简单,这里不再介绍理论知识,直接看代码实现:

KNN的简单实现

需要用到的一些语法:
tile()
sum(axis=1)
argsort,sort 和 sorted,operator.itemgetter函数
get(),items(),iteritems()方法

# coding=utf-8
from numpy import *
import operator # 运算符模块,执行排序操作时将用到
import matplotlib.pyplot as plt

# 建立训训练集和相应的标签
def createDataset():
    # 数组,注意此处是两个中括号
    group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels=['A','A','B','B']
    return (group,labels)

# 简单分类
def classify0(inX,dataSet,labels,k):
    #shape[0]得到的是矩阵行数,shape[1]得到列数
    dataSetSize=dataSet.shape[0] 
    # tile()得到和dataset相同的维数,进行相减
    diffMat=tile(inX,(dataSetSize,1))-dataSet 
    #print(diffMat)
    # 各向量相减后平方
    sqDiffMat = diffMat**2
    #print(sqDiffMat)
    # axis=1按行求和,得到了平方和
    sqDistances = sqDiffMat.sum(axis=1)
    #print(sqDistances)
    # 开根号,求得输入向量和训练集各向量的欧氏距离
    distances = sqDistances**0.5
    #print(distances)
    # 得到各距离索引值,是升序,即最小距离到最大距离
    sortedDistIndicies = distances.argsort()
    #print( sortedDistIndicies)
    classCount={} 
    for i in range(k):
        # 前k个最小距离的标签
        voteIlabel = labels[sortedDistIndicies[i]]
        #print( voteIlabel)
        # 累计投票数
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    print('classCount:',classCount)

    # 把分类结果进行排序,然后返回得票数最多的分类结果
    # 其中iteritems()把字典分解为元祖列表,itemgetter(1)按照第二个元素的次序对元祖排序
    sortedClassCount = sorted(classCount.iteritems(), \
                        key=operator.itemgetter(1), reverse=True)
    print(sortedClassCount)
    # 输出分类标签
    #print(sortedClassCount[0][0]) 
    return sortedClassCount[0][0]


# 读的是datingTestSet2.txt,不是datingTestSet.txt
file_raw='C:\Users\LiLong\Desktop\datingTestSet2.txt'   
if __name__== "__main__": 
    # 导入数据
    group,labels=createDataset()
    print('training data set:',group)
    print('labels of training data set:',labels)
    # 简单分类    
    tt=classify0([0,0],group,labels,3)
    print('Classification results:',tt)

运行结果:

('training data set:', array([[ 1. ,  1.1],
       [ 1. ,  1. ],
       [ 0. ,  0. ],
       [ 0. ,  0.1]]))
('labels of training data set:', ['A', 'A', 'B', 'B'])
('classCount:', {'A': 1, 'B': 2})
[('B', 2), ('A', 1)]
('Classification results:', 'B')

至此一个最简单的KNN分类就实现了

KNN算法改进约会网站的配对效果

数据的处理

会用到的语法:
matplotlib
min(iterable, *[, key, default])

# coding=utf-8
from numpy import *
import operator # 运算符模块,执行排序操作时将用到
import matplotlib.pyplot as plt

# 数据预处理   
def file2matrix(filename):
    '''
     从文件中读入训练数据,并存储为矩阵
    '''
    fr=open(filename,'r')
    # 源代码有错误
    arrayOfLines=fr.readlines()  # 只能读一次
    numberOfLines = len(arrayOfLines) # 得到样本的行数
    # 得到一个二维矩阵,行数是样本的行数,每行3列
    returnMat = zeros((numberOfLines,3)) 
    print('row:%s and column:%s' %(returnMat.shape[0],returnMat.shape[1]))
    classLabelVector = []  # 得到一个一维的数组,存放样本标签
    index = 0
    for line in arrayOfLines:
        #strip() 方法用于移除字符串头尾指定的字符(默认为所有的空字符,包括空格、换行(\n)、制表符(\t)等)
        line = line.strip()  # 把回车符号给去掉
        #对于每一行,按照制表符切割字符串,得到的结果构成一个数组,
        listFromLine = line.split('\t')
        #print(listFromLine[0:4])
        # 把分割好的数据放至数据集,是一个1000*3的数组
        returnMat[index,:] = listFromLine[0:3]       
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return ( returnMat,classLabelVector)    
    fr.close()

# 归一化数据
def autoNorm(dataSet):
    # 每列的最小值minvals
    minVals=dataSet.min(0) # 0表示返回每列的最小值
    maxVals=dataSet.max(0)
    ranges=maxVals-minVals
    # 得到dataset相同行列数的0数组
    normDataSet=zeros(shape(dataSet))
    m = dataSet.shape[0] #数组的行数
    # tile复制形如[A,B,C](ABC分别代表每列的最小值)m行
    normDataSet = dataSet - tile(minVals, (m,1)) 
    # 归一化公式,注意是具体特征值相除
    normDataSet = normDataSet/tile(ranges, (m,1))   #element wise divide
    return normDataSet, ranges, minVals

# 分类测试
def datingClassTest():
    hoRatio = 0.10      
    datingDataMat,datingLabels = file2matrix('C:\Users\LiLong\Desktop\datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    # 测试数据的数量
    numTestVecs = int(m*hoRatio)
    print('the test number:',numTestVecs)
    errorCount = 0.0

    for i in range(numTestVecs):
        #normMat[i,:]表示输入的测试集是前100行的数据,normMat[numTestVecs:m,:]表示训练集
        #是100-1000的,datingLabels[numTestVecs:m]表示和训练集是对应的
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
                                     datingLabels[numTestVecs:m],3)
        print ("the classifier came back with: %d, the real answer is: %d"\
                % (classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]):  errorCount += 1.0
    print "the total error rate is: %f" % (errorCount/float(numTestVecs))
    print  errorCount

# 读的是datingTestSet2.txt,不是datingTestSet.txt
file_raw='C:\Users\LiLong\Desktop\datingTestSet2.txt'   
if __name__== "__main__":
    # 格式化数据
    datingDataMat,datingLables=file2matrix(file_raw)
    print datingDataMat
    print datingLables
    #print(array(datingLables)) # 以数组的部分省略形式显示
    # 创建散点图
    fig=plt.figure()
    ax=fig.add_subplot(111)
    ax.scatter(datingDataMat[:,1],datingDataMat[:,2])
    # c是颜色的数目,s是尺寸
    ax.scatter(datingDataMat[:,1],datingDataMat[:,2],\           c=15.0*array(datingLables),s=15.0*array(datingLables))
    plt.show()
    # 数据归一化
    normMat, ranges, minVals=autoNorm(datingDataMat)    
    print normMat

其中file2matrix得到的是数组矩阵,也即是可以处理的数据格式,如下:

[[  4.09200000e+04   8.32697600e+00   9.53952000e-01]
 [  1.44880000e+04   7.15346900e+00   1.67390400e+00]
 [  2.60520000e+04   1.44187100e+00   8.05124000e-01]
 ..., 
 [  2.65750000e+04   1.06501020e+01   8.66627000e-01]
 [  4.81110000e+04   9.13452800e+00   7.28045000e-01]
 [  4.37570000e+04   7.88260100e+00   1.33244600e+00]]

[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3, 2, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 1, 3, 1, 2, 1, 1, 2, 3, 3, 1, 2, 3, 3, 3, 1, 1, 1, 1, 2, 2, 1, 3, 2, 2, 2, 2, 3, 1, 2, 1, 2, 2, 2, 2, 2, 3, 2, 3, 1, 2, 3, 2, 2, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 1, 2, 2, 1, 1, 3, 3, 1, 2, 1, 3, 3, 2, 1, 1, 3, 1, 2, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 3, 1, 2, 1, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 3, 3, 1, 3, 2, 2, 3, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 2, 3, 3, 1, 2, 3, 2, 2, 3, 3, 3, 1, 2, 2, 1, 1, 3, 2, 3, 3, 1, 2, 1, 3, 1, 2, 3, 2, 3, 1, 1, 1, 3, 2, 3, 1, 3, 2, 1, 3, 2, 2, 3, 2, 3, 2, 1, 1, 3, 1, 3, 2, 2, 2, 3, 2, 2, 1, 2, 2, 3, 1, 3, 3, 2, 1, 1, 1, 2, 1, 3, 3, 3, 3, 2, 1, 1, 1, 2, 3, 2, 1, 3, 1, 3, 2, 2, 3, 1, 3, 1, 1, 2, 1, 2, 2, 1, 3, 1, 3, 2, 3, 1, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 2, 1, 1, 1, 3, 3, 2, 1, 1, 1, 2, 2, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 3, 2, 3, 3, 3, 3, 1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 2, 1, 3, 1, 3, 2, 2, 1, 2, 2, 3, 1, 3, 2, 1, 1, 3, 3, 2, 3, 3, 2, 3, 1, 3, 1, 3, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 2, 1, 3, 1, 1, 3, 3, 2, 2, 3, 1, 2, 3, 3, 2, 2, 1, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 1, 3, 3, 3, 2, 3, 2, 1, 1, 1, 1, 1, 3, 2, 2, 1, 2, 1, 3, 2, 1, 3, 2, 1, 3, 1, 1, 3, 3, 3, 3, 2, 1, 1, 2, 1, 3, 3, 2, 1, 2, 3, 2, 1, 2, 2, 2, 1, 1, 3, 1, 1, 2, 3, 1, 1, 2, 3, 1, 3, 1, 1, 2, 2, 1, 2, 2, 2, 3, 1, 1, 1, 3, 1, 3, 1, 3, 3, 1, 1, 1, 3, 2, 3, 3, 2, 2, 1, 1, 1, 2, 1, 2, 2, 3, 3, 3, 1, 1, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 1, 1, 1, 3, 3, 3, 3, 2, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 3, 2, 1, 2, 2, 2, 3, 2, 1, 3, 2, 3, 2, 3, 2, 1, 1, 2, 3, 1, 3, 3, 3, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 1, 3, 3, 2, 2, 2, 3, 1, 2, 1, 1, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2, 1, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 3, 2, 2, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 1, 3, 3, 1, 2, 3, 1, 3, 1, 1, 2, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 3, 1, 3, 1, 1, 2, 2, 2, 3, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 3, 2, 3, 2, 1, 2, 1, 1, 1, 2, 3, 2, 2, 1, 2, 2, 1, 3, 1, 3, 3, 3, 2, 2, 3, 3, 1, 2, 2, 2, 3, 1, 2, 1, 3, 1, 2, 3, 1, 1, 1, 2, 2, 3, 1, 3, 1, 1, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 2, 2, 3, 1, 3, 1, 2, 3, 2, 2, 3, 1, 2, 3, 2, 3, 1, 2, 2, 3, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 3, 2, 1, 3, 3, 3, 1, 1, 3, 1, 2, 3, 3, 2, 2, 2, 1, 2, 3, 2, 2, 3, 2, 2, 2, 3, 3, 2, 1, 3, 2, 1, 3, 3, 1, 2, 3, 2, 1, 3, 3, 3, 1, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 1, 2, 1, 3, 1, 2, 2, 1, 3, 2, 1, 3, 3, 2, 2, 2, 1, 2, 2, 1, 3, 1, 3, 1, 3, 3, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 3, 2, 2, 1, 3, 1, 2, 3, 1, 3, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 3, 3, 3, 1, 3, 2, 2, 1, 1, 3, 3, 2, 2, 2, 1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 3, 1, 2, 2, 2, 3, 2, 1, 2, 1, 2, 3, 3, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 1, 1, 3, 2, 1, 2, 1, 2, 2, 3, 2, 2, 2, 3, 1, 2, 1, 2, 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 2, 3, 3, 2, 2, 1, 1, 1, 3, 3, 1, 1, 1, 3, 3, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 3, 1, 1, 2, 3, 2, 2, 1, 3, 1, 2, 3, 1, 2, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 2, 3, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 3, 3, 3]

下图是数据的散点图:
这里写图片描述

归一化后的数据:

[[ 0.44832535  0.39805139  0.56233353]
 [ 0.15873259  0.34195467  0.98724416]
 [ 0.28542943  0.06892523  0.47449629]
 ..., 
 [ 0.29115949  0.50910294  0.51079493]
 [ 0.52711097  0.43665451  0.4290048 ]
 [ 0.47940793  0.3768091   0.78571804]]

测试算法

# coding=utf-8
from numpy import *
import operator # 运算符模块,执行排序操作时将用到
import matplotlib.pyplot as plt

# 建立训训练集和相应的标签
def createDataset():
    # 数组,注意此处是两个中括号
    group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels=['A','A','B','B']
    return (group,labels)

# 简单分类
def classify0(inX,dataSet,labels,k):
    #shape[0]得到的是矩阵行数,shape[1]得到列数
    dataSetSize=dataSet.shape[0] 
    # tile()得到和dataset相同的维数,进行相减
    diffMat=tile(inX,(dataSetSize,1))-dataSet 
    # 各向量相减后平方
    sqDiffMat = diffMat**2
    # axis=1按行求和,得到了平方和
    sqDistances = sqDiffMat.sum(axis=1)
    # 开根号,求得输入向量和训练集各向量的欧氏距离
    distances = sqDistances**0.5
    # 得到各距离索引值,是升序,即最小距离到最大距离
    sortedDistIndicies = distances.argsort()
    classCount={}  # 定义一个字典
    for i in range(k):
        # 前k个最小距离的标签
        voteIlabel = labels[sortedDistIndicies[i]]     
        # 累计投票数
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1

    # 把分类结果进行排序,然后返回得票数最多的分类结果
    # 其中iteritems()把字典分解为元祖列表,itemgetter(1)按照第二个元素的次序对元祖排序
    sortedClassCount = sorted(classCount.iteritems(), \
                              key=operator.itemgetter(1), reverse=True)
    # 输出分类标签
    #print(sortedClassCount[0][0]) 
    return sortedClassCount[0][0]

# 数据预处理   
def file2matrix(filename):
    '''
     从文件中读入训练数据,并存储为矩阵
    '''
    fr=open(filename,'r')
    # 源代码有错误
    arrayOfLines=fr.readlines()  # 只能读一次
    numberOfLines = len(arrayOfLines) # 得到样本的行数
    returnMat = zeros((numberOfLines,3))  # 得到一个二维矩阵,行数是样本的行数,每行3列
    print('row:%s and column:%s' %(returnMat.shape[0],returnMat.shape[1]))
    classLabelVector = []  # 得到一个一维的数组,存放样本标签
    index = 0
    for line in arrayOfLines:
        #strip() 方法用于移除字符串头尾指定的字符(默认为所有的空字符,包括空格、换行(\n)、制表符(\t)等)
        line = line.strip()  # 把回车符号给去掉
        #对于每一行,按照制表符切割字符串,得到的结果构成一个数组,
        listFromLine = line.split('\t')
        #print(listFromLine[0:4])
        # 把分割好的数据放至数据集,是一个1000*3的数组
        returnMat[index,:] = listFromLine[0:3]       
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return ( returnMat,classLabelVector)    
    fr.close()

# 归一化数据
def autoNorm(dataSet):
    # 每列的最小值minvals
    minVals=dataSet.min(0) # 0表示返回每列的最小值
    maxVals=dataSet.max(0)
    ranges=maxVals-minVals
    # 得到dataset相同行列数的0数组
    normDataSet=zeros(shape(dataSet))
    m = dataSet.shape[0] #数组的行数
    # tile复制形如[A,B,C](ABC分别代表每列的最小值)m行
    normDataSet = dataSet - tile(minVals, (m,1)) 
    # 归一化公式,注意是具体特征值相除
    normDataSet = normDataSet/tile(ranges, (m,1))   #element wise divide
    return normDataSet, ranges, minVals

# 分类测试
def datingClassTest():
    hoRatio = 0.10      #hold out 10%
    datingDataMat,datingLabels = file2matrix('C:\Users\LiLong\Desktop\datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    # 测试数据的数量
    numTestVecs = int(m*hoRatio)
    print('the test number:',numTestVecs)
    errorCount = 0.0

    for i in range(numTestVecs):
        #normMat[i,:]表示输入的测试集是前100行的数据,normMat[numTestVecs:m,:]表示训练集
        #是100-1000的,datingLabels[numTestVecs:m]表示和训练集是对应的
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
                                     datingLabels[numTestVecs:m],3)
        print ("the classifier came back with: %d, the real answer is: %d"\
                % (classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]):  errorCount += 1.0
    print "the total error rate is: %f" % (errorCount/float(numTestVecs))
    print  errorCount


# 读的是datingTestSet2.txt,不是datingTestSet.txt
#file_raw='C:\Users\LiLong\Desktop\datingTestSet2.txt'   
if __name__== "__main__": 
    datingClassTest()

结果:

row:1000 and column:3
('the test number:', 100)
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
..., 
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 1
the total error rate is: 0.050000
5.0

结果显示错误率5.0%

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值