kNN coding and some corrections in comments

from numpy import *
from os import listdir  #used in the handwritting example
import operator
def createDataSet():
        group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
        labels=['A','A','B','B']
        return group,labels




def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]


def file2matrix(filename):
        fr = open(filename)
        numberOfLines = len(fr.readlines())
        returnMat = zeros((numberOfLines,3))
        classLabelVector = []
        fr = open(filename)
        index = 0
        for line in fr.readlines():
            line = line.strip()
            listFromLine = line.split('\t')
            returnMat[index,:] = listFromLine[0:3]
            classLabelVector.append((listFromLine[-1])) #int() is not useful
            index += 1
        return returnMat,classLabelVector


def autoNorm(dataSet):
        minVals = dataSet.min(0)
        maxVals = dataSet.max(0)
        ranges = maxVals - minVals
        normDataSet = zeros(shape(dataSet))
        m = dataSet.shape[0]
        normDataSet = dataSet - tile(minVals,(m,1))
        normDataSet = normDataSet/tile(ranges,(m,1))
        return normDataSet, ranges, minVals




def datingClassTest():
        hoRatio = 0.10
        datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') #The class value in this file is numeric while it is nominal, which we do not need in datingTestSet(same reason as below).
        normMat, rangs, minVals = autoNorm(datingDataMat)
        m = normMat.shape[0]
        numTestVecs = int(m*hoRatio)
        errorCount = 0.0
        for i in range(numTestVecs):
                classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
                if (classifierResult !=datingLabels[i]):
                        errorCount +=1.0
                        print "the classifier came back with: %d, the real answer is: %d" %(int(classifierResult), int(datingLabels[i]))
        print "the total error rate is: %f" %(errorCount/float(numTestVecs))


#in python shell commond line, one is "ax.scatter(datingDataMat[:,1], datingDataMat[:,2],
15.0*array(datingLabels), 15.0*array(datingLabels))"
#which needs to be corrected as "ax.scatter(datingDataMat[:,1], datingDataMat[:,2],
15.0*array(map(int,datingLabels)), 15.0*array(map(int,datingLabels)))"


def classifyPerson():
        resultList = ['not at all','in small doses','in large doses']
        percentTats = float(raw_input("percentage of time spent playing video games?"))
        ffMiles = float(raw_input("frequent flier miles earned per year?"))
        iceCream = float(raw_input("liters of ice cream consumed per year?"))
        datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
        normMat, ranges, minVals = autoNorm(datingDataMat)
        inArr = array([ffMiles, percentTats, iceCream])
        classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
        print "you will probably like this person: ", resultList[int(classifierResult)-1] #needs to compulsively transfer the data type




def img2vector(filename):
        returnVect = zeros((1,1024))
        fr = open(filename)
        for i in range(32):
                lineStr = fr.readline()
                for j in range(32):
                        returnVect[0,32*i+j] = int(lineStr[j])
        return returnVect


def handwritingClassTest():
        hwLabels = []
        trainingFileList = listdir('trainingDigits')
        trainingFileList1 = trainingFileList[1:len(trainingFileList)]
        #to get rid of the first element in list, which is ".DS_Store". It may cause bad effect on the file reading.
        #Why? Due to the mechanism of the storage of OS X folder that automatically add ".DS_Store" as a default?
        m = len(trainingFileList1)
        trainingMat = zeros((m,1024))
        for i in range(m):
                fileNameStr = trainingFileList1[i]
                fileStr = fileNameStr.split('.')[0]
                classNumStr = int(fileStr.split('_')[0])
                hwLabels.append(classNumStr)
                trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
        testFileList = listdir('testDigits')
        testFileList1 = testFileList[1:len(testFileList)]
        #The reason is the same as the above instance
        errorCount = 0.0
        mTest = len(testFileList1)
        for i in range(mTest):
                fileNameStr = testFileList1[i]
                fileStr = fileNameStr.split('.')[0]
                classNumStr = int(fileStr.split('_')[0])
                vectorUnderTest = img2vector('testDigits/%s' % fileNameStr) # '%'sign is for connecting two component
                classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels,3)
                print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
                if (classifierResult !=classNumStr): errorCount +=1.0
        print "\nthe total number of errors is: %d" % errorCount
        print "\nthe total error rate is: %f" % (errorCount/float(mTest))
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值