机器学习实战之最近邻法分类算分享交流

整理并分析了里面一些错误和不适之处,原因是Python版本的更新,目前只是基础部分的代码,后面例子的代码调试分析后再上传,供大家学习交流。

#-*-coding:utf-8-*-
from numpy import *
import operator
import matplotlib.pyplot as plt
def classify0(inX,dataSet,labels,k):
    '''
    function would like this:
    For very point n our dataset:
    calculate the distance between inX and current point
    sort the distance in increasing order
    take k items with lowest disrtance to inX
    find the majority calss among these items
    return the majority class as our prediction for the class of inX
    :param inX: the input vector to classify called inX
    :param dataSet: our full martix of training examples
    :param labels:  a vector of labels
    :param k: the number of nearest neighbors to use in the voting
    :return: sortedClassCount[0][0]
    '''
    dataSetsize=dataSet.shape[0]#记录数组第一维的大小
    #欧几里得距离的公式体现
    diffMat=tile(inX,(dataSetsize,1))-dataSet
    sqDiffMat=diffMat**2
    sqDistance=sqDiffMat.sum(axis=1)#axis=1 列求和
    distance=sqDistance**0.5#开方
    sortedDistIndicies=distance.argsort()#元素从小到大排序 提取对对应的index
    classCount={}
    for i in range(k):#the input k should always be a positive integer
        voteIlabel=labels[sortedDistIndicies[i]]
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
    #items返回的是列表对象,而iteritems返回的是iterator对象.
    #using the itemgetter method from the operator module imported in the second line of the program
    sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]


def file2martix(filename):
    '''
    datingTestSet.txt including:
    1000 entries
    Recorded the following features:
    one.Number of frequent flyer miles earned per year.
    two.Percentage of time spent playing video games.
    three.Liters of ice cream consumed per week.
    :param filename: datingTestSet
    :return: retrunMat,classLabelVector
    '''
    fr=open(filename)
    numberOfLines=len(fr.readlines())
    returnMat=zeros((numberOfLines,3))#生成numberOfLines*3全零矩阵
    classLabelVector=[]
    fr=open(filename)
    index=0
    for line in fr.readlines():
        line=line.strip()#剔除多余的空格
        listFromline=line.split('\t')#利用'\t'来分隔读取到的line
        returnMat[index,:]=listFromline[0:3]#take the first three elements and shove them into a row of matrix
        classLabelVector.append(listFromline[-1])#like the integer verison of the last item in the list
        index+=1
    return returnMat,classLabelVector

def plotpicture(dataingMata,classingLabel):
    fig=plt.figure()
    ax=fig.add_subplot(111)
    ax.scatter(dataingMata[:,1],dataingMata[:,2])
    ax.axis([-2,25,-0.2,2.0])
    plt.xlabel('Percentage of Time Spent Playing Video Games')
    plt.ylabel('Listers of Ice Cream Consumed Per Week')
    plt.show()


def autoNorm(dataSet):
    '''
    Data-normalizing code
    :param dataSet: our data martix
    :return:normDataSet,ranges,minVals
    '''
    minVals=dataSet.min(0)#The 0 in dataSet.min(0) allows you to take the minimums from the columns
    maxVals=dataSet.max(0)#Same to the above
    #the shape of minVals and maxVals is 1*3 and our martix is 1000*3
    ranges=maxVals-minVals
    normDataSet=zeros(shape(dataSet))
    m=dataSet.shape[0]
    #tile function to create a martix the same size as our input martix and fill it up with many copies.
    normDataSet=dataSet-tile(minVals,(m,1))
    normDataSet=normDataSet/tile(ranges,(m,1))# / operator is element-wise division;linalg.solve(matA,matB) for martix division
    return normDataSet,ranges,minVals

def datingClassTest(normMat,datingLabels):
    '''
    Classifler testing code for dating site
    '''
    hoRatio=0.10
    m=normMat.shape[0]
    numTestVecs=int(m*hoRatio)
    errorCount=0.0
    for i in range(numTestVecs):
        classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
        print("the classifier came back with: %s, the real answer is: %s"\
              %(classifierResult,datingLabels[i]))
        if (classifierResult!=datingLabels[i]):
            errorCount+=1.0
    print("the total error rate is: %f" %(errorCount/float(numTestVecs)))


def classifyPerson(datingLabels,normMat,minVals,ranges):
    '''
    Dating site predictor functions
    '''
    resultList=['not at all','in small does','in large doses']
    percenTats=float(input("percentage of time spent playing video games?"))
    ffMiles=float(input("frequent flier miles earned per year?"))
    iceCream=float(input("liters of ice cream consumed per year?"))
    inArr=array([ffMiles,percenTats,iceCream])
    classifierResult=classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
    print("You will probably like this person with:%s",classifierResult)

#dataMata,classLabelMat=file2martix('datingTestSet.txt')
#normDataSet,ranges,minVals=autoNorm(dataMata)
#datingClassTest(normDataSet,classLabelMat)
#classifyPerson(classLabelMat,normDataSet,minVals,ranges)
#以上测试过,没有问题.

例子:handwriting recognition 的代码,测试过,没有问题。

#-*- coding:utf-8 -*-
from numpy import *
from os import listdir
import operator

def classify0(inX,dataSet,labels,k):
    '''
    function would like this:
    For very point n our dataset:
    calculate the distance between inX and current point
    sort the distance in increasing order
    take k items with lowest disrtance to inX
    find the majority calss among these items
    return the majority class as our prediction for the class of inX
    :param inX: the input vector to classify called inX
    :param dataSet: our full martix of training examples
    :param labels:  a vector of labels
    :param k: the number of nearest neighbors to use in the voting
    :return: sortedClassCount[0][0]
    '''
    dataSetsize=dataSet.shape[0]#记录数组第一维的大小
    #欧几里得距离的公式体现
    diffMat=tile(inX,(dataSetsize,1))-dataSet
    sqDiffMat=diffMat**2
    sqDistance=sqDiffMat.sum(axis=1)#axis=1 列求和
    distance=sqDistance**0.5#开方
    sortedDistIndicies=distance.argsort()#元素从小到大排序 提取对对应的index,默认升序排列.
    classCount={}
    for i in range(k):#the input k should always be a positive integer
        voteIlabel=labels[sortedDistIndicies[i]]
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
    #items返回的是列表对象.从Python3.5开始使用
    #using the itemgetter method from the operator module imported in the second line of the program
    #operator.itemgetter函数获取的不是值,而是定义了一个函数,通过该函数作用到对象上才能获取值
    #sorted(iterable[, cmp[, key[, reverse]]])
    sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

def img2vevtor(filename):
    returnVect=zeros((1,1024))
    fr=open(filename)
    for i in range(32):
        lineStr=fr.readline()
        for j in range(32):
            returnVect[0,32*i+j]=int(lineStr[j])
    return returnVect

def handwritigClassTest():
    hwLables=[]
    trainingFileList=listdir('trainingDigits')
    m=len(trainingFileList)#1934
    trainingMat=zeros((m,1024))
    for i in range(m):
        fileNameStr=trainingFileList[i]
        fileStr=fileNameStr.split('.')[0]
        classNumStr=int(fileStr.split('_')[0])
        hwLables.append(classNumStr)
        trainingMat[i,:]=img2vevtor('trainingDigits/%s' %fileNameStr)
    testFileList=listdir('testDigits')
    errcount=0.0
    mTest=len(testFileList)
    for i in range(mTest):
        fileNameStr=testFileList[i]
        fileStr=fileNameStr.split('.')[0]
        classNumStr=int(fileStr.split('_')[0])
        vectorUnderTest=img2vevtor('testDigits/%s' %fileNameStr)
        classifierResult=classify0(vectorUnderTest,trainingMat,hwLables,3)
        print("the classifier came back with: %d,the real answer is %d"\
              %(classifierResult,classNumStr))
        if (classifierResult!=classNumStr):
            errcount+=1
    print("\n the total number og errors is: %d" %errcount)
    print("\n thr total error rate is: %f" %(errcount/float(mTest)))

handwritigClassTest()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值