机器学习实战 k-近邻算法

约会网站:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from numpy import *
import operator
import matplotlib.pyplot as plt

def classify0(inX, dataSet, labels, k):#kNN算法
    #计算欧式距离
    dataSetSize = dataSet.shape[0]#行数 1列数
    diffMat = tile(inX, (dataSetSize,1)) - dataSet #    例:tile([1,2],(2,3)) 结果:[[1,2,1,2,1,2], [1,2,1,2,1,2]]
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)#axis=1, 表示按行方向相加  =0 按列方向相加
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort() #argsort函数返回的是数组值从小到大的索引值
    classCount={}          
    for i in range(k):#前k个
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #从字典classCount中取值,key为voteIlabel,如果没有返回0
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    #classCount.iteritems()以迭代器对象 返回键值对 key=operator.itemgetter(1)按第二个元素对元组排序 reverse=True 逆序 大到小
    #operator模块提供的itemgetter函数用于获取对象的哪些维的数据
    return sortedClassCount[0][0]

def file2matrix(filename):
    fr = open(filename)
    numberOfLines = len(fr.readlines())  # get the number of lines in the file
    returnMat = zeros((numberOfLines, 3))  # prepare matrix to return
    classLabelVector = []  # prepare labels return
    fr = open(filename)
    index = 0
    for line in fr.readlines():
        line = line.strip()#去除两侧\r\n
        listFromLine = line.split('\t')
        returnMat[index, :] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat, classLabelVector

def autoNorm(dataSet):#归一化数值 将任意取值范围的特征值转化为0到1区间内
                      #newValue = (oldValue-min)/(max-min)
    minVals = dataSet.min(0)#列最小
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m, 1))
    normDataSet = normDataSet / tile(ranges, (m, 1))  # element wise divide
    return normDataSet, ranges, minVals

def datingClassTest():
    hoRatio = 0.10  # hold out 10%
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')  # load data setfrom file
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m * hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
        print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print "the total error rate is: %f" % (errorCount / float(numTestVecs))
    print errorCount

def createPlot():
    fig = plt.figure()
    ax = fig.add_subplot(111)  # 349 将画布分割成3行4列,图像画在从左到右从上到下的第9块
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    # ax.scatter(datingDataMat[:,1], datingDataMat[:,2])
    ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2], 15.0 * array(datingLabels), 15.0 * array(datingLabels))
    ax.axis([-2, 25, -0.2, 2.0])
    plt.xlabel('Percentage of Time Spent Playing Video Games')
    plt.ylabel('Liters of Ice Cream Consumed Per Week')
    plt.show()

if __name__ == '__main__':
    createPlot()
    datingClassTest()


datingTestSet2.txt

40920	8.326976	0.953952	3
14488	7.153469	1.673904	2
26052	1.441871	0.805124	1
75136	13.147394	0.428964	1
35948	6.830792	1.213192	3


手写字识别:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from numpy import *
import operator
from os import listdir

def classify0(inX, dataSet, labels, k):#kNN算法
    #计算欧式距离
    dataSetSize = dataSet.shape[0]#行数 1列数
    diffMat = tile(inX, (dataSetSize,1)) - dataSet #    例:tile([1,2],(2,3)) 结果:[[1,2,1,2,1,2], [1,2,1,2,1,2]]
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)#axis=1, 表示按行方向相加  =0 按列方向相加
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort() #argsort函数返回的是数组值从小到大的索引值
    classCount={}          
    for i in range(k):#前k个
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #从字典classCount中取值,key为voteIlabel,如果没有返回0
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    #classCount.iteritems()以迭代器对象 返回键值对 key=operator.itemgetter(1)按第二个元素对元组排序 reverse=True 逆序 大到小
    #operator模块提供的itemgetter函数用于获取对象的哪些维的数据
    return sortedClassCount[0][0]

def img2vector(filename):
    returnVect = zeros((1,1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j])
    return returnVect

def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir('digits/trainingDigits')           #load the training set
    m = len(trainingFileList)
    trainingMat = zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]     #6_43.txt take off .txt = 6_43
        classNumStr = int(fileStr.split('_')[0]) #6_43 take off _43  = 6
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('digits/trainingDigits/%s' % fileNameStr)
    testFileList = listdir('digits/testDigits')        #iterate through the test set
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]     #6_43.txt take off .txt = 6_43
        classNumStr = int(fileStr.split('_')[0]) #6_43 take off _43  = 6
        vectorUnderTest = img2vector('digits/testDigits/%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
        if (classifierResult != classNumStr): errorCount += 1.0
    print "\nthe total number of errors is: %d" % errorCount
    print "\nthe total error rate is: %f" % (errorCount/float(mTest))

if __name__ == '__main__':
    handwritingClassTest()

digits/trainingDigits/0_0.txt:

00000000000001111000000000000000
00000000000011111110000000000000
00000000001111111111000000000000
00000001111111111111100000000000
00000001111111011111100000000000
00000011111110000011110000000000
00000011111110000000111000000000
00000011111110000000111100000000
00000011111110000000011100000000
00000011111110000000011100000000
00000011111100000000011110000000
00000011111100000000001110000000
00000011111100000000001110000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000011111110000000001111000000
00000011110110000000001111000000
00000011110000000000011110000000
00000001111000000000001111000000
00000001111000000000011111000000
00000001111000000000111110000000
00000001111000000001111100000000
00000000111000000111111000000000
00000000111100011111110000000000
00000000111111111111110000000000
00000000011111111111110000000000
00000000011111111111100000000000
00000000001111111110000000000000
00000000000111110000000000000000
00000000000011000000000000000000


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值