《机器学习实战》kNN
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/4/5 16:58
# @Author : HJH
# @Site :
# @File : kNN.py
# @Software: PyCharm
from numpy import *
import operator
from os import listdir
import pandas as pd
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
#计算欧式距离
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
#argsort函数返回的是数组值从小到大的索引值
sortedDistIndicies = distances.argsort()
classCount={}
#选择距离最小的k个节点
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
#排序
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
#将手写数字图像转换为向量的形式
def img2vector(filename):
returnVect = zeros((1,1024))
with open(filename,'r') as fr:
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('trainingDigits')
m = len(trainingFileList)
trainingMat = zeros((m,1024))
for i in range(m):
#从文件名中分析数字
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
# print(fileStr)
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
# print(classNumStr)
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
testFileList = listdir('testDigits')
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
if (classifierResult != classNumStr): errorCount += 1.0
print("\nthe total number of errors is: %d" % errorCount)
print("\nthe total error rate is: %f" % (errorCount/float(mTest)))
if __name__=='__main__':
handwritingClassTest()
手写数字的数据集:链接:https://pan.baidu.com/s/1R5e5BTbVu8jo2nzIuBO_Gw 密码:qfkb