机器学习实战-knn-文本数据,手写字符识别
from numpy import *
import operator
from os import listdir
# knn分类函数
def classify0(inX, dataSet, labels, k):
# inX-输入向量
# dataset-训练数据
# labels-标签向量
# k-表示选择最近邻居的数目
dataSetSize = dataSet.shape[0] # 看有多少个数据
# print("dataSetSize", dataSetSize)
diffMat = tile(inX, (dataSetSize, 1)) - dataSet # 把数据inX行重复dataSetSize次,列重复1次形成新列表,再减去训练数据
# 得到的是差值
# print(diffMat)
# 计算欧式距离-两个向量点之间
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances ** 0.5
# print("distances", distances)
sortedDistIndicies = distances.argsort() # 将数据降序排列,提取的是各个数据的索引
# print("sortedDistIndicies",sortedDistIndicies)
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]] # 获取最近邻的标签
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # get函数为在classCount中找voteIlabel,有的话返回其值,没有返回0、
# print("classcount", classCount)
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 将数据变为内含元组的列表
return sortedClassCount[0][0]
# 创建数据集,包括数据和标签
def createDataSet():
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
# 将文件里的数据转换到列表
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines()) # get the number of lines in the file
returnMat = zeros((numberOfLines, 3)) # prepare matrix to return
classLabelVector = [] # prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines(): # 解析文件数据到列表
line = line.strip() # 截取掉所有回车字符
listFromLine = line.split('\t') # 使用tap将整行数据分割成一个元素列表
returnMat[index, :] = listFromLine[0:3] # 记录数据,前三个
classLabelVector.append(int(listFromLine[-1])) # 记录标签(-1代表最后一列数据-负索引)
index += 1
return returnMat, classLabelVector
# 数据归一化,任何特征值都转化到0-1或-1-1之间
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m, 1))
normDataSet = normDataSet / tile(ranges, (m, 1)) # element wise divide
return normDataSet, ranges, minVals
# 使用文件中的数据训练和测试
def datingClassTest():
hoRatio = 0.05 # hold out 10%---测试数据集的比例
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') # load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio) # 将数据集划分为训练数据集和测试数据集
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]): errorCount += 1.0
print("the total error rate is: %f" % (errorCount / float(numTestVecs)))
print(errorCount)
# 从文本中读取图片,格式化为向量
def img2vector(filename):
returnVect = zeros((1, 1024)) # 创建1行,1024列的向量
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32 * i + j] = int(lineStr[j])
return returnVect
# knn-手写数字识别,1024个距离计算
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('digits/trainingDigits') # 获取训练文件内容---文件夹内所有文件的文件名
m = len(trainingFileList)
trainingMat = zeros((m, 1024)) # 创建向量
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] # 去掉 .txt,split的作用是将fileNameStr根据.分成前后两部分,获取第一部分
classNumStr = int(fileStr.split('_')[0]) # 继续分,获取到数据的标签
hwLabels.append(classNumStr)
trainingMat[i, :] = img2vector('digits/trainingDigits/%s' % fileNameStr) # 将训练数据转变成变量
testFileList = listdir('digits/testDigits') # 获取到所有测试数据的文件名
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] # take off .txt
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('digits/testDigits/%s' % fileNameStr) # 获取到所有的测试数据
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) # 对所有的测试数据进行分类
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
if (classifierResult != classNumStr): errorCount += 1.0
print("\nthe total number of errors is: %d" % errorCount)
print("\nthe total error rate is: %f" % (errorCount / float(mTest)))
handwritingClassTest()