简介
最近邻算法的缺陷—对噪声数据过于敏感。为了解决这一问题,我们可以把未知样本周边的多个最近样本计算在内,扩大参与决策的样本量,以避免个别数据直接决定决策结果。为此,我们引进K-最近邻算法(KNN)。
KNN 算法的实现步骤(TOP-K)
- 随机选择一个训练样本,计算未知样本与该训练样本之间的距离(dist)。
- 得到目前K个最邻近样本中的最大距离(maxdist)。
- 如果最近邻样本数小于K个,直接将其作为最近邻样本;否则,如果距离dist小于最大距离maxdist,就将该训练样本作为K-最近邻样本,同时删除最大距离maxdist对应的训练样本。
- 重复前三步,直到未知样本和所有训练样本的距离都算完。
- 统计K个最近邻样本中每个类别出现的次数。
- 选择出现频率最大的类别作为未知样本的类别。
KNN 算法举例
KNN 算法
距离dist = 0; 最大距离maxdist = 0; K = 3;
训练样本集合S = {S1, S2, S3, S4, S5, S6}
未知样本S'(18, 90)
K最近邻样本集合K-sample = {}
训练样本选择顺序order = {S2, S5, S4, S1, S6, S3}
1) S2
dist = |S'S2| = 18.86
K-sample = {S2}
2) S5
dist = |S'S5| = 117.4
K-sample = {S2, S5}
3) S4
dist = |S'S4| = 115.28
K-sample = {S2, S5, S4}
4) S1
dist = |S'S1| = 20.52
maxdist = 117.4
K-sample = {S2, S4, S1}
5) S6
dist = |S'S6| = 118.93
maxdist = 115.28
K-sample = {S2, S4, S1}
6) S3
dist = |S'S3| = 19.24
maxdist = 115.28
K-sample = {S2, S1, S3}
Coding: TOP-K
#coding:utf-8
import numpy as np
from random import shuffle
'''
Created on Aug 25, 2020
@author: jhckn
code: KNN(top-k)
'''
# 创建训练数据集
def createTrainingDataSet():
# 训练数据以字典表示,两个values表示样本类别和样本特征
#dataSet = {0:['Romance',[3,104]],1:['Romance',[2,100]],2:['Romance',[1,81]],3:['Action',[101,10]],4:['Action',[99,5]],5:['Action',[98,2]]}
# 样本特征
featureSet = {1:[3,104], 2:[2,100], 3:[1,81], 4:[101,10], 5:[99,5], 6:[98,2]}
# 样本所属类别
classSet = {1:'Romance', 2:'Romance', 3:'Romance', 4:'Action', 5:'Action', 6:'Action'}
return featureSet,classSet
# unknownSample表示待测样本,randomOrder表示样本序列顺序
# 得到判定测试样本类别的K个训练样本
def knn(unknownSample, randomOrder, featureSet, classSet, k):
k_sample = {}
dist = np.zeros(len(featureSet))
maxdist = 0
for x in range(0,len(randomOrder)):
# 将前k个训练样本写入k_sample
if x < k:
k_sample[randomOrder[x]] = classSet.get(randomOrder[x])
dist[randomOrder[x]-1] = np.linalg.norm(np.array(unknownSample) - np.array(featureSet.get(randomOrder[x])))
# 当前训练样本与测试样本之间的距离小于最大距离(maxdist),则把当前训练样本写入k_sample,并删除最大距离样本
else:
maxdist = np.max(dist)
if np.linalg.norm(np.array(unknownSample) - np.array(featureSet.get(randomOrder[x]))) < maxdist:
del[k_sample[np.argmax(dist)+1]]
dist[np.argmax(dist)] = 0
k_sample[randomOrder[x]] = classSet.get(randomOrder[x])
dist[randomOrder[x]-1] = np.linalg.norm(np.array(unknownSample) - np.array(featureSet.get(randomOrder[x])))
return k_sample
# 统计k_sample中的类别信息,判断测试样本的类别
def judgeClass(k_sample):
i = j = 0
for key, value in k_sample.items():
if value == 'Romance':
i = i+1
elif value == 'Action':
j = j+1
if i > j:
print('Romance!')
elif i < j:
print('Action!')
if __name__ == '__main__':
featureSet, classSet = createTrainingDataSet()
x = [18, 90]
s = [1, 2, 3, 4, 5, 6]
shuffle(s)
k_sample = knn(x, s, featureSet, classSet, 3)
print(k_sample)
judgeClass(k_sample)
字符文本识别(TXT格式)KNN分类
# -*- coding:utf-8 -*-
from os import listdir
from numpy import *
import numpy as np
import operator
import datetime
'''
Created on Aug 25, 2020
@author: jhckn
code: KNN(top-k)
'''
def KNN(test_data,train_data,train_label,k):
#已知分类的数据集(训练集)的行数
dataSetSize = train_data.shape[0]
#求所有距离:先tile函数将输入点拓展成与训练集相同维数的矩阵,计算测试样本与每一个训练样本的距离
all_distances = np.sqrt(np.sum(np.square(tile(test_data,(dataSetSize,1))-train_data),axis=1))
#print("所有距离:",all_distances)
#按all_distances中元素进行升序排序后得到其对应索引的列表
sort_distance_index = all_distances.argsort()
#print("文件索引排序:",sort_distance_index)
#选择距离最小的k个点
classCount = {}
for i in range(k):
#返回最小距离的训练集的索引(预测值)
voteIlabel = train_label[sort_distance_index[i]]
#print('第',i+1,'次预测值',voteIlabel)
classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
#求众数:按classCount字典的第2个元素(即类别出现的次数)从大到小排序
sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True)
return sortedClassCount[0][0]
#文本向量化 32x32 -> 1x1024
def img2vector(filename):
returnVect = []
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect.append(int(lineStr[j]))
return returnVect
#从文件名中解析分类数字
def classnumCut(fileName):
#参考文件名格式为:0_3.txt
fileStr = fileName.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
return classNumStr
#构建训练集数据向量,及对应分类标签向量
def trainingDataSet():
train_label = []
trainingFileList = listdir('trainingDigits')
m = len(trainingFileList)
train_data = zeros((m,1024))
#获取训练集的标签
for i in range(m):
# fileNameStr:所有训练集文件名
fileNameStr = trainingFileList[i]
# 得到训练集索引
train_label.append(classnumCut(fileNameStr))
train_data[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
return train_label,train_data
#测试函数
def main():
t1 = datetime.datetime.now() # 计时开始
Nearest_Neighbor_number = int(input('选取最邻近的K个值,K='))
train_label,train_data = trainingDataSet()
testFileList = listdir('testDigits')
error_sum = 0
test_number = len(testFileList)
for i in range(test_number):
#测试集文件名
fileNameStr = testFileList[i]
#切片后得到测试集索引
classNumStr = classnumCut(fileNameStr)
test_data = img2vector('testDigits/%s' % fileNameStr)
#调用knn算法进行测试
classifierResult = KNN(test_data, train_data, train_label, Nearest_Neighbor_number)
print ("第",i+1,"组:","预测值:",classifierResult,"真实值:",classNumStr)
if (classifierResult != classNumStr):
error_sum += 1.0
print ("\n测试集总数为:",test_number)
print ("测试出错总数:",error_sum)
print ("\n错误率:",error_sum/float(test_number)*100,'%')
t2 = datetime.datetime.now()
print('耗 时 = ', t2 - t1)
if __name__ == "__main__":
main()
字符文本识别(png格式)KNN分类
# encoding=utf-8
# 图像转文本(png-txt)
from PIL import Image
import numpy as np
from os import listdir
def img2txt(img_path, txt_name):
im = Image.open(img_path).convert('1').resize((32,32))
data = np.asarray(im)
np.savetxt(txt_name, data, fmt='%d', delimiter='')
trainingFileList = listdir('training_img')
print(len(trainingFileList))
for i in range(len(trainingFileList)):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
img2txt('training_img/%s'% fileNameStr, 'trainingDigits/%s' % fileStr + '.txt')
# -*- coding:utf-8 -*-
from os import listdir
from numpy import *
import numpy as np
import operator
import datetime
'''
Created on Aug 25, 2020
@author: jhckn
code: KNN(top-k)
'''
def KNN(test_data,train_data,train_label,k):
#已知分类的数据集(训练集)的行数
dataSetSize = train_data.shape[0]
#求所有距离:先tile函数将输入点拓展成与训练集相同维数的矩阵,计算测试样本与每一个训练样本的距离
all_distances = np.sqrt(np.sum(np.square(tile(test_data,(dataSetSize,1))-train_data),axis=1))
#print("所有距离:",all_distances)
#按all_distances中元素进行升序排序后得到其对应索引的列表
sort_distance_index = all_distances.argsort()
#print("文件索引排序:",sort_distance_index)
#选择距离最小的k个点
classCount = {}
for i in range(k):
#返回最小距离的训练集的索引(预测值)
voteIlabel = train_label[sort_distance_index[i]]
#print('第',i+1,'次预测值',voteIlabel)
classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
#求众数:按classCount字典的第2个元素(即类别出现的次数)从大到小排序
sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True)
return sortedClassCount[0][0]
#文本向量化 32x32 -> 1x1024
def img2vector(filename):
returnVect = []
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect.append(int(lineStr[j]))
return returnVect
#从文件名中解析分类数字
def classnumCut(fileName):
#参考文件名格式为:0_3.txt
fileStr = fileName.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
return classNumStr
#构建训练集数据向量,及对应分类标签向量
def trainingDataSet():
train_label = []
trainingFileList = listdir('trainingDigits')
m = len(trainingFileList)
train_data = zeros((m,1024))
#获取训练集的标签
for i in range(m):
# fileNameStr:所有训练集文件名
fileNameStr = trainingFileList[i]
# 得到训练集索引
train_label.append(classnumCut(fileNameStr))
train_data[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
return train_label,train_data
#测试函数
def main():
t1 = datetime.datetime.now() # 计时开始
Nearest_Neighbor_number = int(input('选取最邻近的K个值,K='))
train_label,train_data = trainingDataSet()
testFileList = listdir('testDigits')
error_sum = 0
test_number = len(testFileList)
for i in range(test_number):
#测试集文件名
fileNameStr = testFileList[i]
#切片后得到测试集索引
classNumStr = classnumCut(fileNameStr)
test_data = img2vector('testDigits/%s' % fileNameStr)
#调用knn算法进行测试
classifierResult = KNN(test_data, train_data, train_label, Nearest_Neighbor_number)
print ("第",i+1,"组:","预测值:",classifierResult,"真实值:",classNumStr)
if (classifierResult != classNumStr):
error_sum += 1.0
print ("\n测试集总数为:",test_number)
print ("测试出错总数:",error_sum)
print ("\n错误率:",error_sum/float(test_number)*100,'%')
t2 = datetime.datetime.now()
print('耗 时 = ', t2 - t1)
if __name__ == "__main__":
main()