算法思路:
已知一些数据点,在新传入一个未知数据点时,将未知数据点与已知数据点计算距离,然后将距离由大到小排列,取前K个距离,然后看前K个距离中占比最多的类别是哪一类,则未知数据就分属于该类别。
算法代码:
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
from collections import Counter
from PIL import Image
#载入数据集
digits = datasets.load_digits()
#加载一张新照片
img_num = 99
test_img = digits.images[img_num]
#plt.figure('image')
#plt.imshow(test_img)
#plt.show()
#拓展测试样本维度
def test_img_function(img):
vectors = np.zeros((1797,64))
for i in range(1797):
vectors[i] = img
return vectors
#统计数组中元素个数
def count(label):
label_count = np.zeros(10)
for i in range (100):
target = label[i]
label_count[target] += 1
return label_count
test_img = test_img_function(digits.data[img_num]) #把测试图片拓展成1797*64""
distance = (test_img - digits.data)**2
distance = distance.sum(axis=1)
distance_sort = distance.argsort() #距离排序并返回索引列表""
num = distance_sort[:100] #取前100个索引""
label = []
for i in range (100):
label.append(digits.target[num[i]]) #根据索引创建存储前一百个图片对应的的标签列表""
labels = label[:100] #取前一百个标签""
label_count = count(labels) #统计标签出现次数并返回列表""
label_count_sort = label_count.argsort() #对标签出现次数排序并返回索引列表(从小到大排序)
test_label = label_count_sort[-1] #选取出现次数最多的标签
print("The predeictd label is :" + str(test_label))
print("The real label is: "+ str (digits.target[img_num]))