一、实验目的
通过实验,掌握利用kNN分类算法
二、实验任务与要求
计算k(1,3,5,8)取不同值时的kNN的分类结果,并总结差异
三、code
1.code
import matplotlib.pyplot as plt
import heapq
import math
import copy
def pic_show():
black_x=[0.4,0.6,1,1,1.3,1.6,1.9,2,2,2,2.5,2.5,2.5,2.5,2.6]
black_y=[4,6,5.5,7,3,4,6,2,6,7,1,3,4,5,6]
black = list(zip(black_x, black_y))
red_x=[4.4,5.5,5.8,6,6,6.5,7,7.5,7.5,7.5,8,8,9]
red_y=[3,4,3.8,2,5,1,5,2,4,6,3,7,3]
red = list(zip(red_x, red_y))
test1_x = [4]
test1_y = [5.5]
test1 = list(zip(test1_x, test1_y))
test2_x = [4.5]
test2_y = [2]
test2 = list(zip(test2_x, test2_y))
plt.figure(figsize=(8, 4))#figsize:确定画布大小
plt.scatter(black_x,black_y,c='black')
plt.scatter(red_x,red_y,c='red')
plt.scatter(test1_x,test1_y,c='green',label='test1')
plt.scatter(test2_x,test2_y,c='blue',label='test2')
plt.legend()
plt.show()
return black,red,test1,test2
def distance(xy,test):
d = math.sqrt((xy[0] - test[0][0]) ** 2 + (xy[1] - test[0][1]) ** 2)
return d
def kNN(nums, find_nums):
if len(nums) == len(list(set(nums))):
# 使用heapq
min_number = heapq.nsmallest(find_nums, nums)
min_num_index = list(map(nums.index, min_number))
else:
# 使用deepcopy
nums_copy = copy.deepcopy(nums)
max_num = max(nums) + 1
min_num_index = []
min_number = []
for i in range(find_nums):
num_min = min(nums_copy)
num_index = nums_copy.index(num_min)
min_number.append(num_min)
min_num_index.append(num_index)
nums_copy[num_index] = max_num
return min_num_index, min_number
all_d=[]
black,red,test1,test2=pic_show()
black_l=len(black)
red_l=len(red)
#print(black,red,black_l,red_l)
all=black+red
print("全部数组为:",all)
test=test1
choice=eval(input("请输入你要选择的测试用例:1 or 2 :(默认为test1)"))
if choice == 1:
test = test1
elif choice == 2:
test =test2
for xy in all:
d = distance(xy, test)
all_d.append(d)
print("元组距离为:",all_d)
k=eval(input("请输入k值:"))
min_num_index, min_number = kNN(all_d, k)
print('索引:', min_num_index)
print('元素值:', min_number)
black_num=0
red_num=0
for i in min_num_index:
if i+1 <= black_l:
black_num+=1
else:
red_num+=1
P_black=black_num/k
P_red=red_num/k
print("该文档属于black的概率为{}".format(P_black))
print("该文档属于red的概率为{}".format(P_red))
2.测试样例
这里测试文章取test1
k=1,3,5,8
3.实验结果
4.总结差异
当取不同k值时,所选取的不同类别文章个数不同,进而使得测试文章的所属类别的概率发生变化,将每篇测试文档分到训练集中离它最近的k篇文档所属类别中最多的那个类别。
需要注意的是:当训练集非常大的时候,kNN分类的精度很高;如果训练集很小,kNN可能效果很差。