听了好几年的K近邻算法 今天终于接触到了
原理很简单 讲样本映射为多维空间中的点
无标签新样本 由空间中与其最近的K个点中数量最多的标签来定义
以下为暴力实现 高效算法留坑
from sklearn import datasets
def cmp(elem):
return elem[0]
iris=datasets.load_iris()
data=iris.data
target=iris.target
num_data,num_feature=data.shape
num_target=len(iris.target_names)
data_train,target_train=[],[]#训练集大小为120
data_test,target_test=[],[]#测试集大小为30
for i in range(num_data):
if i%5!=0:
data_train.append(data[i])
target_train.append(target[i])
else:
data_test.append(data[i])
target_test.append(target[i])
num_train,num_test=len(data_train),len(data_test)
K=int(10)#只考虑最近的10个邻居
#P=2
count=int(0)
for i in range(num_test):
list=[]
for j in range(num_train):
val=0.0
for k in range(num_feature):
val+=(data_test[i][k]-data_train[j][k])*(data_test[i][k]-data_train[j][k])
val=val**0.5
"""
for k in range(num_feature):
val+=abs(data_test[i][k]-data_train[j][k])
"""
tmp=[]
tmp.append(val)
tmp.append(target_train[j])
list.append(tmp)
list.sort(key=cmp)
vote=[]
for j in range(num_target):
vote.append(int(0))
for j in range(K):
id=int(list[j][1])
vote[id]=vote[id]+1
maxx,ans=int(-1),int(-1)
for j in range(num_target):
if maxx<vote[j]:
maxx,ans=vote[j],j
if ans==target_test[i]:
count=count+1
print(count,num_test)