以机器学习实战上面一段代码为例
import numpy as np
import operator
"""创建数据集"""
def createDataSet():
group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
"""k-近邻算法"""
def classify0(X, dataset, labels, k):
"""
X:用于输入的待分类向量
dataset:训练样本特征数据集
labels:训练样本对应的标签数据集
"""
# 训练数据的样本个数
dataset_size = dataset.shape[0]
# 计算x与训练集每个样本的距离(此处选择欧氏距离)
diffMat = np.tile(X, (dataset_size, 1)) - dataset
sqDiffMat = diffMat**2
sqDistances = np.sum(sqDiffMat, axis = 1)
distances = sqDistances**0.5
# 获得与样本距离从小到大排列后在愿标签中的的位置索引
sortedDistIndicies = np.argsort(distances)
# 获得与X最近的k个训练样本类别出现的次数
classCount = {}
for i in range(k):
# 获得离X第i近的样本标签
voteIlabel = labels[sortedDistIndicies[i]]
# 对该标签进行统计累加次数
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
# 对字典sortedClassCount按值从大到小进行排序
sortedClassCount = sorted(classCount.items(), key=lambda x: x[1], reverse = True)
# 返回X的类别
return sortedClassCount[0][0]
group, labels = createDataSet()
print(group)
print(labels)
out:
[[1. 1.1]
[1. 1. ]
[0. 0. ]
[0. 0.1]]
['A', 'A', 'B', 'B']
classify0([0,0], group, labels, 3)
out:
'B'