-- coding: utf-8 --
“”"
Created on Thu Nov 14 19:29:08 2019
@author: HTING
“”"
导入科学计算包模块
import numpy as np
导入运算符模块
import operator
=============================================================================
# 导入 os 模块
import os
=============================================================================
创建数据集和标签
def createDataSet():
group = np.array([[1.0, 1.1],
[1.0, 1.0],
[0, 0],
[0, 0.1]])
labels = (‘A’, ‘A’, ‘B’, ‘B’)
return group, labels
‘’’
Parameters:
inX - 用于分类的数据(测试集)
dataSet - 用于训练的数据(训练集)
labes - 训练数据集的label
k - 选择距离最小的k个点
return:
sortedClassCount[0][0] - 输入数据的预测分类
‘’’
k-近邻算法
def classify0(inX, k):
# import dataSet, labels
dataSet, labels = createDataSet()
# 计算距离
# A.shape[i] : 第i维的长度
dataSetSize = dataSet.shape[0]
# 用tile将输入向量复制成和数据集一样大的矩阵
'''
np.tile(A, reps) :
数组A重复一定次数获得新数组;
A - array, list, tuple, dict, matrix
以及基本数据类型int, string, float以及bool类型;
reps - tuple,list, dict, array, int, bool.
但不可以是float, string, matrix类型;
np.tile(A,(m,n)):
数组A重复n次 --> nA; # A重复n次
nA --> m[nA]. # m 维的nA
'''
diffMat = np.tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat ** 2
'''
In Numpy dimensions are called axes.
The number of axes is rank.
'''
sqDistances = sqDiffMat.sum(axis=1)
# sqDistances = np.sum(sqDiffMat, axis=1)
distances = sqDistances ** 0.5
# 按距离从小到大排序,并返回相应的索引位置
# A.argsort()[]
sortedDistIndicies = distances.argsort()
# 创建一个字典,存储标签和出现次数
classCount = {}
# 选择距离最小的k个点
for i in range(k):
'''
for i in range(m,n,z) | range(start, stop, step)
i <--> m -> n-1, step = z;
default: m = 0, z = 1
'''
# 查找样本的标签类型
voteIlabel = labels[sortedDistIndicies[i]]
# 在字典中给找到的样本标签类型+1
'''
若不存在voteIlabel,
则字典classCount中生成voteIlabel元素,并使其对应的数字为0 :
: classCount = {voteIlabel:0}
此时classCount.get(voteIlabel,0)作用是检测并生成新元素,括号中的0只用作初始化,之后再无作用;
当字典中有voteIlabel元素时,
classCount.get(voteIlabel,0)作用是返回该元素对应的值
'''
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
# 排序并返回出现次数最多的标签类型
'''
sorted(iterable, cmp=None, key=None, reverse=False) --> new sorted list
cmp -- accept function;
key -- accept one element of one function, which is function return ,
the weight to sort;
reverse -- True -> positive order;
False -> negative order;
operator.itemgetter()
用于获取对象的哪些维的数据,参数为一些序号。
注,operator.itemgetter函数获取的不是值,而是定义了一个函数,通过该函数作用到对象上才能获取值。
'''
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
#实现 classify0()的第二种方式
def classify1(inX, dataSet, Labels, k):
#计算距离
#import numpy as np\
import collections
dist = np.sum((inX - dataSet)**2, axis=1) ** 0.5 #利用numpy中的broadcasting
#k个最近的标签
k_labels = [Labels[index] for index in dist.argsort()[0 : k]]
#出现次数最多的标签即为最终类别
label = collections.Counter(k_labels).most_common(1)[0][0]
return label