主要参考《机器学习实战》
kNN
存在一个样本数据集合(训练集),且训练集中每个数据都存在标签。输入无标签的新数据后,将新数据的每个特征与训练集样本对应的特征进行比较,然后算法提取样本集中特征最相似数据(最近邻)的分类标签。
一般而言,只选择样本数据集中前k个最相似的数据,通常k<=20。最后,选择k个最相似数据中出现次数最多的分类,作为新数据的分类。
kNN实现一般流程
(1)收集数据;
(2)准备数据:距离计算所需要的数值,最好是结构化的数据格式;
(3)分析数据;
(4)训练数据:不适用于kNN;
(5)测试数据:计算错误率;
(6)使用算法:首先输入样本数据和结构化的输出结果,然后运行kNN判断分类结果,最后应用对计算出的分类执行后续的处理。
代码
# -*- coding: utf-8 -*-
from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt
import time
def CreateDataset():
group = array([[1.0, 1.1],
[1.0, 1.0],
[0.0, 0.0],
[0.0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
"""
对未知属性的数据集中的每个点依次执行以下操作:
(1)计算已知类别数据集中的点与当前点之间的距离;
(2)按照距离递增次序排列;
(3)选取与当前点距离最小的k个点;
(4)确定前k个点所在类别出现的概率;
(5)返回前k个点出现概率最高的类别作为当前点的预测分类;
"""
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0] # 样本量个数
# numpy.tile(A, reps):通过重复 A 由 reps 给出的次数来构造一个数组。
diffMat = tile(inX, (dataSetSize, 1)) - dataSet # 计算与训练集的差值
sqDiffMat = diffMat ** 2 # 计算距离
sqDistances = sqDiffMat.sum(axis=1) # 按列求和得最终距离
distances = sqDistances**0.5 # 距离开方
# numpy.argsort():返回将对数组进行排序的索引(对数组的索引值进行排序)
# print("distances:", distances)
sortedDisIndicies = distances.argsort()
# print("sortedDisIndicies:", sortedDisIndicies)
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDisIndicies[i]]
# Python 字典 get() 函数返回指定键的值。
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
# print("sortedClassCount:", sortedClassCount)
return sortedClassCount[0][0]
def file2matrix(filename):
with open(filename) as fr:
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines, 3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m, 1))
normDataSet = normDataSet / tile(ranges, (m, 1))
return normDataSet, ranges, minVals
def datingClassTest():
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :],
datingLabels[numTestVecs : m], 5)
print("the classifier came back with: %d, the real answer is: %d" \
% (classifierResult, datingLabels[i]))
# time.sleep(1)
if (classifierResult != datingLabels[i]): errorCount += 1.0
print("the total error rate is: %f" %(errorCount / float(numTestVecs)))
if __name__ == "__main__":
# group, labels = CreateDataset()
# pro = classify0([0, 0], group, labels, 3)
# print(pro)
# datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
# print("returnMat:")
# print(returnMat)
# print("classLabelVector:")
# print(classLabelVector)
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.scatter(datingDataMat[:,0], datingDataMat[:, 1],
# 15.0*array(datingLabels), 15.0*array(datingLabels))
# ax.set_xlabel("飞行常客里程数")
# ax.set_ylabel("玩视频游戏所耗时间占比")
# plt.show()
# normMat, ranges, minVals = autoNorm(datingDataMat)
# print(normMat)
datingClassTest()