KNN算法:又称k-近邻算法,是数据分类最为简单有效的算法之一,但是每次使用KNN算法时需要所有的训练样本与每个测试样本进行计算,若训练集较大则占用较多的内存,导致训练和测试的效率不高。
注意:KNN的样本集的每一个样本可能是由多个特征数据组成,这些特征数据的可能各自的数量级不一样这对最终结果影响很大,需要将数据归一化(每个特征的range=max-min),采用距离作为KNN的度量标准时候可以适当的设置各个特征在计算距离时的权重。此外,某些特殊情况可以使用余弦相似度作为KNN的度量标准。
KNN算法实现:
1、单个测试样本数据。2、处理后的训练集数据。3、训练集对应的labels。4、k值(设置取与测试样本最近的K个训练结果,统计最高出现的label即结果)
import numpy as np
import operator
import os
from mpl_toolkits import mplot3d
import matplotlib
import matplotlib.pyplot as plt
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # key不可少!!!
return sortedClassCount[0][0]
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = np.zeros(np.shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals, (m, 1))
normDataSet = normDataSet / np.tile(ranges, (m, 1))
return normDataSet, ranges, minVals
def file2matrix(filename, n=3):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = np.zeros((numberOfLines, n)) # n=3
index = 0
classLabelVector = []
for line in arrayOLines:
line = line.strip() # 默认去除首尾空格,或指定的字符
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
fr.close()
return returnMat, classLabelVector
def datingClassTest():
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix(
r"C:\Users\tycho-wjr\Desktop\数据结构和算法相关\机器学习\机器学习实战中文\MLiA_SourceCode\machinelearninginaction\Ch02\datingTestSet2.txt")
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:int(m), :], datingLabels[numTestVecs:int(m)], 3)
print("the classifier came back with: %d,the real answer is :%d" % (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]): errorCount += 1.0
print("the total error rate is:%f" % (errorCount / float(numTestVecs)))
def classifyperson():
resultList = ["not at all", "in small doses", "in large doses"]
percentTats = float(input("percentage if time spent playing video games?"))
ffMiles = float(input("frequent flier miles earned per year?"))
iceCream = float(input("liters of icecream consumed per year?"))
datingDataMat, datingLabels = file2matrix(
r"C:\Users\tycho-wjr\Desktop\数据结构和算法相关\机器学习\机器学习实战中文\MLiA_SourceCode\machinelearninginaction\Ch02\datingTestSet2.txt")
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = [percentTats, ffMiles, iceCream]
classifiResult = classify0((inArr - minVals) / ranges, datingDataMat, datingLabels, 3)
print("you will probably like this person:", resultList[classifiResult - 1])
def img2vector(filename):
returnVect = np.zeros((1, 1024))
fr = open(filename)
for i in range(32):
line = fr.readline()
for j in range(32):
returnVect[0, i * 32 + j] = int(line[j])
fr.close()
return returnVect
def handwritingClassTest(filepath1, filepath2):
train_files = os.listdir(filepath1)
train_labels = []
train_imgs = np.zeros((len(train_files), 1024))
for i in range(len(train_files)):
train_imgs[i] = img2vector(filepath1 + "/" + train_files[i])
train_labels.append(int(train_files[i].split("_")[0]))
test_files = os.listdir(filepath2)
test_labels = []
test_img = np.zeros((1, 1024))
error = 0
for j in range(len(test_files)):
test_img = img2vector(filepath2 + "/" + test_files[j])
test_labels.append(int(test_files[j].split("_")[0]))
test_result = classify0(test_img, train_imgs, train_labels, 3)
print("image %d is probably %d ,its label is %d" % (j, test_result, test_labels[j]))
if (test_labels[j] != test_result): error += 1
print("total images is %d ,total error images is %d" % (len(test_files), error))
print("total error rate is %f" % (error / float(len(test_files))))
testing_path=r"C:\Users\tycho-wjr\Desktop\数据结构和算法相关\机器学习\机器学习实战中文\MLiA_SourceCode\machinelearninginaction\Ch02\digits\testDigits"
training_path=r"C:\Users\tycho-wjr\Desktop\数据结构和算法相关\机器学习\机器学习实战中文\MLiA_SourceCode\machinelearninginaction\Ch02\digits\trainingDigits"
handwritingClassTest(training_path,testing_path)
#datingClassTest()
#classifyperson()
"""
fig = plt.figure()
ax = mplot3d.Axes3D(fig)
#plt.xlabel("x air caption")
#plt.ylabel("y game caption")
#plt.zlabel("z ice-cream caption")
ax.scatter(xs=datingDataMat[:, 0], ys=datingDataMat[:, 1], zs=datingDataMat[:, 2], s=15 * np.array(datingDataLabel),
c=15 * np.array(datingDataLabel))
plt.show()
"""