算法步骤:
1.计算输入数据与已知标签样本集的距离
2.将步骤1计算的距离值进行排序
3.选取排序的前K个取值(K邻近算法中K的由来)
4.计算这K个取值中不同标签出现的频率,将频率最大的标签作为本次预测的结果
这里选取距离计算公式为欧式距离,欧式距离也是最常见的距离计算公式:
数据集 datingTestSet2.txt
40920 8.326976 0.953952 3
14488 7.153469 1.673904 2
26052 1.441871 0.805124 1
75136 13.147394 0.428964 1
38344 1.669788 0.134296 1
72993 10.141740 1.032955 1
35948 6.830792 1.213192 3
42666 13.276369 0.543880 3
67497 8.631577 0.749278 1
35483 12.273169 1.508053 3
50242 3.723498 0.831917 1
63275 8.385879 1.669485 1
5569 4.875435 0.728658 2
51052 4.680098 0.625224 1
77372 15.299570 0.331351 1
43673 1.889461 0.191283 1
61364 7.516754 1.269164 1
69673 14.239195 0.261333 1
15669 0.000000 1.250185 2
28488 10.528555 1.304844 3
6487 3.540265 0.822483 2
37708 2.991551 0.833920 1
22620 5.297865 0.638306 2
28782 6.593803 0.187108 3
19739 2.816760 1.686209 2
36788 12.458258 0.649617 3
sklearn直接调库实现
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
def file2matrix(filename): #打开文件,获取数据和标签
love_dictionary = {'largeDoses':3, 'smallDoses':2, 'didntLike':1}
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines) #get the number of lines in the file
returnMat = np.zeros((numberOfLines, 3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
if(listFromLine[-1].isdigit()):
classLabelVector.append(int(listFromLine[-1]))
else:
classLabelVector.append(love_dictionary.get(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
if __name__ == '__main__':
hoRatio = 0.50 #取数据的50%作为已知标签的样本集 50%作为未知标签数据
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file
scaler = MinMaxScaler() # 归一化数据
scaler = scaler.fit(datingDataMat) # fit,在这里本质是生成min(x)和max(x)
normMat = scaler.transform(datingDataMat) # 通过接口导出结果
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
model =KNeighborsClassifier(n_neighbors=3)
model.fit(normMat[numTestVecs:m, :], datingLabels[numTestVecs:m])
y=model.predict(normMat[0:numTestVecs, :])
#计算准确率
result = y - datingLabels[0:numTestVecs]
error=0
for i in range(len(result)):
if result[i]!=0:
error+=1
print(f'准确率:{((1-error/len(result))*100)}%')
import numpy as np
from os import listdir
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def file2matrix(filename):
love_dictionary = {'largeDoses':3, 'smallDoses':2, 'didntLike':1}
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines) #get the number of lines in the file
returnMat = np.zeros((numberOfLines, 3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
if(listFromLine[-1].isdigit()):
classLabelVector.append(int(listFromLine[-1]))
else:
classLabelVector.append(love_dictionary.get(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = np.zeros(np.shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals, (m, 1))
normDataSet = normDataSet/np.tile(ranges, (m, 1)) #element wise divide
return normDataSet, ranges, minVals
def datingClassTest():
hoRatio = 0.50 #hold out 10%
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print("返回的分类结果是: %d, 真实的分类标签是: %d" % (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]): errorCount += 1.0
print('准确率为{:.2f}%'.format((1-errorCount / float(numTestVecs))*100))
print(errorCount)
if __name__ == '__main__':
datingClassTest()