#前几篇的总结居然格式错误,找时间重新整理。
一调包
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
data = np.array([[3,104],[2,100],[1,81],[101,10],[99,5],[98,2]]) #打斗和接吻次数
labels = np.array([1,1,1,2,2,2]) #结果
knn.fit(data,labels)
knn.predict([18,90])
二.根据原理编写
import numpy as np
from numpy import *
import operator
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0] #shape[0]是行数,shape[1]是列数
diffMat = tile(inX, (dataSetSize,1)) - dataSet #tile是把数组进行重复
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() #索引从小到大排列
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #对出出现的label进行计数
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) #对label出现次数进行排序
return sortedClassCount[0][0] #[0][0]是label出现频率最多的label
调用
classify0([2,0], group, labels, 3)
对for循环的解释:
import numpy as np
from numpy import *
import operator
data = np.array([[3,104],[2,100],[1,81],[101,10],[99,5],[98,2]]) #打斗和接吻次数
labels = np.array([1,1,1,2,2,2]) #结果
inX=[8,10]
dataSetSize = data.shape[0]
diffMat = tile(inX, (dataSetSize,1)) -data
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
distances
sortedDistIndicies = distances.argsort()array([ 94.1328848 , 90.19977827, 71.34423593, 93. , 91.13725912, 90.35485598])
sortedDistIndicies #索引序列有小到大排序
for i in range(6):array([2, 1, 5, 4, 3, 0], dtype=int32)
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
classCount
sortedClassCount{1: 3, 2: 3}
[(1, 3), (2, 3)]
cc={}
cc[1] = cc.get(1,0) + 1
cc
{1: 1}
cc[1] = cc.get(1,0) + 1
cc
cc[1] = cc.get(1,0) + 1{1: 2}
cc
#累加{1: 3}