KNN算法的的原理:KNN不存在显式的学习过程,对于一个测试样本,根据给定的距离计算公式,和k值,找到距离测试样本最近的k个训练样本,k个训练样本得票最高的类别作为测试样本的类别。
三要素
距离的度量
np.linalg.norm(x-y,p) #p=1即曼哈顿距离,p=2即欧式聚类
k值的选择
k值过大,,模型简单
k值过小,模型过拟合
通常k值选择较小的一个数,采用交叉验证法来选取最优的k值。
决策规则
通常选择多数服从少数原理,即得票最多的类别作为测试类别
python实现(线性查询比较)
import numpy as np
#naive
class Knn(object):
def __init__(self,k = 3,p = 2):
#k值选择
self.k = k
#距离公式选择
self.p = p
def fit(self,X,y):
self.X = X
self.y = y
def predict(self,x_test):
#k个邻近节点及下标
kNeighbers = sorted(enumerate(self.X),key= lambda xi:np.linalg.norm(xi[1]-x_test,self.p))[:self.k]
#k个邻近节点
self.neighber = [i[1] for i in kNeighbers]
#k个邻近节点的标签
self.neighberLabel = [self.y[i[0]] for i in kNeighbers]
label = np.max(self.neighberLabel)
return label
准备数据及测试
from sklearn import datasets
from sklearn.model_selection import train_test_split
X,y = datasets.load_iris(return_X_y=True)
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
knn = Knn()
knn.fit(x_train,y_train)
#类别标签
label = knn.predict(x_test[0])
#k个最邻近
knn.neighber
kd树
import heapq
class Node(object):
def __init__(self,data,label,sp=0,left=None,right=None):
#存储的数据
self.data = data
#数据的标签
self.label = label
#分离轴
self.sp = sp
#左子树
self.left = left
#右子树
self.right = right
class KdTree(object):
def __init__(self,X,y):
self.dim = X.shape[1]
self.root = self.createTree(X,y,0)
def createTree(self,dataset,y,sp):
if len(dataset)==0:
return None
#根据分离轴排序
datasetSort = sorted(enumerate(dataset),key=lambda d :d[1][sp])
mid = len(datasetSort)//2
data = datasetSort[mid][1]
label = y[datasetSort[mid][0]]
left = self.createTree(np.array([x[1] for x in datasetSort[:mid]]),np.array([y[x[0]] for x in datasetSort[:mid]]),(sp+1)%self.dim)
right = self.createTree(np.array([x[1] for x in datasetSort[mid+1:]]),np.array([y[x[0]] for x in datasetSort[mid+1:]]),(sp+1)%self.dim)
return Node(data,label,sp,left,right)
def nearest(self,x,near_k=3,p=2):
#初始化最小堆
self.knn = [(-np.inf,None)]*near_k
self.label = []
def visit(node):
if not node == None :
# print(node.data)
dis = x[node.sp] - node.data[node.sp]
if dis > 0:
visit(node.right)
else:
visit(node.left)
cur_dis =np.linalg.norm(node.data-x,p)
# print(dis)
heapq.heappushpop(self.knn,(-cur_dis,node))
if -self.knn[0][0]>abs(dis):
if dis>0:
visit(node.left)
else:
visit(node.right)
visit(self.root)
print( self.knn)
self.knn = [(i[1].data,i[1].label) for i in heapq.nlargest(near_k, self.knn)]
return self.knn
测试
knn = KdTree(x_train,y_train)
knn.nearest(x_test[0])
sklearn调用KNN分类器
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
#数据准备
X,y = datasets.load_iris(return_X_y=True)
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
kn = KNeighborsClassifier(n_neighbors=3,p=2)
kn.fit(x_train,y_train) #利用knn模型进行训练
kn.predict(x_test) #预测出的标签
kn.score(x_test,y_test)#训练准确率
kn.predict_proba(x_test)#每种类别的概率
kn.predict(x_test)
kn.score(x_test,y_test)
kn.predict_proba(x_test)