K近邻分类算法
下面先介绍算法的原理,其实很简单就是欧式距离+概率
首先介绍下欧式距离
设向量x=[x1,x2,x3…xn]
则x的L2范数 |L2|=
x
x
T
\sqrt{{x}{x}^T}
xxT
令K=20 这里的K为超参数 K的设置至关重要 它决定了模型的分类准确率
下面以鸢尾花数据集作为示例
from sklearn.metrics import recall_score,precision_score,f1_score,confusion_matrix
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import seaborn as sns
import numpy as np
class KNN():
def __init__(self,k):
self.k=k
def distance(self,X,x):
return np.linalg.norm(X-x,axis=1)
def fit(self,X,Y):
self.X=np.array(X) if not isinstance(X,np.ndarray) else X
self.Y=np.array(Y) if not isinstance(Y,np.ndarray) else Y
self.labels=set(Y)
def pred(self,X,isWeighted=False):
res=[]
for x in X:
dis=self.distance(self.X,x)
vote=np.argsort(dis)[:self.k]#得到k个最近数据下标
#加权投票
if isWeighted:
temp=1/(dis[vote]+1e3)#距离倒数
weights=temp/temp.sum()#权重归一化
ct={}
for (lab,score) in zip(self.Y[vote],dis[vote]*weights):
if lab in ct:
ct[lab]+=score
else:
ct[lab]=score
#等权投票
else:
ct=Counter(self.Y[vote])
res.append(max(ct,key=ct.get))
return res
def getEvaluation(self,ytrue,ypred):
print("精确率",precision_score(ytrue,ypred,average="micro"))
print("回归率",recall_score(ytrue,ypred,average="micro"))
print("f1-score",f1_score(ytrue,ypred,average="micro"))
confusmatrix=confusion_matrix(ytrue,ypred)
sns.heatmap(confusmatrix,annot=True,cmap="Greens")
iris=load_iris()
X,Y=iris.data,iris.target
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.3,random_state=42)
knn=KNN(10)
knn.fit(Xtrain,Ytrain)
mypred=knn.pred(Xtest,True)
knn.getEvaluation(Ytest,mypred)