k近邻算法
需求
rm(list=ls())
knnProp=read.csv("kNN_fictious.csv",header = FALSE)
#样本个数
Prow=86
#测试数据的个数
Srow=258
#循环次数
forNum=30
#knn算法
knn <- function(P,S,K){
#数组,存储最短的K个距离,样本分类标签和分类结果
#前K列为距离,1+K到2K为样本分类标签
#2K+1为分类结果,2K+2为原始分类标签
#2K+3为分类结果是否正确的判断,T为正确,F为错误
distanceArray=array(0,dim=c(Prow,2*K+3))
for(i in 1:Prow){
#临时数组,存储258个距离和样本分类标签
distanceC=array(0,dim=c(Srow,2))
for(j in 1:Srow){
#计算欧式距离
distance=sqrt((P[i,]$V1-S[j,]$V1)^2+
(P[i,]$V2-S[j,]$V2)^2+
(P[i,]$V3-S[j,]$V3)^2+
(P[i,]$V4-S[j,]$V4)^2)
#存储距离
distanceC[j,1]=distance
#存储样本分类标签
distanceC[j,2]=S[j,]$V5
}
#距离排序
distanceC=distanceC[order(distanceC[,1]),]
#两个暂时变量,用于判断分类
numA=0
numB=0
#存储前K个距离,样本分类标签
for(k in 1:K){
distanceArray[i,k]=distanceC[k,1]
distanceArray[i,k+K]=distanceC[k,2]
if(distanceC[k,2]==1){
numA=numA+1
} else{
numB=numB+1
}
}
#判断分类结果,并存储
if(numA>numB){
distanceArray[i,2*K+1]=1
} else{
distanceArray[i,2*K+1]=-1
}
#存储待分类数据的原始分类结果
distanceArray[i,2*K+2]=P[i,]$V5
#判断分类是否正确
if(distanceArray[i,2*K+2]==distanceArray[i,2*K+1]){
distanceArray[i,2*K+3]='T'
} else{
distanceArray[i,2*K+3]='F'
}
}
return(distanceArray)
}
#求正确率
Accuracy <- function(distanceArray,K){
#分类正确的个数
TNum=0
#分类错误的个数
FNum=0
#正确率判断
for(i in 1:Prow){
if(distanceArray[i,2*K+3]=='T'){
TNum=TNum+1
} else{
FNum=FNum+1
}
}
accuracy=TNum/(TNum+FNum)
return(accuracy)
}
#随机抽取样本循环30次,求30次正确率
Sample <- function(prop,K){
accuracy=c()
for(i in 1:forNum){
sampleNum=sample(1:344,344)
distanceArray=knn(knnProp[sampleNum[1:86],],knnProp[sampleNum[87:344],],K)
accuracy=c(accuracy,Accuracy(distanceArray,K))
}
rst=mean(accuracy)
return(rst)
}
rst1=Sample(knnProp,3)
rst2=Sample(knnProp,7)
rst3=Sample(knnProp,11)
rst1
rst2
rst3