一、k-近邻
rm(list=ls())
#1
read_digits <- function(file) {
data = read.table(file)
colnames(data)[1] = "zipcode"
data[,1] = factor(data[,1])
return(data)
}
file="train.txt"
train = read_digits(".\train.txt")
test = read_digits(".\test.txt")
predict_knn = function(train, test, k=1, d = "euclidean") {
distance_train=train[,2:ncol(train)]
distance_test=test[,2:ncol(test)]
distances = matrix(0, nrow(test), nrow(train))
for(i in 1:nrow(test)) {
for(j in 1:nrow(train)) {
distances[i,j] = dist(rbind(distance_test[i,],distance_train[j,]), method = d)
}
}
p = numeric(nrow(distances))
for(i in 1:nrow(distances)) {
vec = train[order(distances[i,])[1:k],1]
tabs = table(vec)
pos = which.max(tabs)
p[i] = as.numeric(names(tabs))[pos]
}
return(p)
}
二、10折K近邻
alldistance1=dist(train[,-1])
alldistance1= as.matrix(alldistance1) #default eculid distance
alldistance2=dist(train[,-1], method = "manhattan")
alldistance2= as.matrix(alldistance2)
zipcode = train[,1]
make_pred = function(x, k,label) {
p = numeric(nrow(x))
for(i in 1:nrow(x)) {
tbl = table(label[order(x[i,])[1:k]])
p[i]=as.integer(names(tbl)[which.max(tbl)])
}
return(p)
}
cv_error_knn = function(k, d) {
if(d == 1) alldistance = alldistance1
if(d == 2) alldistance = alldistance2
splits = (1:nrow(train)) %% 10
splits = splits + 1
preds = NULL
dd =NULL
for(i in 1:10) {
tests = which(splits == i)
trains = which(splits != i)
sub = alldistance[tests, trains]
temp=make_pred(sub,k,label = zipcode[trains])
preds = c(preds,temp)
zz=zipcode[tests]
dd=as.numeric(c(dd,as.vector(zipcode[tests])))
}
# zipcode1=as.numeric(c(as.vector(zipcode[tests]),as.vector(zipcode[trains])))
err_overall = mean(preds != dd)
cm = table(preds, dd)
return( list(err_overall,cm))
}