统计学习要素中有图7.14:
我一直在尝试做,到目前为止,我只能使用代码生成左上图“ Prediction Error”,该代码将生成图7.3所示的100个模拟(见图)。
图7.14仅需要图7.3中的线性回归图(右上方)。 代码如下所示(注意:case = 2用于kNN图,这是不必要的):
# generate dataset
genX
X = matrix(runif(n*p, 0, 1), ncol = p, nrow = n)
return(X)
}
# generate response
genY
n = nrow(X)
Y = numeric(n)
if (case == 1){ # for the left panel of fig. 7.3
Y = sapply(X[, 1], function(x) ifelse(x <= 0.5, 0, 1))
}
else {
Y = apply(X[, 1:10], 1, function(x) ifelse(sum(x) > 5, 1, 0))
}
return(Y)
}
## global parameters setting
ntest = 1000
percent = 0.75
B = 100 # the number of repetition
## case 2
seed = 1234
set.seed(seed)
X = genX()
Y = genY(X, case = 2)
X.test = genX(n = ntest)
Y.test = genY(X.test, case = 2)
## use leaps package to do best subset selection
library(leaps)
## predict test dataset by using the best subset model with size p
predict.regsub
which = summary(model)$which
coef.raw = coef(model, p)
# construct coef vector
if (length(coef.raw) == p+1)
{
coef.vec = numeric(1+ncol(X.test)) # include intercept
coef.vec[1] = coef.raw[1]
flag = 1
}
else
{
coef.vec = numeric(ncol(X.test))
flag = 0
}
j = flag + 1 # point to raw coef
for (i in c(1:ncol(X.test)) + flag){
if (which[p, i]){
coef.vec[i] = coef.raw[j]
j = j + 1
}
}
# for simplicity, consider intercept; and in fact, every regsubset models have intercept
pred = apply(cbind(1, X.test), 1, function(x) sum(x*coef.vec))
return(pred)
}
n = nrow(X)
## store all prediction
reg.pred.full = vector("list", 20)
for (i in 1:20){
reg.pred.full[[i]] = matrix(nrow = nrow(X.test),
ncol = B)
}
for (i in 1:B){
train.id = sample(n, n*percent)
X.train = X[train.id, ]
Y.train = Y[train.id]
reg.sub = regsubsets(X.train, Y.train, nvmax = 20)
for (j in 1:20){
reg.pred.full[[j]][, i] = predict.regsub(reg.sub, j, X.test)
}
}
## calculate bias2, variance, epe
reg.bias2.full = numeric(20)
reg.variance.full = numeric(20)
reg.epe.full = numeric(20)
cl.epe.full = numeric(20)
#apply
#sapply
for (i in 1:20){
bias2 = sapply(1:ntest, function(j) (mean(reg.pred.full[[i]][j, ]) - Y.test[j])^2)
variance = apply(reg.pred.full[[i]], 1, function(x) var(x))
# for regression
epe = sapply(1:ntest, function(j) mean((reg.pred.full[[i]][j, ] - Y.test[j])^2))
reg.variance.full[i] = mean(variance)
reg.bias2.full[i] = mean(bias2)
reg.epe.full[i] = mean(epe)
}
在这里,我开始对图7.14的左上平面进行编码。
#Prediction Error
error = matrix(0,nrow=20,ncol=100);
for (i in 1:20){
error[i,] = sapply(1:100, function(j) mean((reg.pred.full[[i]][,j] - Y.test)^2))
}
最后,这就是我遇到的问题。 我正在尝试使用10倍CV的右上方面板,但是在使用部分可用数据来拟合模型后,我不知道下一步该怎么做。 底部图也一样。
#10-fold-cross-validation
n_folds
folds_i
cv_err = matrix(0,nrow=n_folds,ncol=20);
for (k in 1:n_folds){
test_i
#what's next??
}
#Leave-one-out CV
我知道我必须填充cv_err才能绘制出如下图所示的Prediction error图,
#plots
plot(1:20,error[,1],"l",ylim=c(0,.4),xlab = "Subset size p", ylab = "Error", main = "Prediction Error");
colvec
for(i in 2:100)
{
lines(1:20,error[,i],"l",col=colvec[i]);
}
但是我不知道要如何在我的预测模型中使用10折。 此外,从10折起,如何进行留一法和右下角的面板。 有谁可以帮助我吗?

被折叠的 条评论
为什么被折叠?



