Simulation study
这是我研究生课程的一个作业小题,用R做simulation,教授给的代码逻辑很值得学习,特别是最后整合在一个图像里。我在他代码的基础上稍作改动,特别是在使用ksmooth这个function时要注意它的预测值所对应的x是升序的,这里我也给出了方法解决这个问题。
Originally written by Prof. Noah Simon from University of Washington, and extended by Lan Shui.
Conditional mean functions
library(ggplot2)
cond_mean_lin <- function(x){
return(2*x)
}
cond_mean_sin <- function(x){
return(sin(2*pi*x))
}
cond_mean_sin2 <- function(x){
return(sin(30*x))
}
cond_mean_list <- list()
cond_mean_list[[1]] <- cond_mean_lin
cond_mean_list[[2]] <- cond_mean_sin
cond_mean_list[[3]] <- cond_mean_sin2
Predictive Model Training Functions
train_mod_poly_generator <- function(degree){
force(degree) # add this because of the lazy R
train_mod <- function(dat){
fit <- lm(y~poly(x,degree),data = dat)
pred<-predict(fit,newX=dat$x)
}
return(train_mod)
}
train_mods<-list()
for(i in 1:5){
train_mods[[i]] <- train_mod_poly_generator(i)
}
## For ksmooth function, x values at which the
## smoothed fit is evaluated. Guaranteed to be in
## increasing order.
train_mod_NW_generator <- function(kernel){
train_mod <- function(dat){
n <- nrow(dat)
pred <- ksmooth(dat$x,dat$y,kernel,bandwidth = n^(-1/3),x.points = dat$x)$y
unsort.x = inverse.permutation(order(dat$x))
pred = pred[unsort.x] # puts g back into data order
return(pred)
}
return(train_mod)
}
inverse.permutation = function(forward.permutation) {
## match() returns the indices of the first
## occurrences of its first argument its second
## argument. See solutions-05.R for a (slower)
## approach which doesn’t use this.
inv.perm = match(1:length(forward.permutation),forward.permutation)
return(inv.perm)
}
train_mods[[6]] <- train_mod_NW_generator(kernel="box")
train_mods[[7]] <- train_mod_NW_generator(kernel="normal")
The simulation Engine
gen_dat<-function(n, cond_mean){
x<-runif(n)
y<-cond_mean(x)+rnorm(n)
dat<-data.frame(x=x,y=y)
return(dat)
}
evaluate_model <- function(pred,x_grid,cond_mean){
cond_mean_vals<-cond_mean(x_grid)
MSE<-mean((pred-cond_mean_vals)^2)
return(MSE)
}
run_one_eval <- function(n,cond_mean,train_mod){
dat<-gen_dat(n,cond_mean)
pred<-train_mod(dat)
MSE<-evaluate_model(pred,dat$x,cond_mean)
return(MSE)
}
run_one_sim_group <- function(ns,cond_mean,train_mod){
MSEs <- rep(0,length(ns))
for(i in 1:length(ns)){
n<-ns[i]
MSEs[i]<-mean(replicate(nrep, run_one_eval(n,cond_mean,train_mod)))
}
return(MSEs)
}
set.seed(1)
nrep <- 1000
ns <-c(30,50,100,500,1000)
output <- data.frame(MSE=numeric(0),cond_mean = numeric(0),train_mod = numeric(0),n=numeric(0))
for (i in 1:length(cond_mean_list)){
for (j in 1:length(train_mods)){
MSEs <- run_one_sim_group(ns,cond_mean_list[[i]],train_mods[[j]])
new_output <- data.frame(MSE = MSEs, cond_mean = i, train_mod = j, n=ns)
output <- rbind(output, new_output)
}
}
output
ggplot(output, aes(x=n,y=MSE,color=as.factor(train_mod)))+geom_line()+facet_wrap(~cond_mean)
最后得到的图如下
由于时间关于没有对图例进行修改,1: linear regression, 2: polynomial regression df=2, 3: polynomial regression df=3, 4: polynomial regression df=4, 5: polynomial regression df=5, 6: Nadaraya-Watson estimation with box kernel, 7: Nadaraya-Watson estimation with normal kernel.
左起第一张图是对f(x)=2x进行拟合的MSE结果,第二张图是对f(x)=sin(2pix)进行拟合的MSE结果,第三张图是对f(x)=sin(30x)进行拟合的MSE结果。