下面在此分享一下一次课程作业的答题思路及个人答题结果。如有错误欢迎指正。
*本文题目非原创。
Q:对于law数据计算GPA和 LSAT的相关系数的偏差,并计算经过偏差调整后的相关系数以及相关系数的置信区间(使用非参数的bootstrap,置信区间需要使用percentile 和BCa的方法,R和Python编程选一种使用)。
一、数据查看
LSAT | GPA |
576 | 3.39 |
635 | 3.3 |
558 | 2.81 |
578 | 3.03 |
666 | 3.44 |
580 | 3.07 |
555 | 3 |
661 | 3.43 |
651 | 3.36 |
605 | 3.13 |
653 | 3.12 |
575 | 2.74 |
545 | 2.76 |
572 | 2.88 |
594 | 2.96 |
二、相关系数的偏差计算
我们运用非参数的bootstrap算法进行计算。
将不同B次下计算的结果填入下表并绘制曲线图如下:
B次 | 25 | 50 | 100 | 200 | 400 | 800 | 1600 | 3200 |
bias | 0.018980788 | 0.009771031 | -0.031498300 | -0.013222192 | -0.008180624 | -0.004801327 | -0.010277039 | -0.008612720 |
三、经偏差调整后的相关系数计算
根据θ=θ-bias可算出经过偏差调整后的相关系数为:
B次 | 25 | 50 | 100 | 200 | 400 | 800 | 1600 | 3200 |
r | 0.7573937 | 0.7666035 | 0.8078728 | 0.7895967 | 0.7845551 | 0.7811758 | 0.7866515 | 0.7849872 |
四、相关系数的置信区间计算
B | 置信区间图 |
25 | |
50 | |
100 | |
200 | |
400 | |
800 | |
1600 | |
3200 |
五、附录
set.seed(11)
# Nonparametric bootstrap for the GPA_LSAT data
rm(list = ls())
# install.packages('bootstrap')
library(bootstrap)
library(plotly)
data(law82)
#names(law)
# plot the scatter
p=plot_ly(law82,x=~LSAT,y=~GPA,type="scatter",mode="markers");
p
# ==============================================================================
# bootstrap to calculate the bias of correlation coefficients
data(law)
data_treat=law
n=dim(data_treat)[1]
B=c(25,50,100,200,400,800,1600,3200)
corr_sample = cor(data_treat$LSAT,data_treat$GPA) # Sample correlation
corr_bias_boot=NULL # bootstrap bias vector
corr_correct=NULL
for(i in 1:length(B))
{
corr_boot=NULL # bootstrap samples vector for specific B
for(b in 1:B[i])
{
boot_ID=sample(1:n,n,replace=T) # nonparametric bootstrap
data_boot=data_treat[boot_ID,]
corr_boot=c(corr_boot,cor(data_boot$LSAT,data_boot$GPA))
}
corr_bias_boot=c(corr_bias_boot,(sum(corr_boot))/B[i]-corr_sample)
corr_correct = c(corr_correct, corr_sample-(sum(corr_boot))/B[i]+ corr_sample)
}
# Show the scatter plot
corr_correct
boot_result=data.frame(B=B,corr_correct=corr_correct)
p=plot_ly(boot_result,x=~B,y=~corr_correct,type="scatter",mode="markers");
p%>%add_lines()
# Show the scatter plot
corr_bias_boot
boot_result=data.frame(B=B,corr_bias_boot=corr_bias_boot)
p=plot_ly(boot_result,x=~B,y=~corr_bias_boot,type="scatter",mode="markers");
p%>%add_lines()
# Show the histogram
p=plot_ly(x=~corr_boot,type="histogram")%>%
layout(title = "histogram of bootstrap samples",
yaxis = list(title = "freq"))
p
# ==============================================================================
# improved estimate of corr
B=3200 # B=c(25,50,100,200,400,800,1600,3200)
n=15
data_treat=law
alpha = 0.1
corr_boot=NULL # bootstrap samples vector for specific B
corr_correct=NULL
for(b in 1:B)
{
boot_ID=sample(1:n,n,replace=T) # nonparametric bootstrap
data_boot=data_treat[boot_ID,]
corr_boot=c(corr_boot,cor(data_boot$LSAT,data_boot$GPA))
}
# ==============================================================================
# bootstrap-percentile CI
CI_boot_per_l = sort(corr_boot)[B*alpha/2]
CI_boot_per_r = sort(corr_boot)[B*(1-alpha/2)]
# BCa
corr_sample = cor(law$LSAT,law$GPA) # Sample correlation
z_0 = qnorm(length(corr_boot[corr_boot < corr_sample])/B)
theta_i = NULL
for(i in 1:n)
{
x_i = data_treat[-i]
theta_i = c(theta_i,var(x_i))
}
theta_hat = mean(theta_i)
hat_a = (sum((theta_hat-theta_i)^3))/(6*(sum((theta_hat-theta_i)^2))^(3/2))
alpha_1 = pnorm(z_0+((z_0+qnorm(alpha/2))/(1-hat_a*(z_0+qnorm(alpha/2)))))
alpha_2 = pnorm(z_0+((z_0+qnorm(1-alpha/2))/(1-hat_a*(z_0+qnorm(1-alpha/2)))))
# BCa CI
CI_boot_BCa_l = sort(corr_boot)[B*alpha_1]
CI_boot_BCa_r = sort(corr_boot)[B*alpha_2]
# plot
hist(corr_boot, main = "Histogram of the bootstrap samples and CI")
abline(v = c(CI_boot_per_l,CI_boot_per_r), lty = 1, col = 'red')
abline(v = c(CI_boot_BCa_l,CI_boot_BCa_r), lty = 1, col = 'blue')
legend("topright", c("boot-per","boot_BCa"), lty = c(1,1), col = c("red","blue"))