R语言|计算统计作业练习1

最新推荐文章于 2024-10-08 11:11:32 发布

Bayes1b

最新推荐文章于 2024-10-08 11:11:32 发布

阅读量413

点赞数 7

分类专栏： R语言文章标签： r语言开发语言

本文链接：https://blog.csdn.net/qq_56593533/article/details/134843885

版权

R语言专栏收录该内容

5 篇文章 0 订阅

订阅专栏

下面在此分享一下一次课程作业的答题思路及个人答题结果。如有错误欢迎指正。

*本文题目非原创。

Q：对于law数据计算GPA和 LSAT的相关系数的偏差，并计算经过偏差调整后的相关系数以及相关系数的置信区间(使用非参数的bootstrap，置信区间需要使用percentile 和BCa的方法,R和Python编程选一种使用)。

一、数据查看

LSAT	GPA
576	3.39
635	3.3
558	2.81
578	3.03
666	3.44
580	3.07
555	3
661	3.43
651	3.36
605	3.13
653	3.12
575	2.74
545	2.76
572	2.88
594	2.96

二、相关系数的偏差计算

我们运用非参数的bootstrap算法进行计算。

将不同B次下计算的结果填入下表并绘制曲线图如下：

B次	25	50	100	200	400	800	1600	3200
bias	0.018980788	0.009771031	-0.031498300	-0.013222192	-0.008180624	-0.004801327	-0.010277039	-0.008612720

三、经偏差调整后的相关系数计算

根据θ=θ-bias可算出经过偏差调整后的相关系数为：

B次	25	50	100	200	400	800	1600	3200
r	0.7573937	0.7666035	0.8078728	0.7895967	0.7845551	0.7811758	0.7866515	0.7849872

四、相关系数的置信区间计算

B	置信区间图
25
50
100
200
400
800
1600
3200

五、附录

set.seed(11)
# Nonparametric bootstrap for the GPA_LSAT data

rm(list = ls())
# install.packages('bootstrap')
library(bootstrap)
library(plotly)
data(law82) 
#names(law)
# plot the scatter 
p=plot_ly(law82,x=~LSAT,y=~GPA,type="scatter",mode="markers");
p

# ==============================================================================
# bootstrap to calculate the bias of correlation coefficients
data(law)
data_treat=law
n=dim(data_treat)[1]
B=c(25,50,100,200,400,800,1600,3200)
corr_sample = cor(data_treat$LSAT,data_treat$GPA) # Sample correlation


corr_bias_boot=NULL    # bootstrap bias vector
corr_correct=NULL
for(i in 1:length(B))
{
  corr_boot=NULL     # bootstrap samples vector for specific B
  for(b in 1:B[i])
  {
    boot_ID=sample(1:n,n,replace=T) # nonparametric bootstrap
    data_boot=data_treat[boot_ID,]  
    corr_boot=c(corr_boot,cor(data_boot$LSAT,data_boot$GPA))
  }
  corr_bias_boot=c(corr_bias_boot,(sum(corr_boot))/B[i]-corr_sample)
  corr_correct = c(corr_correct, corr_sample-(sum(corr_boot))/B[i]+ corr_sample)
}

# Show the scatter plot
corr_correct
boot_result=data.frame(B=B,corr_correct=corr_correct) 
p=plot_ly(boot_result,x=~B,y=~corr_correct,type="scatter",mode="markers");
p%>%add_lines() 


# Show the scatter plot
corr_bias_boot
boot_result=data.frame(B=B,corr_bias_boot=corr_bias_boot) 
p=plot_ly(boot_result,x=~B,y=~corr_bias_boot,type="scatter",mode="markers");
p%>%add_lines() 

# Show the histogram
p=plot_ly(x=~corr_boot,type="histogram")%>%
  layout(title = "histogram of bootstrap samples",  
         yaxis = list(title = "freq"))
p

# ==============================================================================
# improved estimate of corr

B=3200 # B=c(25,50,100,200,400,800,1600,3200)
n=15
data_treat=law
alpha = 0.1
corr_boot=NULL     # bootstrap samples vector for specific B
corr_correct=NULL
for(b in 1:B)
{
    boot_ID=sample(1:n,n,replace=T) # nonparametric bootstrap
    data_boot=data_treat[boot_ID,]  
    corr_boot=c(corr_boot,cor(data_boot$LSAT,data_boot$GPA))
}

# ==============================================================================
# bootstrap-percentile CI         
CI_boot_per_l = sort(corr_boot)[B*alpha/2]
CI_boot_per_r = sort(corr_boot)[B*(1-alpha/2)]


# BCa
corr_sample = cor(law$LSAT,law$GPA) # Sample correlation

z_0 = qnorm(length(corr_boot[corr_boot < corr_sample])/B)
theta_i = NULL
for(i in 1:n)
{
  x_i = data_treat[-i]
  theta_i = c(theta_i,var(x_i))
}
theta_hat = mean(theta_i)
hat_a = (sum((theta_hat-theta_i)^3))/(6*(sum((theta_hat-theta_i)^2))^(3/2))

alpha_1 = pnorm(z_0+((z_0+qnorm(alpha/2))/(1-hat_a*(z_0+qnorm(alpha/2)))))
alpha_2 = pnorm(z_0+((z_0+qnorm(1-alpha/2))/(1-hat_a*(z_0+qnorm(1-alpha/2)))))

# BCa CI         
CI_boot_BCa_l = sort(corr_boot)[B*alpha_1]
CI_boot_BCa_r = sort(corr_boot)[B*alpha_2]

# plot      
hist(corr_boot, main = "Histogram of the bootstrap samples and CI")
abline(v = c(CI_boot_per_l,CI_boot_per_r), lty = 1, col = 'red')
abline(v = c(CI_boot_BCa_l,CI_boot_BCa_r), lty = 1, col = 'blue')
legend("topright", c("boot-per","boot_BCa"), lty = c(1,1), col = c("red","blue"))