R的基本数据分析(描述性统计)

1.数值型单变量

attach(ReportCard)
(Av.Poli<-mean(poli))
(Av.Poli<-mean(poli,na.rm=TRUE))
(Sd.Poli<-sd(poli,na.rm=TRUE))
(N<-length(poli[!is.na(poli)]))
(Skew.Poli<-sum((poli[!is.na(poli)]-Av.Poli)^3/Sd.Poli^3)/N)
(Kurt.Poli<-sum((poli[!is.na(poli)]-Av.Poli)^4/Sd.Poli^4)/N-3)
 summary(poli)
detach(ReportCard)


> (Skew.Poli<-sum((poli[!is.na(poli)]-Av.Poli)^3/Sd.Poli^3)/N)
[1] -1.306537
> (Kurt.Poli<-sum((poli[!is.na(poli)]-Av.Poli)^4/Sd.Poli^4)/N-3)
[1] 2.053688
>  summary(poli)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  40.00   74.50   82.50   79.64   87.00   96.00       2 

  

同时计算 多个变量的描述性统计量

sapply()

colMeans()

colSums()

tapply()

> (Av.Course<-sapply(ReportCard[,3:10],FUN=mean,na.rm=TRUE))
    poli      chi     math     fore      phy      che      geo 
79.63793 83.27966 61.16949 49.91667 75.20000 54.08333 65.24167 
     his 
78.68333 
> (Sd.Course<-sapply(ReportCard[,3:10],FUN=sd,na.rm=TRUE))
     poli       chi      math      fore       phy       che 
10.575872  8.127365 15.076417 14.018501 12.351902 12.315474 
      geo       his 
15.394389 12.73523

 

#########方便地计算每门课程的描述统计量(用户自定义函数的定义和调用)
Des.Fun<-function(x,...){
 Av<-mean(x,na.rm=TRUE)
 Sd<-sd(x,na.rm=TRUE)
 N<-length(x[!is.na(x)])
 Sk<-sum((x[!is.na(x)]-Av)^3/Sd^3)/N
 Ku<-sum((x[!is.na(x)]-Av)^4/Sd^4)/N-3
 result<-list(avg=Av,sd=Sd,skew=Sk,kurt=Ku)
 return(result)
}
DesRep<-sapply(ReportCard[,3:10],FUN=Des.Fun,na.rm=TRUE)

> DesRep
     poli      chi        math        fore        phy       
avg  79.63793  83.27966   61.16949    49.91667    75.2      
sd   10.57587  8.127365   15.07642    14.0185     12.3519   
skew -1.306537 -0.3117055 -0.07276131 -0.02902908 -0.3264613
kurt 2.053688  -0.6558917 -1.004213   -0.1879948  -0.7166125
     che        geo        his       
avg  54.08333   65.24167   78.68333  
sd   12.31547   15.39439   12.73523  
skew -0.1116243 -0.8644448 -0.4610922
kurt -0.2767491 0.07934935 -0.6362816

  

> ##分性别计算各门课程的描述统计量
> MaleCard<-subset(ReportCard,ReportCard$sex=="M")
> (Des.Male<-sapply(MaleCard[3:10],FUN=Des.Fun,na.rm=TRUE))
     poli       chi        math       fore       phy       
avg  78.86667   83.5       60.03333   51.2       72.73333  
sd   10.41793   6.874491   14.70276   13.12381   12.18516  
skew -0.6431142 -0.1395905 -0.2319195 -0.2290347 -0.3062211
kurt -0.639428  -0.8433017 -0.7633507 -0.1874488 -0.9496199
     che        geo        his       
avg  52.36667   62.53333   78.93333  
sd   11.39111   14.46275   12.67888  
skew 0.05773828 -0.7055209 -0.4779689
kurt -0.8837325 -0.5034247 -0.6771532

 当类别较多时

tapply()

ep,分性别计算政治课成绩的描述性统计

 

> (Des.Gender<-tapply(ReportCard$poli,INDEX=ReportCard$sex,FUN=Des.Fun,na.rm=TRUE))
$F
$F$avg
[1] 80.46429

$F$sd
[1] 10.87124

$F$skew
[1] -1.902856

$F$kurt
[1] 4.587466


$M
$M$avg
[1] 78.86667

$M$sd
[1] 10.41793

$M$skew
[1] -0.6431142

$M$kurt
[1] -0.639428

  

 

2.相关关系

> Tmp<-ReportCard[complete.cases(ReportCard),]
> (CorMatrix<-cor(Tmp[,c(5,7,8)],use="everything",method="pearson"))
          math       phy       che
math 1.0000000 0.7535317 0.7171637
phy  0.7535317 1.0000000 0.6207730
che  0.7171637 0.6207730 1.0000000
> (CovMatrix<-cov(Tmp[,c(5,7,8)],use="complete.obs",method="pearson"))
         math       phy       che
math 231.2021 139.30399 125.40956
phy  139.3040 147.81972  86.79915
che  125.4096  86.79915 132.26134

  相关系数的检验

> cor.test(Tmp[,5],Tmp[,7],alternative="two.side",method="pearson")

	Pearson's product-moment correlation

data:  Tmp[, 5] and Tmp[, 7]
t = 8.5775, df = 56, p-value = 8.753e-12
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6149204 0.8469769
sample estimates:
      cor 
0.7535317 

  计算偏相关系数:控制其他数值型变量的条件下计算两数值变量的相关关系

 

> library("corpcor")
> cor2pcor(CorMatrix)
          [,1]      [,2]      [,3]
[1,] 1.0000000 0.5643387 0.4838756
[2,] 0.5643387 1.0000000 0.1754160
[3,] 0.4838756 0.1754160 1.0000000

  

3.分类型变量的相关性分析

3.1.相关性描述

对学生成绩,编制性别和平均成绩的列联表

> (CrossTable<-table(ReportCard[,c(2,12)]))
   avScore
sex  C  D  E
  F 13 10  3
  M 11 12  5
> (CrossTable<-xtabs(~sex+avScore,data=ReportCard))
   avScore
sex  C  D  E
  F 13 10  3
  M 11 12  5

  编制百分比的列联表

> margin.table(CrossTable,1)#计算频数
sex
 F  M 
26 28 
> margin.table(CrossTable,2)
avScore
 C  D  E 
24 22  8 
> prop.table(CrossTable)*100
   avScore
sex         C         D         E
  F 24.074074 18.518519  5.555556
  M 20.370370 22.222222  9.259259

> addmargins(prop.table(CrossTable)*100)

avScore
sex C D E Sum
F 24.074074 18.518519 5.555556 48.148148
M 20.370370 22.222222 9.259259 51.851852
Sum 44.444444 40.740741 14.814815 100.000000

 


  

3.2.相关性检验

1.卡方检验

Tmp<-ReportCard[complete.cases(ReportCard),]
(CrossTable<-table(Tmp[,c(2,12)]))
(ResChisq<-chisq.test(CrossTable,correct=FALSE))
ResChisq$expected

> (ResChisq<-chisq.test(CrossTable,correct=FALSE))

	Pearson's Chi-squared test

data:  CrossTable
X-squared = 0.77547, df = 2, p-value = 0.6786

> ResChisq$expected
   avScore
sex        C        D        E
  F 11.55556 10.59259 3.851852
  M 12.44444 11.40741 4.148148

  不能拒绝性别和成绩独立的原假设

2.相关性度量

若卡方检验拒绝原假设,即两者相关,那么相关度有多高

a.phi系数

2.列联系数

#############计算基于卡方的相关系数
library("vcd")
assocstats(CrossTable)

  

 

转载于:https://www.cnblogs.com/coderevelyn/p/7552792.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值