《R语言与统计分析》-探索性数据分析

最新推荐文章于 2024-03-09 18:06:58 发布

parcaf

最新推荐文章于 2024-03-09 18:06:58 发布

阅读量567

点赞数 1

文章标签： r语言数据分析开发语言

本文链接：https://blog.csdn.net/parcaf/article/details/127450798

版权

这篇博客探讨了多种统计图形的绘制和使用，包括直方图、核密度估计、箱线图、QQ图等，用于数据分布的可视化和正态性检验。此外，还展示了如何通过R语言实现这些图形，并通过实例演示了正态分布的概率函数和多组数据的描述性统计分析。

摘要由CSDN通过智能技术生成

####渐近正态性的图形检验####
limite.central<- function(r=runif,distpar=c(0,1),m=0.5,
                          s=1/sqrt(12),
                          n=c(1,3,10,30),N=1000) {
  for(i in n){
    if(length(distpar)==2){
      x<-matrix(r(i*N,distpar[1],distpar[2]),nc=i)
    }
    else {
      x<-matrix(r(i*N,distpar),nc=i)
    }
    x<-(apply(x,1,sum)-i*m)/(sqrt(i)*s)
    hist(x,col='light blue',probability=T,main=paste("n=",i),
         ylim=c(0,max(.4,density(x)$y)))
    lines(density(x),col='red',lwd=3)
    curve(dnorm(x),col='blue',lwd=3,lty=3,add=T)
    if(N>100){
      rug(sample(x,100))
    }
    else{
      rug(x)
    }
  }
}

op<-par(mfrow=c(2,2))
limite.central(rbinom,distpar=c(10,0.1),m=1,s=0.9,N=100)
par(op)

####正态分布的概率函数图形####
par(mfrow=c(1,1))
curve(dnorm(x,0,1),xlim = c(-5,5),ylim=c(0,.8),
      col='red',lwd=2,lty=3)
curve(dnorm(x,0,2),add=T,col='blue',lwd=2,lty=2)
curve(dnorm(x,0,.5),add=T,lwd=2,lty=1)
title(main='Gaussian distributions')
legend(par('usr')[2],par('usr')[4],xjust=1,
       c('Sigma=1','Sigma=2','Sigma=0.5'),
       lwd=c(2,2,2),lty=c(3,2,1),
       col=c('red','blue',par("fg")))

####直方图####
hist(x, breaks = "Sturges",
     freq = NULL, probability = !freq,
     include.lowest = TRUE, right = TRUE, fuzz = 1e-7,
     density = NULL, angle = 45, col = "lightgray", border = NULL,
     main = paste("Histogram of" , xname),
     xlim = range(breaks), ylim = NULL,
     xlab = xname, ylab,
     axes = TRUE, plot = TRUE, labels = FALSE,
     nclass = NULL, warn.unused = TRUE, ...)

####核密度估计####
density(x, bw = "nrd0", adjust = 1,
        kernel = c("gaussian", "epanechnikov", "rectangular",
                   "triangular", "biweight",
                   "cosine", "optcosine"),
        weights = NULL, window = kernel, width,
        give.Rkern = FALSE, subdensity = FALSE,
        n = 512, from, to, cut = 3, na.rm = FALSE, ...)

####茎叶图####
stem(x, scale = 1, width = 80, atom = 1e-08)

####箱线图####
boxplot(formula, data = NULL, ..., subset, na.action = NULL,
        xlab = mklab(y_var = horizontal),
        ylab = mklab(y_var =!horizontal),
        add = FALSE, ann = !add, horizontal = FALSE,
        drop = FALSE, sep = ".", lex.order = FALSE)
#formula指箱线图的作图规则，y~grp指y根据grp分类
boxplot(x, ..., range = 1.5, width = NULL, varwidth = FALSE,
        notch = FALSE, outline = TRUE, names, plot = TRUE,
        border = par("fg"), col = "lightgray", log = "",
        pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5),
        ann = !add, horizontal = FALSE, add = FALSE, at = NULL)

####正态性检验####
{
  ####QQ图####
  qqnorm(y, ylim, main = "Normal Q-Q Plot",
         xlab = "Theoretical Quantiles", ylab = "Sample Quantiles",
         plot.it = TRUE, datax = FALSE, ...)
  
  qqline(y, datax = FALSE, distribution = qnorm,
         probs = c(0.25, 0.75), qtype = 7, ...)
  
  qqplot(x, y, plot.it = TRUE,
         xlab = deparse1(substitute(x)),
         ylab = deparse1(substitute(y)), ...)
  
  ####与正态密度函数比较####
  hist()
  curve()
  lines()
  
  ####使用经验分布函数####
  data<-rnorm(100)
  x<-sort(data)
  n<-length(x)
  y<-(1:n)/n
  m<-mean(x)
  s<-sd(x)
  plot(x,y,type='s',main=".")
  curve(pnorm(x,m,s),col='red',lwd=2,add=T)
}

####多组数据描述性统计####
#散点图+箱线图
library(DAAG)
data("cars")
op<-par()
layout(matrix(c(2,1,0,3),2,2,byrow = T),c(1,6),c(4,1))
#matrix(绘图顺序)，区域宽度1:6,4:1
par(mar=c(1,1,5,2))#下、左、上、右
plot(cars$dist~cars$speed,xlab='',ylab='',las=1)
rug(side=1,jitter(cars$speed,5))
rug(side=2,jitter(cars$dist,20))#jitter()在数据上加一个小小的扰动
title(main="cars data")
par(mar=c(1,2,5,1))
boxplot(cars$dist,axes=F)
title(ylab='Stopping distance(ft)',line=0)
par(mar=c(5,1,1,2))
boxplot(cars$speed,horizontal = T,axes=F)
title(xlab='Speed(mph)',line=1)
par(op)

####等高线图####
library(MASS)
z<-kdeed(x,y)#估计二维数据的密度函数
contour(z,col='red',drawlabels=F,main='')#绘制等高线图

####三维透视图####
persq(z,main='')

####分组数据的图形概括####
histogram(x,
          data,
          allow.multiple, outer = TRUE,
          auto.key = FALSE,
          aspect = "fill",
          panel = lattice.getOption("panel.histogram"),
          prepanel, scales, strip, groups,
          xlab, xlim, ylab, ylim,
          type = c("percent", "count", "density"),
          nint = if (is.factor(x)) nlevels(x)
          else round(log2(length(x)) + 1),
          endpoints = extend.limits(range(as.numeric(x),
                                          finite = TRUE), prop = 0.04),
          breaks,
          equal.widths = TRUE,
          drop.unused.levels =
            lattice.getOption("drop.unused.levels"),
          ...,
          lattice.options = NULL,
          default.scales = list(),
          default.prepanel =
            lattice.getOption("prepanel.default.histogram"),
          subscripts,
          subset)

####条形图####
stripchart(x, method = "overplot", jitter = 0.1, offset = 1/3,
           vertical = FALSE, group.names, add = FALSE,
           at = NULL, xlim = NULL, ylim = NULL,
           ylab = NULL, xlab = NULL, dlab = "", glab = "",
           log = "", pch = 0, col = par("fg"), cex = par("cex"),
           axes = TRUE, frame.plot = axes, ...)

####密度曲线图####
library(lattice)
densityplot(x,
            data,
            allow.multiple = is.null(groups) || outer,
            outer = !is.null(groups),
            auto.key = FALSE,
            aspect = "fill",
            panel = lattice.getOption("panel.densityplot"),
            prepanel, scales, strip, groups, weights,
            xlab, xlim, ylab, ylim,
            bw, adjust, kernel, window, width, give.Rkern,
            n = 512, from, to, cut, na.rm,
            drop.unused.levels =
              lattice.getOption("drop.unused.levels"),
            ...,
            lattice.options = NULL,
            default.scales = list(),
            default.prepanel =
              lattice.getOption("prepanel.default.densityplot"),
            subscripts,
            subset)

####分类数据的描述性统计分析####
Eye.Hair<-matrix(c(68,20,15,5,119,84,54,29,26,17,14,14,7,94,10,16),
                 nrow=4,byrow = T)
colnames(Eye.Hair)<-c("Brown","Blue","Hazel","Green")
rownames(Eye.Hair)<-c("Black","Brown","Red","Blond")
Eye.Hair

#全局相对频率列表
round(Eye.Hair/sum(Eye.Hair),digits = 3)

####列联表的图形描述####
data("HairEyeColor")
a<-as.table(apply(HairEyeColor,c(1,2),sum))
barplot(a,legend.text=attr(a,"dimnames")$Hair,beside = T)
#beside=F叠加柱状图
barplot(a,legend.text=attr(a,"dimnames")$Hair)

dotchart(Eye.Hair)