R 语言学习笔记三：操纵数据-重要函数的使用

最新推荐文章于 2023-10-11 16:37:36 发布
shawncheer
最新推荐文章于 2023-10-11 16:37:36 发布
阅读量1.8k
点赞数
分类专栏： R语言学习文章标签：函数
本文链接：https://blog.csdn.net/shawncheer/article/details/50688256
版权
R语言学习专栏收录该内容
3 篇文章 0 订阅
订阅专栏
#一句话循环-lapply
-可以循环处理列表中的每一个元素
#要善于运用str函数
> str(apply)
function (X, MARGIN, FUN, ...)  
  > str(lapply)
function (X, FUN, ...)  
  > x<-list(a=1:10,b=c(11,21,31,41,51))
  > x
  $a
  [1]  1  2  3  4  5  6  7  8  9 10

  $b
  [1] 11 21 31 41 51

  > #求平均
    > lapply(x,mean)
  $a
  [1] 5.5

  $b
  [1] 31

  > x<-1:4
  > lapply(x,runif)
  [[1]]
  [1] 0.2392829

  [[2]]
  [1] 0.9162620 0.7999029

  [[3]]
  [1] 0.59626812 0.03595273 0.65593715

  [[4]]
  [1] 0.8693877 0.3254179 0.3577321 0.7780447

  > lapply(x,runif,min=0,max=100)
  [[1]]
  [1] 18.78005

  [[2]]
  [1] 89.26312 17.01786

  [[3]]
  [1] 14.72381 86.13003 39.71868

  [[4]]
  [1] 48.93485 22.37576 35.54066 57.34798

  > x<-list(a=matrix(1:6,2,3),b=matrix(4:7,2,2))
  > #匿名函数
    > lapply(x,function(m),m[1,])
  Error: unexpected ',' in "lapply(x,function(m),"
  > lapply(x,function(m) m[1,])
  $a
  [1] 1 3 5

  $b
  [1] 4 6

  > #sapply
    > #可以对lapply的结果进行化简
    > x<-list(a=1:10,b=c(11,21,31,41,51))
  > lapply(x,mean)
  $a
  [1] 5.5

  $b
  [1] 31

  > sapply(x,mean)
  a    b 
  5.5 31.0 
  > class(sapply(x,mean))
  [1] "numeric"
  > 

  #apply-沿着数组的某一纬度处理数据
  #效率和for/while一样
> x<-matrix(1:16,4,4)
> x
     [,1] [,2] [,3] [,4]
[1,]    1    5    9   13
[2,]    2    6   10   14
[3,]    3    7   11   15
[4,]    4    8   12   16

#求列的平均-2
> apply(x,2,mean)
[1]  2.5  6.5 10.5 14.5
#求列和
> apply(x,2,sum)
[1] 10 26 42 5

#类似求行-1
apply(x,1,mean)
apply(x,1,sum)

#对每行或每行或每行或每列求平均
rowSums(x)
rowMeans(x)
colSums(x)
colMeans(x)

#rnorm(100)随机从正态分布总体抽100个数据
#quantile求百分位点对应的数据，probs是分位点值
> x<-matrix(rnorm(100),10,10)
> apply(x,1,quantile,probs=c(0.25,0.75))
         [,1]        [,2]       [,3]       [,4]
25% -0.328531 -0.78827897 -0.8925568 -0.5913973
75%  1.363568  0.08398574  0.6381586  0.1598917
          [,5]       [,6]       [,7]       [,8]
25% -0.2881346 -0.1927514 -0.6403461 -0.6259618
75%  1.0994751  0.8746238  0.2437979  0.6709361
          [,9]       [,10]
25% -0.8207727 -1.04342393
75%  0.1826279 -0.07395085
> 

> x<-array(rnorm(2*3*4),c(2,3,4))
> x
, , 1

            [,1]      [,2]       [,3]
[1,] -0.05156115 -1.302837 -1.2937694
[2,]  0.17554622 -1.155084  0.7215416

, , 2

          [,1]      [,2]       [,3]
[1,]  1.056157 0.4106916 0.43380783
[2,] -1.848322 0.5986957 0.03038948

, , 3

           [,1]       [,2]      [,3]
[1,] -0.7534977 -0.3102528 -1.089347
[2,] -0.8581792  2.0207069  1.754384

, , 4

            [,1]       [,2]        [,3]
[1,] -1.81074138  0.8243567 -0.36015041
[2,]  0.06445541 -0.7503270  0.04297629

#对1 2纬度求平均,相当与压平后的平均厚度
> apply(x,c(1,2),mean)
[,1]        [,2]       [,3]
[1,] -0.3899108 -0.09451043 -0.5773648
[2,] -0.6166249  0.17849788  0.6373230
> 
#mapply-lapply的多元版本
> list(rep(1,4),rep(2,3),rep(3,2),rep(4,1))
[[1]]
[1] 1 1 1 1

[[2]]
[1] 2 2 2

[[3]]
[1] 3 3

[[4]]
[1] 4

> mapply(rep,1:4,4:1)
[[1]]
[1] 1 1 1 1

[[2]]
[1] 2 2 2

[[3]]
[1] 3 3

[[4]]
[1] 4

> 
#写一个函数 
s<-function(n,mean,std){
  rnorm(n,mean,std)
}  
#从均值为0，标准差为1的正态分布中抽取4个数
s(4,0,1)

[1]  0.9297812  1.7290848 -0.3481608  2.6755208
>
> mapply(s,1:5,5:1,2)
[[1]]
[1] 8.094877

[[2]]
[1] 5.501669 2.539699

[[3]]
[1] 4.873989 1.890569 2.402905

[[4]]
[1] -0.101477  1.147575  4.736491  3.020118

[[5]]
[1]  1.212230  0.134689 -3.053350  1.293467  8.119817

#tapply 对向量子集进行操作

#runif均匀分布
> x<-c(rnorm(5),runif(5),rnorm(5,1))
> x
[1]  1.35142935  0.26992948 -1.14563221  1.40220826
[5] -1.91542779  0.44899698  0.96529742  0.21443593
[9]  0.02963372  0.68414264  1.65109203  1.52528853
[13]  1.99955166  2.38402247  2.55755724
> f<-gl(3,5)
> f
[1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
Levels: 1 2 3
> tapply(x,f,mean)
1            2            3 
-0.007498582  0.468501338  2.023502386 
> tapply(x,f,mean,simplify = FALSE)
$`1`
[1] -0.007498582

$`2`
[1] 0.4685013

$`3`
[1] 2.023502


#split-根据因子或因子列表将向量或其他对象分组
#通常和lapply一起使用

> x<-c(rnorm(5),runif(5),rnorm(5,1))
> f<-gl(3,5)
> split(x,f)
$`1`
[1] 0.53150178 0.07597644 1.30590674 0.10663172
[5] 1.28516557

$`2`
[1] 0.7860998 0.4483129 0.9338888 0.3238153 0.3262904

$`3`
[1] 1.7431109 3.6205839 2.8010569 2.1896358 0.3421596

> lapply(split(x,f),mean)
$`1`
[1] 0.6610364

$`2`
[1] 0.5636815

$`3`
[1] 2.139309

> library(datasets)
> head(airquality)
Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
5    NA      NA 14.3   56     5   5
6    28      NA 14.9   66     5   6
> s<-split(airquality,airquality$Month)
> s
$`5`
Ozone Solar.R Wind Temp Month Day
1     41     190  7.4   67     5   1
2     36     118  8.0   72     5   2
3     12     149 12.6   74     5   3
4     18     313 11.5   62     5   4
5     NA      NA 14.3   56     5   5
6     28      NA 14.9   66     5   6
7     23     299  8.6   65     5   7
8     19      99 13.8   59     5   8
(以上数据没完--风注)

> table(airquality$Month)

5  6  7  8  9 
31 30 31 31 30 
> lapply(s,function(x) colMeans(x[,c("Ozone","Wind","Timp")]))
Show Traceback

Rerun with Debug
Error in `[.data.frame`(x, , c("Ozone", "Wind", "Timp")) : 
  选择了未定义的列 >  lapply(s,function(x) colMeans(x[,c("Ozone","Wind","Temp")]))
$`5`
Ozone     Wind     Temp 
NA 11.62258 65.54839 

$`6`
Ozone     Wind     Temp 
NA 10.26667 79.10000 

$`7`
Ozone      Wind      Temp 
NA  8.941935 83.903226 

$`8`
Ozone      Wind      Temp 
NA  8.793548 83.967742 

$`9`
Ozone  Wind  Temp 
NA 10.18 76.90 

> sapply(s,function(x) colMeans(x[,c("Ozone","Wind","Temp")]))
5        6         7         8     9
Ozone       NA       NA        NA        NA    NA
Wind  11.62258 10.26667  8.941935  8.793548 10.18
Temp  65.54839 79.10000 83.903226 83.967742 76.90
>  sapply(s,function(x) colMeans(x[,c("Ozone","Wind","Temp")],na.rm = TRUE))
5        6         7         8        9
Ozone 23.61538 29.44444 59.115385 59.961538 31.44828
Wind  11.62258 10.26667  8.941935  8.793548 10.18000
Temp  65.54839 79.10000 83.903226 83.967742 76.90000



#排序：
#-sort：对向量进行拍下，返回排好序的内容
#-order：返回排好序内容的下标、/多个排序标准
> x<-data.frame(v1=1:5,v2=c(10,7,9,6,8),v3=11:15,v4=c(1,1,2,2,1))
> x
  v1 v2 v3 v4
1  1 10 11  1
2  2  7 12  1
3  3  9 13  2
4  4  6 14  2
5  5  8 15  1
> sort(x$v2)
[1]  6  7  8  9 10
> sort(x$v2,decreasing = TRUE)
[1] 10  9  8  7  6

#order返回的是下标
> order(x$v2)
[1] 4 2 5 3 1
> x[order(x$v2),]
  v1 v2 v3 v4
4  4  6 14  2
2  2  7 12  1
5  5  8 15  1
3  3  9 13  2
1  1 10 11  1
> 
#如果遇到两个一样的就再安装第二个条件进行排序
  v1 v2 v3 v4
2  2  7 12  1
5  5  8 15  1
1  1 10 11  1
4  4  6 14  2
3  3  9 13  2
>   
  > x[order(x$v4,x$v2,decreasing = TRUE),]
v1 v2 v3 v4
3  3  9 13  2
4  4  6 14  2
1  1 10 11  1
5  5  8 15  1
2  2  7 12  1
>   

#总结数据信息

#头六行
> head(airquality)
  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
5    NA      NA 14.3   56     5   5
6    28      NA 14.9   66     5   6
#最后六行
> tail(airquality)
    Ozone Solar.R Wind Temp Month Day
148    14      20 16.6   63     9  25
149    30     193  6.9   70     9  26
150    NA     145 13.2   77     9  27
151    14     191 14.3   75     9  28
152    18     131  8.0   76     9  29
153    20     223 11.5   68     9  30
#10行
> head(airquality,10)
   Ozone Solar.R Wind Temp Month Day
1     41     190  7.4   67     5   1
2     36     118  8.0   72     5   2
3     12     149 12.6   74     5   3
4     18     313 11.5   62     5   4
5     NA      NA 14.3   56     5   5
6     28      NA 14.9   66     5   6
7     23     299  8.6   65     5   7
8     19      99 13.8   59     5   8
9      8      19 20.1   61     5   9
10    NA     194  8.6   69     5  10

#
> summary(airquality)
     Ozone           Solar.R           Wind             Temp      
 Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
 1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
 Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
 Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
 3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
 Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
 NA's   :37       NA's   :7                                       
     Month            Day      
 Min.   :5.000   Min.   : 1.0  
 1st Qu.:6.000   1st Qu.: 8.0  
 Median :7.000   Median :16.0  
 Mean   :6.993   Mean   :15.8  
 3rd Qu.:8.000   3rd Qu.:23.0  
 Max.   :9.000   Max.   :31.0  

> 
> str(airquality)
'data.frame':   153 obs. of  6 variables:
 $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
 $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
 $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
 $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
 $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
 $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...
>
> table(airquality$Month)

 5  6  7  8  9 
31 30 31 31 30 
> 
> table(airquality$Ozone)

  1   4   6   7   8   9  10  11  12  13  14  16  18  19  20  21  22  23 
  1   1   1   3   1   3   1   3   2   4   4   4   4   1   4   4   1   6 
 24  27  28  29  30  31  32  34  35  36  37  39  40  41  44  45  46  47 
  2   1   3   1   2   1   3   1   2   2   2   2   1   1   3   2   1   1 
 48  49  50  52  59  61  63  64  65  66  71  73  76  77  78  79  80  82 
  1   1   1   1   2   1   1   2   1   1   1   2   1   1   2   1   1   1 
 84  85  89  91  96  97 108 110 115 118 122 135 168 
  1   2   1   1   1   2   1   1   1   1   1   1   1 
> 
> table(airquality$Ozone,useNA = "ifany")

   1    4    6    7    8    9   10   11   12   13   14   16   18   19 
   1    1    1    3    1    3    1    3    2    4    4    4    4    1 
  20   21   22   23   24   27   28   29   30   31   32   34   35   36 
   4    4    1    6    2    1    3    1    2    1    3    1    2    2 
  37   39   40   41   44   45   46   47   48   49   50   52   59   61 
   2    2    1    1    3    2    1    1    1    1    1    1    2    1 
  63   64   65   66   71   73   76   77   78   79   80   82   84   85 
   1    2    1    1    1    2    1    1    2    1    1    1    1    2 
  89   91   96   97  108  110  115  118  122  135  168 <NA> 
   1    1    1    2    1    1    1    1    1    1    1   37 
> 
> table(airquality$Month,airquality$Day)

    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
  5 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  6 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  7 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  8 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  9 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1

    27 28 29 30 31
  5  1  1  1  1  1
  6  1  1  1  1  0
  7  1  1  1  1  1
  8  1  1  1  1  1
  9  1  1  1  1  0  

  #看看有没有缺失值
> any(is.na(airquality$Ozone))
[1] TRUE  

  > sum(is.na(airquality$Ozone))
  [1] 37
  > all(airquality$Month<12)
  [1] TRUE
  > 
    > Titanic<-as.data.frame(Titanic)
  > head(Titanic)
  Class    Sex   Age Survived Freq
  1   1st   Male Child       No    0
  2   2nd   Male Child       No    0
  3   3rd   Male Child       No   35
  4  Crew   Male Child       No    0
  5   1st Female Child       No    0
  6   2nd Female Child       No    0
  > dim(Titanic)
  [1] 32  5
  > summary(Titanic)
  Class       Sex        Age     Survived      Freq       
  1st :8   Male  :16   Child:16   No :16   Min.   :  0.00  
  2nd :8   Female:16   Adult:16   Yes:16   1st Qu.:  0.75  
  3rd :8                                   Median : 13.50  
  Crew:8                                   Mean   : 68.78  
  3rd Qu.: 77.00  
  Max.   :670.00  
  > #交叉表
    > xtabs(Freq ~ Class + Age,data = Titanic)
  Age
  Class  Child Adult
  1st      6   319
  2nd     24   261
  3rd     79   627
  Crew     0   885
  > x<-xtabs(Freq ~ Class + Age,data = Titanic)
  > ftable(x)
  Age Child Adult
  Class                
  1st           6   319
  2nd          24   261
  3rd          79   627
  Crew          0   885
  > object.size(airquality)
  5496 bytes
  > print(object.size(airquality),units = "Kb")
  5.4 Kb
  >