#一句话循环-lapply
-可以循环处理列表中的每一个元素
#要善于运用str函数
> str(apply)
function (X, MARGIN, FUN, ...)
> str(lapply)
function (X, FUN, ...)
> x<-list(a=1:10,b=c(11,21,31,41,51))
> x
$a
[1] 1 2 3 4 5 6 7 8 9 10
$b
[1] 11 21 31 41 51
> #求平均
> lapply(x,mean)
$a
[1] 5.5
$b
[1] 31
> x<-1:4
> lapply(x,runif)
[[1]]
[1] 0.2392829
[[2]]
[1] 0.9162620 0.7999029
[[3]]
[1] 0.59626812 0.03595273 0.65593715
[[4]]
[1] 0.8693877 0.3254179 0.3577321 0.7780447
> lapply(x,runif,min=0,max=100)
[[1]]
[1] 18.78005
[[2]]
[1] 89.26312 17.01786
[[3]]
[1] 14.72381 86.13003 39.71868
[[4]]
[1] 48.93485 22.37576 35.54066 57.34798
> x<-list(a=matrix(1:6,2,3),b=matrix(4:7,2,2))
> #匿名函数
> lapply(x,function(m),m[1,])
Error: unexpected ',' in "lapply(x,function(m),"
> lapply(x,function(m) m[1,])
$a
[1] 1 3 5
$b
[1] 4 6
> #sapply
> #可以对lapply的结果进行化简
> x<-list(a=1:10,b=c(11,21,31,41,51))
> lapply(x,mean)
$a
[1] 5.5
$b
[1] 31
> sapply(x,mean)
a b
5.5 31.0
> class(sapply(x,mean))
[1] "numeric"
>
#apply-沿着数组的某一纬度处理数据
#效率和for/while一样
> x<-matrix(1:16,4,4)
> x
[,1] [,2] [,3] [,4]
[1,] 1 5 9 13
[2,] 2 6 10 14
[3,] 3 7 11 15
[4,] 4 8 12 16
#求列的平均-2
> apply(x,2,mean)
[1] 2.5 6.5 10.5 14.5
#求列和
> apply(x,2,sum)
[1] 10 26 42 5
#类似求行-1
apply(x,1,mean)
apply(x,1,sum)
#对每行或每行或每行或每列求平均
rowSums(x)
rowMeans(x)
colSums(x)
colMeans(x)
#rnorm(100)随机从正态分布总体抽100个数据
#quantile求百分位点对应的数据,probs是分位点值
> x<-matrix(rnorm(100),10,10)
> apply(x,1,quantile,probs=c(0.25,0.75))
[,1] [,2] [,3] [,4]
25% -0.328531 -0.78827897 -0.8925568 -0.5913973
75% 1.363568 0.08398574 0.6381586 0.1598917
[,5] [,6] [,7] [,8]
25% -0.2881346 -0.1927514 -0.6403461 -0.6259618
75% 1.0994751 0.8746238 0.2437979 0.6709361
[,9] [,10]
25% -0.8207727 -1.04342393
75% 0.1826279 -0.07395085
>
> x<-array(rnorm(2*3*4),c(2,3,4))
> x
, , 1
[,1] [,2] [,3]
[1,] -0.05156115 -1.302837 -1.2937694
[2,] 0.17554622 -1.155084 0.7215416
, , 2
[,1] [,2] [,3]
[1,] 1.056157 0.4106916 0.43380783
[2,] -1.848322 0.5986957 0.03038948
, , 3
[,1] [,2] [,3]
[1,] -0.7534977 -0.3102528 -1.089347
[2,] -0.8581792 2.0207069 1.754384
, , 4
[,1] [,2] [,3]
[1,] -1.81074138 0.8243567 -0.36015041
[2,] 0.06445541 -0.7503270 0.04297629
#对1 2纬度求平均,相当与压平后的平均厚度
> apply(x,c(1,2),mean)
[,1] [,2] [,3]
[1,] -0.3899108 -0.09451043 -0.5773648
[2,] -0.6166249 0.17849788 0.6373230
>
#mapply-lapply的多元版本
> list(rep(1,4),rep(2,3),rep(3,2),rep(4,1))
[[1]]
[1] 1 1 1 1
[[2]]
[1] 2 2 2
[[3]]
[1] 3 3
[[4]]
[1] 4
> mapply(rep,1:4,4:1)
[[1]]
[1] 1 1 1 1
[[2]]
[1] 2 2 2
[[3]]
[1] 3 3
[[4]]
[1] 4
>
#写一个函数
s<-function(n,mean,std){
rnorm(n,mean,std)
}
#从均值为0,标准差为1的正态分布中抽取4个数
s(4,0,1)
[1] 0.9297812 1.7290848 -0.3481608 2.6755208
>
> mapply(s,1:5,5:1,2)
[[1]]
[1] 8.094877
[[2]]
[1] 5.501669 2.539699
[[3]]
[1] 4.873989 1.890569 2.402905
[[4]]
[1] -0.101477 1.147575 4.736491 3.020118
[[5]]
[1] 1.212230 0.134689 -3.053350 1.293467 8.119817
#tapply 对向量子集进行操作
#runif均匀分布
> x<-c(rnorm(5),runif(5),rnorm(5,1))
> x
[1] 1.35142935 0.26992948 -1.14563221 1.40220826
[5] -1.91542779 0.44899698 0.96529742 0.21443593
[9] 0.02963372 0.68414264 1.65109203 1.52528853
[13] 1.99955166 2.38402247 2.55755724
> f<-gl(3,5)
> f
[1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
Levels: 1 2 3
> tapply(x,f,mean)
1 2 3
-0.007498582 0.468501338 2.023502386
> tapply(x,f,mean,simplify = FALSE)
$`1`
[1] -0.007498582
$`2`
[1] 0.4685013
$`3`
[1] 2.023502
#split-根据因子或因子列表将向量或其他对象分组
#通常和lapply一起使用
> x<-c(rnorm(5),runif(5),rnorm(5,1))
> f<-gl(3,5)
> split(x,f)
$`1`
[1] 0.53150178 0.07597644 1.30590674 0.10663172
[5] 1.28516557
$`2`
[1] 0.7860998 0.4483129 0.9338888 0.3238153 0.3262904
$`3`
[1] 1.7431109 3.6205839 2.8010569 2.1896358 0.3421596
> lapply(split(x,f),mean)
$`1`
[1] 0.6610364
$`2`
[1] 0.5636815
$`3`
[1] 2.139309
> library(datasets)
> head(airquality)
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
> s<-split(airquality,airquality$Month)
> s
$`5`
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
7 23 299 8.6 65 5 7
8 19 99 13.8 59 5 8
(以上数据没完--风注)
> table(airquality$Month)
5 6 7 8 9
31 30 31 31 30
> lapply(s,function(x) colMeans(x[,c("Ozone","Wind","Timp")]))
Show Traceback
Rerun with Debug
Error in `[.data.frame`(x, , c("Ozone", "Wind", "Timp")) :
选择了未定义的列 > lapply(s,function(x) colMeans(x[,c("Ozone","Wind","Temp")]))
$`5`
Ozone Wind Temp
NA 11.62258 65.54839
$`6`
Ozone Wind Temp
NA 10.26667 79.10000
$`7`
Ozone Wind Temp
NA 8.941935 83.903226
$`8`
Ozone Wind Temp
NA 8.793548 83.967742
$`9`
Ozone Wind Temp
NA 10.18 76.90
> sapply(s,function(x) colMeans(x[,c("Ozone","Wind","Temp")]))
5 6 7 8 9
Ozone NA NA NA NA NA
Wind 11.62258 10.26667 8.941935 8.793548 10.18
Temp 65.54839 79.10000 83.903226 83.967742 76.90
> sapply(s,function(x) colMeans(x[,c("Ozone","Wind","Temp")],na.rm = TRUE))
5 6 7 8 9
Ozone 23.61538 29.44444 59.115385 59.961538 31.44828
Wind 11.62258 10.26667 8.941935 8.793548 10.18000
Temp 65.54839 79.10000 83.903226 83.967742 76.90000
#排序:
#-sort:对向量进行拍下,返回排好序的内容
#-order:返回排好序内容的下标、/多个排序标准
> x<-data.frame(v1=1:5,v2=c(10,7,9,6,8),v3=11:15,v4=c(1,1,2,2,1))
> x
v1 v2 v3 v4
1 1 10 11 1
2 2 7 12 1
3 3 9 13 2
4 4 6 14 2
5 5 8 15 1
> sort(x$v2)
[1] 6 7 8 9 10
> sort(x$v2,decreasing = TRUE)
[1] 10 9 8 7 6
#order返回的是下标
> order(x$v2)
[1] 4 2 5 3 1
> x[order(x$v2),]
v1 v2 v3 v4
4 4 6 14 2
2 2 7 12 1
5 5 8 15 1
3 3 9 13 2
1 1 10 11 1
>
#如果遇到两个一样的就再安装第二个条件进行排序
v1 v2 v3 v4
2 2 7 12 1
5 5 8 15 1
1 1 10 11 1
4 4 6 14 2
3 3 9 13 2
>
> x[order(x$v4,x$v2,decreasing = TRUE),]
v1 v2 v3 v4
3 3 9 13 2
4 4 6 14 2
1 1 10 11 1
5 5 8 15 1
2 2 7 12 1
>
#总结数据信息
#头六行
> head(airquality)
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
#最后六行
> tail(airquality)
Ozone Solar.R Wind Temp Month Day
148 14 20 16.6 63 9 25
149 30 193 6.9 70 9 26
150 NA 145 13.2 77 9 27
151 14 191 14.3 75 9 28
152 18 131 8.0 76 9 29
153 20 223 11.5 68 9 30
#10行
> head(airquality,10)
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
7 23 299 8.6 65 5 7
8 19 99 13.8 59 5 8
9 8 19 20.1 61 5 9
10 NA 194 8.6 69 5 10
#
> summary(airquality)
Ozone Solar.R Wind Temp
Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
NA's :37 NA's :7
Month Day
Min. :5.000 Min. : 1.0
1st Qu.:6.000 1st Qu.: 8.0
Median :7.000 Median :16.0
Mean :6.993 Mean :15.8
3rd Qu.:8.000 3rd Qu.:23.0
Max. :9.000 Max. :31.0
>
> str(airquality)
'data.frame': 153 obs. of 6 variables:
$ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
$ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
$ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
$ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
$ Month : int 5 5 5 5 5 5 5 5 5 5 ...
$ Day : int 1 2 3 4 5 6 7 8 9 10 ...
>
> table(airquality$Month)
5 6 7 8 9
31 30 31 31 30
>
> table(airquality$Ozone)
1 4 6 7 8 9 10 11 12 13 14 16 18 19 20 21 22 23
1 1 1 3 1 3 1 3 2 4 4 4 4 1 4 4 1 6
24 27 28 29 30 31 32 34 35 36 37 39 40 41 44 45 46 47
2 1 3 1 2 1 3 1 2 2 2 2 1 1 3 2 1 1
48 49 50 52 59 61 63 64 65 66 71 73 76 77 78 79 80 82
1 1 1 1 2 1 1 2 1 1 1 2 1 1 2 1 1 1
84 85 89 91 96 97 108 110 115 118 122 135 168
1 2 1 1 1 2 1 1 1 1 1 1 1
>
> table(airquality$Ozone,useNA = "ifany")
1 4 6 7 8 9 10 11 12 13 14 16 18 19
1 1 1 3 1 3 1 3 2 4 4 4 4 1
20 21 22 23 24 27 28 29 30 31 32 34 35 36
4 4 1 6 2 1 3 1 2 1 3 1 2 2
37 39 40 41 44 45 46 47 48 49 50 52 59 61
2 2 1 1 3 2 1 1 1 1 1 1 2 1
63 64 65 66 71 73 76 77 78 79 80 82 84 85
1 2 1 1 1 2 1 1 2 1 1 1 1 2
89 91 96 97 108 110 115 118 122 135 168 <NA>
1 1 1 2 1 1 1 1 1 1 1 37
>
> table(airquality$Month,airquality$Day)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
8 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
9 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
27 28 29 30 31
5 1 1 1 1 1
6 1 1 1 1 0
7 1 1 1 1 1
8 1 1 1 1 1
9 1 1 1 1 0
#看看有没有缺失值
> any(is.na(airquality$Ozone))
[1] TRUE
> sum(is.na(airquality$Ozone))
[1] 37
> all(airquality$Month<12)
[1] TRUE
>
> Titanic<-as.data.frame(Titanic)
> head(Titanic)
Class Sex Age Survived Freq
1 1st Male Child No 0
2 2nd Male Child No 0
3 3rd Male Child No 35
4 Crew Male Child No 0
5 1st Female Child No 0
6 2nd Female Child No 0
> dim(Titanic)
[1] 32 5
> summary(Titanic)
Class Sex Age Survived Freq
1st :8 Male :16 Child:16 No :16 Min. : 0.00
2nd :8 Female:16 Adult:16 Yes:16 1st Qu.: 0.75
3rd :8 Median : 13.50
Crew:8 Mean : 68.78
3rd Qu.: 77.00
Max. :670.00
> #交叉表
> xtabs(Freq ~ Class + Age,data = Titanic)
Age
Class Child Adult
1st 6 319
2nd 24 261
3rd 79 627
Crew 0 885
> x<-xtabs(Freq ~ Class + Age,data = Titanic)
> ftable(x)
Age Child Adult
Class
1st 6 319
2nd 24 261
3rd 79 627
Crew 0 885
> object.size(airquality)
5496 bytes
> print(object.size(airquality),units = "Kb")
5.4 Kb
>
R 语言学习笔记三 : 操纵数据-重要函数的使用
最新推荐文章于 2023-10-11 16:37:36 发布