1.构建子集(subsetting)
原始数据(raw dataset)->预处理后的数据(clean dataset)
基本方法:
- []:提取一个或多个类型相同的元素
-[[]]:从列表或者数据框中提取元素
-$:按名字从列表或数据框中提取元素
。
(1)
//向量的子集
> x <- 1:10
> x[1]
[1] 1
> x[5]
[1] 5
> x[1:5]
[1] 1 2 3 4 5
> x[x>5]
[1] 6 7 8 9 10
> x>5
[1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
> x[x>5 & x<7]
[1] 6
> x[x<3 | x>7]
[1] 1 2 8 9 10
> y <- 1:4
> y
[1] 1 2 3 4
> names(y) <- c("a","b","c","d")
> y
a b c d
1 2 3 4
> y[2]
b
2
> y["b"]
b
2
>
//矩阵的子集
> x <- matrix(1:6, nrow = 2, ncol = 3)
> x
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
> x[1,2] //拿一个元素
[1] 3
> x[1,] //拿一行元素
[1] 1 3 5
> x[,1] //拿一列元素
[1] 1 2
> x[2,c(1,3)]
[1] 2 6
> class(x[1,2])
[1] "integer"
> x[1,2, drop = FALSE] //将拿出的元素仍为矩阵类型的
[,1]
[1,] 3
//数据框的子集
> data.frame(v1=1:5, v2=6:10, v3=11:15)
v1 v2 v3
1 1 6 11
2 2 7 12
3 3 8 13
4 4 9 14
5 5 10 15
> x$v3[c(2,4)] <- NA
> x
v1 v2 v3
1 1 6 11
2 2 7 NA
3 3 8 13
4 4 9 NA
5 5 10 15
> x[,2]
[1] 6 7 8 9 10
> x[,"v2"]
[1] 6 7 8 9 10
> x[(x$v1<4 & x$v2>=8),]
v1 v2 v3
3 3 8 13
> x[(x$v1<4 | x$v2>=8),]
v1 v2 v3
1 1 6 11
2 2 7 NA
3 3 8 13
4 4 9 NA
5 5 10 15
> x[x$v1>2,]
v1 v2 v3
3 3 8 13
4 4 9 NA
5 5 10 15
> x[which(x$v1>2),]
v1 v2 v3
3 3 8 13
4 4 9 NA
5 5 10 15
> which(x$v1>2)
[1] 3 4 5
> x$v1>2
[1] FALSE FALSE TRUE TRUE TRUE
> subset(x,x$v1>2)
v1 v2 v3
3 3 8 13
4 4 9 NA
5 5 10 15
.(2)列表的子集
- [[]] /$ / [[]][] / [[]][[]]
- 嵌套列表/不完全匹配(partial matching)
> x <- list(id = 1:4, height = 170, gender = "male")//创建一个列表
> x
$id
[1] 1 2 3 4
$height
[1] 170
$gender
[1] "male"
> x[1]//取列表中的第一个元素
$id
[1] 1 2 3 4
> x["id"]
$id
[1] 1 2 3 4
//只取第一个元素的内容
> x[[1]]
[1] 1 2 3 4
> x[["id"]]
[1] 1 2 3 4
> x$id
[1] 1 2 3 4
>
> x[c(1,3)]//取列表中的第一个和第三个元素
$id
[1] 1 2 3 4
$gender
[1] "male"
>
> y <- "id"
> x[["id"]]
[1] 1 2 3 4
> x[[y]]//通过y来指代
[1] 1 2 3 4
>
>
> x$id
[1] 1 2 3 4
> x$y//这种方法不适用
NULL
>
> x <- list(a=list(1,2,3,4), b=c("Monday","Tuesday"))//嵌套列表
> x
$a
$a[[1]]
[1] 1
$a[[2]]
[1] 2
$a[[3]]
[1] 3
$a[[4]]
[1] 4
$b
[1] "Monday" "Tuesday"
> x[[1]]
[[1]]
[1] 1
[[2]]
[1] 2
[[3]]
[1] 3
[[4]]
[1] 4
> x[[1]][[2]]//第一个里面的第二个元素内容
[1] 2
> x[[1]][2]//第一个元素的第二个元素
[[1]]
[1] 2
>
>
> x[[c(1,3)]]//第一个里面的第三个
[1] 3
> x[[c(2,2)]]
[1] "Tuesday"
>
>
> //不完全匹配
> l <- list(sddfg = 1:10)
> l
$sddfg
[1] 1 2 3 4 5 6 7 8 9 10
> l$sddfg
[1] 1 2 3 4 5 6 7 8 9 10
> l$a
NULL
> l$s
[1] 1 2 3 4 5 6 7 8 9 10
> l[["s", exact = FALSE]]
[1] 1 2 3 4 5 6 7 8 9 10
(3)如何处理缺失值(missing value)
> x <- c(1, NA, 2, NA, 3)
> is.na(x)
[1] FALSE TRUE FALSE TRUE FALSE
> x[!is.na(x)]
[1] 1 2 3
> x <- c(1, NA, 2, NA, 3)
> y <- c("a","b",NA,"c",NA)
> z <- complete.cases(x,y)//x,y都不是缺失值得元素
> z
[1] TRUE FALSE FALSE FALSE FALSE
>
> x[z]//分别查看x和y中不是缺失值得值
[1] 1
> y[z]
[1] "a"
> library(datasets)//加载一个数据集
> head(airquality)//查看数据集的前6行
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
> g <- complete.cases(airquality)//查看缺失值
> g
[1] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE
[13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[25] FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
[37] FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE TRUE TRUE
[49] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[61] FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
[73] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
[85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
[97] FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE
[109] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE
[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[145] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
> airquality[g,][1:10,]//选择数据集中不存在缺失值的行,列全要;查看1到10行,列全要。
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
7 23 299 8.6 65 5 7
8 19 99 13.8 59 5 8
9 8 19 20.1 61 5 9
12 16 256 9.7 69 5 12
13 11 290 9.2 66 5 13
14 14 274 10.9 68 5 14
(4)向量化操作
——可以作用于向量、矩阵等结构,使得代码简洁,易于阅读、效率高
向量矩阵的加减乘除都是针对每个元素的,即就是每个对应元素做加减乘除,如果想计算真正的矩阵乘法时,使用这个符号%*%
2.函数
(1)循环
- R不仅有for/while循环语句,还有更强大的实现循环的“一句话”函数:
- lapply:
- lapply可以循环处理列表中的每一个元素
- lapply(参数):lapply(列表,函数/函数名,其他参数)
总是返回一个列表
sapply:简化结果
- (1)结果列表元素长度均为1,返回向量
- (2)结果列表元素长度相同且大于1,返回矩阵
x <- list(a=1:10, b=c(11,21,31,41,51))
x
lapply(x,mean)
x <- 1:4
lapply(x, runif)
lapply(x, runif, min=0, max=100)
x <- list(a=matrix(1:6,2,3), b=matrix(4:7,2,2))
lapply(x, function(m) m[1,])
#sapply
x <- list(a=1:10, b=c(11,21,31,41,51))
x
lapply(x,mean)
sapply(x,mean)
//执行步骤:
> str(lapply)//查看一个函数的标准化
function (X, FUN, ...)
> x <- list(a=1:10, b=c(11,21,31,41,51))
> x
$a
[1] 1 2 3 4 5 6 7 8 9 10
$b
[1] 11 21 31 41 51
> lapply(x,mean)
$a
[1] 5.5
$b
[1] 31
> x <- 1:4
> lapply(x, runif)//runif取随机数
[[1]]
[1] 0.8024411
[[2]]
[1] 0.3922546 0.6929949
[[3]]
[1] 0.64910476 0.06124001 0.45324513
[[4]]
[1] 0.01928596 0.86259091 0.67297106 0.98231294
> lapply(x, runif, min=0, max=100)
[[1]]
[1] 34.09794
[[2]]
[1] 20.99846 45.18515
[[3]]
[1] 19.148935 81.885369 5.879639
[[4]]
[1] 18.60201 53.44052 27.06450 15.64718
> x <- list(a=matrix(1:6,2,3), b=matrix(4:7,2,2))
> lapply(x, function(m) m[1,])//自定义函数,求矩阵的第一行
$a
[1] 1 3 5
$b
[1] 4 6
> #sapply//简化结果
> x <- list(a=1:10, b=c(11,21,31,41,51))
> x
$a
[1] 1 2 3 4 5 6 7 8 9 10
$b
[1] 11 21 31 41 51
> lapply(x,mean)
$a
[1] 5.5
$b
[1] 31
> sapply(x,mean)
a b
5.5 31.0
> class(sapply(x,mean))
[1] "numeric"
- apply:
- 沿着数组的某一维度处理数据
- (1)例如:将函数用于矩阵的行或者列
- (2)虽然与for/while循环的效率相似,但是只用一句话就可以完成
- apply(参数):apply(数组,维度,函数/函数名)
x <- matrix(1:16,4,4)
> x
[,1] [,2] [,3] [,4]
[1,] 1 5 9 13
[2,] 2 6 10 14
[3,] 3 7 11 15
[4,] 4 8 12 16
> apply(x,2,mean)//2代表列
[1] 2.5 6.5 10.5 14.5
> apply(x,2,sum)
[1] 10 26 42 58
> sum
function (..., na.rm = FALSE) .Primitive("sum")
>
>
> apply(x,1,mean)//1代表行
[1] 7 8 9 10
> apply(x,1,sum)
[1] 28 32 36 40
>
rowSums(x)
rowMeans(x)
colSums(x)
colMeans(x)//更简便的算行列和以及平均数的函数
> x <- matrix(rnorm(100),10,10)
> apply(x, 1, quantile, probs=c(0.25,0.75))//算分位数
[,1] [,2] [,3] [,4] [,5] [,6]
25% 0.06054315 -0.8796558 -0.6019438 -0.3698089 -0.5951642 -0.80187646
75% 0.74679590 0.3931769 0.2459020 0.6931527 0.3820894 0.08229792
[,7] [,8] [,9] [,10]
25% -0.5985797 -0.1538297 -0.8994844 -1.202065
75% 0.7508944 0.8694427 0.8951599 -0.278875
> x
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 0.7943604 0.03776428 -1.30135026 0.1523084 0.12887977 1.8300554
[2,] 0.5349808 -0.77551826 -0.03223475 2.8928050 -0.38175344 -0.3784332
[3,] -2.5381823 1.06850374 -0.26883696 0.3875790 -0.65437178 0.8733678
[4,] 0.7108435 0.64008029 -2.75786167 -0.2609761 0.74268723 -0.6941522
[5,] 0.3862663 -0.65398901 -1.00904319 -0.3064571 -0.41868968 -1.3247414
[6,] 0.3072321 -0.67854760 -1.52100130 -0.8338515 -0.04192732 0.1019094
[7,] -0.7110167 0.99272860 0.49472202 0.3522931 -0.89273593 -0.1111968
[8,] 0.8180394 0.24945472 0.79362529 -0.3022865 0.88657718 -0.2078542
[9,] -0.5296272 0.10399219 -1.13527860 1.5599123 -1.32145044 2.1397333
[10,] -0.9080016 0.90191879 -0.11167784 -0.6201164 -1.26570093 -2.0554333
[,7] [,8] [,9] [,10]
[1,] 2.1890150 0.2225240 -1.097342438 0.604102489
[2,] -1.0996727 -1.4860049 -0.914368314 0.662635948
[3,] -0.1958515 -1.6066377 -0.179128701 -0.444659977
[4,] 0.5083954 0.8595397 -0.008981572 -0.406086520
[5,] -0.0432980 1.9158199 0.369558390 1.902044999
[6,] -0.7059515 -1.2839743 1.956473298 0.023463385
[7,] -0.2612688 1.1744811 0.836285136 -0.985409256
[8,] 1.1911508 3.4116767 -0.689197813 0.008243826
[9,] 1.0691268 -0.8876152 -0.903440765 0.373259033
[10,] -0.6666487 -1.0111572 -2.403407215 -0.165127814
> x <- array(rnorm(2*3*4), c(2,3,4))//处理多维数据
> apply(x,c(1,2),mean)
[,1] [,2] [,3]
[1,] -0.3919501 -0.6118854 -0.079962927
[2,] -0.3715698 -0.0623516 0.007735163
> apply(x,c(1,3),mean)
[,1] [,2] [,3] [,4]
[1,] -0.8401368 0.2964799 -0.8814463 -0.01996122
[2,] 0.3391606 -0.7196667 0.2350666 -0.42280886
- mapply:
- lapply的多元版本
- mapply(参数):mapply(函数/函数名,数据,函数相关的参数)
> list(rep(1,4),rep(2,3),rep(3,2),rep(4,1))
[[1]]
[1] 1 1 1 1
[[2]]
[1] 2 2 2
[[3]]
[1] 3 3
[[4]]
[1] 4
> mapply(rep, 1:4, 4:1)
[[1]]
[1] 1 1 1 1
[[2]]
[1] 2 2 2
[[3]]
[1] 3 3
[[4]]
[1] 4
s <- function(n, mean, std){
rnorm(n, mean, std)
}
#从均值为mean标准差为std的数据中抽取n个数据
> s <- function(n, mean, std){
+ rnorm(n, mean, std)
+ }
> s(4,0,1)
[1] 0.4510066 1.0603416 -1.7950502 -0.1977936
> mapply(s, 1:5, 5:1, 2)
[[1]]
[1] 3.891429
[[2]]
[1] 3.411856 4.860403
[[3]]
[1] 4.648763 1.810501 1.254752
[[4]]
[1] 4.330916 4.414443 5.379381 1.924920
[[5]]
[1] 0.3902404 0.5769118 2.1168463 -0.2346868 1.2657041
- tapply:
- (1)对向量的子集进行操作
- (2)tapply(参数):tapply(向量,因子/因子列表,函数/函数名)
#5个正态分布,5个均匀分布,5个均值为1,标准差为0的正态分布
x <- c(rnorm(5), runif(5), rnorm(5,1))
#因子,3个水平,每个水平下有五个元素
f <- gl(3,5)
tapply(x,f,mean)
tapply(x,f,mean, simplify=FALSE)
//实现步骤
> x <- c(rnorm(5), runif(5), rnorm(5,1))
> x
[1] 0.62053059 -0.56823581 -0.57007814 -1.15549512 1.30405929 0.58582563
[7] 0.32466797 0.61626307 0.11666598 0.29560003 1.80040297 1.15604482
[13] 0.48488965 0.78446984 0.06004225
> f <- gl(3,5)
> f
[1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
Levels: 1 2 3
> tapply(x,f,mean)
1 2 3
-0.07384384 0.38780454 0.85716990
> tapply(x,f,mean, simplify=FALSE)
$`1`
[1] -0.07384384
$`2`
[1] 0.3878045
$`3`
[1] 0.8571699
- split
- (1)根据因子或者因子列表将向量或其他对象分组
- (2)通常与lapply一起使用
- (3)split(参数):split(向量/列表/数据框,因子/因子列表)
x <- c(rnorm(5), runif(5), rnorm(5,1))
x
f <- gl(3,5)
split(x,f)
lapply(split(x,f),mean)
head(airquality)
#按月份查看:
s <- split(airquality, airquality$Month)
#查看有几个月,每个月包含几个记录
table(airquality$Month)
#用lapply计算每一个月的测量风速,温度等以及平均值
lapply(s, function(x) colMeans(x[,c("Ozone","Wind","Temp")]))
sapply(s, function(x) colMeans(x[,c("Ozone","Wind","Temp")]))
sapply(s, function(x) colMeans(x[,c("Ozone","Wind","Temp")],na.rm = TRUE))
//执行步骤:
> x <- c(rnorm(5), runif(5), rnorm(5,1))
> x
[1] 1.23662119 0.67641029 -0.70324640 -0.09581818 -0.48951465 0.07295452
[7] 0.53722799 0.94607521 0.68805128 0.97326122 -0.45511553 0.99704495
[13] 1.18909835 0.76038237 2.39615395
> f <- gl(3,5)
> split(x,f)
$`1`
[1] 1.23662119 0.67641029 -0.70324640 -0.09581818 -0.48951465
$`2`
[1] 0.07295452 0.53722799 0.94607521 0.68805128 0.97326122
$`3`
[1] -0.4551155 0.9970449 1.1890984 0.7603824 2.3961540
> lapply(split(x,f),mean)
$`1`
[1] 0.1248905
$`2`
[1] 0.643514
$`3`
[1] 0.9775128
> head(airquality)
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
> s <- split(airquality, airquality$Month)
> s
$`5`
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
7 23 299 8.6 65 5 7
8 19 99 13.8 59 5 8
9 8 19 20.1 61 5 9
10 NA 194 8.6 69 5 10
11 7 NA 6.9 74 5 11
12 16 256 9.7 69 5 12
13 11 290 9.2 66 5 13
14 14 274 10.9 68 5 14
15 18 65 13.2 58 5 15
16 14 334 11.5 64 5 16
17 34 307 12.0 66 5 17
18 6 78 18.4 57 5 18
19 30 322 11.5 68 5 19
20 11 44 9.7 62 5 20
21 1 8 9.7 59 5 21
22 11 320 16.6 73 5 22
23 4 25 9.7 61 5 23
24 32 92 12.0 61 5 24
25 NA 66 16.6 57 5 25
26 NA 266 14.9 58 5 26
27 NA NA 8.0 57 5 27
28 23 13 12.0 67 5 28
29 45 252 14.9 81 5 29
30 115 223 5.7 79 5 30
31 37 279 7.4 76 5 31
$`6`
Ozone Solar.R Wind Temp Month Day
32 NA 286 8.6 78 6 1
33 NA 287 9.7 74 6 2
34 NA 242 16.1 67 6 3
35 NA 186 9.2 84 6 4
36 NA 220 8.6 85 6 5
37 NA 264 14.3 79 6 6
38 29 127 9.7 82 6 7
39 NA 273 6.9 87 6 8
40 71 291 13.8 90 6 9
41 39 323 11.5 87 6 10
42 NA 259 10.9 93 6 11
43 NA 250 9.2 92 6 12
44 23 148 8.0 82 6 13
45 NA 332 13.8 80 6 14
46 NA 322 11.5 79 6 15
47 21 191 14.9 77 6 16
48 37 284 20.7 72 6 17
49 20 37 9.2 65 6 18
50 12 120 11.5 73 6 19
51 13 137 10.3 76 6 20
52 NA 150 6.3 77 6 21
53 NA 59 1.7 76 6 22
54 NA 91 4.6 76 6 23
55 NA 250 6.3 76 6 24
56 NA 135 8.0 75 6 25
57 NA 127 8.0 78 6 26
58 NA 47 10.3 73 6 27
59 NA 98 11.5 80 6 28
60 NA 31 14.9 77 6 29
61 NA 138 8.0 83 6 30
$`7`
Ozone Solar.R Wind Temp Month Day
62 135 269 4.1 84 7 1
63 49 248 9.2 85 7 2
64 32 236 9.2 81 7 3
65 NA 101 10.9 84 7 4
66 64 175 4.6 83 7 5
67 40 314 10.9 83 7 6
68 77 276 5.1 88 7 7
69 97 267 6.3 92 7 8
70 97 272 5.7 92 7 9
71 85 175 7.4 89 7 10
72 NA 139 8.6 82 7 11
73 10 264 14.3 73 7 12
74 27 175 14.9 81 7 13
75 NA 291 14.9 91 7 14
76 7 48 14.3 80 7 15
77 48 260 6.9 81 7 16
78 35 274 10.3 82 7 17
79 61 285 6.3 84 7 18
80 79 187 5.1 87 7 19
81 63 220 11.5 85 7 20
82 16 7 6.9 74 7 21
83 NA 258 9.7 81 7 22
84 NA 295 11.5 82 7 23
85 80 294 8.6 86 7 24
86 108 223 8.0 85 7 25
87 20 81 8.6 82 7 26
88 52 82 12.0 86 7 27
89 82 213 7.4 88 7 28
90 50 275 7.4 86 7 29
91 64 253 7.4 83 7 30
92 59 254 9.2 81 7 31
$`8`
Ozone Solar.R Wind Temp Month Day
93 39 83 6.9 81 8 1
94 9 24 13.8 81 8 2
95 16 77 7.4 82 8 3
96 78 NA 6.9 86 8 4
97 35 NA 7.4 85 8 5
98 66 NA 4.6 87 8 6
99 122 255 4.0 89 8 7
100 89 229 10.3 90 8 8
101 110 207 8.0 90 8 9
102 NA 222 8.6 92 8 10
103 NA 137 11.5 86 8 11
104 44 192 11.5 86 8 12
105 28 273 11.5 82 8 13
106 65 157 9.7 80 8 14
107 NA 64 11.5 79 8 15
108 22 71 10.3 77 8 16
109 59 51 6.3 79 8 17
110 23 115 7.4 76 8 18
111 31 244 10.9 78 8 19
112 44 190 10.3 78 8 20
113 21 259 15.5 77 8 21
114 9 36 14.3 72 8 22
115 NA 255 12.6 75 8 23
116 45 212 9.7 79 8 24
117 168 238 3.4 81 8 25
118 73 215 8.0 86 8 26
119 NA 153 5.7 88 8 27
120 76 203 9.7 97 8 28
121 118 225 2.3 94 8 29
122 84 237 6.3 96 8 30
123 85 188 6.3 94 8 31
$`9`
Ozone Solar.R Wind Temp Month Day
124 96 167 6.9 91 9 1
125 78 197 5.1 92 9 2
126 73 183 2.8 93 9 3
127 91 189 4.6 93 9 4
128 47 95 7.4 87 9 5
129 32 92 15.5 84 9 6
130 20 252 10.9 80 9 7
131 23 220 10.3 78 9 8
132 21 230 10.9 75 9 9
133 24 259 9.7 73 9 10
134 44 236 14.9 81 9 11
135 21 259 15.5 76 9 12
136 28 238 6.3 77 9 13
137 9 24 10.9 71 9 14
138 13 112 11.5 71 9 15
139 46 237 6.9 78 9 16
140 18 224 13.8 67 9 17
141 13 27 10.3 76 9 18
142 24 238 10.3 68 9 19
143 16 201 8.0 82 9 20
144 13 238 12.6 64 9 21
145 23 14 9.2 71 9 22
146 36 139 10.3 81 9 23
147 7 49 10.3 69 9 24
148 14 20 16.6 63 9 25
149 30 193 6.9 70 9 26
150 NA 145 13.2 77 9 27
151 14 191 14.3 75 9 28
152 18 131 8.0 76 9 29
153 20 223 11.5 68 9 30
> table(airquality$Month)
5 6 7 8 9
31 30 31 31 30
> lapply(s, function(x) colMeans(x[,c("Ozone","Wind","Temp")]))
$`5`
Ozone Wind Temp
NA 11.62258 65.54839
$`6`
Ozone Wind Temp
NA 10.26667 79.10000
$`7`
Ozone Wind Temp
NA 8.941935 83.903226
$`8`
Ozone Wind Temp
NA 8.793548 83.967742
$`9`
Ozone Wind Temp
NA 10.18 76.90
> sapply(s, function(x) colMeans(x[,c("Ozone","Wind","Temp")]))
5 6 7 8 9
Ozone NA NA NA NA NA
Wind 11.62258 10.26667 8.941935 8.793548 10.18
Temp 65.54839 79.10000 83.903226 83.967742 76.90
> sapply(s, function(x) colMeans(x[,c("Ozone","Wind","Temp")],na.rm = TRUE))//拿掉缺失值
5 6 7 8 9
Ozone 23.61538 29.44444 59.115385 59.961538 31.44828
Wind 11.62258 10.26667 8.941935 8.793548 10.18000
Temp 65.54839 79.10000 83.903226 83.967742 76.90000
(2)排序
- sort:
- 对向量进行排序;返回排好序的内容
- order
- 返回排好序的内容的下标/多个排序标准
x <- data.frame(v1=1:5, v2=c(10,7,9,6,8), v3=11:15, v4=c(1,1,2,2,1))
x
#对数据框中v2进行排序
sort(x$v2)
sort(x$v2, decreasing = TRUE)
order(x$v2)
x[order(x$v2),]
#先对v4进行排序,如果遇到两个一样的元素,再按v2进行排序
x[order(x$v4,x$v2),]
x[order(x$v4,x$v2,decreasing = TRUE),]
//执行步骤:
> x <- data.frame(v1=1:5, v2=c(10,7,9,6,8), v3=11:15, v4=c(1,1,2,2,1))
> x
v1 v2 v3 v4
1 1 10 11 1
2 2 7 12 1
3 3 9 13 2
4 4 6 14 2
5 5 8 15 1
> sort(x$v2)
[1] 6 7 8 9 10
> sort(x$v2, decreasing = TRUE)
[1] 10 9 8 7 6
> order(x$v2)
[1] 4 2 5 3 1
> x[order(x$v2),]
v1 v2 v3 v4
4 4 6 14 2
2 2 7 12 1
5 5 8 15 1
3 3 9 13 2
1 1 10 11 1
> x[order(x$v4,x$v2),]
v1 v2 v3 v4
2 2 7 12 1
5 5 8 15 1
1 1 10 11 1
4 4 6 14 2
3 3 9 13 2
> x[order(x$v4,x$v2,decreasing = TRUE),]
v1 v2 v3 v4
3 3 9 13 2
4 4 6 14 2
1 1 10 11 1
5 5 8 15 1
2 2 7 12 1
(3)总结数据信息
#查看前六行
head(airquality)
#看后六行
tail(airquality)
#自定义行数
head(airquality,10)
summary(airquality)
str(airquality)
table(airquality$Month)
table(airquality$Ozone, useNA = "ifany")
table(airquality$Month,airquality$Day)
#判断是否有缺失值
any(is.na(airquality$Ozone))
sum(is.na(airquality$Ozone))
#判断月份是不是都小于12
all(airquality$Month<12)
#新的一个数据
titanic <- as.data.frame(Titanic)
head(titanic)
dim(titanic)
summary(titanic)
#一个新的表
x <- xtabs(Freq ~ Class + Age, data=titanic)
ftable(x)
#了解我们的数据有多大
object.size(airquality)
print(object.size(airquality), units = "Kb")
//执行步骤
> #查看前六行
> head(airquality)
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
> #看后六行
> tail(airquality)
Ozone Solar.R Wind Temp Month Day
148 14 20 16.6 63 9 25
149 30 193 6.9 70 9 26
150 NA 145 13.2 77 9 27
151 14 191 14.3 75 9 28
152 18 131 8.0 76 9 29
153 20 223 11.5 68 9 30
> head(airquality,10)
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
7 23 299 8.6 65 5 7
8 19 99 13.8 59 5 8
9 8 19 20.1 61 5 9
10 NA 194 8.6 69 5 10
> summary(airquality)
Ozone Solar.R Wind Temp
Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
NA's :37 NA's :7
Month Day
Min. :5.000 Min. : 1.0
1st Qu.:6.000 1st Qu.: 8.0
Median :7.000 Median :16.0
Mean :6.993 Mean :15.8
3rd Qu.:8.000 3rd Qu.:23.0
Max. :9.000 Max. :31.0
> str(airquality)
'data.frame': 153 obs. of 6 variables:
$ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
$ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
$ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
$ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
$ Month : int 5 5 5 5 5 5 5 5 5 5 ...
$ Day : int 1 2 3 4 5 6 7 8 9 10 ...
> table(airquality$Month)
5 6 7 8 9
31 30 31 31 30
> table(airquality$Ozone)
1 4 6 7 8 9 10 11 12 13 14 16 18 19 20 21 22 23 24 27
1 1 1 3 1 3 1 3 2 4 4 4 4 1 4 4 1 6 2 1
28 29 30 31 32 34 35 36 37 39 40 41 44 45 46 47 48 49 50 52
3 1 2 1 3 1 2 2 2 2 1 1 3 2 1 1 1 1 1 1
59 61 63 64 65 66 71 73 76 77 78 79 80 82 84 85 89 91 96 97
2 1 1 2 1 1 1 2 1 1 2 1 1 1 1 2 1 1 1 2
108 110 115 118 122 135 168
1 1 1 1 1 1 1
> table(airquality$Ozone, useNA = "ifany")
1 4 6 7 8 9 10 11 12 13 14 16 18 19 20 21
1 1 1 3 1 3 1 3 2 4 4 4 4 1 4 4
22 23 24 27 28 29 30 31 32 34 35 36 37 39 40 41
1 6 2 1 3 1 2 1 3 1 2 2 2 2 1 1
44 45 46 47 48 49 50 52 59 61 63 64 65 66 71 73
3 2 1 1 1 1 1 1 2 1 1 2 1 1 1 2
76 77 78 79 80 82 84 85 89 91 96 97 108 110 115 118
1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1
122 135 168 <NA>
1 1 1 37
> table(airquality$Month,airquality$Day)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
8 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
9 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
29 30 31
5 1 1 1
6 1 1 0
7 1 1 1
8 1 1 1
9 1 1 0
> any(is,na(airquality$Ozone))
Error: could not find function "na"
> any(is.na(airquality$Ozone))
[1] TRUE
> sum(is.na(airquality$Ozone))
[1] 37
> all(airquality$Month<12)
[1] TRUE
> #新的一个数据
> titanic <- as.data.frame(Titanic)
> head(titanic)
Class Sex Age Survived Freq
1 1st Male Child No 0
2 2nd Male Child No 0
3 3rd Male Child No 35
4 Crew Male Child No 0
5 1st Female Child No 0
6 2nd Female Child No 0
> dim(titanic)
[1] 32 5
> summary(titanic)
Class Sex Age Survived Freq
1st :8 Male :16 Child:16 No :16 Min. : 0.00
2nd :8 Female:16 Adult:16 Yes:16 1st Qu.: 0.75
3rd :8 Median : 13.50
Crew:8 Mean : 68.78
3rd Qu.: 77.00
Max. :670.00
>
>
> #一个新的表
> xtabs(Freq ~ Class + Age, data=titanic)
Age
Class Child Adult
1st 6 319
2nd 24 261
3rd 79 627
Crew 0 885
> x <- xtabs(Freq ~ Class + Age, data=titanic)
> ftable(x)
Age Child Adult
Class
1st 6 319
2nd 24 261
3rd 79 627
Crew 0 885
> object.size(airquality)
5496 bytes
> print(object.size(airquality), units = "Kb")
5.4 Kb