"循环"
# for 不够简洁,有些比较好的循环函数
> lapply
function (X, FUN, ...)
{
FUN <- match.fun(FUN)
if (!is.vector(X) || is.object(X))
X <- as.list(X)
.Internal(lapply(X, FUN))
}
<bytecode: 0x0000026716deb2e0>
<environment: namespace:base>
x会强制转换为列表,如果不能转化为列表则会报错
> x = list(a = 1:5, b = rnorm(10))
> lapply(x, mean)
$a
[1] 3
$b
[1] 0.1979356
"lapply的应用"
#感觉
> x = list(a = 1:4, b = rnorm(10),c = rnorm(20,1),d = rnorm(100,5))
> lapply(x,mean)
$a
[1] 2.5
$b
[1] -0.1493043
$c
[1] 1.055613
$d
[1] 4.963545
这个函数还是挺有意思的
> rnorm
function (n, mean = 0, sd = 1)
.Call(C_rnorm, n, mean, sd)
<bytecode: 0x0000026721476fc8>
<environment: namespace:stats>
#对列表中的每个数都进行后续的函数操作
# runif 是生成均匀随机变量的数(范围是)
> x = 1:4
> lapply(x,runif)
[[1]]
[1] 0.6119571
[[2]]
[1] 0.33926336 0.07634248
[[3]]
[1] 0.5548308 0.7645762 0.5888868
[[4]]
[1] 0.17705475 0.39926515 0.74825495 0.06295764
#runif 函数的特点
> runif
function (n, min = 0, max = 1)
.Call(C_runif, n, min, max)
<bytecode: 0x0000026719b7f600>
<environment: namespace:stats>
#设置函数的参数
> x = 1:4
> lapply(x, runif, min = 0,max = 10)
[[1]]
[1] 9.09993
[[2]]
[1] 8.249654 8.300149
[[3]]
[1] 0.920899 3.428931 8.805718
[[4]]
[1] 4.548563 1.144547 4.978611 4.252297
#创建矩阵的特点
> x = matrix(1:4,2,2)
> x
[,1] [,2]
[1,] 1 3
[2,] 2 4
> x = list(a = matrix(1:4, 2, 2),b = matrix(1:6, 3, 2))
> x
$a
[,1] [,2]
[1,] 1 3
[2,] 2 4
$b
[,1] [,2]
[1,] 1 4
[2,] 2 5
[3,] 3 6
#提取第一列(自定义函数,想得到自己想要的结果需要自己写)
> lapply(x, function(elt) elt[,1])
$a
[1] 1 2
$b
[1] 1 2 3
列表和向量的区别
R语言中,向量和list都能存放多个元素。
向量中存储的是同一种元素,list是向量的一般形式,其元素类型并不一定都相同
而且其元素往往为向量或元素本身,列表为返回结果提供了一种便捷的方式。
#lapply会全部返回列表,而sapply是返回的向量
> y = list(1,2,3)
> class(y)
[1] "list"
> y= 1
> class(y)
[1] "numeric"
> y<-1:10
> class(y)
[1] "integer"
> y <- c(1,3,4,9,10)
> class(y)
[1] "numeric"
> y = vector("numeric",1L)
> class(y) #还是不是很懂,等后面再看看吧
"sapply的应用 简化"
# sapply会对lapply返回的结果进行简化
> x = list(a = 1:4, b = rnorm(10),c = rnorm(20,1),d = rnorm(100,5))
> lapply(x, mean)
$a
[1] 2.5
$b
[1] 0.04221977
$c
[1] 0.5871852
$d
[1] 5.014724
> sapply(x, mean)
a b c d
2.50000000 0.04221977 0.58718524 5.01472399
> mean(x)#直接来求的话是不对的
[1] NA
Warning message:
In mean.default(x) : 参数不是数值也不是逻辑值:回覆NA
"apply的应用"
> x = matrix(rnorm(200),20,10)
> apply(x,2 , mean)
[1] -0.04709199 0.03602752 0.10755204 -0.15945315
[5] -0.10060440 0.03241379 0.17101090 0.03626799
[9] -0.03919470 0.01915320
> apply(x, 1, mean)
[1] 0.40597671 -0.04996821 0.07252043 -0.41796057
[5] -0.19648232 -0.38673315 0.09841727 0.05975844
[9] 0.27766894 0.17145129 0.02994334 0.40665879
[13] 0.07016164 0.57858407 -0.17893295 -0.97666838
[17] -0.04187527 0.36007597 0.09975086 -0.27018445
# MARGIN是函数中的第二个参数能指定行和列
# 如果把mean改成sum,可以计算行列的综合
# 计算百分位
> x = matrix(rnorm(200),20,10)
> apply(x, 1, quantile, probs = c(0.25, 0.75))
[,1] [,2] [,3] [,4] [,5]
25% -0.6382066 -0.7559317 0.08077449 -0.7794681 0.5607623
75% 0.4233704 -0.1102367 0.94947144 0.8516035 1.0274760
[,6] [,7] [,8] [,9] [,10]
25% -0.303428 -0.7693836 -0.3030852 -0.9621957 -0.2906915
75% 1.182858 0.2017545 0.6080206 0.9727036 0.4342429
[,11] [,12] [,13] [,14]
25% -0.85514882 -0.2550779 -0.9079677 0.002593077
75% 0.02238502 1.2077598 0.1596715 1.281124934
[,15] [,16] [,17] [,18] [,19]
25% 0.1383119 -0.1993797 -0.8122670 -0.2819779 -0.1013921
75% 0.8065995 0.5798212 0.5848504 1.0934210 0.9399187
[,20]
25% -0.3037867
75% 0.3248413
#probs这个参数是传给quantile的
#多维度的计算
> a = array(rnorm(2*2*10),c(2, 2, 10))
> apply(a, c(1, 2), mean)
[,1] [,2]
[1,] -0.3056770 0.510748034
[2,] 0.1224402 0.009973656
> rowMeans(a, dims = 2)
[,1] [,2]
[1,] -0.3056770 0.510748034
[2,] 0.1224402 0.009973656
"mapply"
# lappy和apply是单个的分析
# 处理多个列表时候要用到mapply
# > list(rep(1, 4), rep(2,3),rep(3,2),rep(4, 1))
# [[1]]
# [1] 1 1 1 1
#
# [[2]]
# [1] 2 2 2
#
# [[3]]
# [1] 3 3
#
# [[4]]
# [1] 4
# > mapply(rep, 1:4, 4:1)
# [[1]]
# [1] 1 1 1 1
#
# [[2]]
# [1] 2 2 2
#
# [[3]]
# [1] 3 3
#
# [[4]]
# [1] 4
#这两者是一样的,我感觉像是两个列表一个提供了一个参数
#3:08
noise = function(n, mean, sd){
+rnorm(n, mean, sd)
}
#
noise(5, 1, 2)
noise(1:5, 1:5, 2)
# > noise = function(n, mean, sd){
# + +rnorm(n, mean, sd)
# + }
# >
# >
# 5个均值为1 标准差为2的随机变量
# > noise(5, 1, 2)
# [1] 2.1349294 4.4768690 0.9392429 -0.2767693 -1.0053244
#1个期望值为1 标准差为2的随机变量
#2个期望值为2 标准差为2的随机变量
# ....
# > noise(1:5, 1:5, 2)
# [1] 2.007081 2.971506 -1.922784 5.191263 8.051044
mapply(noise, 1:5, 1:5, 2)
# [[1]]
# [1] 1.668036
#
# [[2]]
# [1] -2.577098 1.794833
#
# [[3]]
# [1] 1.243830 7.943393 3.943070
#
# [[4]]
# [1] 8.289387 3.801006 5.081261 1.359685
#
# [[5]]
# [1] 1.561076 6.683892 3.947925 5.159011 3.428528
list(noise(5, 1, 2),noise(4, 1, 2))
#list 会直接运行
# [[1]]
# [1] 1.001350 2.504705 1.640980 -2.314787 0.997709
#
# [[2]]
# [1] 1.086982 2.653428 -2.915361 2.892763
"tapply"
# 用来计算区域的值
x <- c(rnorm(10), runif(10), rnorm(10, 1))
f <- gl(3, 10)
f
tapply(x, f, mean)
# > x <- c(rnorm(10), runif(10), rnorm(10, 1))
# > f <- gl(3, 10)
# > f
# [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3
# Levels: 1 2 3
# > tapply(x, f, mean)
# 1 2 3
# 0.03543971 0.46739283 1.25009909
#不简化就是一个list
tapply(x, f, mean, simplify = FALSE)
# $`1`
# [1] 0.03543971
#
# $`2`
# [1] 0.4673928
#
# $`3`
# [1] 1.250099
tapply(x, f, range)
#测算数值的范围
# $`1`
# [1] -1.221246 1.543615
#
# $`2`
# [1] 0.0639620 0.8429187
#
# $`3`
# [1] -0.1620603 2.5457265
"split"
x <- c(rnorm(10), runif(10), rnorm(10, 1))
f <- gl(3, 10)
split(x, f)
# $`1`
# [1] -1.6811460 -0.8094158 -0.8390141 -1.0325300 -0.7308444 -1.3002284
# [7] 0.5846582 -0.8420822 -0.2422032 -0.4242965
#
# $`2`
# [1] 0.3682323 0.1598136 0.5698454 0.1563768 0.8951418 0.9841967 0.8010664
# [8] 0.6594530 0.9396235 0.8305545
#
# $`3`
# [1] 1.7961916 -0.3702188 2.1567355 3.3805721 1.0052914 0.9722251
# [7] 1.7216104 -0.5571471 1.1695204 1.3274121
lapply(split(x,f), mean)
# $`1`
# [1] -0.7317103
#
# $`2`
# [1] 0.6364304
#
# $`3`
# [1] 1.260219
a=read.csv("hw1_data.csv")
head(a)
# Ozone Solar.R Wind Temp Month Day
# 1 41 190 7.4 67 5 1
# 2 36 118 8.0 72 5 2
# 3 12 149 12.6 74 5 3
# 4 18 313 11.5 62 5 4
# 5 NA NA 14.3 56 5 5
# 6 28 NA 14.9 66 5 6
s = split(a,a$Month)
lapply(s, function(x) colMeans(x[,c("Solar.R", "Ozone", "Wind")]))
# $`6`
# Solar.R Ozone Wind
# 190.16667 NA 10.26667
#
# $`7`
# Solar.R Ozone Wind
# 216.483871 NA 8.941935
#
# $`8`
# Solar.R Ozone Wind
# NA NA 8.793548
#
# $`9`
# Solar.R Ozone Wind
# 167.4333 NA 10.1800
sapply(s, function(x) colMeans(x[,c("Solar.R", "Ozone", "Wind")]))
# 5 6 7 8 9
# Solar.R NA 190.16667 216.483871 NA 167.4333
# Ozone NA NA NA NA NA
# Wind 11.62258 10.26667 8.941935 8.793548 10.1800
sapply(s, function(x) colMeans(x[,c("Solar.R", "Ozone", "Wind")],
na.rm = TRUE))
# 5 6 7 8 9
# Solar.R 181.29630 190.16667 216.483871 171.857143 167.43333
# Ozone 23.61538 29.44444 59.115385 59.961538 31.44828
# Wind 11.62258 10.26667 8.941935 8.793548 10.18000
# 当面临多个条件时的组合
# > x = rnorm(10)
# > f1 = gl(2,5)
# > f2 = gl(5,2)
# > f1
# [1] 1 1 1 1 1 2 2 2 2 2
# Levels: 1 2
# > f2
# [1] 1 1 2 2 3 3 4 4 5 5
# Levels: 1 2 3 4 5
# > interaction(f1,f2)
# [1] 1.1 1.1 1.2 1.2 1.3 2.3 2.4 2.4 2.5 2.5
# Levels: 1.1 2.1 1.2 2.2 1.3 2.3 1.4 2.4 1.5 2.5
# 不一定要使用split函数,用split(x,list(..))可以让其自动调用
str(split(x, list(f1, f2)))
# 我暂且认为其是随机分的
# > str(split(x, list(f1, f2)))
# List of 10
# $ 1.1: num [1:2] -2.078 -0.909
# $ 2.1: num(0)
# $ 1.2: num [1:2] -0.764 2.827
# $ 2.2: num(0)
# $ 1.3: num -0.217
# $ 2.3: num -2.36
# $ 1.4: num(0)
# $ 2.4: num [1:2] -2.09 0.121
# $ 1.5: num(0)
# $ 2.5: num [1:2] 1.09 -1.1
str(split(x, list(f1, f2), drop = TRUE))
# > str(split(x, list(f1, f2), drop = TRUE))
# List of 6
# $ 1.1: num [1:2] -2.078 -0.909
# $ 1.2: num [1:2] -0.764 2.827
# $ 1.3: num -0.217
# $ 2.3: num -2.36
# $ 2.4: num [1:2] -2.09 0.121
# $ 2.5: num [1:2] 1.09 -1.1
"debug"
# message warning 和 error共三种错误
# warning 不致命
# error 错误
# message
# > log(-1)
# [1] NaN
# Warning message:
# In log(-1) : NaNs produced
printmessage = function(x){
if(x > 0)
print("x is greater than zero")
else
print("x is less than or equal to zero")
invisible(x)
}
printmessage(1)
printmessage(NA)
# > printmessage(1)
# [1] "x is greater than zero"
# > printmessage(NA)
# Error in if (x > 0) print("x is greater than zero") else print("x is less than or equal to zero") :
# missing value where TRUE/FALSE needed
# 改进函数
printmessage2 = function(x){
if(is.na(x))
print("x is missing value")
else if(x > 0)
print("x is greater than zero")
else
print("x is less than or equal to zero")
invisible(x)
}
x = log(-1)
printmessage2(x)
#
# > x = log(-1)
# Warning message:
# In log(-1) : NaNs produced
# >
# > printmessage2(x)
# [1] "x is missing value"
"debug tools in R"
# traceback 函数能告诉你这里面一共运行了多少函数
# debug 一行一行的运行
# brower 调用会在那儿停止
# trace 插入debug在特定的位置
# recover会在函数出现错误的位置上停止
#traceback
# > mean(b)
# Error in mean(b) : object 'b' not found
# > traceback()
# 1: mean(b)
# > lm(x - y)
# Error in formula.default(object, env = baseenv()) : 公式不对
# > traceback()
# 9: stop("invalid formula")
# 8: formula.default(object, env = baseenv())
# 7: formula(object, env = baseenv())
# 6: as.formula(formula)
# 5: model.frame.default(formula = x - y, drop.unused.levels = TRUE)
# 4: stats::model.frame(formula = x - y, drop.unused.levels = TRUE)
# 3: eval(mf, parent.frame())
# 2: eval(mf, parent.frame())
# 1: lm(x - y)
debug(lm)
lm(x - y)
2
R学习——霍普金斯大学week3
最新推荐文章于 2024-10-15 11:48:31 发布