R语言并行化基础与提高

• Bootstrapping
• 交叉验证(Cross-validation)
• (Multivariate Imputation by Chained Equations ,MICE)相关介绍:R语言中的缺失值处理
• 拟合多元回归方程

学习lapply是关键

> lapply(1:3, function(x) c(x, x^2, x^3))
[[1]]
[1] 1 1 1

[[2]]
[1] 2 4 8

[[3]]
[1] 3 9 27

> lapply(1:3/3, round, digits=3)
[[1]]
[1] 0.333

[[2]]
[1] 0.667

[[3]]
[1] 1

parallel包

library(parallel)

# Calculate the number of cores
no_cores <- detectCores() - 1

# Initiate cluster
cl <- makeCluster(no_cores)

parLapply(cl, 2:4,
function(exponent)
2^exponent)
[[1]]
[1] 4

[[2]]
[1] 8

[[3]]
[1] 16

stopCluster(cl)

变量作用域

cl<-makeCluster(no_cores)

base <- 2
clusterExport(cl, "base")
parLapply(cl,
2:4,
function(exponent)
base^exponent)

stopCluster(cl)

[[1]]
[1] 4

[[2]]
[1] 8

[[3]]
[1] 16

cl<-makeCluster(no_cores)
clusterExport(cl, "base")
base <- 4
# Run
parLapply(cl,
2:4,
function(exponent)
base^exponent)

# Finish
stopCluster(cl)
[[1]]
[1] 4

[[2]]
[1] 8

[[3]]
[1] 16

使用parSapply

> parSapply(cl, 2:4,
function(exponent)
base^exponent)
[1]  4  8 16

> parSapply(cl, as.character(2:4),
function(exponent){
x <- as.numeric(exponent)
c(base = base^x, self = x^x)
})
2  3   4
base 4  8  16
self 4 27 256

foreach包

library(foreach)
library(doParallel)

cl<-makeCluster(no_cores)
registerDoParallel(cl)

stopImplicitCluster()

foreach函数可以使用参数.combine控制你汇总结果的方法：

> foreach(exponent = 2:4,
.combine = c)  %dopar%
base^exponent
[1]  4  8 16
> foreach(exponent = 2:4,
.combine = rbind)  %dopar%
base^exponent
[,1]
result.1    4
result.2    8
result.3   16
foreach(exponent = 2:4,
.combine = list,
.multicombine = TRUE)  %dopar%
base^exponent
[[1]]
[1] 4

[[2]]
[1] 8

[[3]]
[1] 16

> foreach(exponent = 2:4,
.combine = list)  %dopar%
base^exponent
[[1]]
[[1]][[1]]
[1] 4

[[1]][[2]]
[1] 8

[[2]]
[1] 16

变量作用域

> base <- 2
> cl<-makeCluster(2)
> registerDoParallel(cl)
> foreach(exponent = 2:4,
.combine = c)  %dopar%
base^exponent
stopCluster(cl)
[1]  4  8 16

test <- function (exponent) {
foreach(exponent = 2:4,
.combine = c)  %dopar%
base^exponent
}
test()

Error in base^exponent : task 1 failed - "object 'base' not found" 

base <- 2
cl<-makeCluster(2)
registerDoParallel(cl)

base <- 4
test <- function (exponent) {
foreach(exponent = 2:4,
.combine = c,
.export = "base")  %dopar%
base^exponent
}
test()

stopCluster(cl)

[1]  4  8 16


使用Fork还是sock?

• FORK:”to divide in branches and go separate ways”
系统：Unix/Mac (not Windows)
环境: 所有
• PSOCK:并行socket集群
系统: All (including Windows)
环境: 空

内存控制

PSOCK:

library(pryr) # Used for memory analyses
cl<-makeCluster(no_cores)
clusterExport(cl, "a")
clusterEvalQ(cl, library(pryr))
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

FORK :

cl<-makeCluster(no_cores, type="FORK")
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

b <- 0
parSapply(cl, X = 1:10, function(x) {b <- b + 1; b})
# [1] 1 1 1 1 1 1 1 1 1 1
parSapply(cl, X = 1:10, function(x) {b <<- b + 1; b})
# [1] 1 2 3 4 5 1 2 3 4 5
b
# [1] 0

调试

tryCatch-list方法

foreach(x=list(1, 2, "a"))  %dopar%
{
tryCatch({
c(1/x, x, 2^x)
}, error = function(e) return(paste0("The variable '", x, "'",
" caused the error: '", e, "'")))
}
[[1]]
[1] 1 1 2

[[2]]
[1] 0.5 2.0 4.0

[[3]]
[1] "The variable 'a' caused the error: 'Error in 1/x: non-numeric argument to binary operator\n'"

out <- lapply(1:3, function(x) c(x, 2^x, x^x))
do.call(rbind, out)
[,1] [,2] [,3]
[1,]    1    2    1
[2,]    2    4    4
[3,]    3    8   27

创建一个文件输出

cl<-makeCluster(no_cores, outfile = "debug.txt")
registerDoParallel(cl)
foreach(x=list(1, 2, "a"))  %dopar%
{
print(x)
}
stopCluster(cl)
starting worker pid=7392 on localhost:11411 at 00:11:21.077
starting worker pid=7276 on localhost:11411 at 00:11:21.319
starting worker pid=7576 on localhost:11411 at 00:11:21.762
[1] 2]

[1] "a"

创建一个结点专用文件

cl<-makeCluster(no_cores, outfile = "debug.txt")
registerDoParallel(cl)
foreach(x=list(1, 2, "a"))  %dopar%
{
cat(dput(x), file = paste0("debug_file_", x, ".txt"))
}
stopCluster(cl)

partools包

partools这个包有一个dbs()函数或许值得一看（使用非windows系统值得一看），他允许你联合多个终端给每个进程进行debug。

Caching

cacheParallel <- function(){
vars <- 1:2
tmp <- clusterEvalQ(cl,
library(digest))

parSapply(cl, vars, function(var){
fn <- function(a) a^2
dg <- digest(list(fn, var))
cache_fn <-
sprintf("Cache_%s.Rdata",
dg)
if (file.exists(cache_fn)){
}else{
var <- fn(var);
Sys.sleep(5)
save(var, file = cache_fn)
}
return(var)
})
}

system.time(out <- cacheParallel())
# user system elapsed
# 0.003 0.001 5.079
out
# [1] 1 4
system.time(out <- cacheParallel())
# user system elapsed
# 0.001 0.004 0.046
out
# [1] 1 4

# To clean up the files just do:
file.remove(list.files(pattern = "Cache.+\.Rdata"))

载入平衡

任务载入

parLapply <- function (cl = NULL, X, fun, ...)
{
cl <- defaultCluster(cl)
do.call(c, clusterApply(cl, x = splitList(X, length(cl)),
fun = lapply, fun, ...), quote = TRUE)
}

# From the nnet example
parLapply(cl, c(10, 20, 30, 40, 50), function(neurons)
nnet(ir[samp,], targets[samp,],
size = neurons))

# From the nnet example
parLapply(cl, c(10, 50, 30, 40, 20), function(neurons)
nnet(ir[samp,], targets[samp,],
size = neurons))

内存载入

> rm(list=ls())
> library(pryr)
> library(magrittr)
> a <- matrix(1, ncol=10^4*2, nrow=10^4)
> object_size(a)
1.6 GB
> system.time(mean(a))
user  system elapsed
0.338   0.000   0.337
> system.time(mean(a + 1))
user  system elapsed
0.490   0.084   0.574
> library(parallel)
> cl <- makeCluster(4, type = "PSOCK")
> system.time(clusterExport(cl, "a"))
user  system elapsed
5.253   0.544   7.289
> system.time(parSapply(cl, 1:8,
function(x) mean(a + 1)))
user  system elapsed
0.008   0.008   3.365
> stopCluster(cl); gc();
> cl <- makeCluster(4, type = "FORK")
> system.time(parSapply(cl, 1:8,
function(x) mean(a + 1)))
user  system elapsed
0.009   0.008   3.123
> stopCluster(cl)

FORKs可以让你并行化从而不用崩溃：

> cl <- makeCluster(8, type = "PSOCK")
> system.time(clusterExport(cl, "a"))
user  system elapsed
10.576   1.263  15.877
> system.time(parSapply(cl, 1:8, function(x) mean(a + 1)))
Error in checkForRemoteErrors(val) :
8 nodes produced errors; first error: cannot allocate vector of size 1.5 Gb
Timing stopped at: 0.004 0 0.389
> stopCluster(cl)
> cl <- makeCluster(8, type = "FORK")
> system.time(parSapply(cl, 1:8, function(x) mean(a + 1)))
user  system elapsed
0.014   0.016   3.735
> stopCluster(cl)

> a <- matrix(1, ncol=10^4*2.1, nrow=10^4)
> cl <- makeCluster(8, type = "FORK")
> parSapply(cl, 1:8, function(x) {
+   b <- a + 1
+   mean(b)
+   })
Error in unserialize(node\$con) : error reading from connection

内存建议

• 尽量使用rm()避免无用的变量
• 尽量使用gc()释放内存。即使这在R中是自动执行的，但是当它没有及时执行，在一个并行计算的情况下，如果没有及时释放内存，那么它将不会将内存返回给操作系统，从而影响了其他worker的执行。
• 通常并行化在大规模运算下很有用，但是，考虑到R中的并行化存在内存的初始化成本，所以考虑到内存的情况下，显然小规模的并行化可能会更有用。
• 有时候在并行计算时，不断做缓存，当达到上限时，换回串行计算。
• 你也可以手动的控制每个核所使用的内存数量，一个简单的方法就是：memory.limit()/memory.size() = max cores

其他建议

• 一个常用的CPU核数检测函数：
max(1, detectCores() - 1)
• 永远不要使用set.seed()，使用clusterSetRNGStream()来代替设置种子，如果你想重现结果。
• 如果你有Nvidia 显卡，你可以尝试使用gputools 包进行GPU加速（警告：安装可能会很困难）
• 当使用mice并行化时记得使用ibind()`来合并项。

12-08

08-27
09-10 1万+
01-22 1590
07-25 3545
08-21 7505
04-20 8512
02-26 3061
09-06 3111
02-08 5424