近期使用Spark开发ML机器学习模型的时候,其中有一个部分需要交替搜索最优参数。
待搜索的参数空间有上万维,如果参数搜索串行执行,那么上千次的迭代计算大约需要10个小时,对于线上部署的模型是万万不可取的。
考虑到参数搜索部分的每次的计算量并不大,只是需要重复上万次,有一些迭代的搜索计算是不相互依赖的,可以并行计算。那么就考虑使用多线程的思想,并行执行一些计算任务。
因为参数的更新需要等这些并行的计算任务都完成后 取他们计算结果汇总后的最大值,所以需要的是同步执行,即需要block阻塞。
如果将多线程定义成异步执行的,那么其中一个线程的任务执行完成后,它不会等待其他线程的任务,线程进入接下来的执行任务。
参考文章如下:https://monix.io/docs/current/best-practices/blocking.html
https://github.com/alexandru/scala-best-practices/blob/master/sections/4-concurrency-parallelism.md
http://trickbooter.com/post/2016-12-17-spark-parallel-execution/ 【个人推荐,英文版】
关于Spark中的多任务并发处理(Concurrency) 【个人推荐,中文版】
scala多线程-异步 、 scala多线程异步 、 scala多线程
scala并发-Actor
多线程同步执行(需要block阻塞)、多线程异步执行(不需要block)的实践代码如下:
一、多线程同步执行(需要block阻塞)
代码示例一:
package com.algorithms.toby
import org.apache.spark.sql.SparkSession
import java.util.concurrent.Executors
import scala.concurrent.{Await, ExecutionContext, ExecutionContextExecutor, Future}
import scala.concurrent.duration._
object threadBlock {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("threadBlock").getOrCreate()
//定义线程池数
val pool = Executors.newFixedThreadPool(2)
implicit val ec: ExecutionContextExecutor = ExecutionContext.fromExecutor(pool)
//定义方法
def addOne(x: Int) = Future(x + 1)
def multiply(x: Int, y: Int) = Future {
val a = addOne(x)
val b = addOne(y)
val result1 = for (r1 <- a; r2 <- b) yield r1 * r2
for (r <- result1) {println(r)}
}
val c= Future{println("this is c")}
val d = Future{println("this is d")}
//提交任务到线程,阻塞,等待5s
Await.result(Future.sequence(Seq(c,d,multiply(1,3))), 5.seconds)
//关闭线程池
println("task finished")
pool.shutdown()
println("task finished finally")
}
}
代码示例二、
package com.algorithms.toby
import org.apache.spark.sql.SparkSession
import java.util.concurrent.Executors
import scala.concurrent.{Await, ExecutionContext, ExecutionContextExecutor, Future}
import scala.concurrent.duration._
object threadBlock {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("threadBlock").getOrCreate()
//Thread 多线性同步执行(block阻塞)
def trainThreadBlock(trainData:DataFrame,modelPath:String): (Long,Double) ={
val timeStart = getCurrentTimeStamp
val featureNum = 10
var epoch = 1
import org.apache.spark.ml.linalg.Vectors
val paramsFieldMap = Map(
1 -> Array[Double](0.1,0.2),
2 -> Array[Double](0.3,0.4)
)
while(epoch>0){
var flag = false
for(ind <- 0 until featureNum-1){
//线程池
val pool = Executors.newFixedThreadPool(2)
implicit val ec: ExecutionContextExecutor = ExecutionContext.fromExecutor(pool)
val t1 = updateScale("thread"+1,featureDF,paramsFieldMap(1))
val t2 = updateScale("thread"+2,featureDF,ind,bestScale,paramsFieldMap(2))
Await.result(Future.sequence(Seq(t3,t4)), Duration.Inf)
pool.shutdown()
println("task finished")
}
}
epoch = epoch -1
}
val timeEnd = getCurrentTimeStamp
val timeUsed = (timeEnd - timeStart)/60
println(s"time used $timeUsed")
}
def updateScale(threadName:String,featureDF:DataFrame,searchField:Array[Double]): Future[Unit] = Future{
println(s"This is Thread $threadName" )
for(t <- searchField ){
println("this is the method")
}
}
println(s"finish thread $threadName")
}
}
二、多线程异步执行(不需要block)
代码示例一、
// 创建线程池
val threadPool:ExecutorService=Executors.newFixedThreadPool(5)
try {
//提交5个线程
for(task <- 1 to 5){
threadPool.submit(new updateTask("thread"+task,featureDF))
}
}finally {
threadPool.shutdown()
}
//Runable
class updateTask(threadName:String,featureDF:DataFrame) extends Runnable {
override def run(){
println(s"This is Thread $threadName" )
featureDF.select("....this is task...")
println(s"finish thread $threadName")
}
}
代码示例二、
package com.algorithms.toby
import org.apache.spark.sql.SparkSession
import java.util.concurrent.Executors
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
import scala.concurrent.{Await, ExecutionContext, ExecutionContextExecutor, Future}
//Runable
class updateTask(threadName:String,featureDF:DataFrame) extends Runnable {
override def run(){
println(s"This is Thread $threadName" )
featureDF.select("....this is task...")
println(s"finish thread $threadName")
}
}
object threadNotBlock {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("threadBlock").getOrCreate()
//Thread 多线性异步执行
def trainThreadNotBlock(trainData:DataFrame,modelPath:String): (Long,Double) ={
val timeStart = getCurrentTimeStamp
val featureNum = 10
var epoch = 1
import org.apache.spark.ml.linalg.Vectors
val paramsFieldMap = Map(
1 -> Array[Double](0.1,0.2),
2 -> Array[Double](0.3,0.4)
)
while(epoch>0){
var flag = false
println(s"this is epcho $epoch")
for(ind <- 0 until featureNum-1){
val t1 = new Thread(new updateTask("thread"+1,featureDF)) // 新建一个线程
val t2 = new Thread(new up dateTask("thread"+2,featureDF)) // 新建一个线程
t1.start()
t2.start()
}
}
epoch = epoch -1
}
val timeEnd = getCurrentTimeStamp
val timeUsed = (timeEnd - timeStart)/60
println(s"time used $timeUsed")
}
}
}