工作中遇到一个这样的情况,List中的元素要每个遍历出来,然后作为参数传给后面通过spark做数据处理,元素太多,一个一个的遍历速度太慢,于是考虑使用多线程,代码如下:(已删除部分代码)
想了解更多线程池的内容,可以参考链接:https://blog.csdn.net/aa1215018028/article/details/82814192
package com.kong.test.UDF;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import com.kong.test.constant.Constants;
public class CallableAndFuture {
public static void main(String[] args) throws InterruptedException, ExecutionException {
SparkSession spark = SparkSession
.builder()
.appName("CalibrationTest")
.master("local")
.enableHiveSupport()
.getOrCreate();
spark.sparkContext().setLogLevel("ERROR");
spark.sparkContext().setLocalProperty("spark.scheduler.pool", "production");
CalibrationSQL cali = new CalibrationSQL(db,branchE,date,date4g,branchC);
Dataset<Row> sqlDF1 = spark.sql(cali.getAllCell());
List<Row> list = sqlDF1.collectAsList();
int threadNum = 10;
ExecutorService threadPool = Executors.newFixedThreadPool(threadNum);
List<Future<Integer>> futures = new ArrayList<Future<Integer>>();
System.out.println("线程数目:"+threadNum);
for (int i = 0; i < list.size(); i++) {
String[] line = list.get(i).toString().replace('[', ' ').replace(']', ' ').trim().split(",");
String antenna_0 = line[0];
String antenna0_googlegri = line[1];
String antenna0_googlegci = line[2];
futures.add(threadPool.submit(new calibration(cali,antenna_0,antenna0_googlegri,antenna0_googlegci,spark)));
}
for (int i = 0; i < futures.size(); i++) {
System.out.println(futures.get(i).get());
}
threadPool.shutdown();System.out.println("threadPool shutdown !");
}
}
class calibration implements Callable<Integer> {
private CalibrationSQL cali;
private String antenna_0;
private String antenna0_googlegri;
private String antenna0_googlegci;
private SparkSession spark;
public calibration(CalibrationSQL cali,String antenna_0,String antenna0_googlegri,String antenna0_googlegci,SparkSession spark) {
this.cali = cali;
this.antenna_0 = antenna_0;
this.antenna0_googlegri = antenna0_googlegri;
this.antenna0_googlegci = antenna0_googlegci;
this.spark = spark;
}
public Integer call() throws Exception {
--处理逻辑--
return 0;
}
}
对每个线程的执行状态加上回调流程,会一直阻塞直至多线程部分全部处理完成。这样不会影响后面的代码处理