对2亿个数排序,取中位数
新生区设为50m时
/usr/local/spark/bin/spark-submit --master yarn --deploy-mode client --num-executors=2 --executor-cores=12 --executor-memory 6g --conf "spark.executor.extraJavaOptions=-Xmn50M -XX:SurvivorRatio=8 -verbose:gc -XX:+PrintGCDetails -XX:+PrintHeapAtGC" /home/vicky/mr-1.0-SNAPSHOT.jar 200
新生区为100m时
新生区为200m时
新生区为1300m时
public static void getMid(Integer reducePart) throws Exception {
System.out.println(reducePart);
long start=System.currentTimeMillis();
SparkConf conf = new SparkConf().setAppName("HelloWorld")
// .setMaster("local[12]")
.setMaster("yarn")
;
JavaSparkContext sc = new JavaSparkContext(conf);
// sc.setLogLevel("ERROR");
SQLContext sqlContext = new SQLContext(sc);
JavaRDD<Integer> rdd= sc.textFile("hdfs://master:9000/num",24).map(new Function<String, Integer>() {
@Override
public Integer call(String v1) throws Exception {
return Integer.valueOf(v1);
}
});
// rdd.cache();
long count= rdd.count();
System.out.print(
rdd.sortBy(new Function<Integer, Integer>() {
@Override
public Integer call(Integer v1) throws Exception {
return v1;
}
},false,reducePart).zipWithIndex().filter(new Function<Tuple2<Integer, Long>, Boolean>() {
@Override
public Boolean call(Tuple2<Integer, Long> v1) throws Exception {
return v1._2()==count/2;
}
}).collect().get(0)
);
// System.out.print( "top10="+rdd.takeOrdered(10,new MyCompare()));
System.out.println("timeuse="+(System.currentTimeMillis()-start));
}
用了三台虚拟机,两个工作节点配置如下