1.Spark Submit提交任务启动Driver
Driver启动 -注册application-申请资源
2.Driver启动注册Application申请资源
3.Master划分资源 – 资源调度
1.Execute在集群中是分散启动的,利于数据处理的本地化
2.如果提交任务什么都不指定,集群中每台Worker为当前的application 启动一个Executor,这个Executor会使用当前节点所有core和1G内存
3.如果想要在一台Worker上启动多个Executor,要指定 -executor-cores
4.提交任务指定 -total-executor-cores 会为当前application申请指定core个数的资源
5.启动Executor不仅和core有关,还和内存有关 --executor-memory
4.任务调度
-docheckpoint
5.scala和java 二次排序和分组取topN
二次排序
- spark中大于2列的排序全部称为二次排序
package com.zmd.testSpark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* scala实现二次排序
*
*/
case class SecondSortkey(val first:Int,val second:Int) extends Ordered[SecondSortkey]{
def compare(that: SecondSortkey): Int = {
if (this.first-that.first==0)
this.second- that.second
else
this.first-that.first
}
}
object SecondSort {
def main(args: Array[String]): Unit = {
val conf = new SparkConf();
conf.setMaster("local");
conf.setAppName("test");
val sc = new SparkContext(conf);
val lines = sc.textFile("./data/test")
val trandsRDD: RDD[(SecondSortkey, String)] = lines.map(s=>{(SecondSortkey(s.split(" ")(1).toInt,s.split(" ")(1).toInt),s)})
trandsRDD.sortByKey(false).map(_._2).foreach(println)
}
}
---------------------------------SecondarySortTest-----------------------------------
package com.zmd.testJava.day04;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
public class SecondarySortTest {
/**
* java实现二次排序
*
*/
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName("SecondarySortTest");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> secondRdd = sc.textFile("./data/test");
JavaPairRDD<SecondSortkeyjava, String> pairSecondRDD = secondRdd.mapToPair(new PairFunction<String, SecondSortkeyjava, String>() {
private static final long serialverionUID = 1L;
@Override
public Tuple2<SecondSortkeyjava, String> call(String line) throws Exception {
String[] splited = line.split(" ");
int first = Integer.valueOf(splited[0]);
int second = Integer.valueOf(splited[1]);
SecondSortkeyjava secondSortkeyjava = new SecondSortkeyjava(first, second);
return new Tuple2<SecondSortkeyjava, String>(secondSortkeyjava, line);
}
});
pairSecondRDD.sortByKey(false).foreach(new VoidFunction<Tuple2<SecondSortkeyjava, String>>() {
private static final long serialverionUID = 1L;
@Override
public void call(Tuple2<SecondSortkeyjava, String> tuple) throws Exception {
System.out.println(tuple._2);
}
});
}
}
-----------------------------------SecondSortkeyjava-------------------------------
package com.zmd.testJava.day04;
import java.io.Serializable;
public class SecondSortkeyjava implements Serializable,Comparable<SecondSortkeyjava> {
private static final long serialVersionUID = 1L;
private int first;
private int second;
public int getFirst() {
return first;
}
public void setFirst(int first) {
this.first = first;
}
public int getSecond() {
return second;
}
public void setSecond(int second) {
this.second = second;
}
public SecondSortkeyjava(int first, int second) {
this.first = first;
this.second = second;
}
@Override
public int compareTo(SecondSortkeyjava o1) {
if (getFirst() -o1.getFirst() == 0){
return getSecond() - o1.getSecond();
}else{
return getFirst() - o1.getFirst();
}
}
}
分组取topN:
package com.zmd.testJava.day04;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Iterator;
public class SparkDay04 {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName("test");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("./data/scores.txt");
JavaPairRDD<String, Integer> pairRDD = lines.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String line) throws Exception {
return new Tuple2<String, Integer>(line.split("\t")[0], Integer.valueOf(line.split("\t")[1]));
}
});
pairRDD.groupByKey().foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
@Override
public void call(Tuple2<String, Iterable<Integer>> tuple) throws Exception {
String className = tuple._1;
Iterator<Integer> iter = tuple._2.iterator();
ArrayList<Integer> list = new ArrayList<>();
// while (iter.hasNext()){
// list.add(iter.next());
// }
//
// Collections.sort(list);
// for(Integer i : list){
// System.out.println("className = "+className+",value"+i);
// }
Integer[] top3 = new Integer[3];
while (iter.hasNext()){
Integer currentOne = iter.next();
for (int i = 0;i<3;i++){
if(top3[i]==null){
top3[i] = currentOne;
break;
}else if(currentOne>top3[i]){
for (int j=2;j>i;j--){
top3[j] = top3[j-1];
}
top3[i] = currentOne;
break;
}
}
}
for (Integer i : top3){
System.out.println("className = "+className+",value"+i);
}
}
});
}
}