TopN:
package spark.core; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2; public class TopN { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("job_name"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<String> textFile = sc.textFile("E:/operator/number.txt"); //输入的是一个string的字符串,输出的是一个(String, Integer) 的map JavaPairRDD<Integer,String> mapToPair = textFile.mapToPair(new PairFunction<String, Integer, String>() { private static final long serialVersionUID = 1L; @Override public Tuple2<Integer, String> call(String value) throws Exception { return new Tuple2<Integer, String>(Integer.valueOf(value),value); } }); JavaPairRDD<Integer,String> sortByKey = mapToPair.sortByKey(false); List<String> take = sortByKey.map(new Function<Tuple2<Integer,String>, String>() { private static final long serialVersionUID = 1L; @Override public String call(Tuple2<Integer, String> result) throws Exception { return result._2; } }).take(3); for (String string : take) { System.out.println(string); } sc.close(); } }
分组TopN(在worker端排序):
package spark.core; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; public class GroupTopN { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("job_name"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<String> textFile = sc.textFile("E:/operator/persion-score.txt"); JavaPairRDD<String,Integer> mapToPair = textFile.mapToPair(new PairFunction<String, String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(String line) throws Exception { String[] split = line.split(" "); return new Tuple2<String, Integer>(split[0],Integer.valueOf(split[1])); } }); JavaPairRDD<String,Iterable<Integer>> groupByKey = mapToPair.groupByKey(); JavaPairRDD<String,Iterable<Integer>> mapToPair2 = groupByKey.mapToPair(new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Iterable<Integer>> call( Tuple2<String, Iterable<Integer>> tuple) throws Exception { Iterable<Integer> scores = tuple._2;//把成绩取出来(但是它是封装的 里面还有数据 那我们遍历出来) Iterator<Integer> it = scores.iterator();//这里需要迭代一下 List<Integer> arrayList = new ArrayList<Integer>(); while (it.hasNext()) { Integer score = it.next();//循环遍历出每个成绩 arrayList.add(score);//把成绩封装到Integer } //接下来需要把list中的封装的数据进行排序 //Collections 操作集合 里面有个排序方法Comparator Collections.sort(arrayList,new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { return -(o1-o2); } }); // List<Integer> list = arrayList.subList(0, 2);//截取前两个 return new Tuple2<String, Iterable<Integer>>(tuple._1,arrayList); } }); mapToPair2.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() { private static final long serialVersionUID = 1L; @Override public void call(Tuple2<String, Iterable<Integer>> tulple) throws Exception { System.out.println(tulple); } }); sc.close(); } }
分组TopN②(在driver段调spark排序):
繁琐操作(一般不用)
package spark.core; import java.util.Iterator; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; public class GroupTopN_dirver { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("job_name"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<String> textFile = sc.textFile("E:/operator/persion-score.txt"); JavaPairRDD<String,Integer> mapToPair = textFile.mapToPair(new PairFunction<String, String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(String line) throws Exception { String[] split = line.split(" "); return new Tuple2<String, Integer>(split[0],Integer.valueOf(split[1])); } }); JavaPairRDD<String,Iterable<Integer>> groupByKey = mapToPair.groupByKey(); //把key取到 final List<String> keys = groupByKey.map(new Function<Tuple2<String,Iterable<Integer>>, String>() { private static final long serialVersionUID = 1L; @Override public String call(Tuple2<String, Iterable<Integer>> tuple) throws Exception { return tuple._1; } }).collect(); for (int i = 0; i < keys.size(); i++) { final int index = i; JavaRDD<Integer> flatMap = groupByKey.filter(new Function<Tuple2<String,Iterable<Integer>>, Boolean>() { private static final long serialVersionUID = 1L; @Override public Boolean call(Tuple2<String, Iterable<Integer>> tuple) throws Exception { return tuple._1.equals(keys.get(index)); } }).flatMap(new FlatMapFunction<Tuple2<String,Iterable<Integer>>, Integer>() { private static final long serialVersionUID = 1L; @Override public Iterator<Integer> call(Tuple2<String, Iterable<Integer>> tuple) throws Exception { return tuple._2.iterator(); } }); JavaPairRDD<Integer, String> kv = flatMap.mapToPair(new PairFunction<Integer, Integer, String>() { private static final long serialVersionUID = 1L; @Override public Tuple2<Integer, String> call(Integer v) throws Exception { return new Tuple2<Integer, String>(v,keys.get(index)); } }); JavaPairRDD<Integer,String> sortByKey = kv.sortByKey(false); JavaPairRDD<String,Integer> mapToPair2 = sortByKey.mapToPair(new PairFunction<Tuple2<Integer,String>, String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(Tuple2<Integer, String> tuple) throws Exception { return new Tuple2<String, Integer>(tuple._2, tuple._1); } }); JavaPairRDD<String,Iterable<Integer>> groupByKey2 = mapToPair2.groupByKey(); groupByKey2.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() { private static final long serialVersionUID = 1L; @Override public void call(Tuple2<String, Iterable<Integer>> result) throws Exception { System.out.println(result); } }); } sc.close(); } }