package cn.weida.Spark.TopNNonUnique;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import Util.SparkUtil;
import scala.Tuple2;
/**
* 假设:对于所有输入(K,V),K是不唯一的
* 这个类实现了Top N设计模式(N>0)
* 主要假设为所有输入(K,V) 对,K非唯一
* 如果发现重复的K,使用reduceByKey()对值进行处理得到唯一Key
* 先选出本地topN 再选出最终TopN
* @author acm160920007
*
* 上午11:40:13 2018年8月9日
*
*/
public class TopNNonUnique {
public static void main(String[] args) throws Exception {
if (args.length != 3) {
System.out.println("Usage:TopN N [top/bottom] <hdfs-file>");
System.exit(1);
}
int topN = Integer.parseInt(args[0]);
String direction = args[1];
if (!(direction.equals("top")||direction.equals("bottom"))) {
System.out.println("Usage:TopN N [top/bottom] <hdfs-file>");
System.exit(1);
}
String inputPath = args[2];
System.out.println("inputPath : <hdfs-file>=" + inputPath);
JavaSparkContext ctx = SparkUtil.createJavaSparkContext();
Broadcast<Integer> broadcastTopN = ctx.broadcast(topN);
Broadcast<String> broadcastDirection = ctx.broadcast(direction);
// 输入
JavaRDD<String> lines = ctx.textFile(inputPath, 1);
//RDD分区
JavaRDD<String> rdd = lines.coalesce(9);
// (String) -> (String,Integer) 输入 输出key 输出 value
JavaPairRDD<String, Integer> pairs = rdd.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String s) {
String[] tokens = s.split(",");
return new Tuple2<String, Integer>(tokens[0], Integer.parseInt(tokens[1]));
}
});
//归约重复键
JavaPairRDD<String, Integer> uniqueKeys = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer arg0, Integer arg1) throws Exception {
return arg0+arg1;
}
});
//创建一个本地的topN
JavaRDD<SortedMap<Integer, String>> partitions = pairs
.mapPartitions(new FlatMapFunction<Iterator<Tuple2<String, Integer>>, SortedMap<Integer, String>>() {
@Override
public Iterator<SortedMap<Integer, String>> call(Iterator<Tuple2<String, Integer>> iter) {
SortedMap<Integer, String> topN = new TreeMap<Integer, String>(); // 等价 setup()
while (iter.hasNext()) { // 等价map()
Tuple2<String, Integer> tuple = iter.next();
topN.put(tuple._2, tuple._1);
if (topN.size() > broadcastTopN.value()) {
if (broadcastDirection.toString().equals("top")) {
topN.remove(topN.firstKey());
} else if (broadcastDirection.equals("bottom")) {
topN.remove(topN.lastKey());
}
}
}
return Collections.singletonList(topN).iterator(); // 等价clearup()
}
});
//所有本地topN 创建最终TopN
SortedMap<Integer, String> finaltopN = new TreeMap<Integer,String>();
List<SortedMap<Integer,String>> alltopN = partitions.collect();
for (SortedMap<Integer, String> localtopN :alltopN) {
for (Map.Entry<Integer, String> entry : localtopN.entrySet()) {
finaltopN.put(entry.getKey(), entry.getValue());
if (finaltopN.size()>broadcastTopN.value()) {
if (broadcastDirection.toString().equals("top")) {
finaltopN.remove(finaltopN.firstKey());
} else if (broadcastDirection.equals("bottom")) {
finaltopN.remove(finaltopN.lastKey());
}
}
}
}
for (Map.Entry<Integer, String> entry : finaltopN.entrySet()) {
System.out.println(entry.getKey() + "--" + entry.getValue());
}
System.exit(0);
}
}
使用自定义排序
package cn.weida.Spark.TopNUsingTakeOrderd;
import java.io.Serializable;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import Util.SparkUtil;
import scala.Tuple2;
/**
* 调用RDD.takeOrdered(int n,java.util.Comparator<T> comp)
* 自定义比较类,实现Comparator接口
* @author acm160920007
*
* 下午1:16:34 2018年8月9日
*
*/
public class TopNUsingTakeOrdered implements Serializable{
//自定义比较器类
static class MyTupleComparator implements Comparator<Tuple2<String, Integer>>,Serializable{
final static MyTupleComparator INSTANCE = new MyTupleComparator();
@Override
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
return -o1._2.compareTo(o2._2); //返回TopN
//return o1._2.compareTo(o2._2); //返回bottom N
}
}
public static void main(String[] args) throws Exception {
if (args.length != 3) {
System.out.println("Usage:TopN N [top/bottom] <hdfs-file>");
System.exit(1);
}
int topN = Integer.parseInt(args[0]);
String direction = args[1];
if (!(direction.equals("top")||direction.equals("bottom"))) {
System.out.println("Usage:TopN N [top/bottom] <hdfs-file>");
System.exit(1);
}
String inputPath = args[2];
System.out.println("inputPath : <hdfs-file>=" + inputPath);
JavaSparkContext ctx = SparkUtil.createJavaSparkContext();
Broadcast<Integer> broadcastTopN = ctx.broadcast(topN);
Broadcast<String> broadcastDirection = ctx.broadcast(direction);
// 输入
JavaRDD<String> lines = ctx.textFile(inputPath, 1);
//RDD分区
JavaRDD<String> rdd = lines.coalesce(9);
// (String) -> (String,Integer) 输入 输出key 输出 value
JavaPairRDD<String, Integer> pairs = rdd.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String s) {
String[] tokens = s.split(",");
return new Tuple2<String, Integer>(tokens[0], Integer.parseInt(tokens[1]));
}
});
//归约重复键
JavaPairRDD<String, Integer> uniqueKeys = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer arg0, Integer arg1) throws Exception {
return arg0+arg1;
}
});
//创建一个本地的topN
JavaRDD<SortedMap<Integer, String>> partitions = pairs
.mapPartitions(new FlatMapFunction<Iterator<Tuple2<String, Integer>>, SortedMap<Integer, String>>() {
@Override
public Iterator<SortedMap<Integer, String>> call(Iterator<Tuple2<String, Integer>> iter) {
SortedMap<Integer, String> topN = new TreeMap<Integer, String>(); // 等价 setup()
while (iter.hasNext()) { // 等价map()
Tuple2<String, Integer> tuple = iter.next();
topN.put(tuple._2, tuple._1);
if (topN.size() > broadcastTopN.value()) {
if (broadcastDirection.toString().equals("top")) {
topN.remove(topN.firstKey());
} else if (broadcastDirection.equals("bottom")) {
topN.remove(topN.lastKey());
}
}
}
return Collections.singletonList(topN).iterator(); // 等价clearup()
}
});
List<Tuple2<String, Integer>> topNResult = uniqueKeys.takeOrdered(broadcastTopN.value(), MyTupleComparator.INSTANCE);
for (Tuple2<String, Integer> entry : topNResult) {
System.out.println(entry._2+ "--" + entry._1);
}
System.exit(0);
}
}