Java Top N
static SortedMap<Integer,T> topN(List<Tuple2<T,Integer>>,L,int N){
if ((L==null)||(L.isEmpty())){
return null;}
SortedMap<Integer,T> topN=new TreeMap<Integer,T>();
for (Tuple2<T,Integer> element:L){
//element._1 类型为T
//element._2 频度
topN.put(element._2,element._1);
if(topN.size()>N){
top.N.remove(topN.firstKey());
}
returen topN;
}
Mapreduce/Hadoop 实现 唯一键
SQL 排序
SELECT cat_id, cat_name,cat_weight
FROM cats
ORDER BY cat_weight DESC LIMIT 10
Mapreduce
每个映射器找出一个本地的topN 列表,把它传递给归约器,归约器从中找出最终的topN
映射器类结构
public class TopN_Mapper{
//定义本地top10所需的数据结构
private SortedMap<Double, Text> top10cats=new TreeMap<Double,Text>();
private int N=10;
//对应每个映射器执行一次setup()函数
setup(Context context){
...}
map(key,value){
... process (key,value) pair
}
//对应每个映射器执行一次cleanup()函数
cleanup(Context context){
...}
}
定义setup函数
setup(Context context){
// top.n由作业的驱动器设置
Configuration conf = context.getConfiguration();
N = conf.get("top.n")
}
map函数接受一个输入块,生成一个本地top10列表 使用不同的分隔符优化映射器和归约器的输入解析
map(){
String[] tokens= value.split(",");
// cat_weight =tokens[0];
// <cat_id><;><cat_name>= tokens[1]
Double weight =Double.parseDouble(tokens[0]);
top10cats.put(weight,value);
//只保留topN
if(top10cats.size() >N){
//删除最小的元素
top10cats.remove(top10cats.firstKey());
}
}
cleanup方法发出这个列表,使用一个键,保证所有映射器的输出都将由一个归约器处理
clean(Context context) {
for (String catAttributes: top10cats.values()){
context.write(NullWritable.get(),catAttributes);}
}
reduce 得到所有本地top10列表,创建最终的top10 列表
reduce(key,value){
SortedMap<Double, Text> finaltop10=new TreeMap<Double, Text>();
//聚集所有本地top10列表
for (Text catRecord:values){
String[] tokens=catRecord.split(",");
Double weight =Double.parseDouble(tokens[0]);
finaltop10.put(weight,value);
if (finaltop10.size()>N){
finaltop10.remove(finaltop10.firstKey());
}
}
for (Text text:finaltop10.values()){
context.write(NullWritable.get(),text);}
}}
spark 实现 唯一键
public class TopN{
public static void main(String[] args) throws Exception{
//确保有正确的输入参数 HDFS输入文件 args[0] /top10/input/top10data.txt
if(args.length<1){
System.err.printlin("Usage: Top10<hads-fule>");
System.exit();
}
String inputPath = args[0];
System.out.println("inputpath: <hdfs-file>="+inputPath);
//连接Spark master
JavaSparkContext ctx = new JavaSparkContext();
//从HDFS中读取输入文件并创建RDD
JavaRDD<String> lines = ctx.textFile(inputPath,1);
//从现有的RDD中创建新的RDD
JavaPairRDD<String, Integer> pairs =lines.mapToPair(new PairFunction<String,String,Integer>(){
public Tuple2<String,Integer> call (String s){
String [] tokens = s.split(",");
return new Tuple2<String, Integer>(tokens[0],Integer.parseInt(tokens[1]));}
})
// 为各个分区创建本地top10 列表
JavaRDD<SortedMap<Integer,String>> patitions =pari.mapPartitions(new FlatMapFunction<
Interator<Tuple2<String, Integer>>,
SortedMap<Integer,String>>(){
@Override
public Iterable<SortedMap<Integer,String>> call(Iterator<Tuple2<String,Integer>> iter){
SortedMap<Integer,String> top10 =new TreeMap<Integer,String>();
while(iter.hasNext()){
Tuple2<String,Integer> tuple = iter.next();
top10.put(tuple._2,tuple.1);
if (top10.size())>10{
top10.remove(top10.firstKey());
}
}
return Collections.singletonList(top10);
}});
//创建最终的top10列表
SortedMap<Integer,String> finaltop10 = partitions.reduce(
new Function2<
SortedMap<Integer,String>,
SortedMap<Integer,String>,
SortedMap<Integer,Sting>
>(){
@Override
public SortedMap<Integer,String> call(SortedMap<Integer,String>m1,
SortedMap<Integer,String>m2){
SortedMap<Integer,String> top10 = new TreeMap<Integer,Sting>();
for(Map.Entry<Integer,String> entry: m1.entrySet()){
top10.put(entry.getKey(),entry.getValue());
if (top10.size()>10){
top10.remove(top10.firstKey());
}
}
for(Map.Entry<Integer,String> entry: m2.entrySet()){
top10.put(entry.getKey(),entry.getValue());
if (top10.size()>10){
top10.remove(top10.firstKey());
}
}
return top10
}
});
// 发出最终top10 列表
System.out.println("=====top10 list ====");
for (Map.Entry<Integer,String> entry:finaltop10.entrySet()){
System.out.pintln(entry.getKey()+"---"+entry.getValue());
}
spark 非唯一键
public class Top10NonUnique{
public static void main(String[] args) throws Exception{
//处理输入参数
//创建javaspark上下文对象
JavaSparkContext ctx= SparkUtil.createJavaSparkContext("Top10NonUnique");
//将topN广播到所有集群节点
final Broadcast<Integer> topN=ctx.broadcast(N);
//从输入创建RDD
JavaRDD<String> lines=ctx.textFile(inputPath,1);
lines.saveAsTextFile("/output/1");
//RDD分区
JavaRDD<String> rdd = lines.coalesce(9)
//输入映射成键值对
JavaPairRDD<String, Integer> kv = rdd.mapToPair(newPairFunction<String,String,Integer>(){
public Tuple2<String,Integer> call(String s){
String[] tokens = s.split(",");
return new Tuple2<String,Integer>(tokens[0], Integer.parseInt(tokens[1]));
}});
kv.saveAsTextFile("/output/2")
//规约重复的K
JavaPairRDD<String,Integer> uniqueKeys=kv.reduceByKey(new Function2<Integer,Integer,Integer>(){
public Integer call(Integer i1, Integer i2){
return i1+i2;}});
uniqueKeys.saveAsTextFile("/output/3");
//创建本地topN
JavaRDD<SortedMap<Integer,String>> patitions = uniqueKeys.mapPartitions(new FlatMapFunction<
Iterator<Tuple2<String,Integer>>,
SortedMap<Integer,String>>(){
@Override
public Iterable<SortedMap<Integer,String>>call(Iterator<Tuple2<String,Integer>> iter){
final int N=topN.value();
SortedMap<Integer,String> localTopN=new TreeMap<Integer,String>();
while(iter.hasNext()){
Tuple2<String,Integer> tuple =iter.next();
localTopN.put(tuple._2,tup;e._1);
if (localTopN.size()>N){
localTopN.remove(localTopN.firstKey());
}
}
return Collections.singletonList(localTopN);
}
});
patitions.saveAsTextFile("/output/4");
//查找最终topN
SortedMap<Intefer,String> finalTopN =new TreeMap<Integer,String>();
List<SortedMap<Integer,String>> allTopN=patitions.collect();
for (SortedMap<Integer,String> localTopN:allTopN){
for(Map.Entry<Integer,String>entry:localTopN.entrySet()){
finalTopN.put(entry.getKey(),entry.getValue());
if(finalTopN.size()>N){
finalTopN.remove(finalTopN.firstKey());}
}
}
//发出最终topN
}
}
spark 使用takeordered()
public class TopUsingTakeOrdered implements Serializable{
public static void main(String[] args) throws Exception{
// 处理输入参数
//创建一个javaspark 上下文对象
JavaSparkContext ctx =SparkUtil.createJavaSparkContext("top-10")
//从输入创建一个RDD
JavaRDD<String> lines=ctx.textFile(inputpath,1);
//RDD分区
JavaRDD<String> rdd= lines.coalesce(9);
//从输入创建映射对
JavaPairRDD<String,Integer> kv = rdd.mapToPair(new PairFunction<String,String,Integer>(){
public Tuple2<String,Integer>call(String s){
String[] tokens=s.split(",");
return new Tuple2<String,Integer>(tokens[0],Integer.parseInt(tokens[1]));
}})
//规约 重复的K
JavaPairRDD<String,Integer> uniqueKeys=kv.reduceByKey(new Function2<Integer,Integer,Integer>(){
public Integer call(Integer i1, Integer i2){
return i1+i2
}});
//调用takeordered查找最终topN
List<Tuple2<String,Integer> >topNResult = uniqueKeys.takeOrdered(N, MyTupleComparator.INSTANCE)
//发出最终topN
}
}