spark 二次排序

通过代码实现spark的二次排序

1 实现二次排序接口ordered

/**
 * 自定义的二次排序key
 * @author Administrator
 *
 */
public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable {

   private static final long serialVersionUID = -2366006422945129991L;
   
   // 首先在自定义key里面,定义需要进行排序的列
   private int first;
   private int second;
   
   public SecondarySortKey(int first, int second) {
      this.first = first;
      this.second = second;
   }

   @Override
   public boolean $greater(SecondarySortKey other) {
      if(this.first > other.getFirst()) {
         return true;
      } else if(this.first == other.getFirst() && 
            this.second > other.getSecond()) {
         return true;
      }
      return false;
   }
   
   @Override
   public boolean $greater$eq(SecondarySortKey other) {
      if(this.$greater(other)) {
         return true;
      } else if(this.first == other.getFirst() && 
            this.second == other.getSecond()) {
         return true;
      }
      return false;
   }

   @Override
   public boolean $less(SecondarySortKey other) {
      if(this.first < other.getFirst()) {
         return true;
      } else if(this.first == other.getFirst() && 
            this.second < other.getSecond()) {
         return true;
      }
      return false;
   }
   
   @Override
   public boolean $less$eq(SecondarySortKey other) {
      if(this.$less(other)) {
         return true;
      } else if(this.first == other.getFirst() && 
            this.second == other.getSecond()) {
         return true;
      }
      return false;
   }
   
   @Override
   public int compare(SecondarySortKey other) {
      if(this.first - other.getFirst() != 0) {
         return this.first - other.getFirst();
      } else {
         return this.second - other.getSecond();
      }
   }
   
   @Override
   public int compareTo(SecondarySortKey other) {
      if(this.first - other.getFirst() != 0) {
         return this.first - other.getFirst();
      } else {
         return this.second - other.getSecond();
      }
   }
   
   // 为要进行排序的多个列,提供getter和setter方法,以及hashcode和equals方法
   public int getFirst() {
      return first;
   }

   public void setFirst(int first) {
      this.first = first;
   }

   public int getSecond() {
      return second;
   }

   public void setSecond(int second) {
      this.second = second;
   }

   @Override
   public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result + first;
      result = prime * result + second;
      return result;
   }

   @Override
   public boolean equals(Object obj) {
      if (this == obj)
         return true;
      if (obj == null)
         return false;
      if (getClass() != obj.getClass())
         return false;
      SecondarySortKey other = (SecondarySortKey) obj;
      if (first != other.first)
         return false;
      if (second != other.second)
         return false;
      return true;
   }
   
}

2 实现排序集群

/**
 * 二次排序
 * 1、实现自定义的key,要实现Ordered接口和Serializable接口,在key中实现自己对多个列的排序算法
 * 2、将包含文本的RDD,映射成key为自定义key,value为文本的JavaPairRDD
 * 3、使用sortByKey算子按照自定义的key进行排序
 * 4、再次映射,剔除自定义的key,只保留文本行
 * @author jhp
 *
 */
public class SecondarySort {

   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("SecondarySort") 
            .setMaster("local");
      JavaSparkContext sc = new JavaSparkContext(conf);
   
      JavaRDD<String> lines = sc.textFile("C://Users//Administrator//Desktop//sort.txt");
      
      JavaPairRDD<SecondarySortKey, String> pairs = lines.mapToPair(
            
            new PairFunction<String, SecondarySortKey, String>() {

               private static final long serialVersionUID = 1L;

               @Override
               public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
                  String[] lineSplited = line.split(" ");  
                  SecondarySortKey key = new SecondarySortKey(
                        Integer.valueOf(lineSplited[0]), 
                        Integer.valueOf(lineSplited[1]));  
                  return new Tuple2<SecondarySortKey, String>(key, line);
               }
               
            });
      
      JavaPairRDD<SecondarySortKey, String> sortedPairs = pairs.sortByKey();
      
      JavaRDD<String> sortedLines = sortedPairs.map(
            
            new Function<Tuple2<SecondarySortKey,String>, String>() {

               private static final long serialVersionUID = 1L;

               @Override
               public String call(Tuple2<SecondarySortKey, String> v1) throws Exception {
                  return v1._2;
               }
               
            });
      
      sortedLines.foreach(new VoidFunction<String>() {

         private static final long serialVersionUID = 1L;

         @Override
         public void call(String t) throws Exception {
            System.out.println(t);  
         }
         
      });
      
      sc.close();
   }
   
}

阅读更多
上一篇spark排序版本的woldcount
下一篇spark 高级算子
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

关闭
关闭