[笔记迁移][Spark][8]常见应用案例

最新推荐文章于 2020-03-24 19:27:20 发布

Bro_Rabbit

最新推荐文章于 2020-03-24 19:27:20 发布

阅读量116

点赞数

分类专栏： bigdata spark 文章标签： spark

本文链接：https://blog.csdn.net/weixin_38240095/article/details/96114443

版权

spark 同时被 2 个专栏收录

27 篇文章 0 订阅

订阅专栏

bigdata

26 篇文章 0 订阅

订阅专栏

说明：部分示例在练习中使用了两种实现

文章目录

1. 基于排序机制的wordcount
2. 二次排序(ORDER BY col1,col2)
3.TOPN

1. 基于排序机制的wordcount

/**
 * 基于排序机制的 wordcount
 * 需求：
 * 1. 对文本文件内出现的每个单词统计出其出现的次数
 * 2. 按照每个单词出现次数降序排序
 * @author Z -Jay
 *
 */
public class SortedWordCount {
       public static void main(String[] args) {
            SparkConf conf = new SparkConf().setAppName("SortedWordCount" ).setMaster("local");
            
            JavaSparkContext sc = new JavaSparkContext(conf );
            
            JavaRDD<String> linesRDD = sc.textFile("C:\\Users\\Z-Jay\\Desktop\\spark.txt" );
            
            JavaRDD<String> words = linesRDD .flatMap(new FlatMapFunction<String, String>() {

                   private static final long serialVersionUID = 1L;

                   @Override
                   public Iterator<String> call(String line ) throws Exception {
                        String[] split = line .split (" " );
                         return Arrays.asList( split).iterator();
                  }
                  
                  
            });
            
            JavaPairRDD<String, Integer> pairs = words .mapToPair(new PairFunction<String, String, Integer>() {

                   private static final long serialVersionUID = 1L;

                   @Override
                   public Tuple2<String, Integer> call(String word) throws Exception {
                         return new Tuple2<>(word,1);
                  }
            });
            
            JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
                  
                   private static final long serialVersionUID = 1L;

                   @Override
                   public Integer call(Integer v1 , Integer v2) throws Exception {
                         return v1 +v2 ;
                  }
            });
            
             //至此，已经得到每个单词出现的次数，wordCountsRDD中元素形式("hello",2)
             //开始新需求，按照每个单词出现的次数降序排序，则必须重构为元素形式为(2,"hello")的新RDD才能进行排序
            
             //KV反转以排序
            JavaPairRDD<Integer, String> countWords = wordCounts.mapToPair(new PairFunction<Tuple2<String,Integer>, Integer, String>() {

                   private static final long serialVersionUID = 1L;

                   @Override
                   public Tuple2<Integer, String> call(Tuple2<String, Integer> tuple) throws Exception {
                         return new Tuple2<>(tuple._2,tuple._1);
                  }
                  
            });
      
             //按照Key排序
            JavaPairRDD<Integer, String> sortedPairs = countWords.sortByKey(false);
            
             //KV再反转以显示
            JavaPairRDD<String, Integer> resPairs = sortedPairs.mapToPair(new PairFunction<Tuple2<Integer,String>, String ,Integer>() {

                   private static final long serialVersionUID = 1L;

                   @Override
                   public Tuple2<String, Integer> call(Tuple2<Integer, String> tuple) throws Exception {

                         return new Tuple2<>(tuple._2,tuple._1);
                  }
            });
            
             resPairs.foreach(new VoidFunction<Tuple2<String,Integer>>() {
                  
                   private static final long serialVersionUID = 1L;

                   @Override
                   public void call(Tuple2<String, Integer> tuple) throws Exception {
                        System. out.println(tuple ._1 +":" +tuple ._2 );
                  }
            });
            
             sc.close();
      }
}

  def main(args:Array[String]){
   
    val conf = new SparkConf().setAppName ("SortedWordsCount" ).setMaster ("local" )
   
    val sc = new SparkContext(conf )
   
    val lines = sc.textFile("C:\\ Users\\Z-Jay \\Desktop\\ spark.txt",1 )
   
    val words = lines.flatMap{ line => line.split( " ") }
   
    val pairs = words. map { word => (word, 1) }
   
    val wordCounts = pairs. reduceByKey(pairs)
   
    val countWords = wordCounts.map { wordCount => (wordCount._2,wordCount._1) }
   
    val sortedCountWords = countWords.sortByKey (false)
   
    val sortedWordCounts = sortedCountWords.map { countWord => (countWord._2,countWord._1) }
   
    sortedWordCounts. foreach{ wordCount => println(wordCount._1+ " : "+wordCount._2) }
  }

2. 二次排序(ORDER BY col1,col2)

/**
 * 二次排序
 * 需求：
 * 1. 按照文件中的第一列排序
 * 2. 如果第一列相同，则按照第二列排序
 *
 * 规范：
 * 1. 必须自定义二次排序Key，实现Ordered <T>接口及序列化
 * 2. 在自定义实现中，定义参与排序的列
 * 3. 提供排序列的getter、setter、hashCode、equal
 * @author Z -Jay
 *
 */
public class DoubleSortKey implements Ordered<DoubleSortKey>, Serializable {
      
       private static final long serialVersionUID = 1L;

       //定义参与排序的列
       private Integer firstCol ;
      
       private Integer secondCol ;
      
       // this > other 的定义
       @Override
       public boolean $greater(DoubleSortKey other) {
            
             if(this .firstCol > other.firstCol){
                   return true ;
            } else if (this.firstCol == other.firstCol && this. secondCol> other.secondCol ){
                   return true ;
            }
            
             return false ;
      }

       // this >= other 的定义
       @Override
       public boolean $greater$eq(DoubleSortKey other) {
             if(this .$greater(other)){
                   return true ;
            } else if (this.firstCol == other.firstCol && this. secondCol == other.secondCol ){
                   return true ;
            }
             return false ;
      }

       // this < other 的定义
       @Override
       public boolean $less(DoubleSortKey other) {
            
             if(this .firstCol < other.firstCol){
                   return true ;
            } else if (this.firstCol == other.firstCol && this. secondCol < other.secondCol ){
                   return true ;
            }
            
             return false ;
      }

       @Override
       public boolean $less$eq(DoubleSortKey other) {
             if(this .$less(other)){
                   return true ;
            } else if (this.firstCol == other.firstCol && this. secondCol == other.secondCol ){
                   return true ;
            }
             return false ;
      }

       @Override
       public int compare(DoubleSortKey other) {
             if(this .firstCol - other.getFirstCol() !=0){
                   return this.firstCol - other.getFirstCol();
            } else{
                   return this.secondCol - other.getSecondCol();
            }
      }

       //compareTo 与 compare的定义一致
       @Override
       public int compareTo(DoubleSortKey other) {
             if(this .firstCol - other.getFirstCol() !=0){
                   return this .firstCol - other.getFirstCol();
            } else{
                   return this .secondCol - other.getSecondCol();
            }
      }

       //为参与排序的多个列提供getter,setter,hashCode,equals
       public Integer getFirstCol() {
             return firstCol ;
      }

       public void setFirstCol(Integer firstCol) {
             this.firstCol = firstCol;
      }

       public Integer getSecondCol() {
             return secondCol ;
      }

       public void setSecondCol(Integer secondCol) {
             this.secondCol = secondCol;
      }

       @Override
       public int hashCode() {
             final int prime = 31;
             int result = 1;
             result = prime * result + (( firstCol == null ) ? 0 : firstCol.hashCode());
             result = prime * result + (( secondCol == null ) ? 0 : secondCol.hashCode());
             return result ;
      }

       @Override
       public boolean equals(Object obj) {
             if (this == obj)
                   return true ;
             if (obj == null)
                   return false ;
             if (getClass() != obj .getClass())
                   return false ;
            DoubleSortKey other = (DoubleSortKey) obj ;
             if (firstCol == null) {
                   if (other .firstCol != null)
                         return false ;
            } else if (!firstCol.equals( other.firstCol ))
                   return false ;
             if (secondCol == null) {
                   if (other .secondCol != null)
                         return false ;
            } else if (!secondCol.equals( other.secondCol ))
                   return false ;
             return true ;
      }
      
      
}

public class DoubleSort {

       public static void main(String[] args) {
            SparkConf conf = new SparkConf().setAppName("DoubleSort" ).setMaster("local");
            
            JavaSparkContext sc = new JavaSparkContext(conf );
            
            JavaRDD<String> lines = sc.textFile("C:\\Users\\Z-Jay\\Desktop\\sort.txt" );

             //引入自定义数据结构，中间封装形式 (DoubleSortKey,"{line}")
            JavaPairRDD<DoubleSortKey, String> pairs = lines.mapToPair(new PairFunction<String, DoubleSortKey, String>() {

                   private static final long serialVersionUID = 1L;

                   @Override
                   public Tuple2<DoubleSortKey, String> call(String line) throws Exception {
                        String[] cols = line .split(" ");
                        DoubleSortKey doubleSortKey = new DoubleSortKey(Integer.parseInt(cols[0]),Integer. parseInt(cols[1]));
                         return new Tuple2<>(doubleSortKey, line);
                  }
            });
            
             //将按照自定义规则进行排序
            JavaPairRDD<DoubleSortKey, String> sortedLines = pairs.sortByKey();
            
             //还原回原始文本，消除自定义数据结构，也就是说自定义Key是中间排序的工具
            JavaRDD<String> res = sortedLines .map(new Function<Tuple2<DoubleSortKey,String>, String>() {

                   private static final long serialVersionUID = 1L;

                   @Override
                   public String call(Tuple2<DoubleSortKey, String> tuple) throws Exception {
                         return tuple ._2 ;
                  }
                  
            });
            
             res.foreach(new VoidFunction<String>() {
                  
                   private static final long serialVersionUID = 1L;

                   @Override
                   public void call(String line) throws Exception {
                        System. out.println(line );
                  }
            });
            
             sc.close();
      }
      
}

class DoubleSortKey(val first:Int, val second:Int) extends Ordered [DoubleSortKey] with Serializable {
 
  def compare(that: DoubleSortKey) : Int = {
    if(this. first == that. first != 0){
      this. first-that .first
    }else{
      this. second-that .second
    }
  }
 
}

object DoubleSort {
  def main(args : Array[ String]):Unit = {
    val conf = new SparkConf().setAppName ("DoubleSort" ).setMaster ("local" )
   
    val sc = new SparkContext(conf )
   
    val lines = sc.textFile("C:\\ Users\\Z-Jay \\Desktop\\ sort.txt",1 )
   
    val pairs = lines.map{ line => (
        new DoubleSortKey(line.split(" " ){0 }.toInt, line.split(" "){ 1}.toInt),line
        )}
   
    val sortedPairs = pairs. sortByKey()
   
    val sortedLines = sortedPairs.map {sortedPair => sortedPair._2}
   
    sortedLines. foreach{sortedLine => println(sortedLine)}
   
  }
}

3.TOPN

(1) 简单不分组

/**
 * 简单Top N
 * 对文本文件内的数字，取最大的前3个
 *
 * @author Z -Jay
 *
 */
public class TopN1 {
      
       public static void main(String[] args) {
            SparkConf conf = new SparkConf().setAppName("TopN1" ).setMaster("local");
            
            JavaSparkContext sc = new JavaSparkContext(conf );
            
            JavaRDD<String> lines = sc.textFile("C:\\Users\\Z-Jay\\Desktop\\topn1.txt" );
            
             //将RDD中全部元素String转换为Integer
            JavaPairRDD<Integer, String> numbers = lines .mapToPair(new PairFunction<String, Integer, String>() {

                   private static final long serialVersionUID = 1L;

                   @Override
                   public Tuple2<Integer, String> call(String numStr) throws Exception {
                        Tuple2<Integer,String> temp = new Tuple2<>(Integer.parseInt(numStr),"");
                         return temp ;
                  }
            });
            
            JavaPairRDD<Integer, String> sortedNums = numbers.sortByKey(false);
            
            JavaRDD<Integer> sortedNumsOnly = sortedNums .map(new Function<Tuple2<Integer,String>, Integer>() {

                   private static final long serialVersionUID = 1L;

                   @Override
                   public Integer call(Tuple2<Integer, String> tuple) throws Exception {
                         return tuple ._1;
                  }
                  
            });
            
            List<Integer> res = sortedNumsOnly .take(3);
            
            System. out.println(res );
            
             sc.close();
      }
      
}

object TopN {
 
  def main(args: Array[ String]): Unit = {
    val conf = new SparkConf().setAppName ("TopN" ).setMaster ("local" )
   
    val sc = new SparkContext(conf )
   
    val numStrs = sc.textFile("C:\\ Users\\Z-Jay \\Desktop\\ topn1.txt")
   
    val pairs = numStrs.map{ numStr => (line.toInt, line) }
   
    val sortedPairs = pairs. sortByKey(false )
   
    val sortedNums = sortedPairs.map { sortedPair => sortedPair._1 }
   
    val res = sortedNums.take(3)
   
    for ( num <- res ){
      println(num)
    }
     
  }
 
}

object TopN {
 
  def main(args: Array[ String]): Unit = {
    val conf = new SparkConf().setAppName ("TopN" ).setMaster ("local" )
   
    val sc = new SparkContext(conf )
   
    val numStrs = sc.textFile("C:\\ Users\\Z-Jay \\Desktop\\ topn1.txt")
   
    val pairs = numStrs.map{ numStr => (line.toInt, line) }
   
    val sortedPairs = pairs. sortByKey(false )
   
    val sortedNums = sortedPairs.map { sortedPair => sortedPair._1 }
   
    val res = sortedNums.take(3)
   
    for ( num <- res ){
      println(num)
    }
     
  }
 
}

(2) 分组

/**
 * 分组TOPN
 * 显示每个班的前三名
 * @author Z -Jay
 *
 */
public class TopN2 {

       public static void main(String[] args) {
             SparkConf conf = new SparkConf().setAppName( "TopN2").setMaster("local" );
            
            JavaSparkContext sc = new JavaSparkContext(conf );
            
            JavaRDD<String> lines = sc.textFile("C:\\Users\\Z-Jay\\Desktop\\top2.txt" );
            
            JavaPairRDD<String, Integer> classScores = lines.mapToPair(new PairFunction<String, String, Integer>() {

                   private static final long serialVersionUID = 1L;

                   @Override
                   public Tuple2<String, Integer> call(String line) throws Exception {
                        String[] split = line .split(" ");
                         return new Tuple2<>(split[0],Integer.parseInt( split[1]));
                  }
                  
            });
            
            JavaPairRDD<String, Iterable<Integer>> groupClassScores = classScores.groupByKey();
            
            JavaPairRDD<String, Iterable<Integer>> resScores = groupClassScores.mapToPair(new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() {

                   private static final long serialVersionUID = 1L;

                   @Override
                   public Tuple2<String, Iterable<Integer>> call(Tuple2<String, Iterable<Integer>> tuple ) throws Exception {
                        Integer[] tempVal = new Integer[3];
                        
                        Iterator<Integer> tempScores = tuple._2 .iterator();
                        
                         while(tempScores .hasNext()){
                              Integer score = tempScores .next();
                              
                               for(Integer i = 0;i < 3; i++){
                                     if(tempVal [i ] == null){
                                           tempVal[i ] = score;
                                          
                                           break;
                                    } else if (score >tempVal[i]){
                                           for(Integer j =2; j>i; j--){
                                                 tempVal[j ] = tempVal[j -1];
                                          }
                                          
                                           tempVal[i ] = score;
                                          
                                           break;
                                    }
                              }
                        }
                        
                         return new Tuple2<>(tuple._1 ,Arrays.asList( tempVal));
                  }
            });
            
             resScores.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() {
                  
                   private static final long serialVersionUID = 1L;

                   @Override
                   public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
                        System. out.println(t ._1 +":" +t ._2 );
                  }
            });
            
             sc.close();
      }
}

Bro_Rabbit

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
[笔记迁移][Spark][8]常见应用案例

说明：部分示例在练习中使用了两种实现文章目录1. 基于排序机制的wordcount2. 二次排序(ORDER BY col1,col2)3.TOPN1. 基于排序机制的wordcount/** * 基于排序机制的 wordcount * 需求： * 1. 对文本文件内出现的每个单词统计出其出现的次数 * 2. 按照每个单词出现次数降序排序 * @author Z -Jay * ...
复制链接

扫一扫

专栏目录