Spark编程实战

最新推荐文章于 2020-11-21 04:15:58 发布

疯狂呼呼呼

最新推荐文章于 2020-11-21 04:15:58 发布

阅读量827

点赞数 1

分类专栏： Spark 文章标签： spark

本文链接：https://blog.csdn.net/lemonZhaoTao/article/details/78063752

版权

Spark 专栏收录该内容

28 篇文章 2 订阅

订阅专栏

在上述几篇的博文中，介绍了Spark的几种常用transformation算子和action算子的使用方法、RDD的创建
在本篇文章中，将带来Spark核心编程的几种经典案例

二次排序

案例需求及实现思路：
案例需求：
1.按照文件中的第一列排序
2.如果第一列相同，则按照第二列排序

实现思路：
1.实现自定义的key，要实现Order接口和Serializable接口，在key中实现自己对多个列的排序算法
2.将包含文本的RDD，映射成key为自定义key value为文本的JavaPairRDD
3.使用sortByKey算子按照自定义的key进行排序
4.再次映射，剔除自定义的key，只保留文本行

sort.txt文件内容

二次排序(Java版本)

import java.io.Serializable;

import scala.math.Ordered;

/**
 * 自定义的二次排序key
 * 注意点：继承scala.math的Ordered的接口，需要自己去实现其中的以下方法：
 * 
 * def compare(that: A): Int
 * def <  (that: A): Boolean = (this compare that) <  0
 * def >  (that: A): Boolean = (this compare that) >  0
 * def <= (that: A): Boolean = (this compare that) <= 0
 * def >= (that: A): Boolean = (this compare that) >= 0
 * def compareTo(that: A): Int = compare(that)
 * 
 * @author 陶
 *
 */
public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable{

    private static final long serialVersionUID = -3565395958411508357L;

    // 首先自定义key里面，定义需要进行排序的列
    private int first;
    private int second;

    // 为要进行排序的多个列，提供getter和setter方法，以及hashcode和equals方法
    public int getFirst() {
        return first;
    }

    public void setFirst(int first) {
        this.first = first;
    }

    public int getSecond() {
        return second;
    }

    public void setSecond(int second) {
        this.second = second;
    }

    public SecondarySortKey(int first, int second) {
        this.first = first;
        this.second = second;
    }

    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + first;
        result = prime * result + second;
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        SecondarySortKey other = (SecondarySortKey) obj;
        if (first != other.first)
            return false;
        if (second != other.second)
            return false;
        return true;
    }

    /*
     * $greater表示当前key与其它key相比 >
     * (non-Javadoc)
     * @see scala.math.Ordered#$greater(A)
     */
    @Override
    public boolean $greater(SecondarySortKey other) {
        if (this.first > other.getFirst()) {
            return true;
        } else if (this.first == other.getFirst() &&
                this.second > other.second) {
            return true;
        }
        return false;
    }

    /*
     * $greater$eq表示当前key与其它key相比   >=
     * (non-Javadoc)
     * @see scala.math.Ordered#$greater$eq(A)
     */
    @Override
    public boolean $greater$eq(SecondarySortKey other) {
        // 1.比较  > 的情况
        // 2.比较  = 的情况
        if (this.$greater(other)) {
            return true;
        } else if (this.first == other.getFirst() &&
                this.second == other.second) {
            return true;
        }
        return false;
    }
    /*
     * 表示当前key与其它key相比  <
     * (non-Javadoc)
     * @see scala.math.Ordered#$less(A)
     */
    @Override
    public boolean $less(SecondarySortKey other) {
        if (this.first < other.getFirst()) {
            return true;
        } else if (this.first == other.getFirst() &&
                this.second < other.second) {
            return true;
        }
        return false;
    }

    /*
     * 表示当前key与其它key相比  <=
     * (non-Javadoc)
     * @see scala.math.Ordered#$less$eq(A)
     */
    @Override
    public boolean $less$eq(SecondarySortKey other) {
        // 1.比较  < 的情况
        // 2.比较  = 的情况
        if (this.$less(other)) {
            return true;
        } else if (this.first == other.getFirst() &&
                this.second == other.second) {
            return true;
        }
        return false;
    }

    /*
     * (non-Javadoc)
     * @see scala.math.Ordered#compare(java.lang.Object)
     */
    @Override
    public int compare(SecondarySortKey other) {
        if (this.first - other.getFirst() != 0) {
            return this.first - other.getFirst();
        } else {
            return this.second - other.getSecond();
        }
    }

    @Override
    public int compareTo(SecondarySortKey other) {
        if (this.first - other.getFirst() != 0) {
            return this.first - other.getFirst();
        } else {
            return this.second - other.getSecond();
        }
    }

}

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

/**
 * 二次排序 实现思路
 * 1.实现自定义的key，要实现Order接口和Serializable接口，在key中实现自己对多个列的排序算法
 * 2.将包含文本的RDD，映射成key为自定义key value为文本的JavaPairRDD
 * 3.使用sortByKey算子按照自定义的key进行排序
 * 4.再次映射，剔除自定义的key，只保留文本行
 * @author 陶
 *
 */
public class SecondarySort {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("SecondarySort")
                .setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<String> lines = sc.textFile("sort.txt");

        JavaPairRDD<SecondarySortKey, String> pairs = lines.mapToPair(

                new PairFunction<String, SecondarySortKey, String>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
                        String[] lineSplited = line.split(" ");
                        SecondarySortKey key = new SecondarySortKey(
                                Integer.valueOf(lineSplited[0]), 
                                Integer.valueOf(lineSplited[1]));
                        return new Tuple2<SecondarySortKey, String>(key, line);
                    }
                });

        JavaPairRDD<SecondarySortKey, String> sortedPairs = pairs.sortByKey();

        JavaRDD<String> sortedLines = sortedPairs.map(

                new Function<Tuple2<SecondarySortKey,String>, String>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public String call(Tuple2<SecondarySortKey, String> v1) throws Exception {
                        return v1._2;
                    }
                });

        sortedLines.foreach(new VoidFunction<String>() {

            private static final long serialVersionUID = 1L;

            @Override
            public void call(String t) throws Exception {
                System.out.println(t);
            }
        });

        sc.close();     
    }

}

二次排序(Scala版本)

import scala.math.Ordered

/**
 * 自定义的二次排序key
 * 需要去实现Ordered中的compare方法
 */
class SecondSortKey (val first: Int, val second: Int)
    extends Ordered[SecondSortKey] with Serializable{

  def compare(that: SecondSortKey): Int = {
    if (this.first - that.first != 0) {
      this.first - that.first
    } else {
      this.second - that.second
    }
  }

}

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object SecondSort {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("SecondSort").setMaster("local")
    val sc = new SparkContext(conf)

    val lines = sc.textFile("sort.txt", 1)
    val pairs = lines.map( line =>{
      (new SecondSortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt), line)
    })

    val sortedPairs = pairs.sortByKey()
    val sortedLines = sortedPairs.map( sortedLine => sortedLine._2)

    sortedLines.foreach { sortedLine => println(sortedLine) }
  }

}

Top N

案例需求及实现思路
案例需求：
1.对文本文件内的数字，取最大的前3个
实现思路:
1.读取文本文件，创建成为RDD
2.转换为Key-Value形式的PairRDD
Key为Integer类型的数字，Value则为文本类型
3.根据Key，调用sortByKey算子进行排序
4.调用take算子，取出Top3
5.将value中的文本内容进行打印

top.txt

Top N(Java版本)

import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

/**
 * 取最大的前3个数字
 * 即Top 3
 * 1.读取文本文件，创建成为RDD
 * 2.转换为Key-Value形式的PairRDD
 *   Key为Integer类型的数字，Value则为文本类型
 * 3.根据Key，调用sortByKey算子进行排序
 * 4.调用take算子，取出Top3
 * 5.将value中的文本内容进行打印
 * @author 陶
 *
 */
public class Top3 {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("Top3")
                .setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<String> lines = sc.textFile("top.txt");

        JavaPairRDD<Integer, String> pairs = lines.mapToPair(

                new PairFunction<String, Integer, String>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Tuple2<Integer, String> call(String t) throws Exception {
                        return new Tuple2<Integer, String>(Integer.valueOf(t), t);
                    }

                });

        JavaPairRDD<Integer, String> sortedPairs = pairs.sortByKey(false);

        JavaRDD<Integer> sortedNumbers = sortedPairs.map(

                new Function<Tuple2<Integer,String>, Integer>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Integer call(Tuple2<Integer, String> v1) throws Exception {
                        return v1._1;
                    }
                });

        List<Integer> sortedNumberList = sortedNumbers.take(3);

        for(Integer num : sortedNumberList){
            System.out.println(num);
        }

        sc.close();
    }

}

Top N(Scala版本)

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object Top3 {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("Top3").setMaster("local")
    val sc = new SparkContext(conf)

    val lines = sc.textFile("top.txt", 1)
    val pairs = lines.map { line => (line.toInt, line) }
    val sortedPairs = pairs.sortByKey(false)
    val sortedNumbers = sortedPairs.map(sortedPair => sortedPair._1)
    val top3Number = sortedNumbers.take(3)

    for(num <- top3Number){
      println(num)
    }
  }

}

分组取Top N

案例需求及实现思路
案例需求：
1.对每个班级内的学生成绩，取出前3名
实现思路:
[分组取Top3–Java版本]
1.读取文本文件成为RDD，使用mapToPair算子映射为key-value形式的pair RDD
2.使用groupByKey算子，对key进行分组
3.再次使用mapToPair算子，映射为className-score形式的kv对
其中，score即为各组前三的成绩，取得前三成绩并未像scala中那样使用sortWith
而是自己实现了一套排序算法
4.使用foreach算子打印结果
[分组取Top3–Scala版本]
实现起来比Java更简单，因为scala可以直接使用sortWith进行排序
1.读取文本文件为RDD，使用map算子将其映射key-value形式的RDD
2.使用groupByKey算子，按key进行分组
3.使用map算子，将分组之后的RDD映射为key-value形式的RDD
key为className，value为score(前3的成绩)
其中取得前3的成绩，先后使用sortWith(其中，sortWith为scala中的函数，可直接调用)和take算子
4.使用foreach算子，将结果进行打印

score.txt

class1 90
class2 56
class1 87
class1 76
class2 88
class1 95
class1 74
class2 87
class2 67
class2 77

分组取Top N(Java版本)

import java.util.Arrays;
import java.util.Iterator;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

/**
 * 分组取Top3
 * 1.读取文本文件成为RDD，使用mapToPair算子映射为key-value形式的pair RDD
 * 2.使用groupByKey算子，对key进行分组
 * 3.再次使用mapToPair算子，映射为className-score形式的kv对
 *     其中，score即为各组前三的成绩，取得前三成绩并未像scala中那样使用sortWith
 *   而是自己实现了一套排序算法
 * 4.使用foreach算子打印结果
 * @author 陶
 *
 */
public class GroupTop3 {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("GroupTop3")
                .setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<String> lines = sc.textFile("score.txt");

        JavaPairRDD<String, Integer> pairs = lines.mapToPair(
                new PairFunction<String, String, Integer>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Tuple2<String, Integer> call(String line) throws Exception {
                        String[] lineSplited = line.split(" ");
                        return new Tuple2<String, Integer>(lineSplited[0], Integer.valueOf(lineSplited[1]));
                    }
                });

        JavaPairRDD<String, Iterable<Integer>> groupedPairs = pairs.groupByKey();

        JavaPairRDD<String, Iterable<Integer>> top3Score = groupedPairs.mapToPair(

                new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Tuple2<String, Iterable<Integer>> call(Tuple2<String, Iterable<Integer>> classScores) throws Exception {
                        Integer[] top3 = new Integer[3];

                        String className = classScores._1;
                        Iterator<Integer> scores = classScores._2.iterator();

                        /*
                         * 算法实现思路：
                         * 1.初始情况下，将第一个score赋值给top[0]
                         * 2.之后，每进来一个score，就拿top[0]，top[1]，top[2]一一去比对
                         * 3.如果发现score比其中一个top[i]大，进行两步操作
                         *   [1].将原来的top[i]后移一位到top[i+1]的位置
                         *   [2].将score的值赋值给top[i]
                         * 4.以此类推，直至取出每组的Top3
                         */
                        while(scores.hasNext()) {
                            Integer score = scores.next();

                            for (int i = 0; i < 3; i++) {
                                if (top3[i] == null) {
                                    top3[i] = score;
                                    break;
                                }else if (score > top3[i]) {
                                    for (int j = 2; j > i; j--) {
                                        top3[j] = top3[j - 1];
                                    }
                                    top3[i] = score;
                                    break;
                                }
                            }       
                        }

                        return new Tuple2<String, Iterable<Integer>>(className, Arrays.asList(top3));
                    }
                });

        top3Score.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() {

            private static final long serialVersionUID = 1L;

            @Override
            public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
                System.out.println("class:" + t._1);
                Iterator<Integer> scoreIterator = t._2.iterator();
                while(scoreIterator.hasNext()){
                    Integer score = scoreIterator.next();
                    System.out.println(score);
                }
                System.out.println("======================");
            }
        });

        sc.close();
    }

}

分组取Top N(Scala版本)

import org.apache.spark.SparkConf
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

/*
 * 分组取Top3
 * 实现起来比Java更简单，因为scala可以直接使用sortWith进行排序
 * 1.读取文本文件为RDD，使用map算子将其映射key-value形式的RDD
 * 2.使用groupByKey算子，按key进行分组
 * 3.使用map算子，将分组之后的RDD映射为key-value形式的RDD
 *   key为className，value为score(前3的成绩)
 *   其中取得前3的成绩，先后使用sortWith(其中，sortWith为scala中的函数，可直接调用)和take算子
 * 4.使用foreach算子，将结果进行打印
 */
object GroupTop3 {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("GroupTop3").setMaster("local")
    val sc = new SparkContext(conf)

    val lines = sc.textFile("score.txt", 1)
    val pairs = lines.map ( line => (line.split(" ")(0), line.split(" ")(1).toInt) )
    val groupedPairs = pairs.groupByKey()
    val top3Score = groupedPairs.map(topScore => {
      val className = topScore._1
      val score = topScore._2.toArray.sortWith(_ > _).take(3)
      (className, score)
    })

    top3Score.foreach(score => {
      val className = score._1
      println("class：" + className)
      score._2.foreach(score => println(score))    
    })

  }

}