6.Spark综合练习--二次排序、TopN

最新推荐文章于 2021-06-06 16:40:17 发布

挨踢正能量

最新推荐文章于 2021-06-06 16:40:17 发布

阅读量407

点赞数

分类专栏： # spark 文章标签： spark二次排序

本文链接：https://blog.csdn.net/wangguohe/article/details/79121967

版权

spark 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

在此练习之前，要了解二次排序的使用。

java代码示例

package secondary;

import scala.math.Ordered;

import java.io.Serializable;

/**
 * Created by Administrator on 2018/1/18.
 */
public class SecondarySortKey implements Ordered<SecondarySortKey>,Serializable {
    private int first;
    private int second;

    public int getFirst() {
        return first;
    }

    public void setFirst(int first) {
        this.first = first;
    }

    public int getSecond() {
        return second;
    }

    public void setSecond(int second) {
        this.second = second;
    }

    public SecondarySortKey(int first, int second) {
        this.first = first;
        this.second = second;
    }

    @Override
    public int compare(SecondarySortKey that) {
        if(this.first - that.first != 0){
              return this.first - that.first;
        }else{
            return this.second - that.second;
        }
    }

    @Override
    public boolean $less(SecondarySortKey that) {
       if(this.first < that.first){
           return true;
       }else if(this.first == that.first && this.second < that.second){
           return true;
       }else{
           return false;
       }
    }

    @Override
    public boolean $greater(SecondarySortKey that) {
//      if(this.first > that.first){
//          return true;
//      }else if(this.first == that.first && this.second > that.second){
//          return true;
//      }else{
//          return false;
//      }
      return !$less(that);
    }

    @Override
    public boolean $less$eq(SecondarySortKey that) {
       if(this.$less(that)){
           return true;
       }else if(this.first == that.first && this.second == that.second){
           return true;
       }else{
           return false;
       }
    }

    @Override
    public boolean $greater$eq(SecondarySortKey that) {
        if(this.$greater(that)){
            return true;
        }else if(this.first == that.first && this.second == that.second){
            return true;
        }else{
            return false;
        }
    }

    @Override
    public int compareTo(SecondarySortKey that) {
       if(this.first - that.first != 0){
           return this.first - that.first;
       }else{
           return this.second- that.second;
       }
    }
}

Test：

package secondary;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

/**
 * Created by Administrator on 2018/1/18.
 */
public class SortSecondTest {
    public static void main(String[] args) {
         SparkConf conf = new SparkConf().setMaster("local").setAppName("Test");
         JavaSparkContext sc = new JavaSparkContext(conf);
        //3,4 sortByKey(自定义的key,line)=>
         sc.textFile("D:\\sort.txt")
                .mapToPair(new PairFunction<String, SecondarySortKey, String>() {
                    //ctrl + i
                    @Override
                    public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
                        final String[] fields = line.split(",");
                        final SecondarySortKey secondarySortKey = new SecondarySortKey(Integer.parseInt(fields[0]), Integer.parseInt(fields[1]));

                        return new Tuple2<>(secondarySortKey,line);
                    }
                }).sortByKey(false)
                .foreach(new VoidFunction<Tuple2<SecondarySortKey, String>>() {
                    @Override
                    public void call(Tuple2<SecondarySortKey, String> tuple) throws Exception {
                       System.out.println(tuple._2());
                    }
                });


    }
}

接下来是scala代码示例：

package core.secondary

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by Administrator on 2018/1/18.
  */
class SecondarySortKey(var first:Int,var second:Int) extends Ordered[SecondarySortKey] with Serializable{
  override def compare(that: SecondarySortKey): Int = {
    if(this.first - that.first != 0){
      return this.first - that.first
    }else {
      return this.second - that.second
    }
  }
}

object Test{
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("test").setMaster("local")
    val sc = new SparkContext(conf)
    sc.textFile("D:\\sort.txt")
      .map( line => {
        val fields = line.split(",")
        (new SecondarySortKey(fields(0).toInt,fields(1).toInt),line)
      }).sortByKey()
      .foreach( result =>{
        println(result._2)
      })
  }
}

是不是很简洁呢？

好啦，现在我们就可以做这个综合练习啦。

需求及需求分析：

题：手机端美团APP（饿了吗，百度外卖）

指标：
1）获取 【点击】，【下单】，【支付】次数排名前十的品类（二次排序）
数据是存储在HDFS之上的（已经有人经过了ETL的操作了）
数据是按天进行存储（一个目录就是一天）
date:
    日期 2018 01 18
user_id:
    用户id
session_id
    会话ID
page_id
    页面
action_id
    访问的时间
city_id
    访问用户所在的城市
search_keywords
    用户搜索的时候写的关键词
【click_category_id】 这个地方只会有一个品类 1
    用户点击的品类的ID
click_product_id
    用户点击的产品ID
【order_category_id】  如果是下单有可能是多个品类 ，如果是多个品类是 1^A2
    用户下单的品类id
order_product_id
    用户下单的产品id
【pay_category_id】
    用户支付的品类id  如果是支付有可能是多个品类 ，如果是多个品类是 1^A2^A3
pay_product_id
    用户支付的产品ID

 ===============================================================================================
 思路分析：
 获取 【点击】，【下单】，【支付】次数排名前十的品类
关键字：【点击】，【下单】，【支付】  次数    【品类】
重点解释：点击、下单、支付是没有任何联系的，不是我们想象中的只有点击了，才有下单的可能，
           只有下单了才有支付的可能，现实业务处理就是这样。但是我们的需求是按照【点击】，
           【下单】，【支付】次数排名，如果点击相同，按下单，下单还想相同的话按支付 排序。
教室里面：
   1      2            3        4
   2      2            2        2
   3      3            2        1
需要我们把数据处理成如下的格式：如何转换成这样的格式？
   1      2            3        4
   2      2            2        2
   3      3            2        1



1）获取到数据里面所涉及到的所有的品类的ID  RDD[(ID,ID)]  rdd1
    如何获取到所有的品类ID？
    a）获取到所有点击的品类 id  rdda

    b) 获取到所有的下单的品类id  rddb
    c）获取到所有的支付的品类 id rddc
    (rdda union rddb union rddc).distinct   思路一定要这样！！！
    ID,ID
    

2）分别计算出来每个品类的：
      点击的次数  1  2  单词计数  rdd2
      下单的次数  1  3  单词计数  rdd3
      支付的次数  1  4  单词计数  rdd4
3）
    rdd1.leftjoin(rdd2).leftjoin(rdd3).leftjoin(4)
    1 ) 如果能join上的  该是都是次就是多少次。 如果join不上的就是出现零次。

直接上代码，自己领悟：

排序规则代码：

package core.demo3

/**
  * Created by Administrator on 2018/1/19.
  */
class SortKey(var clickCount:Long,var orderCount:Long,var payCount:Long) extends Ordered[SortKey] with  Serializable{
  override def compare(that: SortKey): Int = {
    if(this.clickCount - that.clickCount != 0){
      if(this.clickCount - that.clickCount < 0){
        -1
      }else{
        1
      }
    }else if(this.orderCount - that.orderCount != 0){
      if(this.orderCount - that.orderCount <0){
        -1
      }else{
        1
      }
    }else{
      if(this.payCount - that.payCount < 0){
        -1
      }else{
        1
      }
    }
  }
}

业务代码：

package core.demo3

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

/**
  * Created by Administrator on 2018/1/18.
  */
object TopN {


  def main(args: Array[String]): Unit = {
     val conf = new SparkConf().setAppName("topN").setMaster("local")
     val sc = new SparkContext(conf)
     val rdd: RDD[String] = sc.textFile("hdfs://hadoop1:9000/20180909")
    /**
      * 第一步：
      *    获取到所有的品类的ID
      */
   val allCategoryIDS: RDD[(Long, Long)] = getAllCategoryID(rdd)

    /**
      * 第二步：
      *    分别获取到点击，下单，支付  品类 的次数
      */
    val clickCategoryCount: RDD[(Long, Long)] = getClickCategoryCount(rdd)
    val orderCategoryCount: RDD[(Long, Long)] = getOrderCategoryCount(rdd)
    val payCategoryCount: RDD[(Long, Long)] = getPayCategoryCount(rdd)
    /**
      * 第三步：
      *     用所有品类的ID的RDD 分别与 第二步求到的三个值进行leftjoin 最终
      *     拼接处形如：
      *       1      2            3        4
              2      2            2        2
              3      3            2        1
         格式
      */

    val resultRDD: RDD[(Long, String)] = joinCategoryAndData(allCategoryIDS,clickCategoryCount,orderCategoryCount,payCategoryCount)

    /**
      * 第四步：
      *    自定义二次排序的key,并实现二次排序，求得排名前十的数据
      */
    top10(resultRDD)

  }

  /**
    *
    * @param rdd  输入的数据
    * @return  返回的品类ID
    */
  def getAllCategoryID(rdd: RDD[String]):RDD[(Long,Long)]={
    //里面存储的是品类的ID
   val ids= new mutable.HashSet[(Long,Long)]
    rdd.flatMap( line  =>{
      val fields = line.split(",")
      val click_category_id=fields(7)
      val order_category_id=fields(9)
      val pay_cagegory_id=fields(11)
      //添加点击品类ID
      if(click_category_id != null && !click_category_id.trim.equals("")){
        ids +=((click_category_id.toLong,click_category_id.toLong))
      }
      //添加下单品类ID
      if(order_category_id != null && !order_category_id.trim.equals("")){
        val fields = order_category_id.split("\\^A")
        for(categoryid <- fields){
          ids+=((categoryid.toLong,categoryid.toLong))
        }
      }
      //添加支付品类ID
      if(pay_cagegory_id != null && !pay_cagegory_id.trim.equals("")){
        val fields = pay_cagegory_id.split("\\^A")
        for(categoryid <- fields){
          ids+=((categoryid.toLong,categoryid.toLong))
        }
      }
      ids
    })
  }

  /**
    * 统计点击品类出现的次数
    * @param rdd  输入的数据
    * @return  返回品类id,出现的次数
    */
  def getClickCategoryCount(rdd: RDD[String]):RDD[(Long,Long)]={
    rdd.filter( line =>{
      val fields = line.split(",")
      fields(7) != null && !fields(7).trim.equals("")
    }).map( line =>{
      val click_category_ID = line.split(",")(7).toLong
      (click_category_ID,1L)
    }).reduceByKey(_+_)
  }

  /**
    * 统计下单品类出现的次数
    * @param rdd 输入的数据
    * @return 返回品类id,出现的次数
    */
  def getOrderCategoryCount(rdd: RDD[String]):RDD[(Long,Long)]={
    rdd.filter( line =>{
      val fields = line.split(",")
      fields(9) != null && !fields(9).trim.equals("")
    }).flatMap( line =>{
      line.split(",")(9).split("\\^A")
    }).map( categoryid =>{
      (categoryid.toLong,1L)
    }).reduceByKey(_+_)
  }

  /**
    * 统计支付品类的ID出现的次数
    * @param rdd  输入的数据
    * @return  返回值，品类id,出现的次数
    */
  def getPayCategoryCount(rdd: RDD[String]):RDD[(Long,Long)]= {
    rdd.filter( line =>{
      val fields = line.split(",")
      fields(11) != null && !fields(11).trim.equals("")
    }).flatMap( line =>{
      line.split(",")(11).split("\\^A")
    }).map( categoryid =>{
      (categoryid.toLong,1L)
    }).reduceByKey(_+_)

  }


  def joinCategoryAndData(
         allCategoryIDS: RDD[(Long, Long)],
         clickCategoryCount: RDD[(Long, Long)],
         orderCategoryCount: RDD[(Long, Long)],
         payCategoryCount: RDD[(Long, Long)]):RDD[(Long, String)]={
    /**
      * (Long, 品类ID
      * (Long,  品类ID
      * Option[Long]) 出现的次数
      * )
      */
    val resultRDD: RDD[(Long, String)] = allCategoryIDS.leftOuterJoin(clickCategoryCount)
      .map(tuple => {
        val category_id = tuple._1.toLong
        val clickCategoryCount = tuple._2._2.getOrElse(0)
        //
        val value = constants.FIELDS_CATEGORY_ID + "=" + category_id + "|" + "click_category_count=" + clickCategoryCount
        //value=   [categoryid=1|click_category_count=33]|order_category_count=44|pay_category_count=55
        (category_id, value)
      }).leftOuterJoin(orderCategoryCount).map(tuple => {
      val category_id = tuple._1.toLong
      var value = tuple._2._1
      val orderCategoryCount = tuple._2._2.getOrElse(0)

      value += "|" + "order_category_count" + "=" + orderCategoryCount
      (category_id, value)
    }).leftOuterJoin(payCategoryCount).map(tuple => {
      val categoryid = tuple._1.toLong
      var value = tuple._2._1
      val payCategoryCount = tuple._2._2.getOrElse(0)
      value += "|" + "pay_category_count" + "=" + payCategoryCount
      (categoryid, value)
    })

    resultRDD

  }

  def top10(resultRDD:RDD[(Long,String)]):Unit={
    resultRDD.map( tuple =>{
      val category_id = tuple._1
    //[categoryid=1|click_category_count=33]|order_category_count=44|pay_category_count=55
      val value = tuple._2

      val click_category_count = value.split("\\|")(1).split("=")(1).toLong
      val order_category_count = value.split("\\|")(2).split("=")(1).toLong
      val pay_category_count = value.split("\\|")(3).split("=")(1).toLong
      val key=new SortKey(click_category_count,order_category_count,pay_category_count)
      (key,value)
    }).sortByKey(false)
      .take(10)
  }





}

挨踢正能量

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
6.Spark综合练习--二次排序、TopN

在此练习之前，要了解二次排序的使用。java代码示例package secondary;import scala.math.Ordered;import java.io.Serializable;/** * Created by Administrator on 2018/1/18. */public class SecondarySortKey implements O
复制链接

扫一扫