spark处理复杂情况的join

package com.sf.gis.scala.base.spark

import org.apache.log4j.Logger
import org.apache.spark.rdd.RDD

import scala.collection.mutable.ArrayBuffer
import scala.util.Random


/**
  * Created by 01374443 on 2020/7/27.
  * 处理一些复杂的关联场景
  */
object SparkJoin {
  @transient lazy val logger: Logger = Logger.getLogger(this.getClass)

  /**
    * 左关联,左表存在少部分key倾斜,采用单独处理部分key的方式
    *
    * @param left    左表
    * @param right   右表
    * @param hashNum 散列系数,扩容倍数
    * @param topLean 需要单独处理的倾斜数据量
    */
  def leftOuterJoinOfLeftLeanElem(left: RDD[(String, Object)], right: RDD[(String, Object)], hashNum: Int, topLean: Int = 10): RDD[(String, (Object, Option[Object]))] = {
    val keyCounts = left.countByKey().toArray.sortBy(-_._2).take(topLean)
    val keys = keyCounts.map(obj => obj._1)
    val counts = keyCounts.map(obj => obj._2).sum
    logger.error("单独处理的keys:" + keyCounts.mkString(","))
    logger.error("单独处理的总数量:" + counts)
    //拆分数据为独立处理的key和非独立处理的key
    val leftHashKeyData = left.filter(obj => keys.contains(obj._1))
    val leftOtherData = left.filter(obj => !keys.contains(obj._1))
    val rightHashKeyData = right.filter(obj => keys.contains(obj._1))
    val rightOtherData = right.filter(obj => !keys.contains(obj._1))
    //先关联其他key数据
    val otherJoin = leftOtherData.leftOuterJoin(rightOtherData)
    //扩展单独处理的数据
    val leftHashKeyDataExpand = leftHashKeyData.map(obj => {
      val hashPrefix = new Random().nextInt(hashNum)
      ((hashPrefix, obj._1), obj._2)
    })
    val rightHashKeyDataExpand = rightHashKeyData.flatMap(obj => {
      val dataArray = new ArrayBuffer[((Int, String), Object)]()
      for (i <- 0 until hashNum) {
        dataArray.append(((i, obj._1), obj._2))
      }
      dataArray.iterator
    })
    //关联数据
    val hashKeyJoin = leftHashKeyDataExpand.leftOuterJoin(rightHashKeyDataExpand).map(obj => (obj._1._2, obj._2))
    hashKeyJoin.union(otherJoin)
  }

  def main(args: Array[String]): Unit = {
    val spark = Spark.getSparkSession(this.getClass.getSimpleName.replace("$", ""), null, true, 5)
    val list = Array("3333", "dd", "dd", "11", "222", "ddd1", "3333", "11", "dd", "dd", "11", "3333", "3333", "3333", "333")
    val left = spark.sparkContext.parallelize(list).map(obj => (obj, obj))
    val keys = left.countByKey().toArray.sortBy(-_._2)
    val topkey = keys.take(2)
    print(keys)
  }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值