spark业务开发-join合并(join)

spark业务开发-join合并(join)

输入数据集1
order_number,order_date,purchaser,quantity,product_id,remark
10001,2016-01-16,1001,1,102,机q器w记e录r
10003,2016-01-17,1002,2,105,人工记录
10002,2016-01-19,1002,3,106,人工补录
10004,2016-02-21,1003,4,107,自然交易
10001,2016-01-16,1001,1,102,机器记录
输入数据集2
product_id,product_name
101,手机
102,平板
103,电脑
104,家具
105,书籍
106,电子数码
107,母婴
108,美妆
输出数据
+------------+-------------------+---------+--------+----------+------------+------------+
|order_number|         order_date|purchaser|quantity|product_id|      remark|product_name|
+------------+-------------------+---------+--------+----------+------------+------------+
|       10001|2016-01-16 00:00:00|     1001|       1|       102|机q器w记e录r|        平板|
|       10003|2016-01-17 00:00:00|     1002|       2|       105|    人工记录|        书籍|
|       10002|2016-01-19 00:00:00|     1002|       3|       106|    人工补录|    电子数码|
|       10004|2016-02-21 00:00:00|     1003|       4|       107|    自然交易|        母婴|
|       10001|2016-01-16 00:00:00|     1001|       1|       102|    机器记录|        平板|
+------------+-------------------+---------+--------+----------+------------+------------+
程序代码(DSL方式)
package com.cch.bigdata.spark.process.join

import com.cch.bigdata.spark.process.AbstractTransform
import org.apache.spark.sql.{Column, DataFrame}

import scala.collection.mutable.ListBuffer

//使用dsl编程方式执行join
//'inner', 'outer', 'full', 'fullouter', 'full_outer', 'leftouter', 'left', 'left_outer', 'rightouter', 'right', 'right_outer', 'leftsemi', 'left_semi', 'semi', 'leftanti', 'left_anti', 'anti', 'cross'.
class DSLJoiner extends AbstractTransform{


  //join类型
  private val join_type = "left"

  //左表名称
  private val left_table = "orders"

  //输出左表字段
  private val left_table_query_columns = Array[String]("order_number","order_date","purchaser","quantity","product_id","remark")

  //右表表名
  private val right_table = "products"

  //右表输出字段
  private val right_table_query_columns = Array[String]("product_name")

  //左表连接字段
  private val left_table_join_columns = Array[String]("product_id")

  //右表连接字段
  private val right_table_join_columns = Array[String]("product_id")



  override def process(): Unit = {


    if(join_type.isEmpty){
      throw new RuntimeException("join类型不能为空!")
    }


    if(left_table.isEmpty){
      throw new RuntimeException("左表不能为空!")
    }

    if(right_table.isEmpty){
      throw new RuntimeException("右表不能为空!")
    }

    if(left_table_join_columns.length==0){
      throw new RuntimeException("左表关联字段不能为空!")
    }

    if(right_table_join_columns.length==0){
      throw new RuntimeException("右表关联字段不能为空!")
    }

    if(left_table_join_columns.length!=right_table_join_columns.length){
      throw new RuntimeException("关联字段不匹配!")
    }




    //左表数据集
    val leftDF: DataFrame = loadCsv("src/main/resources/csv/orders.csv",spark)

    //右表数据集
    val rightDF: DataFrame = loadCsv("src/main/resources/csv/product.csv",spark)

    //join所需的表达式
    var joinColumn:Column = null
    left_table_join_columns.zipWithIndex.foreach{ case (e, index) => {
        if(index==0){
          joinColumn = leftDF.col(e) === rightDF.col(right_table_join_columns(index))
        }else{
          joinColumn.and(leftDF.col(e) === rightDF.col(right_table_join_columns(index)))
        }
      }
    }

    //执行join
    val df: DataFrame = leftDF.join(rightDF, joinColumn, join_type)

    //构造需要查询列的list
    //主要是解决join操作后,重复字段名的问题
    val columnList: ListBuffer[Column] = new ListBuffer()
    //指定左表需要查询哪些列
    left_table_query_columns.foreach(c=>{columnList.append(leftDF.col(c))})
    //指定右表需要查询哪些列
    right_table_query_columns.foreach(c=>{columnList.append(rightDF.col(c))})

    df.select(columnList.map(c=>{c}):_*).show()

  }

  override def getAppName(): String = "dsl join"
}

object DSLJoiner{
  def main(args: Array[String]): Unit = {
    new DSLJoiner().process()
  }
}

程序代码(sql方式)
package com.cch.bigdata.spark.process.join

import com.cch.bigdata.spark.process.AbstractTransform
import org.apache.spark.sql.{DataFrame, SparkSession}



class Joiner extends AbstractTransform{

  case class JoinTable(name:String,queryColumn:String)

  //join类型
  private val join_type = "left join"

  //左表名称
  private val left_table = "orders"

  //输出左表字段
  private val left_table_query_columns = Array("order_number","order_date","purchaser","quantity","product_id","remark")

  //右表表名
  private val right_table = "products"

  //右表输出字段
  private val right_table_query_columns = Array("product_name")

  //左表连接字段
  private val left_table_join_columns = Array("product_id")

  //右表连接字段
  private val right_table_join_columns = Array("product_id")

  val spark: SparkSession = SparkSession.builder().appName("join").master("local[1]").getOrCreate()


  override def process(): Unit = {

    if(join_type.isEmpty){
      throw new RuntimeException("join类型不能为空!")
    }


    if(left_table.isEmpty){
      throw new RuntimeException("左表不能为空!")
    }

    if(right_table.isEmpty){
      throw new RuntimeException("右表不能为空!")
    }

    if(left_table_join_columns.isEmpty){
      throw new RuntimeException("左表关联字段不能为空!")
    }

    if(right_table_join_columns.isEmpty){
      throw new RuntimeException("右表关联字段不能为空!")
    }

    if(left_table_join_columns.length!=right_table_join_columns.length){
      throw new RuntimeException("关联字段不匹配!")
    }



    //左表数据集
    val leftDF: DataFrame = loadCsv("src/main/resources/csv/orders.csv",spark)

    //右表数据集
    val rightDF: DataFrame = loadCsv("src/main/resources/csv/product.csv",spark)



    //左表对象
    val leftTable: JoinTable = JoinTable(left_table,if(left_table_query_columns.isEmpty) "*" else left_table_query_columns.mkString(","))

    //创建临时视图
    leftDF.createOrReplaceTempView(leftTable.name)

    //右表对象
    val rightTable: JoinTable = JoinTable(right_table,if(right_table_query_columns.isEmpty) "*" else right_table_query_columns.mkString(","))

    //创建临时视图
    rightDF.createOrReplaceTempView(rightTable.name)


    //构造查询sql
    val sqlStringBuilder:StringBuilder = new StringBuilder()

    sqlStringBuilder.append("select ")

    var index = 0

    //需要处理输出列的显示,添加表名前缀
    if(leftTable.queryColumn.equals("*")){
      //如果是*
      sqlStringBuilder.append(leftTable.name).append(".").append(leftTable.queryColumn)
    }else{
      leftTable.queryColumn.split(",").foreach(c=>{
        if(index>0){
          sqlStringBuilder.append(",").append(leftTable.name).append(".").append(c)
        }else{
          sqlStringBuilder.append(leftTable.name).append(".").append(c)
        }
        index+=1
      })
    }


    if(rightTable.queryColumn.equals("*")){
      sqlStringBuilder.append(",").append(rightTable.name).append(".").append(rightTable.queryColumn)
    }else{
      rightTable.queryColumn.split(",").foreach(c=>{
        sqlStringBuilder.append(",").append(rightTable.name).append(".").append(c)
      })
    }


    sqlStringBuilder.append(" from ").append(leftTable.name).append(" ")
    sqlStringBuilder.append(join_type).append(" ").append(rightTable.name)
    sqlStringBuilder.append(" on ")


    var iteratorIndex:Int = 0
    left_table_join_columns.foreach(c=>{
      if(iteratorIndex>0){
        sqlStringBuilder.append(" and ")
      }
      val leftRelationColumnName: String = left_table_join_columns(iteratorIndex)
      val rightRelationColumnName: String = right_table_join_columns(iteratorIndex)
      sqlStringBuilder.append(leftTable.name).append(".").append(leftRelationColumnName).append(" = ").append(rightTable.name).append(".").append(rightRelationColumnName)
      iteratorIndex+=1
    })


    spark.sql(sqlStringBuilder.toString()).show()

  }
}

package com.cch.bigdata.spark.process

import com.cch.bigdata.spark.process.join.Joiner

object JoinTest {

  def main(args: Array[String]): Unit = {
    val joiner = new Joiner()
    joiner.process()
  }
}

参数解释
  • left_table:左表数据集的一个命名
  • left_table_query_columns:左表输出列,使用逗号分隔的字符串
  • right_table:右表数据集的一个命名
  • right_table_query_columns:右表输出列,使用逗号分隔的字符串
  • join_type:join类型
    • left join:左连接:返回选择的左右表字段中,左表的全部数据和右表中满足关联条件的数据。
    • left join:右连接:返回选择的左右表字段中,右表的全部数据和左表中满足关联条件的数据。
    • inner join:内连接:返回选择的左右表字段中,左表中满足关联条件的数据和右表中满足关联条件的数据。
    • full join:全连接:返回选择的左右表字段的所有数据
  • left_table_join_columns:左表需要与右表关联的字段,字符串数组
  • right_table_join_columns:右表需要与左边做关联的字段,字符串数组
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值