spark业务开发-join合并(join)
输入数据集1
order_number,order_date,purchaser,quantity,product_id,remark
10001,2016-01-16,1001,1,102,机q器w记e录r
10003,2016-01-17,1002,2,105,人工记录
10002,2016-01-19,1002,3,106,人工补录
10004,2016-02-21,1003,4,107,自然交易
10001,2016-01-16,1001,1,102,机器记录
输入数据集2
product_id,product_name
101,手机
102,平板
103,电脑
104,家具
105,书籍
106,电子数码
107,母婴
108,美妆
输出数据
+------------+-------------------+---------+--------+----------+------------+------------+
|order_number| order_date|purchaser|quantity|product_id| remark|product_name|
+------------+-------------------+---------+--------+----------+------------+------------+
| 10001|2016-01-16 00:00:00| 1001| 1| 102|机q器w记e录r| 平板|
| 10003|2016-01-17 00:00:00| 1002| 2| 105| 人工记录| 书籍|
| 10002|2016-01-19 00:00:00| 1002| 3| 106| 人工补录| 电子数码|
| 10004|2016-02-21 00:00:00| 1003| 4| 107| 自然交易| 母婴|
| 10001|2016-01-16 00:00:00| 1001| 1| 102| 机器记录| 平板|
+------------+-------------------+---------+--------+----------+------------+------------+
程序代码(DSL方式)
package com.cch.bigdata.spark.process.join
import com.cch.bigdata.spark.process.AbstractTransform
import org.apache.spark.sql.{Column, DataFrame}
import scala.collection.mutable.ListBuffer
class DSLJoiner extends AbstractTransform{
private val join_type = "left"
private val left_table = "orders"
private val left_table_query_columns = Array[String]("order_number","order_date","purchaser","quantity","product_id","remark")
private val right_table = "products"
private val right_table_query_columns = Array[String]("product_name")
private val left_table_join_columns = Array[String]("product_id")
private val right_table_join_columns = Array[String]("product_id")
override def process(): Unit = {
if(join_type.isEmpty){
throw new RuntimeException("join类型不能为空!")
}
if(left_table.isEmpty){
throw new RuntimeException("左表不能为空!")
}
if(right_table.isEmpty){
throw new RuntimeException("右表不能为空!")
}
if(left_table_join_columns.length==0){
throw new RuntimeException("左表关联字段不能为空!")
}
if(right_table_join_columns.length==0){
throw new RuntimeException("右表关联字段不能为空!")
}
if(left_table_join_columns.length!=right_table_join_columns.length){
throw new RuntimeException("关联字段不匹配!")
}
val leftDF: DataFrame = loadCsv("src/main/resources/csv/orders.csv",spark)
val rightDF: DataFrame = loadCsv("src/main/resources/csv/product.csv",spark)
var joinColumn:Column = null
left_table_join_columns.zipWithIndex.foreach{ case (e, index) => {
if(index==0){
joinColumn = leftDF.col(e) === rightDF.col(right_table_join_columns(index))
}else{
joinColumn.and(leftDF.col(e) === rightDF.col(right_table_join_columns(index)))
}
}
}
val df: DataFrame = leftDF.join(rightDF, joinColumn, join_type)
val columnList: ListBuffer[Column] = new ListBuffer()
left_table_query_columns.foreach(c=>{columnList.append(leftDF.col(c))})
right_table_query_columns.foreach(c=>{columnList.append(rightDF.col(c))})
df.select(columnList.map(c=>{c}):_*).show()
}
override def getAppName(): String = "dsl join"
}
object DSLJoiner{
def main(args: Array[String]): Unit = {
new DSLJoiner().process()
}
}
程序代码(sql方式)
package com.cch.bigdata.spark.process.join
import com.cch.bigdata.spark.process.AbstractTransform
import org.apache.spark.sql.{DataFrame, SparkSession}
class Joiner extends AbstractTransform{
case class JoinTable(name:String,queryColumn:String)
private val join_type = "left join"
private val left_table = "orders"
private val left_table_query_columns = Array("order_number","order_date","purchaser","quantity","product_id","remark")
private val right_table = "products"
private val right_table_query_columns = Array("product_name")
private val left_table_join_columns = Array("product_id")
private val right_table_join_columns = Array("product_id")
val spark: SparkSession = SparkSession.builder().appName("join").master("local[1]").getOrCreate()
override def process(): Unit = {
if(join_type.isEmpty){
throw new RuntimeException("join类型不能为空!")
}
if(left_table.isEmpty){
throw new RuntimeException("左表不能为空!")
}
if(right_table.isEmpty){
throw new RuntimeException("右表不能为空!")
}
if(left_table_join_columns.isEmpty){
throw new RuntimeException("左表关联字段不能为空!")
}
if(right_table_join_columns.isEmpty){
throw new RuntimeException("右表关联字段不能为空!")
}
if(left_table_join_columns.length!=right_table_join_columns.length){
throw new RuntimeException("关联字段不匹配!")
}
val leftDF: DataFrame = loadCsv("src/main/resources/csv/orders.csv",spark)
val rightDF: DataFrame = loadCsv("src/main/resources/csv/product.csv",spark)
val leftTable: JoinTable = JoinTable(left_table,if(left_table_query_columns.isEmpty) "*" else left_table_query_columns.mkString(","))
leftDF.createOrReplaceTempView(leftTable.name)
val rightTable: JoinTable = JoinTable(right_table,if(right_table_query_columns.isEmpty) "*" else right_table_query_columns.mkString(","))
rightDF.createOrReplaceTempView(rightTable.name)
val sqlStringBuilder:StringBuilder = new StringBuilder()
sqlStringBuilder.append("select ")
var index = 0
if(leftTable.queryColumn.equals("*")){
sqlStringBuilder.append(leftTable.name).append(".").append(leftTable.queryColumn)
}else{
leftTable.queryColumn.split(",").foreach(c=>{
if(index>0){
sqlStringBuilder.append(",").append(leftTable.name).append(".").append(c)
}else{
sqlStringBuilder.append(leftTable.name).append(".").append(c)
}
index+=1
})
}
if(rightTable.queryColumn.equals("*")){
sqlStringBuilder.append(",").append(rightTable.name).append(".").append(rightTable.queryColumn)
}else{
rightTable.queryColumn.split(",").foreach(c=>{
sqlStringBuilder.append(",").append(rightTable.name).append(".").append(c)
})
}
sqlStringBuilder.append(" from ").append(leftTable.name).append(" ")
sqlStringBuilder.append(join_type).append(" ").append(rightTable.name)
sqlStringBuilder.append(" on ")
var iteratorIndex:Int = 0
left_table_join_columns.foreach(c=>{
if(iteratorIndex>0){
sqlStringBuilder.append(" and ")
}
val leftRelationColumnName: String = left_table_join_columns(iteratorIndex)
val rightRelationColumnName: String = right_table_join_columns(iteratorIndex)
sqlStringBuilder.append(leftTable.name).append(".").append(leftRelationColumnName).append(" = ").append(rightTable.name).append(".").append(rightRelationColumnName)
iteratorIndex+=1
})
spark.sql(sqlStringBuilder.toString()).show()
}
}
package com.cch.bigdata.spark.process
import com.cch.bigdata.spark.process.join.Joiner
object JoinTest {
def main(args: Array[String]): Unit = {
val joiner = new Joiner()
joiner.process()
}
}
参数解释
- left_table:左表数据集的一个命名
- left_table_query_columns:左表输出列,使用逗号分隔的字符串
- right_table:右表数据集的一个命名
- right_table_query_columns:右表输出列,使用逗号分隔的字符串
- join_type:join类型
- left join:左连接:返回选择的左右表字段中,左表的全部数据和右表中满足关联条件的数据。
- left join:右连接:返回选择的左右表字段中,右表的全部数据和左表中满足关联条件的数据。
- inner join:内连接:返回选择的左右表字段中,左表中满足关联条件的数据和右表中满足关联条件的数据。
- full join:全连接:返回选择的左右表字段的所有数据
- left_table_join_columns:左表需要与右表关联的字段,字符串数组
- right_table_join_columns:右表需要与左边做关联的字段,字符串数组