/**
语言:Scala
业务场景:用Spark对表进行join,表连接
思路:对表中相同字段名进行修改,再对表进行关联
*/
package test
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import scala.collection.mutable
import scala.collection.mutable.Set
/**
*实际业务表连接代码
*/
object TableJoin {
def main(args: Array[String]): Unit = {
//接收传入的参数
//标志位
val read_num=args(0)
//传入的表名,之间用,分割
val inputTable=args(1)
//生成的表名,之间用,分割
val tableName=args(2)
//sparkSql语句
val sql=args(3)
//输出的表名
val outputTableName=args(4)
//保存hive,标识符
val saveToHive=args(5)
val tableArr: Array[String] = inputTable.split(",")
val nameArr: Array[String] = tableName.split(",")
val spark: SparkSession = getSpark
if(tableArr.length!=nameArr.length){
throw new Exception("表个数和表个数不一致")
}else if (tableArr.length<2 || nameArr.length<2){
throw new Exception("至少输入两个表的路径和两张表名")
}else{
//把数据缓存在内存中
val dataFrame1: DataFrame = spark.sql("select * from "+tableArr(0))
val dataFrame2: DataFrame = spark.sql("select * from "+tableArr(1))
//读取元数据信息
val fataFrametype1: Array[(String, String)] = dataFrame1.dtypes
val fataFrametype2: Array[(String, String)] = dataFrame1.dtypes
//循环存取元数据信息
val frameOneColumn=Set("")
val frameTwoColumn=Set("")
for (elem <- fataFrametype1) {
frameOneColumn.add(elem._1)
}
for (elem <- fataFrametype2) {
frameTwoColumn.add(elem._1)
}
//中间表
var frameMiddle=dataFrame2
//取相同字段
val repeatColumn: mutable.Set[String] = frameOneColumn.intersect(frameTwoColumn)
//修好第二个表中相同字段的字段名加_1
for (elem <- repeatColumn) {
frameMiddle=frameMiddle.withColumnRenamed(elem,elem+"_1")
}
//创建视图
dataFrame1.createOrReplaceTempView(nameArr(0))
frameMiddle.createOrReplaceTempView(nameArr(1))
}
//执行具体的sql语句
val writeFrame: DataFrame = spark.sql(sql)
//存储在hive里
writeFrame.createOrReplaceTempView("temp_table1")
spark.sql("drop table if exists"+outputTableName)
spark.sql("create table " +outputTableName +" as select * from temp_table1")
spark.sql("alter table "+outputTableName+" set serdeproperties('serialization.null.format'=''")
spark.close()
}
//获取Spark环境
def getSpark = {
val sparkSession: SparkSession = SparkSession.builder()
.master("local[*]")
.appName("test")
.config("spark.sql.crossJoin.enabled","true")
.getOrCreate()
sparkSession
}
}
sparkSql表关联实际业务场景
最新推荐文章于 2024-08-19 23:18:25 发布