package test
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.functions.{col, lit, max}
object ods1 {
def main(args: Array[String]): Unit = {
// 创建 SparkSession
val spark = SparkSession.builder()
.enableHiveSupport()
.getOrCreate()
// 读取 MySQL 的 user_info 表,不进行时间戳转换
val userInfoDF = spark.read.format("jdbc")
.option("url", "jdbc:mysql://10.11.1.66:3306/shtd_store")
.option("dbtable", "user_info1")
.option("user", "root")
.option("password", "123456")
.load()
// 读取 Hive 的 ods 库的 user_info 表
val hiveUserInfoDF = spark.sql("SELECT * FROM ods.user_info")
// 根据增量字段进行过滤
val maxCreateTime = hiveUserInfoDF.agg(max(col("create_time"))).first().getString(0)
val maxOperateTime = hiveUserInfoDF.agg(max(col("operate_time"))).first().getString(0)
val userInfoIncrementDF = userInfoDF.filter(
(col("create_time") > maxCreateTime) || (col("operate_time") > maxOperateTime)
)
//************
spark.sql("SET hive.exec.dynamic.partition.mode=nonstrict")
//*************
// 然后将过滤后的数据写入Hive表时,指定静态分区
userInfoIncrementDF.withColumn("partition_date", lit(20230906))
.write
.partitionBy("partition_date")
.mode(SaveMode.Append)
//.options(staticPartitionValues) // 指定静态分区的值
.saveAsTable("ods.user_info3") // 使用insertInto将数据写入Hive表
// 停止 SparkSession
spark.stop()
}
}
- 读取mysql存入hive的ods层
2读取hive的ods层存入dwd层
package test
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, current_timestamp, date_format, lit, when}
import org.apache.spark.sql.SaveMode
object dwd1{
def main(args: Array[String]): Unit = {
val warehouse="hdfs://10.11.1.66:9820/usr/hive/warehouse"
// 创建 SparkSession
val spark = SparkSession.builder()
//***
.config("saprk.sql.warehouse.dir",warehouse)
.config("saprk.sql.shuffle.partition",1000)
.config("hive.metestore.uris","thrift://10.11.1.66:9083")
//***
.enableHiveSupport()
.getOrCreate()
//读取ods
val ods = spark.sql("select * from ods.new_user_info where partition_date = '20230906'")
.withColumn(colName = "dwd_insert_user", lit("user1"))
.withColumn(colName = "dwd_insert_time", lit(lit(date_format(current_timestamp(), format = "yyyy-MM-dd HH:mm:ss"))))
.withColumn(colName = "dwd_modify_user", lit( "user1"))
.withColumn(colName = "dwd_modify_time", lit(lit(date_format(current_timestamp(), format = "yyyy-MM-dd HH:mm:ss"))))
.drop("partition_date")
//获取最新的分区数据,具体hive cli查询
val dwd = spark.sql("select * from dwd.dim_user_info where partition_date = '20230906'").drop(colName = "partition_date")
// 将两个表数据合并,对字段operate_time为nulL的数据条目填充create_time的值
ods.unionByName(dwd).withColumn("operate_time", when(col("operate_time").isNull, col(colName = "create_time"))
.otherwise(col("operate_time")))
.createTempView("v")
spark.sql("select * from v").show(truncate = false)
//desc降序
// asc升序
val pp=spark.sql(
"""
select *,
row_number() over (partition by v.id order by operate_time desc) as operate_time_num,
min(dwd_insert_time) over (partition by v.id) as min_dwd_insert_time
from v
""".stripMargin)
.withColumn("dwd_insert_time", when(col("operate_time_num") === 1, col("min_dwd_insert_time"))
.otherwise(col("dwd_insert_time")))
.filter(col( "operate_time_num") === 1) //过滤第一条数据,是通过降序排序,,符合条件的留下
.drop( "operate_time_num","min_dwd_insert_time") //删除多余字段
.withColumn( "birthday", col("birthday").cast(to = "timestamp")) //数据类型转换
//保存
pp.withColumn("partition_data", lit(20230405)).write
.partitionBy("partition_data")
.mode(SaveMode.Append)
.saveAsTable("dim_user_info")
spark.stop()
}
}
将处理好的数据存入hbase
package test
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.{SparkSession, functions}
object hbase {
def main(args: Array[String]): Unit = {
// 创建 SparkSession
val spark = SparkSession.builder()
.appName("IncrementalDataETL")
.enableHiveSupport()
.getOrCreate()
val order = spark.sql("""SELECT *,SUBSTR(create_time, 1, 4) AS year,SUBSTR(create_time, 5, 2) AS month FROM dwd.fact_order_info;""")
val region = spark.table("dwd.dim_region")
val province = spark.table(("dwd.dim_province"))
val data = order.join(province, province("id") === order("province_id"))
val newdata = data.join(region, region("id") === data("region_id"), "left_outer")
.groupBy("province_id", "name", "region_id", "region_name", "year", "month")
.agg(functions.sum("final_total_amount").cast("decimal(38, 18)").alias("totalcosumption"), count("province_id").alias("totalorder"))
// 定义 HBase 表名和列族
val hbaseTableName = "pro" // HBase 表名
val columnFamily = "info" // HBase 列族名
// 将数据写入 HBase
newdata.foreachPartition(iter => {
// 创建 HBase 连接
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum", "10.11.1.66")
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
val connection = ConnectionFactory.createConnection(hbaseConf)
val table = connection.getTable(TableName.valueOf(hbaseTableName))
iter.foreach(row => {
val put = new Put(Bytes.toBytes(row.getAs[Long]("province_id").toString)) // 使用ID作为行键
put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("province_id"), Bytes.toBytes(row.getAs[String]("province_id")))
put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("name"), Bytes.toBytes(row.getAs[String]("name")))
put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("region_id"), Bytes.toBytes(row.getAs[String]("region_id")))
put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("year"), Bytes.toBytes(row.getAs[String]("year")))
put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("month"), Bytes.toBytes(row.getAs[String]("month")))
put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("totalconsumption"), row.getAs[java.math.BigDecimal]("totalconsumption").unscaledValue().toByteArray)
put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes("totalorder"), Bytes.toBytes(row.getAs[Long]("totalorder").toString))
// 添加其他列的写入操作,类似上面的方式
table.put(put)
})
// 关闭连接
table.close()
connection.close()
})
// 停止 SparkSession
spark.stop()
}
}