package com.jojo.spark.utils
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Put, Table}
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.mapreduce.Job
import scala.collection.JavaConverters._
/**
* @author wenXin
* @date 2024/04/01
* @describe Hive写入Lindorm
*/
object XGBUserStratificationToLindorm {
def main(args: Array[String]): Unit = {
/** TODO 创建sparkSession上下文执行环境 */
val sparkConf = new SparkConf().setAppName("SparkHiveToHbase")
val sparkSession = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate()
/** TODO 定义dataFrame数据集 */
val dataFrame: DataFrame = sparkSession.sql(
s"""
|select cast(user_id as bigint) uid
| ,subject_type_name
| ,cast(lead_times as int) lead_times
| ,cast(user_subject_lead_cnt as int) user_subject_lead_cnt
| ,cast(current_year_cnt as int) as current_year_cnt
|from
|(select user_id
| ,subject_type_name
| ,lead_times
| ,user_subject_lead_cnt
| ,current_year_cnt
| ,row_number() over(partition by user_id,subject_type order by lead_time) rank
|from dw.table_name
|) t
|where t.rank=1
|""".stripMargin)
/** TODO 执行写入 */
dataFrameWriteToLindormUat(dataFrame)
/** TODO 关闭会话 */
sparkSession.stop()
}
/** TODO 定义dataFrame写入Hbase方法 */
def dataFrameWriteToLindormUat(dataFrame: org.apache.spark.sql.DataFrame): Unit = {
//dataFrame.rdd.mapPartitions { partition =>
dataFrame.rdd.foreachPartition { partition =>
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum", "ld-xxxxxxxxxxxxxxxxx.lindorm.rds.aliyuncs.com")
hbaseConf.set("hbase.zookeeper.property.clientPort", "30020")
hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, "namespace_name:table_name")
hbaseConf.set("hbase.client.username", "username")
hbaseConf.set("hbase.client.password", "password")
val job = Job.getInstance(hbaseConf)
job.setOutputFormatClass(classOf[TableOutputFormat[Put]])
/** 获取 HBase 表对象 */
val table = LindormConnectionManager.getConnection(hbaseConf).getTable(TableName.valueOf("mall", "mall_user_course_focus"))
/** 创建一个空的 Put List 用于批量写入 */
var putList = List[Put]()
partition.foreach { row =>
/** 使用uid+subject_type_name作为rowkey */
val uid = row.getAs[Long]("uid").toString
val subject_type_name = row.getAs[String]("subject_type_name")
val rowkey = Bytes.toBytes(uid + subject_type_name)
val put = new Put(rowkey)
put.addColumn(Bytes.toBytes("focus_info"), Bytes.toBytes("lead_times"), Bytes.toBytes(row.getAs[Int]("lead_times")))
put.addColumn(Bytes.toBytes("focus_info"), Bytes.toBytes("user_subject_lead_cnt"), Bytes.toBytes(row.getAs[Int]("user_subject_lead_cnt")))
put.addColumn(Bytes.toBytes("focus_info"), Bytes.toBytes("current_year_cnt"), Bytes.toBytes(row.getAs[Int]("current_year_cnt")))
put.addColumn(Bytes.toBytes("focus_info"), Bytes.toBytes("subject_type_name"), Bytes.toBytes(row.getAs[String]("subject_type_name")))
/** 将 Put 对象添加到批量写入列表中 */
putList = put :: putList
/** 攒批处理: 如果批量写入列表中的 Put 数量达到阈值,就批量写入到 HBase 表中 */
val batchSize: Int = 10000
if (putList.size >= batchSize) {
table.put(putList.asJava)
/** 清空列表 */
//putList = List[Put]()
putList = Nil
}
}
/** 批量写入剩余的Put对象 */
if (putList.nonEmpty) {
table.put(putList.asJava)
}
/** 返回空的迭代器 */
val _ = Iterator.empty
/** 触发执行,将结果返回给Driver端 */
} //.count()
/** 关闭Hbase连接 */
LindormConnectionManager.closeConnection()
}
}
/** TODO 每个Executor上创建单例的 HBase 连接和表对象;避免序列化问题 */
object LindormConnectionManager extends Serializable {
@transient private var connection: Connection = _
def getConnection(hbaseConf: org.apache.hadoop.conf.Configuration): Connection = {
if (connection == null || connection.isClosed) {
connection = ConnectionFactory.createConnection(hbaseConf)
}
connection
}
def closeConnection(): Unit = {
if (connection != null && !connection.isClosed) {
connection.close()
}
}
}```
Hbase Spark将Hive数据写入Hbase/Lindorm
于 2024-04-09 16:35:56 首次发布