package com.shein
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.SparkSession
object Hive2HbaseGlobal {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName("hive2hbase")
// .master("local[2]")
.enableHiveSupport() //开启支持hive
.getOrCreate()
val exportQuorum=TotalUtil.getSystemParameterByKey("exportQuorum")
val exportTableName = TotalUtil.getSystemParameterByKey("exportTableName")
var exportColumnFamily = TotalUtil.getSystemParameterByKey("exportColumnFamily").getBytes()
val conns = TotalUtil.getHbase(exportQuorum)
val cell = TotalUtil.getSystemParameterByKey("cell")
val cells = cell.split(",")
val index = TotalUtil.getSystemParameterByKey("rowKeyIndex")
val inserCount = spark.sparkContext.longAccumulator("fooCount")
val hql = TotalUtil.getSystemParameterByKey("hql")
TotalUtil.createTable(conns,exportTableName,TotalUtil.getSystemParameterByKey("exportColumnFamily"))
val dataDF = spark.sql(hql)
// 写入数据到hbase
dataDF.foreachPartition(it => {
val conn = TotalUtil.getHbase(exportQuorum)
val insertTable = conn.getTable(TableName.valueOf(exportTableName))
var list = new java.util.ArrayList[Put]
it.foreach(row => {
if (row(index) != null && (!row(index).equals(""))) {
val rowkey = row(index)
inserCount.add(1)
val put = new Put(Bytes.toBytes(rowkey.toString))
for (i <- 0 until row.size) {
if (row(i) != null && (!row(i).equals(""))) {
put.addColumn(exportColumnFamily, Bytes.toBytes(cells(i)), Bytes.toBytes(row(i).toString))
}
}
list.add(put)
if (inserCount.value % 10000 == 0) {
insertTable.put(list)
list = new java.util.ArrayList[Put]
}
}
})
try {
insertTable.put(list)
} catch {
case _: Exception =>
}
insertTable.close()
conn.close()
})
println("inserCount:" + inserCount.value)
spark.stop()
spark.close()
}
}
简化版hive2hbase 通过系统参数的形式获取;不需要修改代码;因为只涉及到一个列族;所以写的有点呆板;