package com.ws.bulkloader
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2
import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
object HiveBulk2Hbase {
def main(args: Array[String]): Unit = {
// 读取hive数据
val spark = SparkSession.builder()
.enableHiveSupport()
.appName("Hive bulkloader 2 Hbase")
.master("local")
.getOrCreate()
import spark.implicits._
val hivetable = spark.read.table("dws17.app_trf_agr_session")
val hbaserows: RDD[(String, String, String, String)] = hivetable.flatMap(row => {
val guid = row.getAs[String]("guid") + ""
val session_id = row.getAs[String]("session_id") + ""
val start_ts = row.getAs[Long]("start_ts") + ""
val end_ts = row.getAs[Long]("end_ts") + ""
val first_page_id = row.getAs[String]("first_page_id") + ""
val last_page_id = row.getAs[String]("last_page_id") + ""
val pv_cnt = row.getAs[Int]("pv_cnt") + ""
val isnew = row.getAs[Int]("isnew") + ""
val hour_itv = row.getAs[Int]("hour_itv") + ""
val province = row.getAs[String]("province") + ""
val city = row.getAs[String]("city") + ""
val region = row.getAs[String]("region") + ""
val device_type = row.getAs[String]("device_type") + ""
val lst = new ListBuffer[(String, String, String, String)]
// rowkey, family , qualifier ,value
lst += ((start_ts + "|" + session_id, "f", "guid", guid))
lst += ((start_ts + "|" + session_id, "f", "session_id", session_id))
lst += ((start_ts + "|" + session_id, "f", "start_ts", start_ts))
lst += ((start_ts + "|" + session_id, "f", "end_ts", end_ts))
lst += ((start_ts + "|" + session_id, "f", "first_page_id", first_page_id))
lst += ((start_ts + "|" + session_id, "f", "last_page_id", last_page_id))
lst += ((start_ts + "|" + session_id, "f", "pv_cnt", pv_cnt))
lst += ((start_ts + "|" + session_id, "f", "isnew", isnew))
lst += ((start_ts + "|" + session_id, "f", "hour_itv", hour_itv))
lst += ((start_ts + "|" + session_id, "f", "province", province))
lst += ((start_ts + "|" + session_id, "f", "city", city))
lst += ((start_ts + "|" + session_id, "f", "region", region))
lst += ((start_ts + "|" + session_id, "f", "device_type", device_type))
lst
// hive中的一行 对应hbase中的多行,所以,把hive一行根据列名拆成多行,放到一个list中,
// 然后flaatMap 变成多行,另外需要根据 主键,列族,列名排好序才能生成hfile
}).filter(tp => !tp._1.contains("null")).rdd.sortBy(x=>(x._1,x._2,x._3))
// hbase 需要的 数据类型为(ImmutableBytesWritable,KeyValue)
val writedata = hbaserows.map(tp => {
val key = new ImmutableBytesWritable(Bytes.toBytes(tp._1))
val value = new KeyValue(Bytes.toBytes(tp._1),Bytes.toBytes(tp._2), Bytes.toBytes(tp._3), Bytes.toBytes(tp._4))
(key, value)
})
val conf = HBaseConfiguration.create()
conf.set("fs.defaultFS","hdfs://dream1:9000")
conf.set("hbase.zookeeper.quorum","dream1:2181,dream2:2181,dream3:2181")
val job = Job.getInstance(conf)
val conn = ConnectionFactory.createConnection(conf)
val app_trf_agr_session = TableName.valueOf("app_trf_agr_session")
val table = conn.getTable(app_trf_agr_session)
val locator = conn.getRegionLocator(app_trf_agr_session)
// 创建HFileOutputFormat
HFileOutputFormat2.configureIncrementalLoad(job,table,locator)
val outPath = "/bulkload/session_agr"
// 保存Hfile
writedata.saveAsNewAPIHadoopFile(outPath,classOf[ImmutableBytesWritable],classOf[KeyValue],classOf[HFileOutputFormat2],job.getConfiguration)
val load = new LoadIncrementalHFiles(conf)
// 通知master写入增量元数据信息
load.doBulkLoad(new Path(outPath),conn.getAdmin,table,locator)
spark.close()
}
}
sprak hive2hbase_bulkloader
于 2021-04-25 21:25:44 首次发布