sprak hive2hbase_bulkloader

5 篇文章 0 订阅
package com.ws.bulkloader

import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2
import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

import scala.collection.mutable.ListBuffer

object HiveBulk2Hbase {
  def main(args: Array[String]): Unit = {

    // 读取hive数据
    val spark = SparkSession.builder()
      .enableHiveSupport()
      .appName("Hive bulkloader 2 Hbase")
      .master("local")
      .getOrCreate()
    import spark.implicits._
    val hivetable = spark.read.table("dws17.app_trf_agr_session")
    val hbaserows: RDD[(String, String, String, String)] = hivetable.flatMap(row => {
      val guid = row.getAs[String]("guid") + ""
      val session_id = row.getAs[String]("session_id") + ""
      val start_ts = row.getAs[Long]("start_ts") + ""
      val end_ts = row.getAs[Long]("end_ts") + ""
      val first_page_id = row.getAs[String]("first_page_id") + ""
      val last_page_id = row.getAs[String]("last_page_id") + ""
      val pv_cnt = row.getAs[Int]("pv_cnt") + ""
      val isnew = row.getAs[Int]("isnew") + ""
      val hour_itv = row.getAs[Int]("hour_itv") + ""
      val province = row.getAs[String]("province") + ""
      val city = row.getAs[String]("city") + ""
      val region = row.getAs[String]("region") + ""
      val device_type = row.getAs[String]("device_type") + ""
      val lst = new ListBuffer[(String, String, String, String)]
      // rowkey, family , qualifier ,value
      lst += ((start_ts + "|" + session_id, "f", "guid", guid))
      lst += ((start_ts + "|" + session_id, "f", "session_id", session_id))
      lst += ((start_ts + "|" + session_id, "f", "start_ts", start_ts))
      lst += ((start_ts + "|" + session_id, "f", "end_ts", end_ts))
      lst += ((start_ts + "|" + session_id, "f", "first_page_id", first_page_id))
      lst += ((start_ts + "|" + session_id, "f", "last_page_id", last_page_id))
      lst += ((start_ts + "|" + session_id, "f", "pv_cnt", pv_cnt))
      lst += ((start_ts + "|" + session_id, "f", "isnew", isnew))
      lst += ((start_ts + "|" + session_id, "f", "hour_itv", hour_itv))
      lst += ((start_ts + "|" + session_id, "f", "province", province))
      lst += ((start_ts + "|" + session_id, "f", "city", city))
      lst += ((start_ts + "|" + session_id, "f", "region", region))
      lst += ((start_ts + "|" + session_id, "f", "device_type", device_type))
      lst
      // hive中的一行 对应hbase中的多行,所以,把hive一行根据列名拆成多行,放到一个list中,
      // 然后flaatMap 变成多行,另外需要根据 主键,列族,列名排好序才能生成hfile
    }).filter(tp => !tp._1.contains("null")).rdd.sortBy(x=>(x._1,x._2,x._3))
    // hbase 需要的 数据类型为(ImmutableBytesWritable,KeyValue)
    val writedata = hbaserows.map(tp => {
      val key = new ImmutableBytesWritable(Bytes.toBytes(tp._1))
      val value = new KeyValue(Bytes.toBytes(tp._1),Bytes.toBytes(tp._2), Bytes.toBytes(tp._3), Bytes.toBytes(tp._4))
      (key, value)
    })
    val conf = HBaseConfiguration.create()
    conf.set("fs.defaultFS","hdfs://dream1:9000")
    conf.set("hbase.zookeeper.quorum","dream1:2181,dream2:2181,dream3:2181")
    val job = Job.getInstance(conf)

    val conn = ConnectionFactory.createConnection(conf)
    val app_trf_agr_session = TableName.valueOf("app_trf_agr_session")
    val table = conn.getTable(app_trf_agr_session)
    val locator = conn.getRegionLocator(app_trf_agr_session)
    // 创建HFileOutputFormat
    HFileOutputFormat2.configureIncrementalLoad(job,table,locator)
    val outPath = "/bulkload/session_agr"
    // 保存Hfile
    writedata.saveAsNewAPIHadoopFile(outPath,classOf[ImmutableBytesWritable],classOf[KeyValue],classOf[HFileOutputFormat2],job.getConfiguration)
    val load = new LoadIncrementalHFiles(conf)
    // 通知master写入增量元数据信息
    load.doBulkLoad(new Path(outPath),conn.getAdmin,table,locator)

    spark.close()
  }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值