scala通过BulkLoad实现hbase的批量入库

部署环境参考其他章节。

完整代码如下:

import java.util.UUID

import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2
import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}


object HbaseOpe {
  def bulkLoadToHbase()={
    val spark = SparkSession.builder().appName("HbaseBulkLoad").getOrCreate()
    val sc = spark.sparkContext

    val tbname= "yykj_pro:rm2018"

    val ckey: Map[String, Int]=Map("humidity"->1,"prcp"->2,"sunlit"->3,"tavg"->4,"tmax"->5,"tmin"->6,"wind"->7)

    for(i <-35 to 365){
      val tm=f"2018$i%3d".replaceAll(" ", "0")
      val txtpath="/user/datas/%s.txt".format(tm)
      val txtRdd=sc.textFile(txtpath)

      //具体文本格式为:
      //1870,0,0,0,0,0,0,0
      //1871,0,0,0,0,0,0,0
      //1872,0,0,0,0,0,0,0
      //1873,0,0,0,0,0,0,0
      //1874,0,0,0,0,0,0,0
      //8067,0,0,0,0,0,0,0
      //8068,0,0,0,0,0,0,0
      //8069,0,0,0,0,0,0,0
      //8070,0,0,0,0,0,0,0
      //8071,0,0,0,0,0,0,0
      //使用bulkLoad多次尝试多列写入失败,所以这里只能通过循环将多列数据写入
      //注意这里有个sortBy排序,使用bulkLoad写入主键必须按顺序排列,否则会出现
      //Added a key not lexically larger than previous错误。
      val sortRdd=txtRdd.map(_.split(",")).sortBy(arr=>arr(0))

      for(c <- ckey){
        val outRdd=sortRdd.map(arr=>{
          val rowKey = Bytes.toBytes(arr(0))
          val immutableRowKey = new ImmutableBytesWritable(rowKey)

          val kv = new KeyValue(
            rowKey,
            Bytes.toBytes("dynamic"),
            Bytes.toBytes(c._1),
            tm.toLong,
            Bytes.toBytes(c._2.toString)
          )
          (immutableRowKey, kv)
        })

        val hbaseConf = HBaseConfiguration.create()
        hbaseConf.set("hbase.mapreduce.hfileoutputformat.table.name", tbname)

        val tableName = TableName.valueOf(tbname)
        val conn = ConnectionFactory.createConnection(hbaseConf)
        val table = conn.getTable(tableName)
        val regionLocator = conn.getRegionLocator(tableName)


        val hFileOutput = s"/tmp/hbase/"+UUID.randomUUID().toString

        outRdd.saveAsNewAPIHadoopFile(hFileOutput,
          classOf[ImmutableBytesWritable],
          classOf[KeyValue],
          classOf[HFileOutputFormat2],
          hbaseConf
        )

        //写入hbase
        val bulkLoader = new LoadIncrementalHFiles(hbaseConf)
        bulkLoader.doBulkLoad(new Path(hFileOutput), conn.getAdmin, table, regionLocator)
      }


    }
  }
}

 

Scala中,将`AbsBean`类的数据写入HBase通常需要使用HBase的官方Java API,因为Scala虽然有自己的特性,但在处理这种底层NoSQL数据库操作上,还是倾向于使用原生支持的语言如Java。以下是简单的步骤: 1. 首先,确保你已经添加了HBaseScala客户端依赖到你的项目中。如果你使用的是Sbt构建工具,可以在build.sbt文件中添加如下依赖: ```scala libraryDependencies += "org.apache.hbase" % "hbase-client-scala" % "version" ``` 2. 导入所需的库: ```scala import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.hadoop.hbase.client.Connection import org.apache.hadoop.hbase.util.Bytes import scala.collection.JavaConverters._ import AbsBean._ ``` 3. 创建HBase配置并连接到集群: ```scala val config = HBaseConfiguration.create() config.set("hbase.zookeeper.quorum", "your_zookeeper_nodes") config.set("hbase.zookeeper.property.clientPort", "your_zookeeper_port") val connection = ConnectionFactory.createConnection(config) ``` 4. 定义表名和行键(假设AbsBean有一个`rowKey`属性作为唯一标识): ```scala val tableName = TableName.valueOf("your_table_name") val rowKeyBytes = Bytes.toBytes(absBean.getRowKey) ``` 5. 将`AbsBean`实例转换为HBase的KeyValue对: ```scala def toKeyValue(bean: AbsBean): Array[Byte] = { // 根据AbsBean的结构,提取需要存储的字段,并转换为Bytes // 比如: val keyValue = new Put(rowKeyBytes).addColumn(familyName, qualifierName, Bytes.toBytes(bean.field)) keyValue.toBytes } ``` 6. 将数据写入HBase: ```scala val table = connection.getTable(tableName) try { table.put(toKeyValue(absBean)) } finally { table.close() } connection.close() ``` 记得替换上述代码中的`your_zookeeper_nodes`、`your_zookeeper_port`、`your_table_name`、`familyName`、`qualifierName`以及字段名称等为实际值。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值