使用spark写数据到Hbase的三种方式

最新推荐文章于 2024-08-25 08:29:09 发布

辣条好辣

最新推荐文章于 2024-08-25 08:29:09 发布

阅读量1.2k

点赞数

分类专栏： spark hbase

原文链接：https://blog.csdn.net/adorechen/article/details/82465140

版权

spark 同时被 2 个专栏收录

6 篇文章 0 订阅

订阅专栏

hbase

6 篇文章 0 订阅

订阅专栏

方式一：直接使用HBase Table的PUT方法



import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put, Table}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
 
/**
  * Description: Use Put method of Hbase Client insert data into hbase in Spark-streaming.
  *
  * Author : Adore Chen
  * Created: 2017-12-22
  */
object SparkPut {
 
  /**
    * insert 100,000 cost 20762 ms
    *
    * @param args
    */
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("SparkPut")
    val context = new SparkContext(conf)
 
    try {
      val rdd = context.makeRDD(1 to 100000, 4)
 
      // column family
      val family = Bytes.toBytes("cf")
      // column counter --> ctr
      val column = Bytes.toBytes("ctr")
 
      println("count is :" + rdd.count())
      rdd.take(5).foreach(println)
 
      // mapPartition & foreachPartition
      // mapPartition is a lazy transformation, if no action, there is no result.
      // foreachPartition is an action
      rdd.foreachPartition(list => {
        val table = createTable()
        list.foreach(value => {
          val put = new Put(Bytes.toBytes(value))
          put.addImmutable(family, column, Bytes.toBytes(value))
          table.put(put)
        })
        table.close()
      })
    } finally {
      context.stop()
    }
  }
 
  /**
    * create Hbase Table interface.
    *
    * @return
    */
  def createTable(): Table = {
    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.set("hbase.zookeeper.quorum", "localhost")
    hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
    hbaseConf.set("hbase.defaults.for.version.skip", "true")
    val conn = ConnectionFactory.createConnection(hbaseConf)
    conn.getTable(TableName.valueOf("test_table"))
  }
}

标题方式二：Put(List)




import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.JavaConversions
 
/**
  * Description: Use Mutator batch insert in spark context.
  *
  * Author : Adore Chen
  * Created: 2017-12-22
  */
object SparkPutList {
 
  /**
    * Use mutator batch insert 100,000, mutator.mutator(Put) cost: 22369
    * Use put list insert 100,000, cost: 25571
    * Use put list by Map 100,000, cost: 21299
    *
    * @param args
    */
  def main(args: Array[String]): Unit = {
    //    putByList()
    putByMap()
  }
 
  def putByMap(): Unit = {
    val conf = new SparkConf().setAppName(SparkPutList.getClass().getSimpleName())
    val context = new SparkContext(conf)
 
    // column family
    val family = Bytes.toBytes("cf")
    // column counter --> ctr
    val column = Bytes.toBytes("ctr")
 
    try {
      val rdd = context.makeRDD(1 to 100000, 4)
      rdd.map(value => {
        val put = new Put(Bytes.toBytes(value))
        put.addImmutable(family, column, Bytes.toBytes(value))
      }).foreachPartition(
        itr => {
          val hbaseConf = HBaseConfiguration.create()
          val conn = ConnectionFactory.createConnection(hbaseConf)
          val table = conn.getTable(TableName.valueOf("test_table"))
          table.put(JavaConversions.seqAsJavaList(itr.toSeq))
          table.close()
        })
    } finally {
      context.stop()
    }
  }
 
  def putByList(): Unit = {
    val conf = new SparkConf().setAppName(SparkPutList.getClass().getSimpleName())
    val context = new SparkContext(conf)
 
    // column family
    val family = Bytes.toBytes("cf")
    // column counter --> ctr
    val column = Bytes.toBytes("ctr")
 
    try {
      val rdd = context.makeRDD(1 to 100000, 4)
      rdd.foreachPartition(list => {
        val hbaseConf = HBaseConfiguration.create()
        val conn = ConnectionFactory.createConnection(hbaseConf)
        val table = conn.getTable(TableName.valueOf("test_table"))
        val putList = new java.util.LinkedList[Put]()
        list.foreach(value => {
          val put = new Put(Bytes.toBytes(value))
          put.addImmutable(family, column, Bytes.toBytes(value))
          putList.add(put)
        })
        table.put(putList)
        table.close()
      })
    } finally {
      context.stop()
    }
  }
 
  def putByMutator(): Unit = {
    val conf = new SparkConf().setAppName(SparkPutList.getClass().getSimpleName())
    val context = new SparkContext(conf)
 
    // column family
    val family = Bytes.toBytes("cf")
    // column counter --> ctr
    val column = Bytes.toBytes("ctr")
 
    try {
      val rdd = context.makeRDD(1 to 100000, 4)
      rdd.foreachPartition(list => {
        val hbaseConf = HBaseConfiguration.create()
        val conn = ConnectionFactory.createConnection(hbaseConf)
        val mutator = conn.getBufferedMutator(TableName.valueOf("test_table"))
        list.foreach(value => {
          val put = new Put(Bytes.toBytes(value))
          put.addImmutable(family, column, Bytes.toBytes(value))
          mutator.mutate(put)
        })
        mutator.close()
      })
    } finally {
      context.stop()
    }
  }
}

方式三：使用map reduce job 写入Hbase




import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper
import org.apache.spark.{SparkConf, SparkContext}
 
/**
  * Description: Put data into Hbase by map reduce Job.
  *
  * Author : Adore Chen
  * Created: 2017-12-22
  */
object SparkMapJob {
 
    /**
      * insert 100,000 cost 21035 ms
      *
      * @param args
      */
    def main(args: Array[String]): Unit = {
      val conf = new SparkConf().setAppName("SparkPutByMap")
      val context = new SparkContext(conf)
 
      val hbaseConf =HBaseConfiguration.create()
      hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, "test_table")
      //IMPORTANT: must set the attribute to solve the problem (can't create path from null string )
      hbaseConf.set("mapreduce.output.fileoutputformat.outputdir", "/tmp")
 
      val job = Job.getInstance(hbaseConf)
      job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
      job.setOutputKeyClass(classOf[ImmutableBytesWritable])
      job.setOutputValueClass(classOf[Put])
 
      try{
        val rdd = context.makeRDD(1 to 100000)
 
        // column family
        val family = Bytes.toBytes("cf")
        // column counter --> ctr
        val column = Bytes.toBytes("ctr")
 
        rdd.map(value => {
          var put = new Put(Bytes.toBytes(value))
          put.addImmutable(family, column, Bytes.toBytes(value))
          (new ImmutableBytesWritable(), put)
          })
          .saveAsNewAPIHadoopDataset(job.getConfiguration)
      }finally{
        context.stop()
      }
    }
 
}

参考链接：https://blog.csdn.net/adorechen/article/details/82465140
https://blog.csdn.net/koukan3/article/details/102721733