spark实现hbase多线程批量读取

Scala版本2.11.8 spark版本2.1.0

 <!--*************************************************************************-->
      <dependency>
          <groupId>org.apache.spark</groupId>
          <artifactId>spark-core_2.11</artifactId>
          <version>2.1.0</version>
      </dependency>
      <dependency>
          <groupId>org.apache.hbase</groupId>
          <artifactId>hbase-server</artifactId>
          <version>1.1.2</version>
      </dependency>
      <dependency>
          <groupId>org.apache.spark</groupId>
          <artifactId>spark-streaming_2.11</artifactId>
          <version>2.1.0</version>
      </dependency>
      <dependency>
          <groupId>org.apache.spark</groupId>
          <artifactId>spark-sql_2.11</artifactId>
          <version>2.1.0</version>
      </dependency>
   <!--*************************************************************************-->
import java.util.concurrent.{Executors, Future}
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import scala.collection.mutable.ArrayBuffer


object ThreadReadHbase {
  //创建弹性线程池
  val pool = Executors.newCachedThreadPool()
  //配置参数意思不明晰,参考另外一篇博客https://blog.csdn.net/u010916338/article/details/80949525
  val sparkConf = new SparkConf().setAppName("ThreadReadHbase").setMaster("yarn")
    .set("spark.executor.instances", "5") //设置运算节点个数
    .set("spark.driver.memory", "2g")  
    .set("spark.executor.memory", "2g")  
    .set("spark.driver.cores", "4")
    .set("spark.executor.cores", "4")
    .set("spark.streaming.blockInterval", "50")
    .set("spark.default.parallelism", "10")
    .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")//序列化,hbase底层存储是bytes类型

  val sc = new SparkContext(sparkConf)

  //声明获取RDD方法
  def getRDDbyTimestamp() = {

    //数组存放userid(拼接在rowkey最前面)
    val userArr = Array("981112", "991111")

    //申明可变数组,存放各线程返回值,因为线程有返回值所以要用Future保证数据一致性
    var results = new ArrayBuffer[Future[RDD[(ImmutableBytesWritable, Result)]]]()

    //查询每一个user信息启动一个线程,查询一天的数据
    for (user <- userArr) {
      //18070500代表1875号零点,位于rowkey的第二段,紧接着userid,sc全局只能申明一个,所以需要传过去
      var hbaseDataScaner = new HbaseDataScaner(user + "18070500", user + "18070524", sc)
      pool.synchronized {
        val rdd: Future[RDD[(ImmutableBytesWritable, Result)]] = pool.submit(hbaseDataScaner)
        results += rdd
      }
    }

    //存放合并之后的RDD
    var rdds: RDD[(ImmutableBytesWritable, Result)] = null

    //取出第一个RDD
    if (results.size > 0) {
      rdds = results(0).get()
    }

    //合并第二个到之后的RDD
    if (results.size >= 1) {
      results.remove(0)
      for (x <- results) {
        println("循环" + x.get())
        rdds = rdds.union(x.get())
      }
    }

    //打印RDD数据,注意:collect慎用,会将所有数据收集到一个计算节点,机器不能承受,小数据量ok
    rdds.collect().foreach { case (_, result) => {
      //获取行键
      val key = Bytes.toString(result.getRow)
      //通过列族和列名获取列
      val os_info = Bytes.toString(result.getValue("f1".getBytes, "c_name".getBytes))
      println("Row key:" + key + " Name:" + os_info)
    }
    }


    //根据业务需求重新组合RDD元素
    val rdd = rdds.map(result => (
      //获取行键
      Bytes.toString(result._2.getRow)+"|"+
      //通过列族和列名获取列
      Bytes.toString(result._2.getValue("f1".getBytes, "c_name".getBytes))+"|"+
      Bytes.toString(result._2.getValue("f1".getBytes, "c_code".getBytes))+"|"+
      Bytes.toString(result._2.getValue("f1".getBytes, "c_content".getBytes))+"|"+
      Bytes.toString(result._2.getValue("f1".getBytes, "c_dept".getBytes))+"|"+
      Bytes.toString(result._2.getValue("f1".getBytes, "c_log_type".getBytes))+"|"+
      Bytes.toString(result._2.getValue("f1".getBytes, "c_result".getBytes))+"|"+
      Bytes.toString(result._2.getValue("f1".getBytes, "c_sn".getBytes))+"|"+
      Bytes.toString(result._2.getValue("f1".getBytes, "n_log_time".getBytes))
    ))

    //保存RDD数据到hdfs
    rdd.saveAsTextFile("/rdd/hbase")

    //返回RDD,供后续数据分析使用
    rdd
  }



  def main(args: Array[String]): Unit = {
    val startTime = System.currentTimeMillis()
    val hbaseRDD = ThreadReadHbase.getRDDbyTimestamp()
    println("行数:" + hbaseRDD.count())
    val endTime = System.currentTimeMillis()
    println("运行时间:" + (endTime - startTime) / 1000 / 60 + "分钟")
    //最后关闭SparkContext
    sc.stop()
  }


}
import java.util.concurrent.Callable
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{Result, Scan}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext}


//线程有返回值需要实现Callable接口,Scala中的实现使用extends关键字,该类需要创建多次需是class类型
class HbaseDataScaner(startRowkey: String, endRowkey: String, sc: SparkContext) extends Callable[RDD[(ImmutableBytesWritable, Result)]] {
  val tablename = "usersaction"
  val conf = HBaseConfiguration.create()
  //设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
  conf.set("hbase.zookeeper.quorum", "192.168.35.206,192.168.35.207,192.168.35.208")
  //设置zookeeper连接端口,默认2181
  conf.set("hbase.zookeeper.property.clientPort", "2181")
  conf.set(TableInputFormat.INPUT_TABLE, tablename)

  //范围查询rowkey
  val scan = new Scan(Bytes.toBytes(startRowkey),Bytes.toBytes(endRowkey));
  //开启缓存
  scan.setCacheBlocks(true)
  //底层bytes类型转成String类型
  val proto = ProtobufUtil.toScan(scan)
  val scan_str = Base64.encodeBytes(proto.toByteArray)
  conf.set(TableInputFormat.SCAN, scan_str)

  val hbaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])

 //RDD返回类型必须是[(ImmutableBytesWritable, Result)]
  override def call(): RDD[(ImmutableBytesWritable, Result)] = {
    return hbaseRDD
  }


}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值