spark读Hbase数据集成Hbase Filter(过滤器)

南风知我意丿

已于 2023-03-17 19:07:37 修改

阅读量340

点赞数

分类专栏： Hbase Spark 文章标签： hbase spark 大数据

于 2023-03-17 18:30:03 首次发布

本文链接：https://blog.csdn.net/Lzx116/article/details/129626034

版权

Spark 同时被 2 个专栏收录

57 篇文章 2 订阅

订阅专栏

Hbase

28 篇文章 0 订阅

订阅专栏

文章目录

基于hbase版本2.3.5

过滤器简介

Hbase 提供了种类丰富的过滤器（filter）来提高数据处理的效率，用户可以通过内置或自定义的过滤器来对数据进行过滤，所有的过滤器都在服务端生效，即谓词下推（predicate push down）。
好处：这样可以保证过滤掉的数据不会被传送到客户端，从而减轻网络传输和客户端处理的压力。

在这里插入图片描述

spark 读Hbase集成Filter

TableInputFormat 源码

package org.apache.hadoop.hbase.mapreduce;

public class TableInputFormat extends TableInputFormatBase
implements Configurable {

  @SuppressWarnings("hiding")
  private static final Logger LOG = LoggerFactory.getLogger(TableInputFormat.class);

  /** Job parameter that specifies the input table. */
  public static final String INPUT_TABLE = "hbase.mapreduce.inputtable";
  /**
   * If specified, use start keys of this table to split.
   * This is useful when you are preparing data for bulkload.
   */
  private static final String SPLIT_TABLE = "hbase.mapreduce.splittable";
  /** Base-64 encoded scanner. All other SCAN_ confs are ignored if this is specified.
   * See {@link TableMapReduceUtil#convertScanToString(Scan)} for more details.
   */
  public static final String SCAN = "hbase.mapreduce.scan";
  /** Scan start row */
  public static final String SCAN_ROW_START = "hbase.mapreduce.scan.row.start";
  /** Scan stop row */
  public static final String SCAN_ROW_STOP = "hbase.mapreduce.scan.row.stop";
						 ...
						 ...

可以看到TableInputFormat中提供了Scan参数，但是具体怎么加载到HBaseConfiguration里面不太清晰。因为Configuration的Set方法只能传字符串，怎么把构建的Scan转为String，是解决问题的关键

新版本Hbase

  public static String convertScanToString(Scan scan) throws IOException {
    ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
    return Bytes.toString(Base64.getEncoder().encode(proto.toByteArray()));
  }

旧版本

ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
Base64.encodeBytes(proto)

代码示例

package hbase_dml

import com.alibaba.fastjson.{JSON, JSONObject}
import entity.TmpSampleInfo
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.client.{Result, Scan}
import org.apache.hadoop.hbase.filter.{FilterList, NullComparator, SingleColumnValueFilter}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableMapReduceUtil}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{CompareOperator, HBaseConfiguration}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

import java.util
import scala.jdk.CollectionConverters.asScalaBufferConverter


/**
 * @author lzx
 * @description: TODO ： spark读Hbase数据集成Hbase Filter(过滤器) 并将读出来的数据转为DF
 */
object SparkHbaseWithFilter2DF {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName)
      .setMaster("local[*]")
    val session: SparkSession = SparkSession.builder().config(conf).getOrCreate()


    //1.创建HbaseConfiguration
    val hconf: Configuration = HBaseConfiguration.create()
    hconf.set("hbase.zookeeper.property.clientPort", "2181")
    hconf.set("hbase.zookeeper.quorum", "xxx,xxx,xxx")
    hconf.set("hbase.master", "xxx:16000")
    hconf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.INPUT_TABLE, s"TMP:HR_DEFINED_SCAN")

    //2.添加filter => TableMapReduceUtil.convertScanToString(getScan)
    hconf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.SCAN, TableMapReduceUtil.convertScanToString(getScan))

    //3.构建Rdd
    val hbaseRdd: RDD[(ImmutableBytesWritable, Result)] = session.sparkContext.newAPIHadoopRDD(hconf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
    import session.implicits._
    //4.构建df
    val df: DataFrame = hbaseRdd.repartition(400).mapPartitions(iter => {
      val arrayList = new util.ArrayList[TmpSampleInfo]()
      iter.foreach(kv => {
        val jsonObj = new JSONObject()
        val sha1: String = Bytes.toString(kv._1.get())
        jsonObj.put("sha1", sha1)
        val familyMap: util.NavigableMap[Array[Byte], Array[Byte]] = kv._2.getFamilyMap(Bytes.toBytes("i"))
        val keyIter: util.Iterator[Array[Byte]] = familyMap.keySet().iterator()
        while (keyIter.hasNext) {
          val key: Array[Byte] = keyIter.next()
          val keyStr: String = Bytes.toString(key)
          println(sha1, keyStr)
          if (keyStr.contains("time") || keyStr == "task_id" || keyStr == "id") {
            val valueStr: Long = Bytes.toLong(familyMap.get(key)) - 9223372036854775807L - 1 //特殊处理phoenix
            jsonObj.put(keyStr, valueStr)
          } else {
            val valueStr: String = Bytes.toString(familyMap.get(key))
            jsonObj.put(keyStr, valueStr)
          }
        }
        arrayList.add(JSON.parseObject(jsonObj.toJSONString, classOf[TmpSampleInfo]))
      })
      arrayList.asScala.iterator
    }).toDF()

    df.show(false)

    /****
     * 业务逻辑。。。
     **/

    session.close()
  }

  /** *
   *
   * @Author: lzx
   * @Description: 过滤： where task_type is not null AND id is not null
   * @return: org.apache.hadoop.hbase.client.Scan
   * */
  def getScan: Scan = {
    //创建Scan
    val scan = new Scan()
    val filterList = new FilterList()
    val singleColumnValueFilter1 = new SingleColumnValueFilter(
      Bytes.toBytes("i"),
      Bytes.toBytes("task_type"),
      CompareOperator.NOT_EQUAL,
      new NullComparator // 或 "".getBytes()
    )
    val singleColumnValueFilter2 = new SingleColumnValueFilter(
      Bytes.toBytes("i"),
      Bytes.toBytes("id"),
      CompareOperator.NOT_EQUAL,
      new NullComparator
    )
    // 如果为 true，则如果未找到该列，将跳过整行
    singleColumnValueFilter2.setFilterIfMissing(true)
    singleColumnValueFilter1.setFilterIfMissing(true)

    //添加过滤器
    filterList.addFilter(singleColumnValueFilter2)
    filterList.addFilter(singleColumnValueFilter1)

    //scan 设置过滤器
    scan.setFilter(filterList)
    scan.withStartRow(Bytes.toBytes("7d32875"))
    scan.withStopRow(Bytes.toBytes("89a038d4"))
  }
}