基于hbase版本2.3.5
过滤器简介
Hbase 提供了种类丰富的过滤器(filter)来提高数据处理的效率,用户可以通过内置或自定义的过滤器来对数据进行过滤,所有的过滤器都在服务端生效,即谓词下推(predicate push down)。
好处:这样可以保证过滤掉的数据不会被传送到客户端,从而减轻网络传输和客户端处理的压力。
spark 读Hbase集成Filter
TableInputFormat 源码
package org.apache.hadoop.hbase.mapreduce;
public class TableInputFormat extends TableInputFormatBase
implements Configurable {
@SuppressWarnings("hiding")
private static final Logger LOG = LoggerFactory.getLogger(TableInputFormat.class);
/** Job parameter that specifies the input table. */
public static final String INPUT_TABLE = "hbase.mapreduce.inputtable";
/**
* If specified, use start keys of this table to split.
* This is useful when you are preparing data for bulkload.
*/
private static final String SPLIT_TABLE = "hbase.mapreduce.splittable";
/** Base-64 encoded scanner. All other SCAN_ confs are ignored if this is specified.
* See {@link TableMapReduceUtil#convertScanToString(Scan)} for more details.
*/
public static final String SCAN = "hbase.mapreduce.scan";
/** Scan start row */
public static final String SCAN_ROW_START = "hbase.mapreduce.scan.row.start";
/** Scan stop row */
public static final String SCAN_ROW_STOP = "hbase.mapreduce.scan.row.stop";
...
...
可以看到TableInputFormat中提供了Scan参数,但是具体怎么加载到HBaseConfiguration里面不太清晰。因为Configuration的Set方法只能传字符串,怎么把构建的Scan转为String,是解决问题的关键
新版本Hbase
public static String convertScanToString(Scan scan) throws IOException {
ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
return Bytes.toString(Base64.getEncoder().encode(proto.toByteArray()));
}
旧版本
ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
Base64.encodeBytes(proto)
代码示例
package hbase_dml
import com.alibaba.fastjson.{JSON, JSONObject}
import entity.TmpSampleInfo
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.client.{Result, Scan}
import org.apache.hadoop.hbase.filter.{FilterList, NullComparator, SingleColumnValueFilter}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableMapReduceUtil}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{CompareOperator, HBaseConfiguration}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import java.util
import scala.jdk.CollectionConverters.asScalaBufferConverter
/**
* @author lzx
* @description: TODO : spark读Hbase数据集成Hbase Filter(过滤器) 并将读出来的数据转为DF
*/
object SparkHbaseWithFilter2DF {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName)
.setMaster("local[*]")
val session: SparkSession = SparkSession.builder().config(conf).getOrCreate()
//1.创建HbaseConfiguration
val hconf: Configuration = HBaseConfiguration.create()
hconf.set("hbase.zookeeper.property.clientPort", "2181")
hconf.set("hbase.zookeeper.quorum", "xxx,xxx,xxx")
hconf.set("hbase.master", "xxx:16000")
hconf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.INPUT_TABLE, s"TMP:HR_DEFINED_SCAN")
//2.添加filter => TableMapReduceUtil.convertScanToString(getScan)
hconf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.SCAN, TableMapReduceUtil.convertScanToString(getScan))
//3.构建Rdd
val hbaseRdd: RDD[(ImmutableBytesWritable, Result)] = session.sparkContext.newAPIHadoopRDD(hconf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
import session.implicits._
//4.构建df
val df: DataFrame = hbaseRdd.repartition(400).mapPartitions(iter => {
val arrayList = new util.ArrayList[TmpSampleInfo]()
iter.foreach(kv => {
val jsonObj = new JSONObject()
val sha1: String = Bytes.toString(kv._1.get())
jsonObj.put("sha1", sha1)
val familyMap: util.NavigableMap[Array[Byte], Array[Byte]] = kv._2.getFamilyMap(Bytes.toBytes("i"))
val keyIter: util.Iterator[Array[Byte]] = familyMap.keySet().iterator()
while (keyIter.hasNext) {
val key: Array[Byte] = keyIter.next()
val keyStr: String = Bytes.toString(key)
println(sha1, keyStr)
if (keyStr.contains("time") || keyStr == "task_id" || keyStr == "id") {
val valueStr: Long = Bytes.toLong(familyMap.get(key)) - 9223372036854775807L - 1 //特殊处理phoenix
jsonObj.put(keyStr, valueStr)
} else {
val valueStr: String = Bytes.toString(familyMap.get(key))
jsonObj.put(keyStr, valueStr)
}
}
arrayList.add(JSON.parseObject(jsonObj.toJSONString, classOf[TmpSampleInfo]))
})
arrayList.asScala.iterator
}).toDF()
df.show(false)
/****
* 业务逻辑。。。
**/
session.close()
}
/** *
*
* @Author: lzx
* @Description: 过滤: where task_type is not null AND id is not null
* @return: org.apache.hadoop.hbase.client.Scan
* */
def getScan: Scan = {
//创建Scan
val scan = new Scan()
val filterList = new FilterList()
val singleColumnValueFilter1 = new SingleColumnValueFilter(
Bytes.toBytes("i"),
Bytes.toBytes("task_type"),
CompareOperator.NOT_EQUAL,
new NullComparator // 或 "".getBytes()
)
val singleColumnValueFilter2 = new SingleColumnValueFilter(
Bytes.toBytes("i"),
Bytes.toBytes("id"),
CompareOperator.NOT_EQUAL,
new NullComparator
)
// 如果为 true,则如果未找到该列,将跳过整行
singleColumnValueFilter2.setFilterIfMissing(true)
singleColumnValueFilter1.setFilterIfMissing(true)
//添加过滤器
filterList.addFilter(singleColumnValueFilter2)
filterList.addFilter(singleColumnValueFilter1)
//scan 设置过滤器
scan.setFilter(filterList)
scan.withStartRow(Bytes.toBytes("7d32875"))
scan.withStopRow(Bytes.toBytes("89a038d4"))
}
}