需求:
有一个开窗取前最近的10条数据的需求,涉及到全表操作。如果加载全表进行开窗的话,害怕内存吃不消,于是乎建了一个muti-version的hbase表,利用Hbase的特性取巧解决这个需求,数据是写进去了,但是怎么在Spark里面去读呐?
- 建表语句
create 'USDP:HASHSIG_SHA1_TEST', {NAME => 'i', VERSIONS => 10}
核心逻辑
- scan
// 设置scan读版本
def getScan: Scan = {
val scan = new Scan()
scan.readVersions(10)
}
// 将scan 方法添加进hconf里面,构建rdd时生效
hconf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.SCAN, TableMapReduceUtil.convertScanToString(getScan))
- convertScanToString
// 将给定scan 写入 Base64 编码字符串
public static String convertScanToString(Scan scan) throws IOException {
ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
return Bytes.toString(Base64.getEncoder().encode(proto.toByteArray()));
}
- getColumnCells
val sha1Cells = kv._2.getColumnCells(Bytes.toBytes("i"), Bytes.toBytes("sha1")).asScala
val addTimeCells = kv._2.getColumnCells(Bytes.toBytes("i"), Bytes.toBytes("addtime")).asScala
// 两列 根据下表拼接在一起 成行
sha1Cells.zip(addTimeCells).foreach(t2 => {
list.add(hashsigRk, Bytes.toString(CellUtil.cloneValue(t2._1)), Bytes.toLong(CellUtil.cloneValue(t2._2)))
})
示例代码
// 读取版本
def getScan: Scan = {
val scan = new Scan()
scan.readVersions(10)
}
def main(args: Array[String]): Unit = {
val session: SparkSession = SparkSession.builder().appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val hconf: Configuration = HBaseConfiguration.create()
val TABLE_NAME: String = "USDP:HASHSIG_SHA1_10"
hconf.set("hbase.zookeeper.quorum", ZOOKEEPER_QUORUM)
hconf.set("hbase.zookeeper.property.clientPort", ZOOKEEPER_QUORUM_CLIENT)
hconf.set("hbase.master", HBASE_MASTER)
hconf.set(TableInputFormat.INPUT_TABLE, TABLE_NAME)
hconf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.SCAN, TableMapReduceUtil.convertScanToString(getScan))
val hbaseRdd: RDD[(ImmutableBytesWritable, Result)] = session.sparkContext.newAPIHadoopRDD(hconf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
val mapRdd: RDD[(String, String, Long)] = hbaseRdd.mapPartitions(iter => {
val list = new util.ArrayList[(String, String, Long)]()
iter.foreach(kv => {
val hashsigRk: String = Bytes.toString(kv._1.get())
val sha1Cells = kv._2.getColumnCells(Bytes.toBytes("i"), Bytes.toBytes("sha1")).asScala
val addTimeCells: mutable.Seq[Cell] = kv._2.getColumnCells(Bytes.toBytes("i"), Bytes.toBytes("addtime")).asScala
sha1Cells.zip(addTimeCells).foreach(t2 => {
list.add(hashsigRk, Bytes.toString(CellUtil.cloneValue(t2._1)), Bytes.toLong(CellUtil.cloneValue(t2._2)))
})
})
list.iterator().asScala
})
println(mapRdd.count())
log.info("读取hbase 完成!")
// rdd 转 df
import session.implicits._
val dataFrame: DataFrame = mapRdd.toDF("hashsig", "sha1", "addtime")
dataFrame.show()
session.close()
}