hbase建表语句
create 'test',{NAME=>'CF_1',COMPRESSION=>'SNAPPY'},{NUMREGIONS => 30, SPLITALGO => 'HexStringSplit'}
bulk load 数据到上面建的表
1、自定义spark的partitioner函数,使得rdd的分区和hbase hfile要求的region分区一致
class MyPartitioner(partitions: Int) extends Partitioner {
override def numPartitions: Int = partitions
//计算region的split键值,总数为partitions-1个
val splits = new HexStringSplit().split(partitions).map(s => Bytes.toString(s))
//根据rowkey前缀,计算该条记录输入哪一个region范围内
def getPartitionNum(splits: Array[String], key: Any): Int = {
var i = 0
var foundIt = false
while (i < splits.length && !foundIt) {
if (key.asInstanceOf[(String, String)]._1.substring(0, 8) < splits(i)) {
foundIt = true
}
i = i + 1
}
i
}
override def getPartition(key: Any): Int = key match {
case null => 0
case _ => getPartitionNum(splits, key)
}
}
对spark产生的rdd进行重新分区
val saltedRDD = result.repartitionAndSortWithinPartitions(new MyPartitioner(partition))
//构造hfile
val rdd = saltedRDD.map(r => {
val rowkey = r._1._1
val cq = r._1._2
val value = r._2
val kv: KeyValue = new KeyValue(Bytes.toBytes(rowkey), "CF_1".getBytes(), cq.getBytes(), value.getBytes())
(new ImmutableBytesWritable(Bytes.add(Bytes.toBytes(rowkey), Bytes.toBytes(r._1._2))), kv)
})
ps:其他的bulkload hfile代码就不上传了