Spark读取HBase示例
Spark读取HBase需要理清HBase的配置,这里给出一个实际的示例。
Spark读取Hbase的时候要注意一次读取的记录数量,需要参考hbase机器的QPS和业务的并发使用情况来设置一次最多读取多少条合适,不然如果业务并发大很可能就会把Hbase集群搞崩溃。
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{Get, HTable, Result}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import scala.collection.mutable
object HbaseRead {
val tblName = "sample_tbl_name"
val ZookeeperQuorum = "ss-11,ss-12,ss-13,ss-14,ss-15"
val ZookeeperPropertyClientPort = "1234"
val zookeeperZnodeParent = "/hbase_data_parent"
val ClientRetriesNumber = "3"
val ClientIpcPoolSize = "3"
def readFromHbase(sc: SparkContext, keys: RDD[String], hbaseBatchNum: Int): RDD[(String, String, String)] = {
val hbaseBatchNumBc = sc.broadcast(hbaseBatchNum)
val hbaseTblNameBc = sc.broadcast(hbaseTblName) // 这里不广播的话就会出现 new HTable(conf, tblname)读不到表名
val resultRDD = keys.mapPartitions { partition => {
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", ZookeeperQuorum)
conf.set("hbase.zookeeper.property.clientPort", ZookeeperPropertyClientPort)
conf.set("zookeeper.znode.parent", zookeeperZnodeParent)
conf.set("hbase.client.retries.number", ClientRetriesNumber)
conf.set("hbase.client.ipc.pool.size", ClientIpcPoolSize)
val table = new HTable(conf, hbaseTblNameBc.value)
val result = mutable.ListBuffer[(String, String)]()
try {
var tmpCount = 0
var hbaseGetList = new mutable.ListBuffer[Get]()
var hbaseGetKeyList = new mutable.ListBuffer[String]()
for (key: String <- partition) {
try {
val get = new Get(Bytes.toBytes(key))
get.addColumn(Bytes.toBytes("myRowFamily"), Bytes.toBytes("myvalue"))
hbaseGetList += get
hbaseGetKeyList += key
tmpCount += 1
} catch {
case ex: Exception => println(ex)
}
if (tmpCount % hbaseBatchNumBc.value == 0) {
val hbaseGetResult = table.get(hbaseGetList)
var index = 0
for (res: Result <- hbaseGetResult) {
val key = hbaseGetKeyList(index)
val valueBytes = res.getValue(Bytes.toBytes("myRowFamily"), Bytes.toBytes("myvalue"))
val value = Bytes.toString(valueBytes)
result += ((key, value))
index += 1
}
hbaseGetList.clear()
hbaseGetKeyList.clear()
}
}
if (hbaseGetList.nonEmpty) {
val hbaseGetResult = table.get(hbaseGetList)
var index = 0
for (res: Result <- hbaseGetResult) {
val key = hbaseGetKeyList(index)
val valueBytes = res.getValue(Bytes.toBytes("myRowFamily"), Bytes.toBytes("myvalue"))
val value = Bytes.toString(valueBytes)
result += ((key, value))
index += 1
}
hbaseGetList.clear()
hbaseGetKeyList.clear()
}
} finally {
table.close()
}
result.iterator
}
}
resultRDD
}
}