spark交互HBase
Spark可以从HBase表中读写(Read/Write)数据,底层采用 TableInputFormat 和 TableOutputFormat 方式,与MapReduce与HBase集成完全一样,使用相同输入格式InputFormat 和输出格式 OutputFoamt 。
1、写入数据
package com.yyds.tags.hbase.write
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* @DESC: 将画像标签数据保存至HBase表
*/
object HBaseWriteTest {
def main(args: Array[String]): Unit = {
// a. 构建SparkContext实例对象
val sparkConf = new SparkConf()
.setAppName("SparkHBaseWrite")
.setMaster("local[4]")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.registerKryoClasses(Array(classOf[ImmutableBytesWritable], classOf[Put])) // 注册哪些类型使用Kryo序列化, 最好注册RDD中类型
val sc: SparkContext = new SparkContext(sparkConf)
// b. 模拟数据集
val tagsRDD: RDD[(String, String)] = sc.parallelize(
List(("1001", "gender:男,job:教师"),
("1002", "gender:女,job:工人"),
("1003", "gender:男,job:学生"),
("1004", "gender:男,job:工人")
),
numSlices = 2
)
// TODO:将RDD数据保存到HBase表中,要求RDD数据类型为二元组,Key: ImmutableBytesWritable, Value:Put
/*
HBase表:htb_tags
RowKey:userId
CF:user
Column:tagName
create 'htb_tags', 'user'
*/
val datasRDD: RDD[(ImmutableBytesWritable, Put)] =
tagsRDD.map { case (userId, tags) =>
// a. 构建RowKey
val rowKey: Array[Byte] = Bytes.toBytes(userId)
// b. 构建put对象
val put = new Put(rowKey)
// 设置列
put.addColumn(
Bytes.toBytes("user"),
Bytes.toBytes("userId"),
Bytes.toBytes(userId)
)
tags.split(",").foreach { tag =>
val Array(field, value) = tag.split(":")
put.addColumn(
Bytes.toBytes("user"),
Bytes.toBytes(field),
Bytes.toBytes(value)
)
}
(new ImmutableBytesWritable(rowKey), put)
}
// 1. 设置HBase依赖Zookeeper相关配置信息
val conf: Configuration = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum","192.168.42.7")
conf.set("hbase.zookeeper.property.clientPort","2181")
conf.set("zookeeper.znode.parent","/hbase")
// 2. 数据写入表的名称
conf.set(TableOutputFormat.OUTPUT_TABLE, "htb_tags")
datasRDD.saveAsNewAPIHadoopFile(
s"datas/hbase/output-${System.nanoTime()}",
classOf[ImmutableBytesWritable],
classOf[Put],
classOf[TableOutputFormat[ImmutableBytesWritable]],
conf
)
// 应用结束,关闭资源
sc.stop()
}
}
2、读取数据
package com.yyds.tags.hbase.read
import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration}
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/** *
* 演示:SparkCore如何从HBase表读取数据
*/
object HBaseReadTest {
def main(args: Array[String]): Unit = {
// 创建SparkContext实例对象
val sparkConf = new SparkConf()
.setMaster("local[4]")
.setAppName("HBaseReadTest")
// 设置使用Kryo序列
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") // 注册哪些类型使用Kryo序列化, 最好注册RDD中类型
.registerKryoClasses(Array(classOf[ImmutableBytesWritable], classOf[Result]))
val sc: SparkContext = SparkContext.getOrCreate(sparkConf)
// 读取数据
/*def newAPIHadoopRDD[K, V, F <: NewInputFormat[K, V]](
conf: Configuration = hadoopConfiguration,
fClass: Class[F], kClass: Class[K], vClass: Class[V]
): RDD[(K, V)]
*/
// 1. 读取配置信息,加载HBaseClient配置(主要ZK地址和端口号)
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", "192.168.42.7")
conf.set("hbase.zookeeper.property.clientPort", "2181")
conf.set("zookeeper.znode.parent", "/hbase")
// 2. 设置表的名称
conf.set(TableInputFormat.INPUT_TABLE, "tbl_users")
// 3. 从HBase表加载数据
val hbaseRDD: RDD[(ImmutableBytesWritable, Result)] =
sc.newAPIHadoopRDD(conf,
classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result])
println(s"count = ${hbaseRDD.count()}")
hbaseRDD.take(2).foreach {
case (_, result) => println(s"RowKey = ${Bytes.toString(result.getRow)}")
for (cell <- result.rawCells()) {
// 列簇CF
val cf = Bytes.toString(CellUtil.cloneFamily(cell))
// 列名称
val column = Bytes.toString(CellUtil.cloneQualifier(cell))
//列的值
val value = Bytes.toString(CellUtil.cloneValue(cell))
println(s"\t ${cf}:${column} = ${value}, version -> ${cell.getTimestamp}")
}
}
// 应用结束,关闭资源
sc.stop()
}
}