@羲凡——只为了更好的活着
Spark 读写Hbase(Scala)——2.批量操作
一.前提准备
1.创建命名空间和表
create_namespace 'testdata'
create 'testdata:hb_staff','info'
2.pom.xml文件中要添加依赖
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
</dependency>
二.直接上代码
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat2,LoadIncrementalHFiles}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object HbaseDemo2 {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
lazy val logger:Logger = Logger.getLogger(this.getClass().getName())
val hbaseQuorum = "deptest33,deptest34,deptest75"
val hbaseClientPort = "2181"
// 获取hbase配置
def getHbaseConf(): Configuration = {
var hbaseConf: Configuration = null
try {
hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum", hbaseQuorum)
hbaseConf.set("hbase.zookeeper.property.clientPort", hbaseClientPort)
} catch {
case e: Exception => logger.error("==========连接hbase失败:," + e)
}
hbaseConf
}
// 获取Table对象
def getTable(tableName: String): Table = {
var table: Table = null
try {
val hbaseConf = getHbaseConf()
val conn = ConnectionFactory.createConnection(hbaseConf)
table = conn.getTable(TableName.valueOf(tableName))
} catch {
case e: Exception => logger.error("==========获取Table对象失败:" + e)
}
table
}
// 小数据批量插入或更新hbase
def insertSmallData(tableName:String,rdd:RDD[(String, String, String)])={
val jobConf : JobConf = new JobConf(getHbaseConf())
jobConf.setOutputFormat(classOf[TableOutputFormat])
jobConf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
try {
rdd.map(t=>{
val put = new Put(Bytes.toBytes(t._1))
put.addColumn("info".getBytes, "name".getBytes, t._2.getBytes)
put.addColumn("info".getBytes, "age".getBytes, t._3.getBytes)
put.setDurability(Durability.ASYNC_WAL)
(new ImmutableBytesWritable(),put)
}).saveAsHadoopDataset(jobConf)
} catch {
case e:Exception =>println("小数据批量插入失败:"+e)
}
}
// 大数据批量插入或更新hbase
def insertBigData(tableName:String,rdd:RDD[(ImmutableBytesWritable,KeyValue)])={
val conf = getHbaseConf()
conf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
val jobConf = new JobConf(conf)
val load = new LoadIncrementalHFiles(conf)
val hdfsFile="/testdata/tmp/bulkInsertHbase"
val path = new Path(hdfsFile)
val fileSystem = FileSystem.get(URI.create(hdfsFile), new Configuration())
if(fileSystem.exists(path)){
fileSystem.delete(new Path(hdfsFile),true)
}
rdd.saveAsNewAPIHadoopFile(hdfsFile,
classOf[ImmutableBytesWritable],
classOf[KeyValue],
classOf[HFileOutputFormat2],conf)
//Thread.sleep(20000)
load.doBulkLoad(new Path(hdfsFile),new HTable(conf,tableName))
}
// 批量查询hbase(根据rowKey)
def scanData(tableName:String,startRowKey:String,stopRowKey:String)={
val table = getTable(tableName)
val scan = new Scan().withStartRow(Bytes.toBytes(startRowKey))
.withStopRow(Bytes.toBytes(stopRowKey))
val rs = table.getScanner(scan)
try {
val resultScan = rs.iterator()
while(resultScan.hasNext){
val result = resultScan.next().rawCells()
for(i <- 0.until(result.length)){
val family = Bytes.toString(CellUtil.cloneFamily(result(i)))
val rowKey = Bytes.toString(CellUtil.cloneRow(result(i)))
val column = Bytes.toString(CellUtil.cloneQualifier(result(i)))
val value = Bytes.toString(CellUtil.cloneValue(result(i)))
println(s"$family:$rowKey,$column:$value")
}
}
} catch {
case e:Exception =>println("批量查询操作失败:"+e)
} finally {
rs.close()
table.close()
}
}
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("HbaseDemo")
.master("local[*]")
.getOrCreate()
val rdd = spark.sparkContext.makeRDD(Array(
"1,曹操,101","2,张辽,102","3,夏侯惇,103"
)).map(t=>{
val arr = t.split(",")
(arr(0),arr(1),arr(2))
})
insertSmallData("testdata:hb_staff",rdd)
val rdd2: RDD[(ImmutableBytesWritable, KeyValue)] = spark.sparkContext.makeRDD(Array(
"4,刘备,104","5,关羽,105","6,张飞,106")).map(t=>{
var kvList : Seq[KeyValue] = List()
val arr = t.split(",")
val kvName=new KeyValue(arr(0).getBytes,"info".getBytes,"name".getBytes,arr(1).getBytes)
val kvAge=new KeyValue(arr(0).getBytes,"info".getBytes,"age".getBytes,arr(2).getBytes)
kvList = kvList :+ kvAge
kvList = kvList :+ kvName
(new ImmutableBytesWritable(arr(0).getBytes),kvList)
}).flatMapValues(_.iterator)
insertBigData("testdata:hb_staff",rdd2)
scanData("testdata:hb_staff","1","7")
println("=============Succeed=================")
}
}
三.结果展示
info:1,age:101
info:1,name:曹操
info:2,age:102
info:2,name:张辽
info:3,age:103
info:3,name:夏侯惇
info:4,age:104
info:4,name:刘备
info:5,age:105
info:5,name:关羽
info:6,age:106
info:6,name:张飞
=============Succeed=================
====================================================================
@羲凡——只为了更好的活着
若对博客中有任何问题,欢迎留言交流