参考文档:
scala版写的SparkSQL程序读取Hbase表注册成表SQL查询
https://blog.csdn.net/qq_21383435/article/details/77328805
第一种思路:
使用newAPIHadoopRDD方法实现
package com.zhbr.mainClass
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{Result, Scan}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.{RowFactory, SparkSession}
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.Base64
import org.apache.spark.sql.types.{DataTypes, StructField}
import scala.collection.mutable.ListBuffer
//dlzzBean样例类
case class dlzzBean(P1:Double,P2:Double)
object Test {
def main(args: Array[String]): Unit = {
//获取sparkSession
val sparkSession = SparkSession.builder().appName(this.getClass.getSimpleName.filter(!_.equals('$'))).getOrCreate()
//获取sparkContext
val sparkContext = sparkSession.sparkContext
//设置日志级别
sparkContext.setLogLevel("WARN")
//指定HBASE的表
val tableName = "DWD_AMR_GS_METER-E-CURVE_201902"
//设置HBaseConfiguration
val hbaseConf = HBaseConfiguration.create()
//设备zookeeper集群地址
hbaseConf.set("hbase.zookeeper.quorum","10.213.111.XXX,10.213.111.XXX,10.213.111.XXX")
//设置zookeeper端口
hbaseConf.set("hbase.zookeeper.property.clientPort","2181")
//设置要读取的表名
hbaseConf.set(TableInputFormat.INPUT_TABLE,tableName)
//创建scan
val scan = new Scan()
scan.addColumn(Bytes.toBytes("DATA"),Bytes.toBytes("meterID-1"))
scan.addColumn(Bytes.toBytes("DATA"),Bytes.toBytes("meterID-2"))
//将scan类转换成string类型
val proto = ProtobufUtil.toScan(scan)
val ScanToString = Base64.encodeBytes(proto.toByteArray())
hbaseConf.set(TableInputFormat.SCAN, ScanToString)
//读取表,获取RDD
val hbaseRDD = sparkContext.newAPIHadoopRDD(hbaseConf,
classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result])
//--------------------------转换DateFrame(1)------------------------------------------------
//遍历数据
val valueRDD:RDD[Row] = hbaseRDD.map(result => {
val P1 = result._2.getValue(Bytes.toBytes("DATA"), Bytes.toBytes("meterID-1"))
val P2 = result._2.getValue(Bytes.toBytes("DATA"), Bytes.toBytes("meterID-2"))
RowFactory.create(P1, P2)
})
//构建schema,转换成dateFrame类型
val structFields=new ListBuffer[StructField]()
structFields.append(DataTypes.createStructField("meterID-1", DataTypes.DoubleType, true))
structFields.append(DataTypes.createStructField("meterID-2", DataTypes.DoubleType, true))
val schema = DataTypes.createStructType(structFields.toArray)
val DataDF = sparkSession.createDataFrame(valueRDD, schema)
//--------------------------转换DateFrame(2)------------------------------------------------
//遍历数据
val valueRDD:RDD[Array[Byte],Array[Byte]] = hbaseRDD.map(result => {
val P1 = result._2.getValue(Bytes.toBytes("DATA"), Bytes.toBytes("meterID-1"))
val P2 = result._2.getValue(Bytes.toBytes("DATA"), Bytes.toBytes("meterID-2"))
(P1, P2)
})
//转换成dateFrame类型
val DataDF = sparkSession.createDataFrame(valueRDD, classOf[dlzzBean])
//--------------------------转换DateFrame(3)------------------------------------------------
//遍历数据,转换成dateFrame类型
val DataDF = hbaseRDD.map(result=>(
//string类型可以直接使用Bytes.toString
//int、double等类型需要先转成String,再转成int或double
java.lang.Double.valueOf(Bytes.toString(result._2.getValue(Bytes.toBytes("DATA"),Bytes.toBytes("meterID-1")))),
java.lang.Double.valueOf(Bytes.toString(result._2.getValue(Bytes.toBytes("DATA"),Bytes.toBytes("meterID-2"))))
)).toDF("meterID-1","meterID-2")
//--------------------------使用sparkSQL-----------------------------------------------------
//注册临时表
DataDF.createTempView("DLZZ")
sparkSession.sql("select * from DLZZ").show()
//关闭sparkSession
sparkSession.stop()
}
}
第二种思路:
使用sparkSession读取json的方式实现
自定义工具类:
package com.zhbr.hbase.utils
import org.apache.hadoop.hbase.{Cell, CellUtil, HBaseConfiguration, HConstants, TableName}
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Scan, Table}
import org.apache.hadoop.hbase.filter.Filter
import org.apache.hadoop.hbase.util.Bytes
import scala.collection.{mutable}
import scala.collection.mutable.ListBuffer
object HBaseClient {
/**
* 扫描HBase并返回结果
* @param tableName 表名
* @param filter 过滤条件
* @param startRow 起始行键
* @param stopRow 结束行键
* @return 扫描结果
*/
def scan(tableName: String, filter: Filter, startRow: String, stopRow: String): List[Map[String, String]] = {
val s = buildScan(filter, startRow, stopRow)
val t = getConnection.getTable(TableName.valueOf(tableName))
scanTable(t, s)
}
/**
* 获取链接
*/
private def getConnection: Connection = {
val conf = HBaseConfiguration.create()
conf.set(HConstants.ZOOKEEPER_QUORUM, "10.213.111.XXX,10.213.111.XXX,10.213.111.XXX")
conf.set(HConstants.ZOOKEEPER_CLIENT_PORT, "2181")
ConnectionFactory.createConnection(conf)
}
/**
* 执行扫描
* @param table 表
* @param scan scan
*/
private def scanTable(table: Table, scan: Scan): List[Map[String, String]] = {
val scanner = table.getScanner(scan)
val ite = scanner.iterator()
val result = new ListBuffer[Map[String, String]]
import scala.collection.JavaConversions._
while (ite.hasNext){
val map = new mutable.ListMap[String, String]
val cells = ite.next().listCells()
for (cell <- cells){
map += readCell(cell)
}
result += map.toMap
}
result.toList
}
/**
* 读取单元格
* @param cell 单元格
*/
private def readCell(cell: Cell) = {
val qualifier = Bytes.toString(CellUtil.cloneQualifier(cell))
val value = Bytes.toString(CellUtil.cloneValue(cell))
(qualifier, value)
}
/**
* 构建Scan实例
* @param filter 过滤条件
* @param startRow 起始行键
* @param stopRow 结束行键
*/
private def buildScan(filter: Filter, startRow: String, stopRow: String): Scan ={
val scan = new Scan()
scan.setMaxVersions()
scan.setCaching(2000)
scan.setCacheBlocks(false)
scan.addColumn(Bytes.toBytes("DATA"),Bytes.toBytes("meterID-1"))
scan.addColumn(Bytes.toBytes("DATA"),Bytes.toBytes("meterID-2"))
if(filter != null)
scan.setFilter(filter)
if(startRow != null)
scan.setStartRow(Bytes.toBytes(startRow))
if(stopRow != null)
scan.setStopRow(Bytes.toBytes(stopRow))
scan
}
}
主程序:
package com.zhbr.mainClass
import com.zhbr.hbase.utils.HBaseClient
import org.apache.spark.sql.{SparkSession}
import scala.util.parsing.json.JSONObject
object DataProcessMain {
def main(args: Array[String]): Unit = {
//获取sparkSession
val sparkSession = SparkSession.builder().appName(this.getClass.getSimpleName.filter(!_.equals('$'))).getOrCreate()
//获取sparkContext
val sparkContext = sparkSession.sparkContext
//设置日志级别
sparkContext.setLogLevel("WARN")
//指定HBASE的表
val tableName = "DWD_AMR_GS_METER-E-CURVE_201902"
//扫描HBase表,获取到结果数据
val result = HBaseClient.scan(tableName,null,null,null)
//将结果数据转化成json字符串
val jsonString = result.map(e => JSONObject(e).toString())
//将结果数据转换成RDD
val jsonRDD = sparkContext.parallelize(jsonString)
//将结果数据转换成DF
val dlzzDF = sparkSession.read.json(jsonRDD)
//注册临时表
dlzzDF.createTempView("DLZZ")
sparkSession.sql("select count(1) from DLZZ").show()
//关闭sparkSession
sparkSession.stop()
}
}