1、创建hbase数据源表
node1> bin/ hbase shell
create 'spark_hbase_sql' , 'cf'
put 'spark_hbase_sql' , '0001' , 'cf:name' , 'zhangsan'
put 'spark_hbase_sql' , '0001' , 'cf:score' , '80'
put 'spark_hbase_sql' , '0002' , 'cf:name' , 'lisi'
put 'spark_hbase_sql' , '0002' , 'cf:score' , '60'
2、创建Hbase的数据保存表
bin/ hbase shell
create 'spark_hbase_write' , 'cf'
3.代码 编写
package com. alibaba. programApp
import java. util
import java. util. Optional
import com. travel. utils. HbaseTools
import org. apache. hadoop. hbase. TableName
import org. apache. hadoop. hbase. client. { Connection, Put, Result, ResultScanner, Scan, Table}
import org. apache. hadoop. hbase. util. Bytes
import org. apache. spark. SparkConf
import org. apache. spark. sql. { DataFrame, Row, SaveMode, SparkSession}
import org. apache. spark. sql. sources. v2. reader. { DataReader, DataReaderFactory, DataSourceReader}
import org. apache. spark. sql. sources. v2. writer. { DataSourceWriter, DataWriter, DataWriterFactory, WriterCommitMessage}
import org. apache. spark. sql. sources. v2. { DataSourceOptions, DataSourceV2, ReadSupport, WriteSupport}
import org. apache. spark. sql. types. StructType
object HBaseSourceAndSink {
def main ( args: Array[ String] ) : Unit = {
val conf = new SparkConf ( )
conf. setMaster ( "local[2]" ) . setAppName ( "sparkSqlSourceAndSink" )
val spark: SparkSession = SparkSession. builder ( ) . config ( conf) . getOrCreate ( )
val df: DataFrame = spark. read. format ( "com.travel.programApp.HBaseSource" )
. option ( "hbase.table.name" , "spark_hbase_sql" )
. option ( "cf.cc" , "cf:name,cf:score" )
. option ( "schema" , "`name` STRING , `score` STRING" )
. load
df. createOrReplaceTempView ( "sparkHBaseSQL" )
df. printSchema ( )
val resultDF: DataFrame = spark. sql ( "select * from sparkHBaseSQL where score > 70 " )
resultDF. write. format ( "com.travel.programApp.HBaseSource" )
. mode ( SaveMode. Overwrite)
. option ( "hbase.table.name" , "spark_hbase_write" )
. option ( "cf" , "cf" )
. save ( )
}
}
class HBaseSource extends DataSourceV2 with ReadSupport with WriteSupport{
override def createReader ( options: DataSourceOptions) : DataSourceReader = {
val tableName : String = options. get ( "hbase.table.name" ) . get ( )
val cfAndCC : String = options. get ( "cf.cc" ) . get ( )
val schema: String = options. get ( "schema" ) . get ( )
new HBaseDataSourceReader ( tableName , cfAndCC , schema)
}
override def createWriter ( jobId: String, schema: StructType, mode: SaveMode, options: DataSourceOptions) : Optional[ DataSourceWriter] = {
val tableName : String = options. get ( "hbase.table.name" ) . get ( )
val family : String = options. get ( "cf" ) . get ( )
Optional. of ( new HBaseDataSourceWriter ( tableName) )
}
}
class HBaseDataSourceWriter ( tableName : String) extends DataSourceWriter {
override def createWriterFactory ( ) : DataWriterFactory[ Row] = {
new HBaseDataWriterFactory ( tableName)
}
override def commit ( messages: Array[ WriterCommitMessage] ) : Unit = {
}
override def abort ( messages: Array[ WriterCommitMessage] ) : Unit = {
}
}
class HBaseDataWriterFactory ( tableName : String) extends DataWriterFactory [ Row] {
override def createDataWriter ( partitionId: Int, attemptNumber: Int) : DataWriter[ Row] = {
new HBaseDataWriter ( tableName)
}
}
class HBaseDataWriter ( tableName : String) extends DataWriter [ Row] {
private val conn = HbaseTools. getHbaseConn
private val table: Table = conn. getTable ( TableName. valueOf ( tableName) )
override def write ( record: Row) : Unit = {
val name: String = record. getString ( 0 )
val score : String = record. getString ( 1 )
val put = new Put ( "0001" . getBytes ( ) )
put. addColumn ( "cf" . getBytes ( ) , "name" . getBytes ( ) , name. getBytes ( ) )
put. addColumn ( "cf" . getBytes ( ) , "score" . getBytes ( ) , score. getBytes ( ) )
table. put ( put)
}
override def commit ( ) : WriterCommitMessage = {
table. close ( )
conn. close ( )
null
}
override def abort ( ) : Unit = {
}
}
class HBaseDataSourceReader ( tableName: String , cfAndCC: String , schema: String) extends DataSourceReader {
override def readSchema ( ) : StructType = {
StructType. fromDDL ( schema)
}
override def createDataReaderFactories ( ) : util. List[ DataReaderFactory[ Row] ] = {
import scala. collection. JavaConverters. _
Seq ( new HBaseDateReaderFactory ( tableName , cfAndCC) . asInstanceOf[ DataReaderFactory[ Row] ] ) . asJava
}
}
class HBaseDateReaderFactory ( tableName: String, cfAndCC: String) extends DataReaderFactory [ Row] {
override def createDataReader ( ) : DataReader[ Row] = {
new HBaseDataReader ( tableName , cfAndCC) ;
}
}
class HBaseDataReader ( tableName : String , cfAndCC: String) extends DataReader [ Row] {
var conn: Connection = null
var table: Table = null
var scan = new Scan ( )
var resultScanner: ResultScanner = null
def getIterator: Iterator[ Seq[ AnyRef] ] = {
conn = HbaseTools. getHbaseConn
table = conn. getTable ( TableName. valueOf ( tableName) )
resultScanner = table. getScanner ( scan)
import scala. collection. JavaConverters. _
val iterator: Iterator[ Seq[ AnyRef] ] = resultScanner. iterator ( ) . asScala. map ( eachResult = > {
val name: String = Bytes. toString ( eachResult. getValue ( "cf" . getBytes ( ) , "name" . getBytes ( ) ) )
val score: String = Bytes. toString ( eachResult. getValue ( "cf" . getBytes ( ) , "score" . getBytes ( ) ) )
System. out. println ( "===================================" )
System. out. println ( Seq ( name, score) . toString ( ) )
System. out. println ( "===================================" )
Seq ( name, score)
} )
iterator
}
val data: Iterator[ Seq[ AnyRef] ] = getIterator
override def next ( ) : Boolean = {
data. hasNext
}
override def get ( ) : Row = {
Row. fromSeq ( data. next ( ) )
}
override def close ( ) : Unit = {
table. close ( )
conn. close ( )
}
}