源码地址
GitHub - apache/hbase-connectors: Apache HBase Connectors
Hbase官方文档
https://hbase.apache.org/book.html#_sparksqldataframes
编译源码:
- 1.下载源码
- 2.下载maven(3.6以上版本)
- 3.编译父工程
cd hbase-connectors-master
mvn clean install -DskipTests //编译 跳过测试,此阶段会很长时间
- 4.编译子工程
cd /hbase-connectors-master/spark
//版本按自己的改
mvn -Dspark.version=3.0.3 -Dscala.version=2.12.10 -Dhadoop-three.version=3.1.1 -Dscala.binary.version=2.12 -Dhbase.version=2.3.5 clean install
- 5.将jar包加入本地仓库
//加入本地仓库
mvn install:install-file -DgroupId=cn.huorong.hbase.connectors.spark -DartifactId=hbase-spark -Dversion=2.3.5_spark-3.0.3 -Dpackaging=jar -Dfile=C:\Maven\repository3.8\huorong\hbase-spark-1.0.1-SNAPSHOT.jar
//在pom文件引用
<dependency>
<groupId>cn.huorong.hbase.connectors.spark</groupId>
<artifactId>hbase-spark</artifactId>
<version>2.3.5_spark-3.0.3 </version>
</dependency>
spark sql集成hbase demo
- HBaseRecord
package hbase_connector
case class HBaseRecord(col0: String,
col1: Boolean,
col2: Double,
col3: Float,
col4: Int,
col5: Long,
col6: Short,
col7: String,
col8: Byte)
object HBaseRecord
{
def apply(i: Int, t: String): HBaseRecord = {
val s = s"""row${"%03d".format(i)}"""
HBaseRecord(s,
i % 2 == 0,
i.toDouble,
i.toFloat,
i,
i.toLong,
i.toShort,
s"String$i: $t",
i.toByte)
}
}
- df_hbase
object df_hbase {
def main(args: Array[String]): Unit = {
val session: SparkSession = SparkSession.builder().appName(this.getClass.getSimpleName).master("local").getOrCreate()
val sc: SparkContext = session.sparkContext
val hbaseConf: Configuration = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.properties.quorum","192.168.1.76,192.168.1.73,192.168.1.77")
hbaseConf.set("hbase.zookeeper.properties.property.clientPort","2181")
//一定要创建 hbasecontext
val hbaseContext = new HBaseContext(sc, hbaseConf, null)
val records: immutable.IndexedSeq[HBaseRecord] = (0 to 255).map { i => HBaseRecord(i, "extra") }
//根据rdd创建df
val recordDF: DataFrame = session.createDataFrame(records)
recordDF.show(false)
//定义映射的catalog
val catalog = s"""{
|"table":{"namespace":"default", "name":"table1"},
|"rowkey":"key",
|"columns":{
|"col0":{"cf":"rowkey", "col":"key", "type":"string"},
|"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},
|"col2":{"cf":"cf2", "col":"col2", "type":"double"},
|"col3":{"cf":"cf3", "col":"col3", "type":"float"},
|"col4":{"cf":"cf4", "col":"col4", "type":"int"},
|"col5":{"cf":"cf5", "col":"col5", "type":"bigint"},
|"col6":{"cf":"cf6", "col":"col6", "type":"smallint"},
|"col7":{"cf":"cf7", "col":"col7", "type":"string"},
|"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"}
|}
|}""".stripMargin
//写入数据
recordDF.write
.format("org.apache.hadoop.hbase.spark")
.option(HBaseTableCatalog.tableCatalog,catalog)
.option(HBaseTableCatalog.newTable,"5") //写入到5个分区内
.mode(SaveMode.Overwrite)//覆盖模式
.save()
//读取数据
val table1DF: DataFrame = session.read
.format("org.apache.hadoop.hbase.spark")
.option(HBaseTableCatalog.tableCatalog, catalog)
.option("hbase.spark.pushdown.columnfilter", false) //使用filter过滤列时需要加上
.load()
table1DF.show(400,false)
+------+-----+----+----+----+----+----+---------------+----+
|col0 |col1 |col2|col3|col4|col5|col6|col7 |col8|
+------+-----+----+----+----+----+----+---------------+----+
|row000|true |0.0 |0.0 |0 |0 |0 |String0: extra |0 |
|row001|false|1.0 |1.0 |1 |1 |1 |String1: extra |1 |
|row002|true |2.0 |2.0 |2 |2 |2 |String2: extra |2 |
|row003|false|3.0 |3.0 |3 |3 |3 |String3: extra |3 |
|row004|true |4.0 |4.0 |4 |4 |4 |String4: extra |4 |
|row005|false|5.0 |5.0 |5 |5 |5 |String5: extra |5 |
|row006|true |6.0 |6.0 |6 |6 |6 |String6: extra |6 |
|row007|false|7.0 |7.0 |7 |7 |7 |String7: extra |7 |
|row008|true |8.0 |8.0 |8 |8 |8 |String8: extra |8 |
|row009|false|9.0 |9.0 |9 |9 |9 |String9: extra |9 |
|row010|true |10.0|10.0|10 |10 |10 |String10: extra|10 |
|row011|false|11.0|11.0|11 |11 |11 |String11: extra|11 |
|row012|true |12.0|12.0|12 |12 |12 |String12: extra|12 |
|row013|false|13.0|13.0|13 |13 |13 |String13: extra|13 |
|row014|true |14.0|14.0|14 |14 |14 |String14: extra|14 |
|row015|false|15.0|15.0|15 |15 |15 |String15: extra|15 |
|row016|true |16.0|16.0|16 |16 |16 |String16: extra|16 |
|row017|false|17.0|17.0|17 |17 |17 |String17: extra|17 |
|row018|true |18.0|18.0|18 |18 |18 |String18: extra|18 |
|row019|false|19.0|19.0|19 |19 |19 |String19: extra|19 |
+------+-----+----+----+----+----+----+---------------+----+
only showing top 20 rows
列过滤
table1DF.filter("col4==20").show(false) //报错
table1DF.filter(row=>{ row.getAs[Int]("col4") == 20 }) //不报错
加上,都不报错了
.option("hbase.spark.pushdown.columnfilter", false) //使用filter过滤列时需要加上
范围查找
session.read
.format("org.apache.hadoop.hbase.spark")
.option(HBaseTableCatalog.tableCatalog, catalog)
.option(HBaseSparkConf.TIMERANGE_START,"1661238243000")
.option(HBaseSparkConf.TIMERANGE_END,"1661238244000")