使用Spark读写Hbase中数据
一、pom文件依赖配置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com</groupId>
<artifactId>atguigu</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>2.0.5</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.0.5</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-mapreduce</artifactId>
<version>2.0.5</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.10.1</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 该插件用于将Scala代码编译成class文件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.4.6</version>
<executions>
<execution>
<!-- 声明绑定到maven的compile阶段 -->
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
二、配置连接属性
hbase-site.xml文件主要配置hbase的连接信息
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>hbase.zookeeper.quorum</name>
<value>hadoop102,hadoop103,hadoop104</value>
</property>
</configuration>
三、Spark读取Hbase数据
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{Cell, CellUtil, HBaseConfiguration}
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.spark.{SparkConf, SparkContext}
object Hbase {
def main(args: Array[String]): Unit = {
val saprkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("Myapp")
val context: SparkContext = new SparkContext(saprkconf)
//所有hbase信息都在hbaseconf中,例如读取那张表
val hbaseConf: Configuration = HBaseConfiguration.create()
//指定读取的表
hbaseConf.set(TableInputFormat.INPUT_TABLE,"student")
//指定扫描详情
//hbaseConf.set(TableInputFormat.SCAN_ROW_START,"student")
//读取hbase数据,封装为Spark的RDD模型
/*
class NewHadoopRDD[K, V](
sc : SparkContext, sparkContext
inputFormatClass: Class[_ <: InputFormat[K, V]], 有一个可以读取HBase的输入格式
TableInputFormat:
Hbase中getSplits: 切片 ,一个region 一片
RecordReader: 读取数据是什么格式
RecordReader<ImmutableBytesWritable, Result>
ImmutableBytesWritable: rowkey的byte[]
Result: 代表一行内容,由多个Cell组成
keyClass: Class[K], 输入格式中K的类型
valueClass: Class[V], 输入格式中V的类型
@transient private val _conf: Configuration) 配置
extends RDD[(K, V)]
*/
val Readrdd: NewHadoopRDD[ImmutableBytesWritable, Result] = new NewHadoopRDD[ImmutableBytesWritable, Result](
context,
classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result],
hbaseConf
)
Readrdd.foreach{
case (rowkey,result)=>{
val cells: Array[Cell] = result.rawCells()
for (elem <- cells) {
println("rowkey:"+Bytes.toString(CellUtil.cloneRow(elem))+
"列族:"+ Bytes.toString(CellUtil.cloneFamily(elem) ) +
"列:"+ Bytes.toString(CellUtil.cloneQualifier(elem) ) +
"值:"+ Bytes.toString(CellUtil.cloneValue(elem) ) )
}
}
}
context.stop()
}
}
四、Spark向Hbase写入数据
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{Mutation, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Write {
def main(args: Array[String]): Unit = {
val saprkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("Myapp")
val context: SparkContext = new SparkContext(saprkconf)
//所有hbase信息都在hbaseconf中,例如读取那张表
val hbaseConf: Configuration = HBaseConfiguration.create()
//指定写入的表
hbaseConf.set(TableOutputFormat.OUTPUT_TABLE,"student")
//设置使用什么样的输出格式
val job: Job = Job.getInstance(hbaseConf)
//调用Job的API对其中传入的Configuration进行配置
//设置输出格式,和输出的key-value的类型
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Mutation])
//准备数据
val datas = List(("r1","info","name","thw"),("r1","info","age","23"))
val dataRDD: RDD[(String, String, String, String)] = context.makeRDD(datas)
//转换数据格式,只有key-value类型的RDD才可以调用写入hbase的方法
/*
def saveAsNewAPIHadoopDataset(conf: Configuration)
只有K-V类型的RDD才可以调用
在Configuration中配置输出的表名 和 输出使用的输出格式
输出主要使用TableOutputFormat ,可以将MR输出的结果,写入到HBase表中,
要求KEY类型可以自定义,VALUE必须是Mutaion(写操作类型的父类,例如Put,Delete)类型
key : 取Rowkey
ImmutableBytesWritable
value: Put
*/
val result = dataRDD.map {
case (rowkey, cf, cq, value) => {
//写出的key
val key: ImmutableBytesWritable = new ImmutableBytesWritable
key.set(Bytes.toBytes(rowkey))
//写出的value
val put: Put = new Put(Bytes.toBytes(rowkey))
put.addColumn(Bytes.toBytes(cf), Bytes.toBytes(cq), Bytes.toBytes(value))
(key, put)
}
}
//调用方法写出
result.saveAsNewAPIHadoopDataset(job.getConfiguration)
context.stop()
}
}