Spark3.X 读写 HBase2.X

最新推荐文章于 2024-07-07 11:58:05 发布

月亮给我抄代码

最新推荐文章于 2024-07-07 11:58:05 发布

阅读量635

点赞数

文章标签： hbase 大数据 spark

本文链接：https://blog.csdn.net/weixin_46389691/article/details/129031989

版权

我所使用的版本信息如下：

Spark 3.0.0
HBase 2.2.3
Scala 2.12
JDK 1.8

pom 依赖：

注意改成自己对应的版本。

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.12</artifactId>
            <version>3.0.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.12</artifactId>
            <version>3.0.0</version>
        </dependency>
        
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>2.2.3</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-mapreduce</artifactId>
            <version>2.2.3</version>
        </dependency>

Spark 写入 HBase

我们先创建一个文件用于写入，文件类型无所谓，我这里将文件名取为：test.csv。

city_name,province_name,total_amount,total_count,sequence,year,month
上海市,上海市,121773688,28456,1,2022,4
上海市,上海市,37040482,8635,1,2022,3
上海市,上海市,31189394,7429,1,2022,5
浙江省杭州市,浙江省,14136866,3344,1,2022,4
江苏省南京市,江苏省,12868037,2999,1,2022,4

写入代码文件示例：

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, NamespaceDescriptor, TableName}
import org.apache.spark.sql.{DataFrame, SparkSession}

object MyWriteHBase {

    // 创建一个全局的 HBaseConf
    private val hbaseConf: Configuration = HBaseConfiguration.create()
    // 连接对象
    hbaseConf.set("hbase.zookeeper.quorum","master,slave1,slave2")
    // 连接端口
    hbaseConf.set("hbase.zookeeper.property.clientPort","2181")
    // 跳过版本验证
    hbaseConf.set("hbase.default.for.version.skip","true")


    def main(args: Array[String]): Unit = {

        val spark: SparkSession = SparkSession.
                builder()
                .appName("MyWriteHBase")
                .master("local[*]")
                .getOrCreate()

        // 读取写入的测试数据集
        val dataFrame: DataFrame = spark.read.option("header", "true").csv("test.csv")

        // 创建连接对象
        val conn: Connection = ConnectionFactory.createConnection(hbaseConf)

        // TODO 在 HBase 中创建库表
        // 获取管理员权限
        val admin: Admin = conn.getAdmin
        // 判断表是否存在，demo 表示库名，order 表示表名
        val tableName: TableName = TableName.valueOf("demo:order")
        if(!admin.tableExists(tableName)){
            // 创建库
            val descriptor: NamespaceDescriptor = NamespaceDescriptor.create("demo").build()
            admin.createNamespace(descriptor)
            // 创建表对象
            val table: TableDescriptorBuilder = TableDescriptorBuilder.newBuilder(tableName)
            // 添加列族
            val columnFamilyDescriptor: ColumnFamilyDescriptor = ColumnFamilyDescriptorBuilder.newBuilder("info".getBytes()).build()
            // 将列族添加到表中
            table.setColumnFamily(columnFamilyDescriptor)
            // 创建表
            admin.createTable(table.build())
        }
        admin.close()
        conn.close()

        // TODO 写入数据

        // 指定列族
        val cloumnName = "info"

        dataFrame.rdd.foreachPartition(iter=>{
            // 创建连接对象
            val conn: Connection = ConnectionFactory.createConnection(hbaseConf)
            // 获取指定表连接
            val table: Table = conn.getTable(TableName.valueOf("demo:order"))

            iter.foreach(r=>{
                val put = new Put(Bytes.toBytes(UUID.randomUUID().toString)) // RowKey
                // 指定列族，列名，值
                put.addColumn(Bytes.toBytes(cloumnName),Bytes.toBytes("city_name"),Bytes.toBytes(r.getString(0)))
                put.addColumn(Bytes.toBytes(cloumnName),Bytes.toBytes("province_name"),Bytes.toBytes(r.getString(1)))
                put.addColumn(Bytes.toBytes(cloumnName),Bytes.toBytes("total_amount"),Bytes.toBytes(r.getString(2)))
                put.addColumn(Bytes.toBytes(cloumnName),Bytes.toBytes("total_count"),Bytes.toBytes(r.getString(3)))
                put.addColumn(Bytes.toBytes(cloumnName),Bytes.toBytes("sequence"),Bytes.toBytes(r.getString(4)))
                put.addColumn(Bytes.toBytes(cloumnName),Bytes.toBytes("year"),Bytes.toBytes(r.getString(5)))
                put.addColumn(Bytes.toBytes(cloumnName),Bytes.toBytes("month"),Bytes.toBytes(r.getString(6)))
                // 向 HBase 提交，如果遇到报错就关闭表连接
                Try(table.put(put)).getOrElse(table.close())
            })
            // 关闭连接
            conn.close()
            table.close()
        })


        spark.stop()

    }

}

Spark 读取 HBase

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object MyReadHBase {


    def main(args: Array[String]): Unit = {

        val spark: SparkSession = SparkSession.
                builder()
                .appName("MyReadHBase")
                .master("local[*]")
                .getOrCreate()

        // 创建一个全局的 HBaseConf
        val hbaseConf: Configuration = HBaseConfiguration.create()
        // 连接对象
        hbaseConf.set("hbase.zookeeper.quorum","master,slave1,slave2")
        // 连接端口
        hbaseConf.set("hbase.zookeeper.property.clientPort","2181")
        // 跳过版本验证
        hbaseConf.set("hbase.default.for.version.skip","true")
        // 指定连接表
        hbaseConf.set(TableInputFormat.INPUT_TABLE,"demo:order")

        // 获取 HBase 数据 rdd
        val hbaseRdd: RDD[(ImmutableBytesWritable, Result)] = spark.sparkContext.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat], classOf[ImmutableBytesWritable],
            classOf[Result])

        // TODO 读取数据
        // 指定列族
        val cloumnName = "info"

        import spark.implicits._

        hbaseRdd.map{
            case (_,result: Result) =>
                val row: String = Bytes.toString(result.getRow) // 获取 RowKey
                // 获取指定列族的相关列数据
                val city_name: String = Bytes.toString(result.getValue(cloumnName.getBytes(), "city_name".getBytes()))
                val province_name: String = Bytes.toString(result.getValue(cloumnName.getBytes(), "province_name".getBytes()))
                val total_amount: String = Bytes.toString(result.getValue(cloumnName.getBytes(), "total_amount".getBytes()))
                val total_count: String = Bytes.toString(result.getValue(cloumnName.getBytes(), "total_count".getBytes()))
                val sequence: String = Bytes.toString(result.getValue(cloumnName.getBytes(), "sequence".getBytes()))
                val year: String = Bytes.toString(result.getValue(cloumnName.getBytes(), "year".getBytes()))
                val month: String = Bytes.toString(result.getValue(cloumnName.getBytes(), "month".getBytes()))
                (row,city_name,province_name,total_amount,total_count,sequence,year,month)
        }
                .toDF("row","city_name","province_name","total_amount","total_count","sequence","year","month")
                .show()


        spark.stop()
    }

}