scala版本,spark将HDFS上的数据同步到hbase

最新推荐文章于 2025-05-14 16:24:51 发布

原创

最新推荐文章于 2025-05-14 16:24:51 发布 · 2.3k 阅读

6 ·

CC 4.0 BY-SA版权

文章标签：

#hbase #spark #hdfs

1 spark将HDFS上的数据同步到hbase

我们也知道hbase底层的数据源是hFile。
将hdfs数据转换为hfile, 通过bukload快速导入hbase ,当然里面有很多坑.
比如 : 版本不一致.
还有就是本地版本和集群版本不一致导致class不存在.写hbase代码最好是使用java和scala。我这里使用的是spark2.4 + hbase 2.1 切记不同版本使用的方法不一样。
我这里也保留了一些老的class希望对你有用。

2 代码

话不多说直接上代码。我这里只是同步，所以都是写在了一个类上面。

package com.test.task

import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{
   
   FileSystem, Path}
import org.apache.hadoop.hbase.client.{
   
   ConnectionFactory, Result, Table}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{
   
   HFileOutputFormat2, LoadIncrementalHFiles, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase._
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.{
   
   SparkConf, SparkContext}

/**
 * Author: say
 * Description:
 * Create: 2021/5/1 13:14
 */
object Hdfs2Hbase {
   
   

    var cdhPath = ""
    var zookeeperQuorum = ""
    var dataSourcePath = ""
    var hdfsRootPath = ""
    var hFilePath = ""
    val tableName = "api:real_time_label"
    val familyName = "baseInfo"

    def main(args: Array[String]): Unit = {
   
   

        //设置用户
        System.setProperty("HADOOP_USER_NAME", "say")

        //  运行shell 传参执行环境
        //  生产运行记得设置运行参数
        if (args.length >= 1) {
   
   
            println("设置参数,运行环境:"+args(0))
            if ("online".equals(args(0))) {
   
   
                cdhPath = "hdfs://say-hdfs-cluster"
                zookeeperQuorum = "192.168.1.101:2181,192.168.1.102:2181,192.168.1.103:2181"
                dataSourcePath = cdhPath+"/user/say/hbase/txt/"
                hdfsRootPath = cdhPath+"/user/say/"
                hFilePath = cdhPath+"/user/say/hbase/hfile"
            } else {
   
   
                cdhPath = "hdfs://say-cdh-master02.net:8020"
                zookeeperQuorum = "192.168.2.101:2181,192.168.2.102:2181,192.168.2.103:2181"
                dataSourcePath = cdhPath+"/user/say/hbase/txt/"
                hdfsRootPath = cdhPath+"/user/say/"
                hFilePath = cdhPath+"/user/say/hbase/hfile"
            }
        } else {
   
   
            println("运行环境: test")
            cdhPath = "hdfs://say-cdh-master02.net:8020"
            zookeeperQuorum = "192.168.1.101:2181,192.168.1.102:2181,192.168.1.103:2181"
            dataSourcePath = cdhPath+"/user/say/hbase/txt/"
            hdfsRootPath = cdhPath+"/user/say/"
            hFilePath = cdhPath+"/user/say/hbase/hfile"
        }

        val sparkConf = new SparkConf()
          .setAppName("hive2Hbase")
//          .setMaster("local[*]")    //本地运行打开,也可以设置参数 ,记得设置运行参数
        sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        sparkConf.registerKryoClasses(Array(classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable]))

        val sc = new SparkContext(sparkConf)
        val hadoopConf = new Configuration()
        hadoopConf.set("fs.defaultFS", hdfsRootPath)
        hadoopConf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem")
        val fileSystem = FileSystem.get(new URI(cdhPath), hadoopConf)
        val hbaseConf = HBaseConfiguration.create(hadoopConf)
        println("我在这里")
        hbaseConf.set(HConstants.ZOOKEEPER_QUORUM, zookeeperQuorum)
        hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
        val hbaseConn = ConnectionFactory.createConnection(hbaseConf)

        val admin = hbaseConn.getAdmin
        println("连接成功啦~")

        // 0. 准备程序运行的环境
        // 如果 HBase 表不存在，就创建一个新表
        if (!admin.tableExists(TableName.valueOf(tableName))) {
   
   
            val desc = new HTableDescriptor(TableName.valueOf(tableName))
            val hcd = new HColumnDescriptor(familyName)
            desc.addFamily(hcd)
            admin.createTable(desc)
        }
        // 如果存放 HFile文件的路径已经存在，就删除掉
        if (fileSystem.exists(new Path(hFilePath))) {
   
   
            fileSystem.delete(new Path(hFilePath), true)
        }

        // 1. 清洗需要存放到 HFile 中的数据，rowKey 一定要排序，否则会报错：
        // java.io.IOException: Added a key not lexically larger than previous.

        val data = sc.textFile(dataSourcePath)
          .map(str => {
   
   
              val valueStr: Array[String] = str.split("\\|")

最低0.47元/天解锁文章