bulkload方式将hive数据存储至hbase表

肉装法师

已于 2022-08-04 15:58:19 修改

阅读量627

点赞数

分类专栏： Hbase 文章标签： hbase hive spark

于 2022-08-04 14:47:36 首次发布

本文链接：https://blog.csdn.net/weixin_41772761/article/details/126158999

版权

Hbase 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

hive外部表数据在hdfs的存储位置为 /warehouse/aaa/nsrxx
创建hbase表（为创建namespace) nsrxx及其列簇， hbase表数据存储位置/apps/hbase/data/data/default/nsrxx
添加配置
增加pom依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.xxx</groupId>
    <artifactId>hbasebulkload</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <scala.version>2.12.10</scala.version>
        <hbase.version>2.0.2</hbase.version>
        <slf4j.version>1.7.22</slf4j.version>
        <hadoop-version>3.1.1</hadoop-version>
        <spark.version>3.0.2</spark.version>
        <maven-compiler-plugin.version>3.1</maven-compiler-plugin.version>
        <build-helper-plugin.version>3.0.0</build-helper-plugin.version>
        <scala-compiler-plugin.version>3.2.0</scala-compiler-plugin.version>
        <maven-shade-plugin.version>3.2.1</maven-shade-plugin.version>
    </properties>
    <repositories>
        <repository>
            <id>aliyunmaven</id>
            <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
        </repository>
    </repositories>
    <dependencies>

<!--        <dependency>-->
<!--            <groupId>org.apache.hive</groupId>-->
<!--            <artifactId>hive-jdbc</artifactId>-->
<!--            <version>3.1.0</version>-->
<!--            <exclusions>-->
<!--                <exclusion>-->
<!--                    <groupId>org.glassfish</groupId>-->
<!--                    <artifactId>javax.el</artifactId>-->
<!--                </exclusion>-->
<!--                <exclusion>-->
<!--                    <groupId>org.eclipse.jetty</groupId>-->
<!--                    <artifactId>jetty-runner</artifactId>-->
<!--                </exclusion>-->
<!--            </exclusions>-->
<!--        </dependency>-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.12</artifactId>
            <version>${spark.version}</version>
<!--                                    <scope>provided</scope>-->
            <!-- provider如果存在，那么运行时该Jar包不存在，也不会打包到最终的发布版本中，只是编译器有效 -->
        </dependency>


        <!-- HBase -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>${hbase.version}</version>
            <!--                        <scope>provided</scope>-->
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-common</artifactId>
            <version>${hbase.version}</version>
            <!--                        <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>${hbase.version}</version>
            <!--                        <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-mapreduce</artifactId>
            <version>${hbase.version}</version>
            <!--                        <scope>provided</scope>-->
        </dependency>

        <!-- Logging -->
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>${slf4j.version}</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-simple</artifactId>
            <version>${slf4j.version}</version>
        </dependency>
        <dependency>
            <groupId>com.typesafe</groupId>
            <artifactId>config</artifactId>
            <version>1.3.4</version>
        </dependency>

    </dependencies>
    <build>
        <resources>
            <resource>
                <directory>src/main/resources</directory>
            </resource>
        </resources>
        <plugins>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <configuration>
                    <recompileMode>incremental</recompileMode>
                </configuration>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>2.4.1</version>
                <configuration>
                    <!-- get all project dependencies -->
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <!-- bind to the packaging phase -->
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>


</project>

编写HbaseBulkLoader 和表字段类
HbaseBulkLoader

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Table}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{
  HFileOutputFormat2,
  LoadIncrementalHFiles
}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.immutable.TreeMap


/**
 * 将数据存储文本文件转换为HFile文件，加载到HBase表中
 */
object HbaseBulkLoader {
  def main(args: Array[String]): Unit = {
    //数据类型、HBase表名称、表列簇、输入路径及输出路径
    val Array(dataType, tableName, family, inputDir, outputDir)
    = Array("1", "nsrxx", "basicInfo", "/warehouse/aaa/nsrxx", "/apps/hbase/data/data/default/nsrxx")


    val fieldNames = dataType.toInt match {
      case 1 => TableFieldNames.NSRXX_FIELD_NAMES
    }

    val sc: SparkContext = {

      val sparkConf = new SparkConf()
        .setAppName(this.getClass.getSimpleName.stripSuffix("$"))
        .set("spark.serializer",
          "org.apache.spark.serializer.KryoSerializer")
      SparkContext.getOrCreate(sparkConf)
    }

    //2. 读取文本文件
    val keyValuesRDD: RDD[(ImmutableBytesWritable, KeyValue)] = sc
      .textFile(inputDir)
      // 过滤数据
      .filter(line => null != line)
      .flatMap { line => getLineToData(line, family, fieldNames) }
      .sortByKey()

    // a. 读取配置信息
    val conf: Configuration = HBaseConfiguration.create()
    // b. 如果输出目录存在，删除
    // 不加下行会报table name empty错误
    conf.set("hbase.mapreduce.hfileoutputformat.table.name", "nsrxx")

    val dfs = FileSystem.get(conf)
    val outputPath: Path = new Path(outputDir)
    if (dfs.exists(outputPath)) {
      dfs.delete(outputPath, true)
    }

    val conn = ConnectionFactory.createConnection(conf)
    val htableName = TableName.valueOf(tableName)
    val table: Table = conn.getTable(htableName)
    HFileOutputFormat2.configureIncrementalLoad(
      Job.getInstance(conf),
      table,
      conn.getRegionLocator(htableName)
    )
	// 如果本地跑不需要注释，如果打jar包不注释 会fileSystem closed spark yarn提交 会共用fileSystem导致异常关闭
    //    dfs.close()


    // 3. 保存数据为HFile文件
    keyValuesRDD.saveAsNewAPIHadoopFile(
      outputDir,
      classOf[ImmutableBytesWritable],
      classOf[KeyValue],
      classOf[HFileOutputFormat2],
      conf
    )

    // 4. 将输出HFile加载到HBase表中
    val load = new LoadIncrementalHFiles(conf)
    load.doBulkLoad(outputPath, conn.getAdmin, table,
      conn.getRegionLocator(htableName))
    // 应用结束，关闭资源
    sc.stop()
  }

  def getLineToData(line: String, family: String, fieldNames: TreeMap[String, Int]): List[(ImmutableBytesWritable, KeyValue)] = {
    val length = fieldNames.size
    //分割字段 本人Hive是以\001为分割字段 
    val fieldValues: Array[String] = line.split("\001", -1)
    //逐行读取hfile分割后数据 设置rowkey 实例化keyvalue对象用于hbase存储
    if (null == fieldValues || fieldValues.length != length) return Nil
    //按hive hfile数据每行分割后，本人第16个字符串设置为rowkey
    val nsrsbh: String = fieldValues(15)
    val rowKey = Bytes.toBytes(nsrsbh)
    val ibw: ImmutableBytesWritable = new ImmutableBytesWritable(rowKey)

    val columnFamily: Array[Byte] = Bytes.toBytes(family)

    fieldNames.toList.map { case (fieldName, fieldIndex) =>
      // KeyValue实例对象
      val keyValue = new KeyValue(
        rowKey,
        columnFamily,
        Bytes.toBytes(fieldName),
        Bytes.toBytes(fieldValues(fieldIndex))
      )
      // 返回
      (ibw, keyValue)
    }

  }

}

import scala.collection.immutable.TreeMap
//使用treeMap是为了对生成的hfile进行字典排序 不仅需要行rowkey字段排序 列名也需要字典排序不然报错
object TableFieldNames {
  val NSRXX_FIELD_NAMES: TreeMap[String, Int] = TreeMap(
    ("cbrs", 0),
    ("clsj", 1),
    ("cym", 2),
    ("dz", 3),
    ("frdb", 4)
    ......
    ("nsrsbh", 11)
    ......
      )
}