- hive外部表 数据在hdfs的存储位置为 /warehouse/aaa/nsrxx
- 创建hbase表 (为创建namespace) nsrxx及其列簇, hbase表数据存储位置/apps/hbase/data/data/default/nsrxx
- 添加配置
- 增加pom依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.xxx</groupId>
<artifactId>hbasebulkload</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<scala.version>2.12.10</scala.version>
<hbase.version>2.0.2</hbase.version>
<slf4j.version>1.7.22</slf4j.version>
<hadoop-version>3.1.1</hadoop-version>
<spark.version>3.0.2</spark.version>
<maven-compiler-plugin.version>3.1</maven-compiler-plugin.version>
<build-helper-plugin.version>3.0.0</build-helper-plugin.version>
<scala-compiler-plugin.version>3.2.0</scala-compiler-plugin.version>
<maven-shade-plugin.version>3.2.1</maven-shade-plugin.version>
</properties>
<repositories>
<repository>
<id>aliyunmaven</id>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-mapreduce</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>com.typesafe</groupId>
<artifactId>config</artifactId>
<version>1.3.4</version>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<configuration>
<recompileMode>incremental</recompileMode>
</configuration>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.4.1</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
- 编写HbaseBulkLoader 和 表字段类
HbaseBulkLoader
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Table}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{
HFileOutputFormat2,
LoadIncrementalHFiles
}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.immutable.TreeMap
object HbaseBulkLoader {
def main(args: Array[String]): Unit = {
val Array(dataType, tableName, family, inputDir, outputDir)
= Array("1", "nsrxx", "basicInfo", "/warehouse/aaa/nsrxx", "/apps/hbase/data/data/default/nsrxx")
val fieldNames = dataType.toInt match {
case 1 => TableFieldNames.NSRXX_FIELD_NAMES
}
val sc: SparkContext = {
val sparkConf = new SparkConf()
.setAppName(this.getClass.getSimpleName.stripSuffix("$"))
.set("spark.serializer",
"org.apache.spark.serializer.KryoSerializer")
SparkContext.getOrCreate(sparkConf)
}
val keyValuesRDD: RDD[(ImmutableBytesWritable, KeyValue)] = sc
.textFile(inputDir)
.filter(line => null != line)
.flatMap { line => getLineToData(line, family, fieldNames) }
.sortByKey()
val conf: Configuration = HBaseConfiguration.create()
conf.set("hbase.mapreduce.hfileoutputformat.table.name", "nsrxx")
val dfs = FileSystem.get(conf)
val outputPath: Path = new Path(outputDir)
if (dfs.exists(outputPath)) {
dfs.delete(outputPath, true)
}
val conn = ConnectionFactory.createConnection(conf)
val htableName = TableName.valueOf(tableName)
val table: Table = conn.getTable(htableName)
HFileOutputFormat2.configureIncrementalLoad(
Job.getInstance(conf),
table,
conn.getRegionLocator(htableName)
)
keyValuesRDD.saveAsNewAPIHadoopFile(
outputDir,
classOf[ImmutableBytesWritable],
classOf[KeyValue],
classOf[HFileOutputFormat2],
conf
)
val load = new LoadIncrementalHFiles(conf)
load.doBulkLoad(outputPath, conn.getAdmin, table,
conn.getRegionLocator(htableName))
sc.stop()
}
def getLineToData(line: String, family: String, fieldNames: TreeMap[String, Int]): List[(ImmutableBytesWritable, KeyValue)] = {
val length = fieldNames.size
val fieldValues: Array[String] = line.split("\001", -1)
if (null == fieldValues || fieldValues.length != length) return Nil
val nsrsbh: String = fieldValues(15)
val rowKey = Bytes.toBytes(nsrsbh)
val ibw: ImmutableBytesWritable = new ImmutableBytesWritable(rowKey)
val columnFamily: Array[Byte] = Bytes.toBytes(family)
fieldNames.toList.map { case (fieldName, fieldIndex) =>
val keyValue = new KeyValue(
rowKey,
columnFamily,
Bytes.toBytes(fieldName),
Bytes.toBytes(fieldValues(fieldIndex))
)
(ibw, keyValue)
}
}
}
import scala.collection.immutable.TreeMap
object TableFieldNames {
val NSRXX_FIELD_NAMES: TreeMap[String, Int] = TreeMap(
("cbrs", 0),
("clsj", 1),
("cym", 2),
("dz", 3),
("frdb", 4)
......
("nsrsbh", 11)
......
)
}
- 运行jar包成功后可以看到数据传入hbase表