前戏:
1.spark操作结构化数据利用hbase进行去重
2.大致思路:将数据处理成结构化数据–>spark调用hadoop api 将数据以hfile形式存入hdfs—>以bulkload方式将数据批量导入hbase
以下以cdh5.16.2生产环境为例:
hadoop版本:2.6.0
hbase版本:1.6.0
spark2版本:2.4.0
zk版本:3.4.5
1.所需依赖:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.spark.hbase</groupId>
<artifactId>hbase_test</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<hadoop.version>2.6.0</hadoop.version>
<hbase.version>1.6.0</hbase.version>
<spark.version>2.4.0</spark.version>
</properties>
<dependencies>
<!-- spark-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<!--spark-sql-->
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
<scope>provided</scope>
</dependency>
<!-- HBase -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>${hbase.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
<scope>provided</scope>
</dependency>
<!--lombok-->
<!-- <dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.2</version>
</dependency>-->
<!-- <dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-mapreduce</artifactId>
<version>2.0.4</version>
</dependency>-->
<!-- https://mvnrepository.com/artifact/org.glassfish/javax.el -->
<dependency>
<groupId>org.glassfish</groupId>
<artifactId>javax.el</artifactId>
<version>3.0.1-b06</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.3</version>
</dependency>
</dependencies>
<build>
<!-- <sourceDirectory>src/main/scala</sourceDirectory>-->
<!--<testSourceDirectory>src/test/scala</testSourceDirectory>-->
<plugins>
<!--scala插件-->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<!--编译java的插件-->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
</plugin>
<!--打jar的插件-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass/>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
2.code
package cm.sean.hbase
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat2, LoadIncrementalHFiles}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.sql.SparkSession
/**
* @date: 2020/5/6 18:09
* @author: sean.li
* @ClassPath: cm.sean.hbase.HbaseTest
* @description:
*/
object HbaseTest {
/**
* 处理null字段
* @param str
* @return
*/
def nullHandle(str: String):String = {
if(str == null || "".equals(str)){
return "NULL"
}else{
return str
}
}
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("import")
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
//设置日志等级
spark.sparkContext.setLogLevel("warn")
//从hive中读取数据,数据是在hdfs上,hive是个外部表,你也可以用内部表,都有一样
val Data = spark.sql("select id,name,age from sean.test")
Data.show(10,false)
val dataRdd = Data.rdd.flatMap(row => { //id:rowkey,cf是列族名,name、age
val rowkey = row.getAs[Long]("id").toString
Array(
(rowkey,("cf","name",nullHandle(row.getAs[String]("name")))),
(rowkey,("cf","age",nullHandle(row.getAs[Long]("age").toString)))
)
})
//要保证行键,列族,列名的整体有序,必须先排序后处理,防止数据异常过滤rowkey
val rdds = dataRdd.filter(x=>x._1 != null).sortBy(x=>(x._1,x._2._1,x._2._2)).map(x => {
//将rdd转换成HFile需要的格式,Hfile的key是ImmutableBytesWritable,那么我们定义的RDD也是要以ImmutableBytesWritable的实例为key
//KeyValue的实例为value
val rowKey = Bytes.toBytes(x._1)
val family = Bytes.toBytes(x._2._1)
val colum = Bytes.toBytes(x._2._2)
val value = Bytes.toBytes(x._2._3)
(new ImmutableBytesWritable(rowKey), new KeyValue(rowKey, family, colum, value))
})
rdds.take(10).foreach(print)
//临时文件保存位置,在hdfs上
val tmpdir = "/tmp/test"
val hconf = new Configuration()
hconf.set("fs.defaultFS", "hdfs://mycluster")
val fs = FileSystem.get(new URI("hdfs://mycluster"), hconf, "hdfs") //hadoop为你的服务器用户名
if (fs.exists(new Path(tmpdir))){ //由于生成Hfile文件的目录必须是不存在的,所以我们存在的话就把它删除掉
println("删除临时文件夹")
fs.delete(new Path(tmpdir), true)
}
println("hello")
//创建HBase的配置
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", "cdh01,cdh02,cdh03")
conf.set("hbase.zookeeper.property.clientPort", "2181")
println("hello1")
//为了预防hfile文件数过多无法进行导入,设置该参数值
conf.setInt("hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily", 5000)
println("hello2")
//此处运行完成之后,在tmpdir生成的Hfile文件
rdds.saveAsNewAPIHadoopFile(tmpdir,
classOf[ImmutableBytesWritable],
classOf[KeyValue],
classOf[HFileOutputFormat2],
conf)
println("hello3")
//开始即那个HFile导入到Hbase,此处都是hbase的api操作
val load = new LoadIncrementalHFiles(conf)
//---------------------------------
//--------------------------------------
//hbase的表名
val tableName = "sean.li"
println("hello4")
//创建hbase的链接,利用默认的配置文件,实际上读取的hbase的master地址
val conn = ConnectionFactory.createConnection(conf)
//根据表名获取表
val table = conn.getTable(TableName.valueOf(tableName))
try {
//获取hbase表的region分布
val regionLocator = conn.getRegionLocator(TableName.valueOf(tableName))
//创建一个hadoop的mapreduce的job
val job = Job.getInstance(conf)
//设置job名称,随便起一个就行
job.setJobName(this.getClass.getSimpleName)
//此处最重要,需要设置文件输出的key,因为我们要生成HFil,所以outkey要用ImmutableBytesWritable
job.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
//输出文件的内容KeyValue
job.setMapOutputValueClass(classOf[KeyValue])
//配置HFileOutputFormat2的信息
HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator)
//开始导入
load.doBulkLoad(new Path("hdfs://mycluster/tmp/test"),conn.getAdmin,table, regionLocator)
} finally {
table.close()
conn.close()
}
spark.close()
}
}
3.结果展示:
4.将hbase数据映射到hive
CREATE EXTERNAL TABLE li(
key string,
name string comment "姓名",
age string comment "年龄"
)STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:name,cf:age")
TBLPROPERTIES("hbase.table.name" = "sean.li");
5.所遇问题
5.1jar包问题
解决
**1.将hbase的jar包复制的spark2的lib下
2.将jars下的htrace-core-3.2.0-incubating.jar cp到spark2的lib下
3.如果还不行再讲上述jar包cp到hadoop的lib下
4.在hadoop-env.sh添加: export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/opt/cloudera/parcels/CDH/lib/hbase/***
cp /opt/cloudera/parcels/CDH/jars/hive-hbase-handler-1.1.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars
cp /opt/cloudera/parcels/CDH/jars/metrics-core-2.2.0.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars
cp /opt/cloudera/parcels/CDH/jars/htrace-core-3.2.0-incubating.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-annotations-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-annotations.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-client-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-common-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-examples-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-examples.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-external-blockcache-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-external-blockcache.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop2-compat-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop2-compat.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-it-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-procedure-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-procedure.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-resource-bundle-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-resource-bundle.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-rest-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-rest.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-rsgroup-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-rsgroup.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-server-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-shell-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-shell.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-spark-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-spark.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-thrift-1.2.0-cdh5.16.2.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-thrift.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/jars/htrace-core-3.2.0-incubating.jar /opt/cloudera/parcels/SPARK2/lib/spark2/jars/