spark 写 hbase

前戏:
1.spark操作结构化数据利用hbase进行去重
2.大致思路:将数据处理成结构化数据–>spark调用hadoop api 将数据以hfile形式存入hdfs—>以bulkload方式将数据批量导入hbase
以下以cdh5.16.2生产环境为例:
hadoop版本:2.6.0
hbase版本:1.6.0
spark2版本:2.4.0
zk版本:3.4.5

1.所需依赖:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>


    <groupId>cn.spark.hbase</groupId>
    <artifactId>hbase_test</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <hadoop.version>2.6.0</hadoop.version>
        <hbase.version>1.6.0</hbase.version>
        <spark.version>2.4.0</spark.version>
    </properties>




    <dependencies>
        <!--    spark-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_2.11</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <!--spark-sql-->
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>


        </dependency>


        <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.6.0</version>
            <scope>provided</scope>
        </dependency>




        <!-- HBase -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-common</artifactId>
            <version>${hbase.version}</version>
            <scope>provided</scope>
        </dependency>


        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>${hbase.version}</version>
            <scope>provided</scope>
        </dependency>


        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>${hbase.version}</version>
            <scope>provided</scope>
        </dependency>


        <!--lombok-->
       <!-- <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.2</version>
        </dependency>-->
       <!-- <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-mapreduce</artifactId>
            <version>2.0.4</version>
        </dependency>-->
        <!-- https://mvnrepository.com/artifact/org.glassfish/javax.el -->
        <dependency>
            <groupId>org.glassfish</groupId>
            <artifactId>javax.el</artifactId>
            <version>3.0.1-b06</version>
        </dependency>


        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.3</version>
        </dependency>


    </dependencies>


    <build>
       <!-- <sourceDirectory>src/main/scala</sourceDirectory>-->
        <!--<testSourceDirectory>src/test/scala</testSourceDirectory>-->
        <plugins>
            <!--scala插件-->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                        <configuration>
                            <args>
                                <arg>-dependencyfile</arg>
                                <arg>${project.build.directory}/.scala_dependencies</arg>
                            </args>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <!--编译java的插件-->
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.5.1</version>
            </plugin>
            <!--打jar的插件-->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <transformers>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass/>
                                </transformer>
                            </transformers>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

2.code

package cm.sean.hbase


import java.net.URI


import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat2, LoadIncrementalHFiles}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.sql.SparkSession
/**
* @date: 2020/5/6 18:09
* @author: sean.li  
* @ClassPath: cm.sean.hbase.HbaseTest
* @description:
*/
object HbaseTest {


  /**
   * 处理null字段
   * @param str
   * @return
   */
  def nullHandle(str: String):String = {
    if(str == null || "".equals(str)){
      return "NULL"
    }else{
      return str
    }
  }
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("import")
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()


    //设置日志等级
    spark.sparkContext.setLogLevel("warn")


    //从hive中读取数据,数据是在hdfs上,hive是个外部表,你也可以用内部表,都有一样
    val Data = spark.sql("select id,name,age  from sean.test")
    Data.show(10,false)
    val dataRdd = Data.rdd.flatMap(row => {     //id:rowkey,cf是列族名,name、age
      val rowkey = row.getAs[Long]("id").toString
      Array(
        (rowkey,("cf","name",nullHandle(row.getAs[String]("name")))),
        (rowkey,("cf","age",nullHandle(row.getAs[Long]("age").toString)))
      )
    })


    //要保证行键,列族,列名的整体有序,必须先排序后处理,防止数据异常过滤rowkey
    val rdds = dataRdd.filter(x=>x._1 != null).sortBy(x=>(x._1,x._2._1,x._2._2)).map(x => {
      //将rdd转换成HFile需要的格式,Hfile的key是ImmutableBytesWritable,那么我们定义的RDD也是要以ImmutableBytesWritable的实例为key
      //KeyValue的实例为value
      val rowKey = Bytes.toBytes(x._1)
      val family = Bytes.toBytes(x._2._1)
      val colum = Bytes.toBytes(x._2._2)
      val value = Bytes.toBytes(x._2._3)
      (new ImmutableBytesWritable(rowKey), new KeyValue(rowKey, family, colum, value))
    })
    rdds.take(10).foreach(print)




    //临时文件保存位置,在hdfs上
    val tmpdir = "/tmp/test"


    val hconf = new Configuration()
    hconf.set("fs.defaultFS", "hdfs://mycluster")


    val fs = FileSystem.get(new URI("hdfs://mycluster"), hconf, "hdfs") //hadoop为你的服务器用户名


    if (fs.exists(new Path(tmpdir))){    //由于生成Hfile文件的目录必须是不存在的,所以我们存在的话就把它删除掉
      println("删除临时文件夹")
      fs.delete(new Path(tmpdir), true)
    }


    println("hello")
    //创建HBase的配置
    val conf = HBaseConfiguration.create()
    conf.set("hbase.zookeeper.quorum", "cdh01,cdh02,cdh03")
    conf.set("hbase.zookeeper.property.clientPort", "2181")
    println("hello1")


    //为了预防hfile文件数过多无法进行导入,设置该参数值
    conf.setInt("hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily", 5000)


    println("hello2")
    //此处运行完成之后,在tmpdir生成的Hfile文件
    rdds.saveAsNewAPIHadoopFile(tmpdir,
      classOf[ImmutableBytesWritable],
      classOf[KeyValue],
      classOf[HFileOutputFormat2],
      conf)


    println("hello3")
    //开始即那个HFile导入到Hbase,此处都是hbase的api操作
    val load = new LoadIncrementalHFiles(conf)


    //---------------------------------




    //--------------------------------------
    //hbase的表名
    val tableName = "sean.li"
    println("hello4")


    //创建hbase的链接,利用默认的配置文件,实际上读取的hbase的master地址
    val conn = ConnectionFactory.createConnection(conf)


    //根据表名获取表
    val table = conn.getTable(TableName.valueOf(tableName))








    try {


      //获取hbase表的region分布
      val regionLocator = conn.getRegionLocator(TableName.valueOf(tableName))


      //创建一个hadoop的mapreduce的job
      val job = Job.getInstance(conf)


      //设置job名称,随便起一个就行
      job.setJobName(this.getClass.getSimpleName)


      //此处最重要,需要设置文件输出的key,因为我们要生成HFil,所以outkey要用ImmutableBytesWritable
      job.setMapOutputKeyClass(classOf[ImmutableBytesWritable])


      //输出文件的内容KeyValue
      job.setMapOutputValueClass(classOf[KeyValue])


      //配置HFileOutputFormat2的信息
      HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator)


      //开始导入
      load.doBulkLoad(new Path("hdfs://mycluster/tmp/test"),conn.getAdmin,table, regionLocator)
    } finally {
      table.close()
      conn.close()
    }
    spark.close()
  }
}

3.结果展示:

在这里插入图片描述
在这里插入图片描述

4.将hbase数据映射到hive

CREATE EXTERNAL TABLE li(
key string,
name string comment "姓名",
age string comment "年龄"
)STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'    
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:name,cf:age")    
TBLPROPERTIES("hbase.table.name" = "sean.li");

在这里插入图片描述

5.所遇问题

5.1jar包问题

在这里插入图片描述

解决

**1.将hbase的jar包复制的spark2的lib下

2.将jars下的htrace-core-3.2.0-incubating.jar cp到spark2的lib下

3.如果还不行再讲上述jar包cp到hadoop的lib下

4.在hadoop-env.sh添加: export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/opt/cloudera/parcels/CDH/lib/hbase/***

cp /opt/cloudera/parcels/CDH/jars/hive-hbase-handler-1.1.0-cdh5.16.2.jar  /opt/cloudera/parcels/SPARK2/lib/spark2/jars
cp /opt/cloudera/parcels/CDH/jars/metrics-core-2.2.0.jar  /opt/cloudera/parcels/SPARK2/lib/spark2/jars
cp /opt/cloudera/parcels/CDH/jars/htrace-core-3.2.0-incubating.jar  /opt/cloudera/parcels/SPARK2/lib/spark2/jars
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-annotations-1.2.0-cdh5.16.2.jar                    /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-annotations.jar                                    /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-client-1.2.0-cdh5.16.2.jar                         /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar                                         /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-common-1.2.0-cdh5.16.2.jar                         /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar                                         /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-examples-1.2.0-cdh5.16.2.jar                       /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-examples.jar                                       /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-external-blockcache-1.2.0-cdh5.16.2.jar            /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-external-blockcache.jar                            /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop2-compat-1.2.0-cdh5.16.2.jar                 /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop2-compat.jar                                 /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat-1.2.0-cdh5.16.2.jar                  /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar                                  /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-it-1.2.0-cdh5.16.2.jar                             /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar                                             /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree-1.2.0-cdh5.16.2.jar                    /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar                                    /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-procedure-1.2.0-cdh5.16.2.jar                      /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-procedure.jar                                      /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol-1.2.0-cdh5.16.2.jar                       /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar                                       /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-resource-bundle-1.2.0-cdh5.16.2.jar                /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-resource-bundle.jar                                /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-rest-1.2.0-cdh5.16.2.jar                           /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-rest.jar                                           /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-rsgroup-1.2.0-cdh5.16.2.jar                        /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-rsgroup.jar                                        /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-server-1.2.0-cdh5.16.2.jar                         /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar                                         /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-shell-1.2.0-cdh5.16.2.jar                          /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-shell.jar                                          /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-spark-1.2.0-cdh5.16.2.jar                          /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-spark.jar                                          /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-thrift-1.2.0-cdh5.16.2.jar                         /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/lib/hbase/hbase-thrift.jar                                         /opt/cloudera/parcels/SPARK2/lib/spark2/jars/
cp /opt/cloudera/parcels/CDH/jars/htrace-core-3.2.0-incubating.jar                                 /opt/cloudera/parcels/SPARK2/lib/spark2/jars/

5.2写入问题

在这里插入图片描述

5.3顺序读写问题:(hfile在往hbase写时要保证数据是有序的)在这里插入图片描述## 5.4maven打包时失败:在这里插入图片描述绝大部分原因是因为依赖的版本冲突,从集群所使用的的jar包版本,各个以来之间的关系考虑即可

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值