Spark -- RDD写入HBase示例

POM

<properties>
        <scala.version>2.11</scala.version>
        <spark.version>2.2.0</spark.version>
        <scope>compile</scope>
<!--        <scope>provided</scope>-->
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.version}</artifactId>
            <version>${spark.version}</version>
            <exclusions>
                <exclusion>
                    <artifactId>janino</artifactId>
                    <groupId>org.codehaus.janino</groupId>
                </exclusion>
            </exclusions>
            <scope>${scope}</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.version}</artifactId>
            <version>${spark.version}</version>
            <scope>${scope}</scope>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.60</version>
        </dependency>
        <dependency>
            <groupId>org.codehaus.janino</groupId>
            <artifactId>janino</artifactId>
            <version>3.0.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>1.3.1</version>
            <exclusions>
                <exclusion>
                    <groupId>org.apache.hadoop</groupId>
                    <artifactId>hadoop-client</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.3.1</version>
        </dependency>
    </dependencies>

    <build>
        <finalName>SparkPro</finalName>
        <plugins>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.0.0</version>
                <configuration>
                    <archive>
                        <manifest>
                            <mainClass>com.wisers.spark.PubLisErrTask</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

示例

package com.wisers.spark.hbase

import com.alibaba.fastjson.JSON
import com.wisers.spark.utils.{HBaseUtil, IsNullUtil}
import org.apache.commons.codec.digest.DigestUtils
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
 * Created By TheBigBlue on 2020/7/14
 * Description : 读hdfs数据,写入hbase
 */
object HtmlInsert {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("HtmlInsert").setMaster("local[2]")
    val context = new SparkContext(conf)
    context.setLogLevel("WARN")
    val userDir = System.getProperty("user.dir")
    val inputFile = if (IsNullUtil.isEmpty(args)) {
      userDir + "\\data\\input\\20200712000249list-hbase-little.data"
    } else {
      args(0)
    }
    val hbaseDts = String.valueOf(System.currentTimeMillis())
    val fileRDD = context.textFile(inputFile).map(x => {
      val obj = JSON.parseObject(x)
      val pubcode = obj.getString("pubcode")
      val url = obj.getString("url")
      val listing = obj.getString("listing")
      val content = obj.getString("content")
      val dts = obj.getString("dts")
      Data(pubcode, url, listing, content, dts, hbaseDts)
    })
    val hbaseRDD: RDD[(ImmutableBytesWritable, Put)] = fileRDD.map(data => {
      val put = new Put(Bytes.toBytes(DigestUtils.md5Hex(data.url)))
      put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("pubcode"), Bytes.toBytes(data.pubcode))
      put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("url"), Bytes.toBytes(data.url))
      if (!IsNullUtil.isEmpty(data.listing)) put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("listing"), Bytes.toBytes(data.listing))
      if (!IsNullUtil.isEmpty(data.content)) put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("content"), Bytes.toBytes(data.content))
      put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("dts"), Bytes.toBytes(data.dts))
      put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("hbaseDts"), Bytes.toBytes(data.hbaseDts))
      (new ImmutableBytesWritable, put)
    })
    //写入hbase
    hbaseRDD.saveAsHadoopDataset(HBaseUtil.getJobConf())
    context.stop()
  }
  case class Data(pubcode: String, url: String, listing: String, content: String, dts: String, hbaseDts: String)
}
package com.wisers.spark.utils

import java.util.Properties

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.mapred.JobConf

/**
 * Created By TheBigBlue on 2020/7/14
 * Description :
 */
object HBaseUtil {

  def getJobConf(): JobConf = {
    //读取配置
    val inputStream = HBaseUtil.getClass.getClassLoader.getResourceAsStream("param.properties")
    val props = new Properties()
    props.load(inputStream)
    //设置hbase信息
    val conf = HBaseConfiguration.create()
    conf.set("hbase.zookeeper.quorum", props.getProperty("hbase.zookeeper.quorum"))
    conf.set("hbase.zookeeper.property.clientPort", props.getProperty("hbase.zookeeper.property.clientPort"))
    //设置jobConf
    val jobConf = new JobConf(conf)
    jobConf.setOutputFormat(classOf[TableOutputFormat])
    jobConf.set(TableOutputFormat.OUTPUT_TABLE, "tb_html")
    //设置输出的KeyClass
    jobConf.setOutputKeyClass(classOf[ImmutableBytesWritable])
    //设置输出ValueClass
    jobConf.setOutputValueClass(classOf[Result])

    jobConf
  }
}

说明

  1. 因为我读取的数据用到了zip压缩和base64编码,并且是json的,所以引入了fastjson依赖,如果用户不需要可以不加
  2. zip压缩和base64编码工具类可以忽略
  3. 本地运行时spark的scope使用compile,但是集群运行时可以为provided,因为集群有spark环境
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值