Spark读取csv和parquet文件将数据写入Hbase表

1、Saprk读取csv文件将数据写入Hbase表中编码实现。
scala编码:
package com.cbp.spark_hbase

import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2
import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.storage.StorageLevel
import scala.collection.mutable.ArrayBuffer

object SparkReadCsvOrParquetToHbase {
  Logger.getLogger("org").setLevel(Level.INFO)
  def main(args: Array[String]): Unit = {
    val tableName = args(0)
    val readPath =args(1)
    val filePath = args(2)
    val columnf ="fpmx"

    val ss = SparkSession.builder().getOrCreate()
    val hconf = HBaseConfiguration.create()
    hconf.set("hbase.mapreduce.hfileoutputformat.table.name", tableName)
    hconf.setInt("hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily", 5000)

    val conn = ConnectionFactory.createConnection(hconf)
    val regionLocator = conn.getRegionLocator(TableName.valueOf(tableName))
    val table = conn.getTable(TableName.valueOf(tableName))

    val job = Job.getInstance(hconf)
    job.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
    job.setMapOutputValueClass(classOf[KeyValue])
    HFileOutputFormat2.configureIncrementalLoadMap(job, table.getDescriptor)

    val df: DataFrame = ss.read.csv(readPath)
//	val df: DataFrame = ss.read.parquet(readPath)
    val rdd1 = df.rdd.flatMap(row => {
      val fields: Array[StructField] = row.schema.fields
      val values = ArrayBuffer[(String, (String, String, String))]()
      val rowkey = row.getAs(0).toString
      fields.foreach(col => {
        values.append((rowkey, (columnf,col.name,row.getAs(col.name))))
      })
      values
    }).persist(StorageLevel.MEMORY_AND_DISK_SER)

    rdd1.sortBy(x => (x._1, x._2._1, x._2._2))
      .map(rdd => {
        val rowKey = Bytes.toBytes(rdd._1)
        val family = Bytes.toBytes(rdd._2._1)
        val colum = Bytes.toBytes(rdd._2._2)
        val value = Bytes.toBytes(rdd._2._3)
        (new ImmutableBytesWritable(rowKey), new KeyValue(rowKey, family, colum, value))
      }).saveAsNewAPIHadoopFile(filePath,
      classOf[ImmutableBytesWritable],
      classOf[KeyValue],
      classOf[HFileOutputFormat2],
      hconf)

    val load = new LoadIncrementalHFiles(hconf)
    load.doBulkLoad(new Path(filePath), conn.getAdmin, table, regionLocator)

    table.close()
    conn.close()
    ss.close()
  }
}

submit提交脚本:
nohup spark-submit \
--master yarn \
--deploy-mode client \
--class com.cbp.spark_hbase.SparkReadParquetToHbase \
--driver-memory 4G \
--executor-memory 20G \
--executor-cores 4 \
--num-executors 20 \
--conf spark.default.parallelism=240 \
--conf spark.speculation=true \
--conf spark.speculation.interval=100 \
--conf spark.speculation.quantile=0.75 \
--conf spark.speculation.multiplier=1.5 \
--conf spark.storage.memoryFraction=0.2 \
--conf spark.shuffle.memoryFraction=0.4 \
--conf spark.shuffle.service.enabled=true \
com.cbp.test-1.0-SNAPSHOT.jar \
"test" \
"./test1" \
"./temp" \
> ./log.file 2>&1 &
2、使用hbase工具 ImportTsv 将 csv 文件导入hbase表。
方式一、put直接写入
hbase org.apache.hadoop.hbase.mapreduce.ImportTsv \
-Dimporttsv.columns=HBASE_ROW_KEY,fpmx:xf_nsrsbh,fpmx:gf_nsrsbh \
test \
./test
方式二、bulk load,先写临时文件,再用LoadIncrementalHFiles工具导入
hbase org.apache.hadoop.hbase.mapreduce.ImportTsv \
-Dimporttsv.separator="," \      csv是逗号分隔的,默认是用 | 分隔
-Dimporttsv.bulk.output=./temp \
-Dimporttsv.columns=HBASE_ROW_KEY,fpmx:xf_nsrsbh,fpmx:gf_nsrsbh \
test \
./test

hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles \
-Dcreate.table=no \   不创建表
./temp \
test
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值