Spark 批量写入 HBase

import com.alibaba.fastjson.JSONObject
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.client.{Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}

object InitUrlToHbase {
    def main(args: Array[String]): Unit = {
        val conf: SparkConf = new SparkConf().setAppName("InitUrlToHbase")
            .set("spark.hadoop.validateOutputSpecs", "false")
            .registerKryoClasses(Array[Class[_]](classOf[ImmutableBytesWritable]))
        val inputPath = args(0)
        val sparkContext: SparkContext = new SparkContext(conf)

        val configuration: Configuration = sparkContext.hadoopConfiguration
        configuration.set("hbase.zookeeper.quorum", "127.0.0.1")
        configuration.set("fs.defaultFS", "hdfs://127.0.0.1:8020")
        // hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily: 允许的hfile的最大个数,默认配置是32
        // hbase.hregion.max.filesize: 单个ColumnFamily的region大小 超过设置的该值则自动split 默认的大小是1G
        // 这两个参数的默认值决定了,每次批量入库的数据量不能超过32*1也就是32个G,超过这个数量就会导致入库失败
        configuration.set("hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily", "3200")
        configuration.set("hbase.hregion.max.filesize", "10737418240")
        configuration.set("zooleeper.znode.parent", "/hbase")
        configuration.set("hbase.zookeeper.property.clientPort", "2181")
        configuration.set(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.OUTPUT_TABLE, "URLS")

        val job: Job = Job.getInstance(configuration)
        job.setOutputKeyClass(classOf[ImmutableBytesWritable])
        job.setOutputValueClass(classOf[Result])
        job.setOutputFormatClass(classOf[org.apache.hadoop.hbase.mapreduce.TableOutputFormat[ImmutableBytesWritable]])

        sparkContext.textFile(inputPath).map(line => {
            val strings = line.split("\\|")
            val urlId = strings(0)
            val url = strings(1)
            val put: Put = new Put(Bytes.toBytes(urlId))
            var json = new JSONObject()
            json.put("urlId",urlId)
            json.put("url",url)
            put.addColumn("D1".getBytes(), "CONTENT".getBytes(), Bytes.toBytes(json.toJSONString))
            (new ImmutableBytesWritable(Bytes.toBytes(urlId)), put)
        }).repartitionAndSortWithinPartitions(new HashPartitioner(200)).saveAsNewAPIHadoopDataset(job.getConfiguration)
    }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值