spark-hbase-BulkLoad

spark-hbase-BulkLoad 

hbase 2.X 的版本,适用

背景:hive表中的数据导入hbase表使用,大概分两种方式操作。各有利弊

方法1:先建 hbase表,再建hbase的外表hive表,做好字段映射,起一个MR 任务写入 hive外表,
优点:简单,
缺点: 数据量较大会建成hbase regionserver 压力大,可能会导致regionserver not online ,宕机,造成服务不可用的状态,数据量小没事

hbase 建表

create 'dim_goods_xxxxx',{NAME=>'cf',VERSIONS=>'1',TTL => '604800',COMPRESSION =>'SNAPPY',CONFIGURATION=>{'hbase.hstore.blockingStoreFiles'=>'8'}},{NUMREGIONS => 30, SPLITALGO => 'HexStringSplit'}


hive 建 hbase 外表

CREATE EXTERNAL TABLE `xxx.dim_goods_xxxxx`(    
  `key` string COMMENT '',     
  `data_from` string COMMENT '企业id',     
  `goods_id` string COMMENT '商品id')    
ROW FORMAT SERDE     
  'org.apache.hadoop.hive.hbase.HBaseSerDe'     
STORED BY     
  'org.apache.hadoop.hive.hbase.HBaseStorageHandler'     
WITH SERDEPROPERTIES (     
  'hbase.columns.mapping'=':key,cf:data_from,cf:goods_id',     
  'serialization.format'='1')    
TBLPROPERTIES (    
  'hbase.table.name'='dim_goods_xxxxx',     
  'last_modified_by'='hive',     
  'last_modified_time'='1722103676',     
  'transient_lastDdlTime'='1722103676')    
;

注意 hbase.columns.mapping 中的字段 映射

JOB

insert into xxx.dim_goods_xxxxx select key,data_from,goods_id from xxx.xxxx;


方法2: BulkLoad 的方式导入,spark 读取 hive 表,写入hbase 需要的 HFile 类型的 文件,写在 hdfs 上面,再使用BulkLoad ,把数据加载移动到hbase表中
优点:hbase regionserver 无感知无压力
缺点: 开发写代码 

步骤
1:hbase 中建表,做好预分区,合理分区,太大,太小都不好。
2:spark session 读取 rdd 数据做 df
3: 把df 数据 包装成 hbase 需要的 HFile ( JavaPairRDD<ImmutableBytesWritable, KeyValue> hbaseDF2), 格式写入hdfs  (saveAsNewAPIHadoopFile)
  3中有一些注意的细节,不处理好,写入不会成功的
  3.1: hFile 写入一定要排序,不按这个排序,数据量小可以成功,数据量大肯定不会成功。这个很重要, rowkey + family + column 一起排序才可以 ,不按这个顺序排
  3.2:根据hbase的预分区做分区内和排序 repartitionAndSortWithinPartitions
  3.3写入saveAsNewAPIHadoopFile
4.LoadIncrementalHFiles doBulkLoad 完成

具体参见代码 

public class SparkHBaseSumit {
    private static SparkSession spark = null;
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        HashMap<String, String> argMap = new HashMap<String, String>();
        if (args.length > 0) {
            for (String arg : args) {
                int firstDelimiterIndex = arg.trim().indexOf("=");
                if (firstDelimiterIndex != -1) {
                    String firstPart = arg.substring(0, firstDelimiterIndex);
                    String restPart = arg.substring(firstDelimiterIndex + 1);
                    argMap.put(firstPart, restPart);
                }
            }
        } else {
            return;
        }

        Long begin = System.currentTimeMillis();

        String sparkSql = (argMap.containsKey("sparkSql")) ? argMap.get("sparkSql") : "";
        String hbaseTable = (argMap.containsKey("hbaseTable")) ? argMap.get("hbaseTable") : "";
        String cf = (argMap.containsKey("family")) ? argMap.get("family") : "cf";
        String p = (argMap.containsKey("p")) ? argMap.get("p") : "";
        String appName = (argMap.containsKey("appName")) ? argMap.get("appName") : "spark_hbase_bluckload_" + hbaseTable;
        if ("".equals(sparkSql) || "".equals(hbaseTable)) {
            System.out.println("sparkSql hbaseTable is not null");
            return;
        }
        spark = getSparkSession(appName);
        if(!"".equals(p)){
            spark.conf().set("spark.sql.shuffle.partitions", p);
        }
        Dataset<Row> df = spark.sql(sparkSql);
        df.show();
        String[] columns = df.columns();
        String rowkey = columns[0];
        List<String> columnsList = new ArrayList<>(Arrays.asList(columns));
        columnsList.remove(rowkey);
        String[] cols = columnsList.toArray(new String[columnsList.size()]);
        Arrays.sort(cols);

        byte[] family = Bytes.toBytes(cf);
        byte[][] colBytes = new byte[cols.length][];
        for (int i = 0; i < cols.length; i++) {
            colBytes[i] = Bytes.toBytes(cols[i]);
        }

        JavaPairRDD<Tuple2<ImmutableBytesWritable, byte[]>, KeyValue> hbaseDF = df.javaRDD().flatMapToPair(row -> HBaseTableUtil.generateHBaseRowWithQualifier(row, rowkey, cf, family, cols, colBytes).iterator());

        Long cost = (System.currentTimeMillis() - begin) / 60000;
        System.out.println("rdd cost time = " + cost + " m");

        FileSystem fileSystem = FileSystem.get(spark.sparkContext().hadoopConfiguration());
        // 如果存放 HFile文件的路径已经存在,就删除掉
        String hFilePath = "/tmp/" + appName;
        if (fileSystem.exists(new Path(hFilePath))) {
            fileSystem.delete(new Path(hFilePath), true);
            System.out.println("del hdfs dir" + hFilePath);
        }
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum", "xxxxxxxxxxx");
        conf.set("hbase.zookeeper.znode.parent", "/hbase");
        conf.set("hbase.zookeeper.property.clientPort", "2181");
        conf.set("hbase.client.ipc.pool.size", "10");
        conf.set("hbase.mapreduce.hfileoutputformat.table.name", hbaseTable);
        conf.set("hbase.mapreduce.hfileoutputformat.compression", "gz");
//        conf.set("hbase.mapreduce.hfileoutputformat.compression", "snappy");
//        conf.set("hbase.mapreduce.hfileoutputformat.compress", "true");
//        conf.set("hbase.mapreduce.hfileoutputformat.compress.type", "BLOCK");
//        conf.set("hbase.mapreduce.hfileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        conf.set("hhbase.bulkload.retries.number", "0");

        // 将HFile文件加载到HBase中,先创建一个类
        // 创建Job实例并设置输出格式
        Job job = Job.getInstance(conf);
        job.setJobName(hbaseTable);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);


        Connection connection = ConnectionFactory.createConnection(conf);
        Admin admin = connection.getAdmin();
        Table table = connection.getTable(TableName.valueOf(hbaseTable));
        RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(hbaseTable));
        List<HRegionInfo> hRegionInfos = admin.getTableRegions(TableName.valueOf(hbaseTable));
        ArrayList<String> regionSplits = new ArrayList<>();
        for (HRegionInfo item : hRegionInfos) {
            regionSplits.add(new String(item.getEndKey()));
        }

        regionSplits.remove(regionSplits.size() - 1);
        JavaPairRDD<ImmutableBytesWritable, KeyValue> hbaseDF2 = null;
        if (regionSplits.size() > 0) {
            hbaseDF2 = hbaseDF.repartitionAndSortWithinPartitions(new RegionPartitioner(regionSplits.toArray(new String[regionSplits.size()])), new KeyQualifierComparator())
                    .mapToPair(row -> new Tuple2<>(row._1()._1(), row._2()));
        } else {
            hbaseDF2 = hbaseDF.mapToPair(row -> new Tuple2<>(row._1()._1(), row._2()));
        }

        hbaseDF2.saveAsNewAPIHadoopFile(hFilePath,
                ImmutableBytesWritable.class,
                KeyValue.class,
                HFileOutputFormat2.class,
                job.getConfiguration()
        );
        Long cost2 = (System.currentTimeMillis() - begin) / 60000;
        System.out.println("write cost time = " + (cost2 - cost) + " m");
        System.out.println("data write ok");

        HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator);
        LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf);
        //8.2用这个类返回值调用doBulkLoad开始加载;传入四个参数(输出路径,,mater对象admin,表名,region的信息)
        load.doBulkLoad(new Path(hFilePath), admin, table, regionLocator);

        Long cost3 = (System.currentTimeMillis() - begin) / 60000;
        System.out.println("load cost time = " + (cost3 - cost2) + " m");
        System.out.println("file load ok");

        //清空目录
        if (fileSystem.exists(new Path(hFilePath))) {
            fileSystem.delete(new Path(hFilePath), true);
            System.out.println("del hdfs dir" + hFilePath);
        }
        fileSystem.close();
        admin.close();
        connection.close();
        spark.stop();
    }

    /**
     * 生产spark hadoop conf
     *
     * @return
     */
    private static SparkSession getSparkSession(String appName) {
//        System.setProperty("HADOOP_USER_NAME", "hive");
        SparkSession spark = SparkSession.builder()
//                .master("local[*]")
                .appName(appName)
                .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic")
                .config("spark.hadoop." + METASTOREURIS.varname, "thrift://xxxx")
                .config("spark.sql.warehouse.dir", "/user/hive/warehouse")
                .config("spark.executor.heartbeatInterval", "100000")
                .config("spark.network.timeoutInterval", "100000")
                .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:G1HeapRegionSize=32M -XX:+UseLargePages")

                .enableHiveSupport()
                .getOrCreate();


                "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
        return spark;
    }
}

rdd 处理代码

public class HBaseTableUtil {

    public static List<Tuple2<Tuple2<ImmutableBytesWritable, byte[]>, KeyValue>> generateHBaseRowWithQualifier(Row row, String rowkey, String cf, byte[] family, String[] cols, byte[][] colBytes) {
        List<Tuple2<Tuple2<ImmutableBytesWritable, byte[]>, KeyValue>> list = new LinkedList<>();
        String rowkeyVal = row.getAs(rowkey).toString();
        byte[] rowkeyByte = Bytes.toBytes(rowkeyVal);
        ImmutableBytesWritable writable = new ImmutableBytesWritable(rowkeyByte);
        Set<KeyValue> map = new TreeSet<KeyValue>(KeyValue.COMPARATOR);
        int j = 0;
        for (String col : cols) {
            String val = (row.getAs(col) != null) ? row.getAs(col).toString() : "";
//            KeyValue keyValue = new KeyValue(rowkeyByte, family, Bytes.toBytes(col), Bytes.toBytes(val));
            KeyValue keyValue = new KeyValue(rowkeyByte, family, colBytes[j], Bytes.toBytes(val));
            map.add(keyValue);
            j++;
        }
        // 这个很重要, rowkey + family + column 一起排序才可以
        int i = 0;
        for (KeyValue kv : map) {
            String familyAndQualifier = cf + ":" + cols[i];
            list.add(new Tuple2<>(new Tuple2<>(writable, Bytes.toBytes(familyAndQualifier)), kv));
            i++;
        }
        return list;
    }
}

分区器代码

public class RegionPartitioner extends Partitioner {

    private final String[] endKeys;
    private final int numPartitions;

    public RegionPartitioner(String[] endKeys){
        this.endKeys = endKeys;
        this.numPartitions = endKeys.length + 1;
    }
    @Override
    public int numPartitions() {
        return this.numPartitions;
    }

    @Override
    public int getPartition(Object key) {
        if (this.endKeys.length == 0) {
            // 如果这个hbase表没有分区信息,则所有数据都写到一个文件里面
            // 经测试,当前情况下,这个partition里面的数据不会进行排序,所以调用RegionPartitioner的时候就避免走到这一步
            return 0;
        }else if(key instanceof Tuple2 ){
            if (((Tuple2) key)._1() instanceof ImmutableBytesWritable) {
                byte[] keyBytes = ((ImmutableBytesWritable) ((Tuple2) key)._1()).copyBytes();
                String comparedKey = new String(keyBytes).substring(0, endKeys[0].length());
                for (int i = 0; i < this.endKeys.length; i++) {
                    if (comparedKey.compareTo(endKeys[i]) < 0) {
                        return i;
                    }
                }
                return endKeys.length;
            }
        } else if(key instanceof ImmutableBytesWritable) {
            byte[] keyBytes = ((ImmutableBytesWritable) key).copyBytes();
            String comparedKey = new String(keyBytes).substring(0, endKeys[0].length());
            for (int i = 0; i < this.endKeys.length; i++) {
                if (comparedKey.compareTo(endKeys[i]) < 0) {
                    return i;
                }
            }
            return endKeys.length;
        }
        return 0;
    }
}

排序 代码

public class KeyQualifierComparator implements Comparator<Tuple2<ImmutableBytesWritable, byte[]>>, Serializable {
    @Override
    public int compare(Tuple2<ImmutableBytesWritable, byte[]> o1, Tuple2<ImmutableBytesWritable, byte[]> o2) {

        if (o1._1().compareTo(o2._1()) == 0) {
            return Bytes.compareTo(o1._2(), o2._2());
        } else {
            return o1._1().compareTo(o2._1());
        }
    }
}

pom.xml 

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>cn.gaojihealth.bdp</groupId>
    <artifactId>diff-data</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <spark.version>3.2.0</spark.version>
        <scala.version>2.12</scala.version>
        <flink.version>1.17.0</flink.version>
        <commons.lang3.version>3.7</commons.lang3.version>
        <commons.io.version>2.6</commons.io.version>
    </properties>


    <dependencies>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
            <exclusions>
                <exclusion>
                    <groupId>org.xerial.snappy</groupId>
                    <artifactId>snappy-java</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
            <exclusions>
                <exclusion>
                    <groupId>org.xerial.snappy</groupId>
                    <artifactId>snappy-java</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_${scala.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
            <exclusions>
                <exclusion>
                    <groupId>org.xerial.snappy</groupId>
                    <artifactId>snappy-java</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

        <!-- hbase dependencies start -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>2.2.0</version>
        </dependency>


        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-common</artifactId>
            <version>2.2.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>2.2.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-mapreduce</artifactId>
            <version>2.2.0</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.8.0</version>
                <configuration>
                    <source>8</source>
                    <target>8</target>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.0.0</version>
                <configuration>
                    <archive>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id> <!-- this is used for inheritance merges -->
                        <phase>package</phase> <!-- 指定在打包节点执行jar包合并操作 -->
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>

spark sumit 提交命令

#!/bin/bash

dt=`date -d "1 days ago" +%Y%m%d`
appName="spark_hbase_bluckload_dim_goods_xxxx"
sparkSql="select rowkey ,data_from ,goods_id ,is_otc ,atc1_new ,stjb ,main_goods_id ,flag_disease ,is_del ,dtp_atc ,is_gold ,class_code1 ,own_brand_flag from  tmp.b;"
hbaseTable="dim:dim_goods_category_manual_all2" 
family=cf

echo "$sparkSql"

export SPARK_HOME=/data/appdata/spark-3.2.0-bin-hadoop3.2 
spark-submit \
--conf spark.yarn.submit.waitAppCompletion=true \
--conf spark.network.maxRemoteBlockSizeFetchToMem=2147483000 \
--conf spark.network.timeout=300000 \
--conf spark.rpc.message.maxSize=1024 \
--conf spark.executor.heartbeatInterval=300000 \
--conf spark.driver.maxResultSize=8G \
--master yarn \
--deploy-mode cluster \
--class task.SparkHBaseSumit \
--driver-memory 8G \
--executor-memory 32G \
--executor-cores 1 \
--num-executors 1 \
--name ${appName} \
--queue root.ODS \
hdfs://nameservice2/BDP/spark/spark-hbase-bluckload.jar appName="${appName}" sparkSql="${sparkSql}" hbaseTable="${hbaseTable}" family="${family}"


spark-hbase-bluckload.jar  为java 工程打好的包

  • 16
    点赞
  • 15
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值