spark-hbase-BulkLoad

最新推荐文章于 2024-08-15 11:51:39 发布

小皮蛋儿子

最新推荐文章于 2024-08-15 11:51:39 发布

阅读量533

点赞数 16

文章标签： spark hbase hive

本文链接：https://blog.csdn.net/celltobig/article/details/141197114

版权

spark-hbase-BulkLoad

hbase 2.X 的版本，适用

背景:hive表中的数据导入hbase表使用，大概分两种方式操作。各有利弊

方法1：先建 hbase表，再建hbase的外表hive表，做好字段映射，起一个MR 任务写入 hive外表，
优点：简单，
缺点: 数据量较大会建成hbase regionserver 压力大，可能会导致regionserver not online ，宕机，造成服务不可用的状态，数据量小没事

hbase 建表

create 'dim_goods_xxxxx',{NAME=>'cf',VERSIONS=>'1',TTL => '604800',COMPRESSION =>'SNAPPY',CONFIGURATION=>{'hbase.hstore.blockingStoreFiles'=>'8'}},{NUMREGIONS => 30, SPLITALGO => 'HexStringSplit'}

hive 建 hbase 外表

CREATE EXTERNAL TABLE `xxx.dim_goods_xxxxx`(
`key` string COMMENT '',
`data_from` string COMMENT '企业id',
`goods_id` string COMMENT '商品id')
ROW FORMAT SERDE
'org.apache.hadoop.hive.hbase.HBaseSerDe'
STORED BY
'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES (
'hbase.columns.mapping'=':key,cf:data_from,cf:goods_id',
'serialization.format'='1')
TBLPROPERTIES (
'hbase.table.name'='dim_goods_xxxxx',
'last_modified_by'='hive',
'last_modified_time'='1722103676',
'transient_lastDdlTime'='1722103676')
;

注意 hbase.columns.mapping 中的字段映射

JOB

insert into xxx.dim_goods_xxxxx select key,data_from,goods_id from xxx.xxxx;

方法2: BulkLoad 的方式导入，spark 读取 hive 表，写入hbase 需要的 HFile 类型的文件，写在 hdfs 上面，再使用BulkLoad ，把数据加载移动到hbase表中
优点：hbase regionserver 无感知无压力
缺点: 开发写代码

步骤
1：hbase 中建表，做好预分区，合理分区，太大，太小都不好。
2：spark session 读取 rdd 数据做 df
3: 把df 数据包装成 hbase 需要的 HFile ( JavaPairRDD<ImmutableBytesWritable, KeyValue> hbaseDF2), 格式写入hdfs (saveAsNewAPIHadoopFile)
3中有一些注意的细节，不处理好，写入不会成功的
3.1: hFile 写入一定要排序，不按这个排序，数据量小可以成功，数据量大肯定不会成功。这个很重要, rowkey + family + column 一起排序才可以，不按这个顺序排
3.2：根据hbase的预分区做分区内和排序 repartitionAndSortWithinPartitions
3.3写入saveAsNewAPIHadoopFile
4.LoadIncrementalHFiles doBulkLoad 完成

具体参见代码

public class SparkHBaseSumit {
    private static SparkSession spark = null;
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        HashMap<String, String> argMap = new HashMap<String, String>();
        if (args.length > 0) {
            for (String arg : args) {
                int firstDelimiterIndex = arg.trim().indexOf("=");
                if (firstDelimiterIndex != -1) {
                    String firstPart = arg.substring(0, firstDelimiterIndex);
                    String restPart = arg.substring(firstDelimiterIndex + 1);
                    argMap.put(firstPart, restPart);
                }
            }
        } else {
            return;
        }

        Long begin = System.currentTimeMillis();

        String sparkSql = (argMap.containsKey("sparkSql")) ? argMap.get("sparkSql") : "";
        String hbaseTable = (argMap.containsKey("hbaseTable")) ? argMap.get("hbaseTable") : "";
        String cf = (argMap.containsKey("family")) ? argMap.get("family") : "cf";
        String p = (argMap.containsKey("p")) ? argMap.get("p") : "";
        String appName = (argMap.containsKey("appName")) ? argMap.get("appName") : "spark_hbase_bluckload_" + hbaseTable;
        if ("".equals(sparkSql) || "".equals(hbaseTable)) {
            System.out.println("sparkSql hbaseTable is not null");
            return;
        }
        spark = getSparkSession(appName);
        if(!"".equals(p)){
            spark.conf().set("spark.sql.shuffle.partitions", p);
        }
        Dataset<Row> df = spark.sql(sparkSql);
        df.show();
        String[] columns = df.columns();
        String rowkey = columns[0];
        List<String> columnsList = new ArrayList<>(Arrays.asList(columns));
        columnsList.remove(rowkey);
        String[] cols = columnsList.toArray(new String[columnsList.size()]);
        Arrays.sort(cols);

        byte[] family = Bytes.toBytes(cf);
        byte[][] colBytes = new byte[cols.length][];
        for (int i = 0; i < cols.length; i++) {
            colBytes[i] = Bytes.toBytes(cols[i]);
        }

        JavaPairRDD<Tuple2<ImmutableBytesWritable, byte[]>, KeyValue> hbaseDF = df.javaRDD().flatMapToPair(row -> HBaseTableUtil.generateHBaseRowWithQualifier(row, rowkey, cf, family, cols, colBytes).iterator());

        Long cost = (System.currentTimeMillis() - begin) / 60000;
        System.out.println("rdd cost time = " + cost + " m");

        FileSystem fileSystem = FileSystem.get(spark.sparkContext().hadoopConfiguration());
        // 如果存放 HFile文件的路径已经存在，就删除掉
        String hFilePath = "/tmp/" + appName;
        if (fileSystem.exists(new Path(hFilePath))) {
            fileSystem.delete(new Path(hFilePath), true);
            System.out.println("del hdfs dir" + hFilePath);
        }
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum", "xxxxxxxxxxx");
        conf.set("hbase.zookeeper.znode.parent", "/hbase");
        conf.set("hbase.zookeeper.property.clientPort", "2181");
        conf.set("hbase.client.ipc.pool.size", "10");
        conf.set("hbase.mapreduce.hfileoutputformat.table.name", hbaseTable);
        conf.set("hbase.mapreduce.hfileoutputformat.compression", "gz");
//        conf.set("hbase.mapreduce.hfileoutputformat.compression", "snappy");
//        conf.set("hbase.mapreduce.hfileoutputformat.compress", "true");
//        conf.set("hbase.mapreduce.hfileoutputformat.compress.type", "BLOCK");
//        conf.set("hbase.mapreduce.hfileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        conf.set("hhbase.bulkload.retries.number", "0");

        // 将HFile文件加载到HBase中,先创建一个类
        // 创建Job实例并设置输出格式
        Job job = Job.getInstance(conf);
        job.setJobName(hbaseTable);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);


        Connection connection = ConnectionFactory.createConnection(conf);
        Admin admin = connection.getAdmin();
        Table table = connection.getTable(TableName.valueOf(hbaseTable));
        RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(hbaseTable));
        List<HRegionInfo> hRegionInfos = admin.getTableRegions(TableName.valueOf(hbaseTable));
        ArrayList<String> regionSplits = new ArrayList<>();
        for (HRegionInfo item : hRegionInfos) {
            regionSplits.add(new String(item.getEndKey()));
        }

        regionSplits.remove(regionSplits.size() - 1);
        JavaPairRDD<ImmutableBytesWritable, KeyValue> hbaseDF2 = null;
        if (regionSplits.size() > 0) {
            hbaseDF2 = hbaseDF.repartitionAndSortWithinPartitions(new RegionPartitioner(regionSplits.toArray(new String[regionSplits.size()])), new KeyQualifierComparator())
                    .mapToPair(row -> new Tuple2<>(row._1()._1(), row._2()));
        } else {
            hbaseDF2 = hbaseDF.mapToPair(row -> new Tuple2<>(row._1()._1(), row._2()));
        }

        hbaseDF2.saveAsNewAPIHadoopFile(hFilePath,
                ImmutableBytesWritable.class,
                KeyValue.class,
                HFileOutputFormat2.class,
                job.getConfiguration()
        );
        Long cost2 = (System.currentTimeMillis() - begin) / 60000;
        System.out.println("write cost time = " + (cost2 - cost) + " m");
        System.out.println("data write ok");

        HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator);
        LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf);
        //8.2用这个类返回值调用doBulkLoad开始加载;传入四个参数（输出路径，,mater对象admin,表名，region的信息）
        load.doBulkLoad(new Path(hFilePath), admin, table, regionLocator);

        Long cost3 = (System.currentTimeMillis() - begin) / 60000;
        System.out.println("load cost time = " + (cost3 - cost2) + " m");
        System.out.println("file load ok");

        //清空目录
        if (fileSystem.exists(new Path(hFilePath))) {
            fileSystem.delete(new Path(hFilePath), true);
            System.out.println("del hdfs dir" + hFilePath);
        }
        fileSystem.close();
        admin.close();
        connection.close();
        spark.stop();
    }

    /**
     * 生产spark hadoop conf
     *
     * @return
     */
    private static SparkSession getSparkSession(String appName) {
//        System.setProperty("HADOOP_USER_NAME", "hive");
        SparkSession spark = SparkSession.builder()
//                .master("local[*]")
                .appName(appName)
                .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic")
                .config("spark.hadoop." + METASTOREURIS.varname, "thrift://xxxx")
                .config("spark.sql.warehouse.dir", "/user/hive/warehouse")
                .config("spark.executor.heartbeatInterval", "100000")
                .config("spark.network.timeoutInterval", "100000")
                .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:G1HeapRegionSize=32M -XX:+UseLargePages")

                .enableHiveSupport()
                .getOrCreate();


                "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
        return spark;
    }
}

rdd 处理代码

public class HBaseTableUtil {

    public static List<Tuple2<Tuple2<ImmutableBytesWritable, byte[]>, KeyValue>> generateHBaseRowWithQualifier(Row row, String rowkey, String cf, byte[] family, String[] cols, byte[][] colBytes) {
        List<Tuple2<Tuple2<ImmutableBytesWritable, byte[]>, KeyValue>> list = new LinkedList<>();
        String rowkeyVal = row.getAs(rowkey).toString();
        byte[] rowkeyByte = Bytes.toBytes(rowkeyVal);
        ImmutableBytesWritable writable = new ImmutableBytesWritable(rowkeyByte);
        Set<KeyValue> map = new TreeSet<KeyValue>(KeyValue.COMPARATOR);
        int j = 0;
        for (String col : cols) {
            String val = (row.getAs(col) != null) ? row.getAs(col).toString() : "";
//            KeyValue keyValue = new KeyValue(rowkeyByte, family, Bytes.toBytes(col), Bytes.toBytes(val));
            KeyValue keyValue = new KeyValue(rowkeyByte, family, colBytes[j], Bytes.toBytes(val));
            map.add(keyValue);
            j++;
        }
        // 这个很重要, rowkey + family + column 一起排序才可以
        int i = 0;
        for (KeyValue kv : map) {
            String familyAndQualifier = cf + ":" + cols[i];
            list.add(new Tuple2<>(new Tuple2<>(writable, Bytes.toBytes(familyAndQualifier)), kv));
            i++;
        }
        return list;
    }
}

分区器代码

public class RegionPartitioner extends Partitioner {

    private final String[] endKeys;
    private final int numPartitions;

    public RegionPartitioner(String[] endKeys){
        this.endKeys = endKeys;
        this.numPartitions = endKeys.length + 1;
    }
    @Override
    public int numPartitions() {
        return this.numPartitions;
    }

    @Override
    public int getPartition(Object key) {
        if (this.endKeys.length == 0) {
            // 如果这个hbase表没有分区信息，则所有数据都写到一个文件里面
            // 经测试，当前情况下，这个partition里面的数据不会进行排序，所以调用RegionPartitioner的时候就避免走到这一步
            return 0;
        }else if(key instanceof Tuple2 ){
            if (((Tuple2) key)._1() instanceof ImmutableBytesWritable) {
                byte[] keyBytes = ((ImmutableBytesWritable) ((Tuple2) key)._1()).copyBytes();
                String comparedKey = new String(keyBytes).substring(0, endKeys[0].length());
                for (int i = 0; i < this.endKeys.length; i++) {
                    if (comparedKey.compareTo(endKeys[i]) < 0) {
                        return i;
                    }
                }
                return endKeys.length;
            }
        } else if(key instanceof ImmutableBytesWritable) {
            byte[] keyBytes = ((ImmutableBytesWritable) key).copyBytes();
            String comparedKey = new String(keyBytes).substring(0, endKeys[0].length());
            for (int i = 0; i < this.endKeys.length; i++) {
                if (comparedKey.compareTo(endKeys[i]) < 0) {
                    return i;
                }
            }
            return endKeys.length;
        }
        return 0;
    }
}

排序代码

public class KeyQualifierComparator implements Comparator<Tuple2<ImmutableBytesWritable, byte[]>>, Serializable {
    @Override
    public int compare(Tuple2<ImmutableBytesWritable, byte[]> o1, Tuple2<ImmutableBytesWritable, byte[]> o2) {

        if (o1._1().compareTo(o2._1()) == 0) {
            return Bytes.compareTo(o1._2(), o2._2());
        } else {
            return o1._1().compareTo(o2._1());
        }
    }
}

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>cn.gaojihealth.bdp</groupId>
    <artifactId>diff-data</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <spark.version>3.2.0</spark.version>
        <scala.version>2.12</scala.version>
        <flink.version>1.17.0</flink.version>
        <commons.lang3.version>3.7</commons.lang3.version>
        <commons.io.version>2.6</commons.io.version>
    </properties>


    <dependencies>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
            <exclusions>
                <exclusion>
                    <groupId>org.xerial.snappy</groupId>
                    <artifactId>snappy-java</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
            <exclusions>
                <exclusion>
                    <groupId>org.xerial.snappy</groupId>
                    <artifactId>snappy-java</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_${scala.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
            <exclusions>
                <exclusion>
                    <groupId>org.xerial.snappy</groupId>
                    <artifactId>snappy-java</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

        <!-- hbase dependencies start -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>2.2.0</version>
        </dependency>


        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-common</artifactId>
            <version>2.2.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>2.2.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-mapreduce</artifactId>
            <version>2.2.0</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.8.0</version>
                <configuration>
                    <source>8</source>
                    <target>8</target>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.0.0</version>
                <configuration>
                    <archive>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id> <!-- this is used for inheritance merges -->
                        <phase>package</phase> <!-- 指定在打包节点执行jar包合并操作 -->
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>

spark sumit 提交命令

#!/bin/bash

dt=`date -d "1 days ago" +%Y%m%d`
appName="spark_hbase_bluckload_dim_goods_xxxx"
sparkSql="select rowkey ,data_from ,goods_id ,is_otc ,atc1_new ,stjb ,main_goods_id ,flag_disease ,is_del ,dtp_atc ,is_gold ,class_code1 ,own_brand_flag from  tmp.b;"
hbaseTable="dim:dim_goods_category_manual_all2" 
family=cf

echo "$sparkSql"

export SPARK_HOME=/data/appdata/spark-3.2.0-bin-hadoop3.2 
spark-submit \
--conf spark.yarn.submit.waitAppCompletion=true \
--conf spark.network.maxRemoteBlockSizeFetchToMem=2147483000 \
--conf spark.network.timeout=300000 \
--conf spark.rpc.message.maxSize=1024 \
--conf spark.executor.heartbeatInterval=300000 \
--conf spark.driver.maxResultSize=8G \
--master yarn \
--deploy-mode cluster \
--class task.SparkHBaseSumit \
--driver-memory 8G \
--executor-memory 32G \
--executor-cores 1 \
--num-executors 1 \
--name ${appName} \
--queue root.ODS \
hdfs://nameservice2/BDP/spark/spark-hbase-bluckload.jar appName="${appName}" sparkSql="${sparkSql}" hbaseTable="${hbaseTable}" family="${family}"


spark-hbase-bluckload.jar  为java 工程打好的包

小皮蛋儿子

关注

16
点赞
踩
15

收藏

觉得还不错? 一键收藏
0
评论
spark-hbase-BulkLoad

方法2: BulkLoad 的方式导入，spark 读取 hive 表，写入hbase 需要的 HFile 类型的文件，写在 hdfs 上面，再使用BulkLoad ，把数据加载移动到hbase表中。缺点: 数据量较大会建成hbase regionserver 压力大，可能会导致regionserver not online ，宕机，造成服务不可用的状态，数据量小没事。方法1：先建 hbase表，再建hbase的外表hive表，做好字段映射，起一个MR 任务写入 hive外表，
复制链接

扫一扫