Bulkload Hive表到HBase

最新推荐文章于 2022-12-27 15:36:58 发布

GatsbyNewton

最新推荐文章于 2022-12-27 15:36:58 发布

阅读量4.9k

点赞数 2

分类专栏： Hive HBase 文章标签： Hive HBase Bulkload

本文链接：https://blog.csdn.net/u010376788/article/details/51055352

版权

Hive 同时被 2 个专栏收录

10 篇文章 1 订阅

订阅专栏

HBase

4 篇文章 0 订阅

订阅专栏

1.描述

HBase可以随机读写海量的数据，但是如果把这海量数据导入到HBase却是一个挑战。如，将Hive表尽可能快的导入到HBase中。这里有以下三种解决方案：

使用API把数据一条一条地写入HBase。
用HBaseIntegration方法。
使用HBase自带的Bulkload功能。

但是，第一种方法明显是最低效的；第二种方法我之前已经提到过，同样比较慢；那么，第三种方法呢？毋庸置疑，Bulkload方法是最高效的解决方案。因为Bulkload方法可以让你绕过创建HFile（HBase Data Files），而是直接把已经生成的HFile拷贝到HDFS上。如果你的Hive和HBase部署在同一个集群上时，Bulkload的处理过程包括两个步骤：

用MapReduce生成HFile。
使用LoadIncrementalHFiles.doBulkLoad将HFile导入到HBase。

public class Driver extends Configured implements Tool{

    private static Configuration conf = new Configuration();
    private static Configuration hconf = null;
    private static HBaseAdmin hadmin = null;

    public static void connectHBase(){
        final String HBASE_CONFIG_ZOOKEEPER_CLIENT = "hbase.zookeeper.property.clientPort";
        final String HBASE_ZOOKEEPER_CLIENT_PORT = "2181";
        final String HBASE_CONFIG_ZOOKEEPER_QUORUM = "hbase.zookeeper.quorum";
        final String HBASE_ZOOKEEPER_SERVER = "hbase38,hbase43,hbase00";

        conf.set(HBASE_CONFIG_ZOOKEEPER_CLIENT, HBASE_ZOOKEEPER_CLIENT_PORT);
        conf.set(HBASE_CONFIG_ZOOKEEPER_QUORUM, HBASE_ZOOKEEPER_SERVER);
        hconf = HBaseConfiguration.create(conf);
        try{
            hadmin = new HBaseAdmin(hconf);
        }
        catch (Exception e){
            e.printStackTrace();
        }
    }


    public static void main(String[] args)throws Exception{
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if(otherArgs.length != 4){
            System.err.println("Usage: <rcfile> <hfile> <schemafile> <hbasetable>");
            System.exit(1);
        }

        String path = System.getProperty("user.dir") + otherArgs[2];
        List<String> fieldNames = HiveTableUtils.getFieldName(path);
        StringBuilder sb = new StringBuilder(fieldNames.get(0));
        int size = fieldNames.size();
        for(int i = 1; i < size; i++){
            sb.append(":").append(fieldNames.get(i));
        }

        conf.set("schema", sb.toString());
		
	if(ToolRunner.run(conf, new Driver(), otherArgs) == 0){
		// Importing the generated HFiles into a HBase table
		LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
		loader.doBulkLoad(new Path(otherArgs[1], otherArgs[3]);
		System.exit(0);
	}
	else{
		System.exit(1);
	}
    }

    @SuppressWarnings("deprecation")
    @Override
    public int run(String[] strings) throws Exception {

        Configuration config = getConf();
        Driver.connectHBase();

        Job job = new Job(config, "RCFile to HFile");
        job.setJarByClass(Driver.class);
        job.setMapperClass(RCFileToHFile.ParseMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);

        //Reduce's number is 0.
        job.setNumReduceTasks(0);

        job.setPartitionerClass(SimpleTotalOrderPartitioner.class);

        job.setInputFormatClass(RCFileMapReduceInputFormat.class);
//		job.setOutputFormatClass(HFileOutputFormat.class);

        HTable table = new HTable(config, strings[3]);
        HFileOutputFormat.configureIncrementalLoad(job, table);

        RCFileMapReduceInputFormat.addInputPath(job, new Path(strings[0]));
        FileOutputFormat.setOutputPath(job, new Path(strings[1]));

        return job.waitForCompletion(true) ? 0 : 1;
    }
}

然而，如果像我司那样——Hive和HBase部署在不同的集群上时，Bulkload的处理过程就需要三个步骤：

用MapReduce生成HFile。
将已生成的HFile从Hive集群拷贝到HBase集群。
用HBase的命令将HFile导入到HBase中（你也可以用Java代码如上述代码，这样就相对复杂了）。

我们先看第一步的代码：

public class Driver extends Configured implements Tool{

    private static Configuration conf = new Configuration();
    private static Configuration hconf = null;
    private static HBaseAdmin hadmin = null;

    public static void connectHBase(){
        final String HBASE_CONFIG_ZOOKEEPER_CLIENT = "hbase.zookeeper.property.clientPort";
        final String HBASE_ZOOKEEPER_CLIENT_PORT = "2181";
        final String HBASE_CONFIG_ZOOKEEPER_QUORUM = "hbase.zookeeper.quorum";
        final String HBASE_ZOOKEEPER_SERVER = "hbase38,hbase43,hbase00";

        conf.set(HBASE_CONFIG_ZOOKEEPER_CLIENT, HBASE_ZOOKEEPER_CLIENT_PORT);
        conf.set(HBASE_CONFIG_ZOOKEEPER_QUORUM, HBASE_ZOOKEEPER_SERVER);
        hconf = HBaseConfiguration.create(conf);
        try{
            hadmin = new HBaseAdmin(hconf);
        }
        catch (Exception e){
            e.printStackTrace();
        }
    }


    public static void main(String[] args)throws Exception{
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if(otherArgs.length != 4){
            System.err.println("Usage: <rcfile> <hfile> <schemafile> <hbasetable>");
            System.exit(1);
        }

        String path = System.getProperty("user.dir") + otherArgs[2];
        List<String> fieldNames = HiveTableUtils.getFieldName(path);
        StringBuilder sb = new StringBuilder(fieldNames.get(0));
        int size = fieldNames.size();
        for(int i = 1; i < size; i++){
            sb.append(":").append(fieldNames.get(i));
        }

        conf.set("schema", sb.toString());
		
		System.exit(ToolRunner.run(conf, new Driver(), otherArgs));
    }

    @SuppressWarnings("deprecation")
    @Override
    public int run(String[] strings) throws Exception {

        Configuration config = getConf();
        Driver.connectHBase();

        Job job = new Job(config, "RCFile to HFile");
        job.setJarByClass(Driver.class);
        job.setMapperClass(RCFileToHFile.ParseMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);

        //Reduce's number is 0.
        job.setNumReduceTasks(0);

        job.setPartitionerClass(SimpleTotalOrderPartitioner.class);

        job.setInputFormatClass(RCFileMapReduceInputFormat.class);
//		job.setOutputFormatClass(HFileOutputFormat.class);

        HTable table = new HTable(config, strings[3]);
        HFileOutputFormat.configureIncrementalLoad(job, table);

        RCFileMapReduceInputFormat.addInputPath(job, new Path(strings[0]));
        FileOutputFormat.setOutputPath(job, new Path(strings[1]));

        return job.waitForCompletion(true) ? 0 : 1;
    }
}

第二步，拷贝命令 distcp：

# Distributed copy HFile to mycluster-hbase.
hadoop distcp hdfs://mycluster-hive/hfile/hbase hdfs://mycluster-hbase/hbase/test

第三步，Bulkload：

# BulkLoad HFile into hbase table on mycluster-hbase.
hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles /hbase/test hbase_table

2.生成HFile

MapReduce的作用是生成HFile。由于Hive采用的是RCFile的存储结构，所以我的MapReduce的输入数据是RCFile文件，你可以查看 MapReduce读/写RCFile文件了解更多细节。值得一提的是，在生成HFile之前，你需要得到Hive表的schema，即字段名称。你可以通过以下方式得到Hive表的字段名。

解析存储Hive表元数据的文件。
获取Hive表的元数据，
1. 访问MySQL
2. 用HCatalog访问Hive表

但是，我个人认为解析文件的方式更为高效，因为我们的Hive的metadata一般有数千列。

public class HiveTableUtils {

    //Gain hive table columns by parsing file.
    public static List<String>  getFieldName(String filePath){
        File file = new File(filePath);
        BufferedReader reader = null;
        List<String> fieldName = new ArrayList<String>();

        try {
            if (file.exists()) {
                reader = new BufferedReader(new FileReader(file));
                String tmp = null;
                while ((tmp = reader.readLine()) != null) {
                    if (tmp.contains("`") && tmp.contains("COMMENT")) {
                        int start = tmp.indexOf("`");
                        int end = tmp.lastIndexOf("`");
                        fieldName.add(tmp.substring(start + 1, end));
                    }
                }
            } else {
                System.err.println("The file doesn't exist!");
                System.exit(1);
            }

            reader.close();
        }
        catch (Exception e) {
            e.printStackTrace();
        }

        return fieldName;
    }

至于Mapper的输入类型，就是 ImmutableBytesWritable, KeyValue。这些类使用了随后的Partitioner和Reducer来创建HFile。当然，你不需要实现自己的Reducer，代码中的 HFileOutputFormat.configureIncrementalLoad()会自动为你设置合理的Partitioner和Reducer。

public class RCFileToHFile {
	
	public static class ParseMapper extends Mapper<LongWritable, BytesRefArrayWritable, ImmutableBytesWritable, KeyValue>{
//		private List<String> fieldName = null;
		private String[] fieldName = null;

		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			super.setup(context);
			Configuration conf = context.getConfiguration();
            
            String schema = conf.get("schema");
            fieldName = schema.split(":");

//			fieldName = new ArrayList<String>();
//			fieldName.add("id");
//			fieldName.add("name");
//			fieldName.add("age");
		}
		
		@Override
		protected void map(LongWritable key, BytesRefArrayWritable values,
				Context context)
				throws IOException, InterruptedException {
			// TODO Auto-generated method stub

			Text line = new Text();
			List<String> fields = new ArrayList<String>();
			int size = values.size();
			for(int i = 0; i < size; i++){
				BytesRefWritable value = values.get(i);
				line.set(value.getData(), value.getStart(), value.getLength());
				fields.add(line.toString());
			}
			
			String rowKey = fields.get(0);
			String columnFamily = "cf";
			int length = fieldName.length;
			ImmutableBytesWritable hKey = new ImmutableBytesWritable();
			hKey.set(rowKey.getBytes());
			KeyValue kv = null;
			for(int i = 1; i < length; i++){
                kv = new KeyValue(hKey.get(), columnFamily.getBytes(), fieldName[i].getBytes(), fields.get(i).getBytes());
                context.write(hKey, kv);
			}
			
		}
	}

}

注意：

生成HFile时，连接的HBase表必须存在。
连接的HBase表，可以与最后Bulkload的目标HBase表可以不同名，但是表的结构必须相同（如，Column Family的个数）。
版本之间的匹配。

完整的代码托管在GitHub中： https://github.com/GatsbyNewton/hive-bulkload-hbase

GatsbyNewton

关注

2
点赞
踩
9

收藏

觉得还不错? 一键收藏
1
评论
Bulkload Hive表到HBase

1.描述HBase可以随机读写海量的数据，但是如果把这海量数据导入到HBase却是一个挑战。如，将Hive表尽可能快的导入到HBase中。这里有以下三种解决方案：使用API把数据一条一条地写入HBase。用HBaseIntegration方法。使用HBase自带的Bulkload功能。但是，第一种方法明显是最低效的；第二种方法我之前已经提到过，同样比较慢；那么，第三种方
复制链接

扫一扫