1、为何要 BulkLoad 导入?传统的 HTableOutputFormat 写 HBase 有什么问题?
我们先看下 HBase 的写流程:
通常 MapReduce 在写HBase时使用的是 TableOutputFormat 方式,在reduce中直接生成put对象写入HBase,该方式在大数据量写入时效率低下(HBase会block写入,频繁进行flush,split,compact等大量IO操作),并对HBase节点的稳定性造成一定的影响(GC时间过长,响应变慢,导致节点超时退出,并引起一系列连锁反应),而HBase支持 bulk load 的入库方式,它是利用hbase的数据信息按照特定格式存储在hdfs内这一原理,直接在HDFS中生成持久化的HFile数据格式文件,然后上传至合适位置,即完成巨量数据快速入库的办法。配合mapreduce完成,高效便捷,而且不占用region资源,增添负载,在大数据量写入时能极大的提高写入效率,并降低对HBase节点的写入压力。
通过使用先生成HFile,然后再BulkLoad到Hbase的方式来替代之前直接调用HTableOutputFormat的方法有如下的好处:
(1)消除了对HBase集群的插入压力
(2)提高了Job的运行速度,降低了Job的执行时间
目前此种方式仅仅适用于只有一个列族的情况,在新版 HBase 中,单列族的限制会消除。
2、bulkload 流程与实践
bulkload 方式需要两个Job配合完成:
(1)第一个Job还是运行原来业务处理逻辑,处理的结果不直接调用HTableOutputFormat写入到HBase,而是先写入到HDFS上的一个中间目录下(如 middata)
(2)第二个Job以第一个Job的输出(middata)做为输入,然后将其格式化HBase的底层存储文件HFile
(3)调用BulkLoad将第二个Job生成的HFile导入到对应的HBase表中
下面给出相应的范例代码:
001 | import java.io.IOException; |
003 | import org.apache.hadoop.conf.Configuration; |
004 | import org.apache.hadoop.fs.Path; |
005 | import org.apache.hadoop.hbase.HBaseConfiguration; |
006 | import org.apache.hadoop.hbase.KeyValue; |
007 | import org.apache.hadoop.hbase.client.HTable; |
008 | import org.apache.hadoop.hbase.client.Put; |
009 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable; |
010 | import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat; |
011 | import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer; |
012 | import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles; |
013 | import org.apache.hadoop.hbase.util.Bytes; |
014 | import org.apache.hadoop.io.IntWritable; |
015 | import org.apache.hadoop.io.LongWritable; |
016 | import org.apache.hadoop.io.Text; |
017 | import org.apache.hadoop.mapreduce.Job; |
018 | import org.apache.hadoop.mapreduce.Mapper; |
019 | import org.apache.hadoop.mapreduce.Reducer; |
020 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; |
021 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; |
022 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; |
023 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; |
024 | import org.apache.hadoop.util.GenericOptionsParser; |
026 | public class GeneratePutHFileAndBulkLoadToHBase { |
028 | public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> |
031 | private Text wordText= new Text(); |
032 | private IntWritable one= new IntWritable( 1 ); |
034 | protected void map(LongWritable key, Text value, Context context) |
035 | throws IOException, InterruptedException { |
036 | // TODO Auto-generated method stub |
037 | String line=value.toString(); |
038 | String[] wordArray=line.split( " " ); |
039 | for (String word:wordArray) |
042 | context.write(wordText, one); |
048 | public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> |
051 | private IntWritable result= new IntWritable(); |
052 | protected void reduce(Text key, Iterable<IntWritable> valueList, |
054 | throws IOException, InterruptedException { |
055 | // TODO Auto-generated method stub |
057 | for (IntWritable value:valueList) |
062 | context.write(key, result); |
067 | public static class ConvertWordCountOutToHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> |
071 | protected void map(LongWritable key, Text value, Context context) |
072 | throws IOException, InterruptedException { |
073 | // TODO Auto-generated method stub |
074 | String wordCountStr=value.toString(); |
075 | String[] wordCountArray=wordCountStr.split( "\t" ); |
076 | String word=wordCountArray[ 0 ]; |
077 | int count=Integer.valueOf(wordCountArray[ 1 ]); |
080 | byte [] rowKey=Bytes.toBytes(word); |
081 | ImmutableBytesWritable rowKeyWritable= new ImmutableBytesWritable(rowKey); |
082 | byte [] family=Bytes.toBytes( "cf" ); |
083 | byte [] qualifier=Bytes.toBytes( "count" ); |
084 | byte [] hbaseValue=Bytes.toBytes(count); |
085 | // Put 用于列簇下的多列提交,若只有一个列,则可以使用 KeyValue 格式 |
086 | // KeyValue keyValue = new KeyValue(rowKey, family, qualifier, hbaseValue); |
087 | Put put= new Put(rowKey); |
088 | put.add(family, qualifier, hbaseValue); |
089 | context.write(rowKeyWritable, put); |
095 | public static void main(String[] args) throws Exception { |
096 | // TODO Auto-generated method stub |
097 | Configuration hadoopConfiguration= new Configuration(); |
098 | String[] dfsArgs = new GenericOptionsParser(hadoopConfiguration, args).getRemainingArgs(); |
100 | //第一个Job就是普通MR,输出到指定的目录 |
101 | Job job= new Job(hadoopConfiguration, "wordCountJob" ); |
102 | job.setJarByClass(GeneratePutHFileAndBulkLoadToHBase. class ); |
103 | job.setMapperClass(WordCountMapper. class ); |
104 | job.setReducerClass(WordCountReducer. class ); |
105 | job.setOutputKeyClass(Text. class ); |
106 | job.setOutputValueClass(IntWritable. class ); |
107 | FileInputFormat.setInputPaths(job, new Path(dfsArgs[ 0 ])); |
108 | FileOutputFormat.setOutputPath(job, new Path(dfsArgs[ 1 ])); |
110 | int wordCountJobResult=job.waitForCompletion( true )? 0 : 1 ; |
112 | //第二个Job以第一个Job的输出做为输入,只需要编写Mapper类,在Mapper类中对一个job的输出进行分析,并转换为HBase需要的KeyValue的方式。 |
113 | Job convertWordCountJobOutputToHFileJob= new Job(hadoopConfiguration, "wordCount_bulkload" ); |
115 | convertWordCountJobOutputToHFileJob.setJarByClass(GeneratePutHFileAndBulkLoadToHBase. class ); |
116 | convertWordCountJobOutputToHFileJob.setMapperClass(ConvertWordCountOutToHFileMapper. class ); |
117 | //ReducerClass 无需指定,框架会自行根据 MapOutputValueClass 来决定是使用 KeyValueSortReducer 还是 PutSortReducer |
118 | //convertWordCountJobOutputToHFileJob.setReducerClass(KeyValueSortReducer.class); |
119 | convertWordCountJobOutputToHFileJob.setMapOutputKeyClass(ImmutableBytesWritable. class ); |
120 | convertWordCountJobOutputToHFileJob.setMapOutputValueClass(Put. class ); |
122 | //以第一个Job的输出做为第二个Job的输入 |
123 | FileInputFormat.addInputPath(convertWordCountJobOutputToHFileJob, new Path(dfsArgs[ 1 ])); |
124 | FileOutputFormat.setOutputPath(convertWordCountJobOutputToHFileJob, new Path(dfsArgs[ 2 ])); |
126 | Configuration hbaseConfiguration=HBaseConfiguration.create(); |
128 | HTable wordCountTable = new HTable(hbaseConfiguration, "word_count" ); |
129 | HFileOutputFormat.configureIncrementalLoad(convertWordCountJobOutputToHFileJob,wordCountTable); |
132 | int convertWordCountJobOutputToHFileJobResult=convertWordCountJobOutputToHFileJob.waitForCompletion( true )? 0 : 1 ; |
134 | //当第二个job结束之后,调用BulkLoad方式来将MR结果批量入库 |
135 | LoadIncrementalHFiles loader = new LoadIncrementalHFiles(hbaseConfiguration); |
136 | //第一个参数为第二个Job的输出目录即保存HFile的目录,第二个参数为目标表 |
137 | loader.doBulkLoad( new Path(dfsArgs[ 2 ]), wordCountTable); |
139 | //最后调用System.exit进行退出 |
140 | System.exit(convertWordCountJobOutputToHFileJobResult); |
比如原始的输入数据的目录为:/rawdata/test/wordcount/20131212
中间结果数据保存的目录为:/middata/test/wordcount/20131212
最终生成的HFile保存的目录为:/resultdata/test/wordcount/20131212
运行上面的Job的方式如下:
hadoop jar test.jar /rawdata/test/wordcount/20131212 /middata/test/wordcount/20131212 /resultdata/test/wordcount/20131212
3、说明与注意事项:
(1)HFile方式在所有的加载方案里面是最快的,不过有个前提——数据是第一次导入,表是空的。如果表中已经有了数据。HFile再导入到hbase的表中会触发split操作。
(2)最终输出结果,无论是map还是reduce,输出部分key和value的类型必须是: < ImmutableBytesWritable, KeyValue>或者< ImmutableBytesWritable, Put>。
否则报这样的错误:
1 | java.lang.IllegalArgumentException: Can't read partitions file |
3 | Caused by: java.io.IOException: wrong key class : org.apache.hadoop.io.*** is not class org.apache.hadoop.hbase.io.ImmutableBytesWritable |
(3)最终输出部分,Value类型是KeyValue 或Put,对应的Sorter分别是KeyValueSortReducer或PutSortReducer,这个 SorterReducer 可以不指定,因为源码中已经做了判断:
1 | if (KeyValue. class .equals(job.getMapOutputValueClass())) { |
2 | job.setReducerClass(KeyValueSortReducer. class ); |
3 | } else if (Put. class .equals(job.getMapOutputValueClass())) { |
4 | job.setReducerClass(PutSortReducer. class ); |
6 | LOG.warn( "Unknown map output value type:" + job.getMapOutputValueClass()); |
(4) MR例子中job.setOutputFormatClass(HFileOutputFormat.class); HFileOutputFormat只适合一次对单列族组织成HFile文件,多列簇需要起多个 job,不过新版本的 Hbase 已经解决了这个限制。
(5) MR例子中最后生成HFile存储在HDFS上,输出路径下的子目录是各个列族。如果对HFile进行入库HBase,相当于move HFile到HBase的Region中,HFile子目录的列族内容没有了。
(6)最后一个 Reduce 没有 setNumReduceTasks 是因为,该设置由框架根据region个数自动配置的。
(7)下边配置部分,注释掉的其实写不写都无所谓,因为看源码就知道configureIncrementalLoad方法已经把固定的配置全配置完了,不固定的部分才需要手动配置。
01 | public class HFileOutput { |
03 | public static Job configureJob(Configuration conf) throws IOException { |
04 | Job job = new Job(configuration, "countUnite1" ); |
05 | job.setJarByClass(HFileOutput. class ); |
06 | //job.setNumReduceTasks(2); |
07 | //job.setOutputKeyClass(ImmutableBytesWritable.class); |
08 | //job.setOutputValueClass(KeyValue.class); |
09 | //job.setOutputFormatClass(HFileOutputFormat.class); |
11 | Scan scan = new Scan(); |
13 | scan.addFamily(INPUT_FAMILY); |
14 | TableMapReduceUtil.initTableMapperJob(inputTable, scan, |
15 | HFileOutputMapper. class , ImmutableBytesWritable. class , LongWritable. class , job); |
16 | //这里如果不定义reducer部分,会自动识别定义成KeyValueSortReducer.class 和PutSortReducer.class |
17 | job.setReducerClass(HFileOutputRedcuer. class ); |
18 | //job.setOutputFormatClass(HFileOutputFormat.class); |
19 | HFileOutputFormat.configureIncrementalLoad(job, new HTable( |
20 | configuration, outputTable)); |
21 | HFileOutputFormat.setOutputPath(job, new Path()); |
22 | //FileOutputFormat.setOutputPath(job, new Path()); //等同上句 |
26 | public static class HFileOutputMapper extends |
27 | TableMapper<ImmutableBytesWritable, LongWritable> { |
28 | public void map(ImmutableBytesWritable key, Result values, |
29 | Context context) throws IOException, InterruptedException { |
31 | context.write( new ImmutableBytesWritable(Bytes()), LongWritable()); |
35 | public static class HFileOutputRedcuer extends |
36 | Reducer<ImmutableBytesWritable, LongWritable, ImmutableBytesWritable, KeyValue> { |
37 | public void reduce(ImmutableBytesWritable key, Iterable<LongWritable> values, |
38 | Context context) throws IOException, InterruptedException { |
40 | KeyValue kv = new KeyValue(row, OUTPUT_FAMILY, tmp[ 1 ].getBytes(), |
41 | Bytes.toBytes(count)); |
42 | context.write(key, kv); |
4、Refer:
1、Hbase几种数据入库(load)方式比较
http://blog.csdn.net/kirayuan/article/details/6371635
2、MapReduce生成HFile入库到HBase及源码分析
http://blog.pureisle.net/archives/1950.html
3、MapReduce生成HFile入库到HBase
http://shitouer.cn/2013/02/hbase-hfile-bulk-load/
1、为何要 BulkLoad 导入?传统的 HTableOutputFormat 写 HBase 有什么问题?
我们先看下 HBase 的写流程:
通常 MapReduce 在写HBase时使用的是 TableOutputFormat 方式,在reduce中直接生成put对象写入HBase,该方式在大数据量写入时效率低下(HBase会block写入,频繁进行flush,split,compact等大量IO操作),并对HBase节点的稳定性造成一定的影响(GC时间过长,响应变慢,导致节点超时退出,并引起一系列连锁反应),而HBase支持 bulk load 的入库方式,它是利用hbase的数据信息按照特定格式存储在hdfs内这一原理,直接在HDFS中生成持久化的HFile数据格式文件,然后上传至合适位置,即完成巨量数据快速入库的办法。配合mapreduce完成,高效便捷,而且不占用region资源,增添负载,在大数据量写入时能极大的提高写入效率,并降低对HBase节点的写入压力。
通过使用先生成HFile,然后再BulkLoad到Hbase的方式来替代之前直接调用HTableOutputFormat的方法有如下的好处:
(1)消除了对HBase集群的插入压力
(2)提高了Job的运行速度,降低了Job的执行时间
目前此种方式仅仅适用于只有一个列族的情况,在新版 HBase 中,单列族的限制会消除。
2、bulkload 流程与实践
bulkload 方式需要两个Job配合完成:
(1)第一个Job还是运行原来业务处理逻辑,处理的结果不直接调用HTableOutputFormat写入到HBase,而是先写入到HDFS上的一个中间目录下(如 middata)
(2)第二个Job以第一个Job的输出(middata)做为输入,然后将其格式化HBase的底层存储文件HFile
(3)调用BulkLoad将第二个Job生成的HFile导入到对应的HBase表中
下面给出相应的范例代码:
001 | import java.io.IOException; |
003 | import org.apache.hadoop.conf.Configuration; |
004 | import org.apache.hadoop.fs.Path; |
005 | import org.apache.hadoop.hbase.HBaseConfiguration; |
006 | import org.apache.hadoop.hbase.KeyValue; |
007 | import org.apache.hadoop.hbase.client.HTable; |
008 | import org.apache.hadoop.hbase.client.Put; |
009 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable; |
010 | import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat; |
011 | import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer; |
012 | import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles; |
013 | import org.apache.hadoop.hbase.util.Bytes; |
014 | import org.apache.hadoop.io.IntWritable; |
015 | import org.apache.hadoop.io.LongWritable; |
016 | import org.apache.hadoop.io.Text; |
017 | import org.apache.hadoop.mapreduce.Job; |
018 | import org.apache.hadoop.mapreduce.Mapper; |
019 | import org.apache.hadoop.mapreduce.Reducer; |
020 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; |
021 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; |
022 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; |
023 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; |
024 | import org.apache.hadoop.util.GenericOptionsParser; |
026 | public class GeneratePutHFileAndBulkLoadToHBase { |
028 | public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> |
031 | private Text wordText= new Text(); |
032 | private IntWritable one= new IntWritable( 1 ); |
034 | protected void map(LongWritable key, Text value, Context context) |
035 | throws IOException, InterruptedException { |
036 | // TODO Auto-generated method stub |
037 | String line=value.toString(); |
038 | String[] wordArray=line.split( " " ); |
039 | for (String word:wordArray) |
042 | context.write(wordText, one); |
048 | public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> |
051 | private IntWritable result= new IntWritable(); |
052 | protected void reduce(Text key, Iterable<IntWritable> valueList, |
054 | throws IOException, InterruptedException { |
055 | // TODO Auto-generated method stub |
057 | for (IntWritable value:valueList) |
062 | context.write(key, result); |
067 | public static class ConvertWordCountOutToHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> |
071 | protected void map(LongWritable key, Text value, Context context) |
072 | throws IOException, InterruptedException { |
073 | // TODO Auto-generated method stub |
074 | String wordCountStr=value.toString(); |
075 | String[] wordCountArray=wordCountStr.split( "\t" ); |
076 | String word=wordCountArray[ 0 ]; |
077 | int count=Integer.valueOf(wordCountArray[ 1 ]); |
080 | byte [] rowKey=Bytes.toBytes(word); |
081 | ImmutableBytesWritable rowKeyWritable= new ImmutableBytesWritable(rowKey); |
082 | byte [] family=Bytes.toBytes( "cf" ); |
083 | byte [] qualifier=Bytes.toBytes( "count" ); |
084 | byte [] hbaseValue=Bytes.toBytes(count); |
085 | // Put 用于列簇下的多列提交,若只有一个列,则可以使用 KeyValue 格式 |
086 | // KeyValue keyValue = new KeyValue(rowKey, family, qualifier, hbaseValue); |
087 | Put put= new Put(rowKey); |
088 | put.add(family, qualifier, hbaseValue); |
089 | context.write(rowKeyWritable, put); |
095 | public static void main(String[] args) throws Exception { |
096 | // TODO Auto-generated method stub |
097 | Configuration hadoopConfiguration= new Configuration(); |
098 | String[] dfsArgs = new GenericOptionsParser(hadoopConfiguration, args).getRemainingArgs(); |
100 | //第一个Job就是普通MR,输出到指定的目录 |
101 | Job job= new Job(hadoopConfiguration, "wordCountJob" ); |
102 | job.setJarByClass(GeneratePutHFileAndBulkLoadToHBase. class ); |
103 | job.setMapperClass(WordCountMapper. class ); |
104 | job.setReducerClass(WordCountReducer. class ); |
105 | job.setOutputKeyClass(Text. class ); |
106 | job.setOutputValueClass(IntWritable. class ); |
107 | FileInputFormat.setInputPaths(job, new Path(dfsArgs[ 0 ])); |
108 | FileOutputFormat.setOutputPath(job, new Path(dfsArgs[ 1 ])); |
110 | int wordCountJobResult=job.waitForCompletion( true )? 0 : 1 ; |
112 | //第二个Job以第一个Job的输出做为输入,只需要编写Mapper类,在Mapper类中对一个job的输出进行分析,并转换为HBase需要的KeyValue的方式。 |
113 | Job convertWordCountJobOutputToHFileJob= new Job(hadoopConfiguration, "wordCount_bulkload" ); |
115 | convertWordCountJobOutputToHFileJob.setJarByClass(GeneratePutHFileAndBulkLoadToHBase. class ); |
116 | convertWordCountJobOutputToHFileJob.setMapperClass(ConvertWordCountOutToHFileMapper. class ); |
117 | //ReducerClass 无需指定,框架会自行根据 MapOutputValueClass 来决定是使用 KeyValueSortReducer 还是 PutSortReducer |
118 | //convertWordCountJobOutputToHFileJob.setReducerClass(KeyValueSortReducer.class); |
119 | convertWordCountJobOutputToHFileJob.setMapOutputKeyClass(ImmutableBytesWritable. class ); |
120 | convertWordCountJobOutputToHFileJob.setMapOutputValueClass(Put. class ); |
122 | //以第一个Job的输出做为第二个Job的输入 |
123 | FileInputFormat.addInputPath(convertWordCountJobOutputToHFileJob, new Path(dfsArgs[ 1 ])); |
124 | FileOutputFormat.setOutputPath(convertWordCountJobOutputToHFileJob, new Path(dfsArgs[ 2 ])); |
126 | Configuration hbaseConfiguration=HBaseConfiguration.create(); |
128 | HTable wordCountTable = new HTable(hbaseConfiguration, "word_count" ); |
129 | HFileOutputFormat.configureIncrementalLoad(convertWordCountJobOutputToHFileJob,wordCountTable); |
132 | int convertWordCountJobOutputToHFileJobResult=convertWordCountJobOutputToHFileJob.waitForCompletion( true )? 0 : 1 ; |
134 | //当第二个job结束之后,调用BulkLoad方式来将MR结果批量入库 |
135 | LoadIncrementalHFiles loader = new LoadIncrementalHFiles(hbaseConfiguration); |
136 | //第一个参数为第二个Job的输出目录即保存HFile的目录,第二个参数为目标表 |
137 | loader.doBulkLoad( new Path(dfsArgs[ 2 ]), wordCountTable); |
139 | //最后调用System.exit进行退出 |
140 | System.exit(convertWordCountJobOutputToHFileJobResult); |
比如原始的输入数据的目录为:/rawdata/test/wordcount/20131212
中间结果数据保存的目录为:/middata/test/wordcount/20131212
最终生成的HFile保存的目录为:/resultdata/test/wordcount/20131212
运行上面的Job的方式如下:
hadoop jar test.jar /rawdata/test/wordcount/20131212 /middata/test/wordcount/20131212 /resultdata/test/wordcount/20131212
3、说明与注意事项:
(1)HFile方式在所有的加载方案里面是最快的,不过有个前提——数据是第一次导入,表是空的。如果表中已经有了数据。HFile再导入到hbase的表中会触发split操作。
(2)最终输出结果,无论是map还是reduce,输出部分key和value的类型必须是: < ImmutableBytesWritable, KeyValue>或者< ImmutableBytesWritable, Put>。
否则报这样的错误:
1 | java.lang.IllegalArgumentException: Can't read partitions file |
3 | Caused by: java.io.IOException: wrong key class : org.apache.hadoop.io.*** is not class org.apache.hadoop.hbase.io.ImmutableBytesWritable |
(3)最终输出部分,Value类型是KeyValue 或Put,对应的Sorter分别是KeyValueSortReducer或PutSortReducer,这个 SorterReducer 可以不指定,因为源码中已经做了判断:
1 | if (KeyValue. class .equals(job.getMapOutputValueClass())) { |
2 | job.setReducerClass(KeyValueSortReducer. class ); |
3 | } else if (Put. class .equals(job.getMapOutputValueClass())) { |
4 | job.setReducerClass(PutSortReducer. class ); |
6 | LOG.warn( "Unknown map output value type:" + job.getMapOutputValueClass()); |
(4) MR例子中job.setOutputFormatClass(HFileOutputFormat.class); HFileOutputFormat只适合一次对单列族组织成HFile文件,多列簇需要起多个 job,不过新版本的 Hbase 已经解决了这个限制。
(5) MR例子中最后生成HFile存储在HDFS上,输出路径下的子目录是各个列族。如果对HFile进行入库HBase,相当于move HFile到HBase的Region中,HFile子目录的列族内容没有了。
(6)最后一个 Reduce 没有 setNumReduceTasks 是因为,该设置由框架根据region个数自动配置的。
(7)下边配置部分,注释掉的其实写不写都无所谓,因为看源码就知道configureIncrementalLoad方法已经把固定的配置全配置完了,不固定的部分才需要手动配置。
01 | public class HFileOutput { |
03 | public static Job configureJob(Configuration conf) throws IOException { |
04 | Job job = new Job(configuration, "countUnite1" ); |
05 | job.setJarByClass(HFileOutput. class ); |
06 | //job.setNumReduceTasks(2); |
07 | //job.setOutputKeyClass(ImmutableBytesWritable.class); |
08 | //job.setOutputValueClass(KeyValue.class); |
09 | //job.setOutputFormatClass(HFileOutputFormat.class); |
11 | Scan scan = new Scan(); |
13 | scan.addFamily(INPUT_FAMILY); |
14 | TableMapReduceUtil.initTableMapperJob(inputTable, scan, |
15 | HFileOutputMapper. class , ImmutableBytesWritable. class , LongWritable. class , job); |
16 | //这里如果不定义reducer部分,会自动识别定义成KeyValueSortReducer.class 和PutSortReducer.class |
17 | job.setReducerClass(HFileOutputRedcuer. class ); |
18 | //job.setOutputFormatClass(HFileOutputFormat.class); |
19 | HFileOutputFormat.configureIncrementalLoad(job, new HTable( |
20 | configuration, outputTable)); |
21 | HFileOutputFormat.setOutputPath(job, new Path()); |
22 | //FileOutputFormat.setOutputPath(job, new Path()); //等同上句 |
26 | public static class HFileOutputMapper extends |
27 | TableMapper<ImmutableBytesWritable, LongWritable> { |
28 | public void map(ImmutableBytesWritable key, Result values, |
29 | Context context) throws IOException, InterruptedException { |
31 | context.write( new ImmutableBytesWritable(Bytes()), LongWritable()); |
35 | public static class HFileOutputRedcuer extends |
36 | Reducer<ImmutableBytesWritable, LongWritable, ImmutableBytesWritable, KeyValue> { |
37 | public void reduce(ImmutableBytesWritable key, Iterable<LongWritable> values, |
38 | Context context) throws IOException, InterruptedException { |
40 | KeyValue kv = new KeyValue(row, OUTPUT_FAMILY, tmp[ 1 ].getBytes(), |
41 | Bytes.toBytes(count)); |
42 | context.write(key, kv); |
4、Refer:
1、Hbase几种数据入库(load)方式比较
http://blog.csdn.net/kirayuan/article/details/6371635
2、MapReduce生成HFile入库到HBase及源码分析
http://blog.pureisle.net/archives/1950.html
3、MapReduce生成HFile入库到HBase
http://shitouer.cn/2013/02/hbase-hfile-bulk-load/