以下场景适合应用bulk load
1.大批量数据导入,可以适用bulkload 来减轻regionserver的负载
第一步,把HDFS文件转化成Hfile文件,
第二步,把Hfile文件move到hbase里
代码如下:
package com.cloudera.examples.hbase.bulkimport;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class GeneratePutHFileAndBulkLoadToHBase {
public static class ConvertWordCountOutToHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put>
{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
String wordCountStr=value.toString();
String[] wordCountArray=wordCountStr.split("\001");
String word=wordCountArray[0];
String kgn=wordCountArray[1];
String kra=wordCountArray[2];
String cpcz=wordCountArray[3];
String teamcol=wordCountArray[4];
String click=wordCountArray[5];
String label=wordCountArray[6];
String pp=wordCountArray[7];
//创建HBase中的RowKey
byte[] rowKey=Bytes.toBytes(word);
ImmutableBytesWritable rowKeyWritable=new ImmutableBytesWritable(rowKey);
byte[] family=Bytes.toBytes("cf");
byte[] qualifier=Bytes.toBytes("kgn");
byte[] hbaseValue=Bytes.toBytes(kgn);
byte[] qualifier1=Bytes.toBytes("kra");
byte[] hbaseValue1=Bytes.toBytes(kra);
byte[] qualifier2=Bytes.toBytes("cpcz");
byte[] hbaseValue2=Bytes.toBytes(cpcz);
byte[] qualifier3=Bytes.toBytes("teamcol");
byte[] hbaseValue3=Bytes.toBytes(teamcol);
byte[] qualifier4=Bytes.toBytes("click");
byte[] hbaseValue4=Bytes.toBytes(click);
byte[] qualifier5=Bytes.toBytes("label");
byte[] hbaseValue5=Bytes.toBytes(label);
byte[] qualifier6=Bytes.toBytes("pp");
byte[] hbaseValue6=Bytes.toBytes(pp);
//可以根据条件来删选记录,例如时间等等
// Put 用于列簇下的多列提交,若只有一个列,则可以使用 KeyValue 格式
// KeyValue keyValue = new KeyValue(rowKey, family, qualifier, hbaseValue);
Put put=new Put(rowKey);
put.add(family, qualifier, hbaseValue);
put.add(family, qualifier1, hbaseValue1);
put.add(family, qualifier2, hbaseValue2);
put.add(family, qualifier3, hbaseValue3);
put.add(family, qualifier4, hbaseValue4);
put.add(family, qualifier5, hbaseValue5);
put.add(family, qualifier6, hbaseValue6);
context.write(rowKeyWritable, put);
}
}
public static void main(String[] arg) throws Exception {
String[] args = {"/user/zhoulh/input","/user/zhoulh/output"};
// TODO Auto-generated method stub
Configuration hadoopConfiguration=new Configuration();
System.setProperty("HADOOP_USER_NAME", "hbase");
hadoopConfiguration.set("hbase.zookeeper.quorum", "n1,n2,n3,n4");
hadoopConfiguration.set("hbase.zookeeper.property.clientPort", "2181");
hadoopConfiguration.set("fs.defaultFS", "hdfs://n1:8020");
String[] dfsArgs = new GenericOptionsParser(hadoopConfiguration, args).getRemainingArgs();
FileSystem fs = FileSystem.get(hadoopConfiguration);
fs.deleteOnExit(new Path(dfsArgs[1]));
//只需要编写Mapper类,在Mapper类中对一个job的输出进行分析,并转换为HBase需要的KeyValue的方式。
Job appUserProfileInterest=new Job(hadoopConfiguration, "app_user_profile_interest_bulkload");
appUserProfileInterest.setJarByClass(GeneratePutHFileAndBulkLoadToHBase.class);
appUserProfileInterest.setMapperClass(ConvertWordCountOutToHFileMapper.class);
//ReducerClass 无需指定,框架会自行根据 MapOutputValueClass 来决定是使用 KeyValueSortReducer 还是 PutSortReducer
//convertWordCountJobOutputToHFileJob.setReducerClass(KeyValueSortReducer.class);
appUserProfileInterest.setMapOutputKeyClass(ImmutableBytesWritable.class);
appUserProfileInterest.setMapOutputValueClass(Put.class);
//指定在HDF上的输入输出目录
FileInputFormat.addInputPath(appUserProfileInterest, new Path(dfsArgs[0]));
FileOutputFormat.setOutputPath(appUserProfileInterest, new Path(dfsArgs[1]));
//创建HBase的配置对象
//创建目标表对象
HTable app_user_profile_interest_hbase =new HTable(hadoopConfiguration, "dd_b_basic_u1_app_user_profile_interest_hbase");
HFileOutputFormat2.configureIncrementalLoad(appUserProfileInterest,app_user_profile_interest_hbase);
//生成Hfile
int convertWordCountJobOutputToHFileJobResult=appUserProfileInterest.waitForCompletion(true)?0:1;
System.out.println(convertWordCountJobOutputToHFileJobResult);
//当生成完Hfile后,调用BulkLoad方式来将MR结果批量入库
LoadIncrementalHFiles loader = new LoadIncrementalHFiles(hadoopConfiguration);
//第一个参数为生成HFile的目录,第二个参数为目标表
System.out.println("outpath:" + dfsArgs[1]);
loader.doBulkLoad(new Path(dfsArgs[1]), app_user_profile_interest_hbase);
System.out.println("done");
}
}
如果出现在
loader.doBulkLoad(new Path(dfsArgs[1]), app_user_profile_interest_hbase);
卡住的情况,可以看server日志,发现是hbase用户没有操作out_put目录的权限。修改此目录的权限。则可以解除
另外,生成output目录下文件的用户也需要是hbase用户