buckload 批量导入数据到HBase
参考链接:
http://www.cnblogs.com/alexwu59/p/6635437.html
buckload 可以将hdfs文件转化为HFile,在将其加载到HBase的目录下。
优点:批量导入数据到HBase,速度快。过程简单,容易控制。
缺点:需要编写特定的MapReduce代码,打成Jar包;输入数据的格式是特定的,要在代码中做解析设置。另外,此工具单次导入的数据VERSIONS=1,也就是说,同一个cell,它只保存最后的那个数据,其余的都丢失了,所以不能通过这个工具,单此导入具有多个VERIONS版本的数据,不过多次导入时不受影响,会保留下多版本的数据。
具体步骤:
1)将代码打包,名为hbase-bulkload-common-1.0.jar
2)准备数据,input数据的格式应为:
rowkey1 columnFamily:columnName1 cell1
rowkey2 columnFamily:columnName2 cell2
rowkey3 columnFamily:columnName3 cell3
3)执行命令,将输入数据转化为HFile:
$HADOOP_CLASSPATH=`/opt/cloudera/parcels/CDH/bin/hbase classpath` hadoop jar hbase-bulkload-common-1.0.jar -D mapreduce.reduce.memory.mb=4096 ${inputPath}/data ${outputPath}/data_out ${HbaseTableName}
如果数据量过大,或者数据在主键上分布不平衡时,个别reduce可能执行很慢,需要设置更多的内存。
4)修改输出文件HFile的权限,避免报错:
$sudo -u hdfs hdfs dfs -chmod 777 ${outputPath}/data_out
5)将HFile导入到HBase中
$HADOOP_CLASSPATH=`/opt/cloudera/parcels/CDH/bin/hbase classpath` hadoop jar hbase-server-*.jar completebulkload ${outputPath}/data_out ${HbaseTableName}
Mapreduce设置参数:
hadoop jar app.jar -D mapreduce.job.queuename=root.etl.distcp
-D mapreduce.job.priority=HIGH
hadoop jar <jarName> -D mapreduce.map.memory.mb=5120
hadoop jar <jarName> -D mapreduce.reduce.memory.mb=4096
代码依赖Hadoop以及hbase客户端jar
hbase:
源代码如下:
package hbase.service;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
//import org.apache.hadoop.fs.FsShell;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
//import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
//import java.net.URI;
public class BulkLoadJob extends Configured implements Tool{
static Logger logger = LoggerFactory.getLogger(BulkLoadJob.class);
public static class BulkLoadMap extends
Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] valueStrSplit = value.toString().split("\\s+");
/**
* 先判断输入的参数格式是否准确
* 空白符号分割,有三个部分;
* 中间的列簇与列名必须存在;
*/
if(valueStrSplit.length == 3 && valueStrSplit[1].split(":").length >= 2 ) {
String hkey = valueStrSplit[0];
String family = valueStrSplit[1].split(":",2)[0];
String column = valueStrSplit[1].split(":",2)[1];
String hvalue = valueStrSplit[2];
final byte[] rowKey = Bytes.toBytes(hkey);
final ImmutableBytesWritable HKey = new ImmutableBytesWritable(rowKey);
Put put = new Put(rowKey);
byte[] cell = Bytes.toBytes(hvalue);
put.addColumn(Bytes.toBytes(family), Bytes.toBytes(column), cell);
context.write(HKey, put);
}
}
}
public static void main(String[] args) throws Exception {
int excord = ToolRunner.run(new Configuration(), new BulkLoadJob(), args);
System.exit(excord);
}
public int run(String[] arg0) throws Exception {
if(arg0.length != 3) {
logger.error("parameters error,requested parameter is 3,but input is " + arg0.length + ";" );
logger.error("paramer list:inputPath, outputPath, hbaseTableName" );
return 1;
}
String inputPath = arg0[0];
String outputPath = arg0[1];
String hbaseTableName = arg0[2];
logger.info("------------------------------------------");
logger.info("inputPath:" + inputPath);
logger.info("outputPath:" + outputPath);
logger.info("hbaseTableName:" + hbaseTableName);
logger.info("------------------------------------------");
Configuration conf;
Connection connection ;
Admin admin ;
Table table = null;
try {
conf = HBaseConfiguration.create();
Job job = Job.getInstance(getConf(), "Buck-HBaseLoad");
connection = ConnectionFactory.createConnection(conf);
admin = connection.getAdmin();
table = connection.getTable(TableName.valueOf(hbaseTableName));
job.setJarByClass(BulkLoadJob.class);
job.setMapperClass(BulkLoadJob.BulkLoadMap.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(Put.class);
// speculation
job.setSpeculativeExecution(false);
job.setReduceSpeculativeExecution(false);
// in/out format
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(HFileOutputFormat2.class);
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
HFileOutputFormat2.configureIncrementalLoad(job,table,connection.getRegionLocator(TableName.valueOf(hbaseTableName)));
/**
* 修改hdfs输出目录的 权限为777;
* 注释部分为自动导入到hbase中;
*/
if (job.waitForCompletion(true)) {
/* FsShell shell = new FsShell(conf);
try {
shell.run(new String[]{"hdfs","dfs","-chmod", "-R", "777", outputPath});
} catch (Exception e) {
logger.error("Couldnt change the file permissions ", e);
throw new IOException(e);
}
//载入到hbase表
LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
loader.doBulkLoad(new Path(outputPath),
admin,table,connection.getRegionLocator(TableName.valueOf(hbaseTableName)));
*/
} else {
logger.error("loading failed.");
System.exit(1);
}
} catch (IllegalArgumentException e) {
e.printStackTrace();
} finally {
if (table != null) {
table.close();
}
}
return 0;
}
}