使用MR任务读取txt文档, 将文件转换HFile文件, 再将HFile文件导入hbase表中, 这样对hbase无压力
mapper
package com.hbaseBulkLoad;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class joMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
String hbaseTable="stu6";
Connection connection = ConnectionFactory.createConnection(getConf());
Admin admin = connection.getAdmin();
Table table = connection.getTable(TableName.valueOf(hbaseTable));
Job job = Job.getInstance(getConf(), "HbaseBulkLoad");
job.setJarByClass(joMain.class);
job.setMapperClass(MapperBulkLoad.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(KeyValue.class);
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.setInputPaths(job,new Path("/tmp/a"));
job.setOutputFormatClass(TextOutputFormat.class);
Path Outpath = new Path("/tmp/out1");
if (Outpath.getFileSystem(getConf()).exists(Outpath)) {
Outpath.getFileSystem(getConf()).delete(Outpath,true);
}
FileOutputFormat.setOutputPath(job,Outpath);
job.setNumReduceTasks(0);
HFileOutputFormat2.configureIncrementalLoad(job,
table,
connection.getRegionLocator(TableName.valueOf(hbaseTable)));
boolean b = job.waitForCompletion(true);
String AutoloadRegionFlag="false";
if(args.length==1){
AutoloadRegionFlag = args[0];
}
if ("true".equals(AutoloadRegionFlag)){
/*使用命令行 将 HFile 文件 导入 hbase表中
*第一种方法: hadoop jar hbase/lib/hbase-server-1.2.6.jar completebulkload /tmp/out1/<文件路径> stu6<表名>
*第二种方法: hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles /tmp/out1/ stu6
*/
if (b){
System.out.println("==================Hbase BulkLoad=====================");
long start = System.currentTimeMillis();
LoadIncrementalHFiles load = new LoadIncrementalHFiles(getConf());
load.doBulkLoad(Outpath,admin,table,connection.getRegionLocator(TableName.valueOf(hbaseTable)));
long end = System.currentTimeMillis();
System.out.println("BulkLoad耗时: "+(end-start)+" ms");
}
}
return b?0:1;
}
public static void main(String[] args) throws Exception {
Configuration hbaseconf = HBaseConfiguration.create();
Configuration conf = new Configuration(hbaseconf);
int run = ToolRunner.run(conf, new joMain(), args);
System.exit(run);
}
}
package com.hbaseBulkLoad;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class Hist {
private String id;
private Date dt;
private String item;
private Integer num;
private String info;
private static Hist hist = new Hist();
private static SimpleDateFormat sdf =new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
public Hist() {
}
public Hist(String id, Date dt, String item, Integer num, String info) {
this.id = id;
this.dt = dt;
this.item = item;
this.num = num;
this.info = info;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Date getDt() {
return dt;
}
public void setDt(Date dt) {
this.dt = dt;
}
public String getItem() {
return item;
}
public void setItem(String item) {
this.item = item;
}
public Integer getNum() {
return num;
}
public void setNum(Integer num) {
this.num = num;
}
public String getInfo() {
return info;
}
public void setInfo(String info) {
this.info = info;
}
@Override
public String toString() {
return "Hist{" +
"id='" + id + '\'' +
", dt=" + sdf.format(dt) +
", item='" + item + '\'' +
", num=" + num +
", info='" + info + '\'' +
'}';
}
public static Hist parse(String line){
String[] splitline = line.trim().split(",");
hist.setId(splitline[0]);
try {
hist.setDt(sdf.parse(splitline[1]));
} catch (ParseException e) {
e.printStackTrace();
hist.setDt(new java.util.Date());
}
hist.setItem(splitline[2]);
hist.setNum(Integer.parseInt(splitline[3]));
hist.setInfo(splitline[4]);
return hist;
}
}
mainjob
package com.hbaseBulkLoad;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class joMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
String hbaseTable="stu6";
Connection connection = ConnectionFactory.createConnection(getConf());
Admin admin = connection.getAdmin();
Table table = connection.getTable(TableName.valueOf(hbaseTable));
Job job = Job.getInstance(getConf(), "HbaseBulkLoad");
job.setJarByClass(joMain.class);
job.setMapperClass(MapperBulkLoad.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(KeyValue.class);
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.setInputPaths(job,new Path("/tmp/a"));
job.setOutputFormatClass(TextOutputFormat.class);
Path Outpath = new Path("/tmp/out1");
if (Outpath.getFileSystem(getConf()).exists(Outpath)) {
Outpath.getFileSystem(getConf()).delete(Outpath,true);
}
FileOutputFormat.setOutputPath(job,Outpath);
job.setNumReduceTasks(0);
HFileOutputFormat2.configureIncrementalLoad(job,
table,
connection.getRegionLocator(TableName.valueOf(hbaseTable)));
boolean b = job.waitForCompletion(true);
String AutoloadRegionFlag="false";
if(args.length==1){
AutoloadRegionFlag = args[0];
}
if ("true".equals(AutoloadRegionFlag)){
/*使用命令行 将 HFile 文件 导入 hbase表中
*第一种方法: hadoop jar hbase/lib/hbase-server-1.2.6.jar completebulkload /tmp/out1/<文件路径> stu6<表名>
*第二种方法: hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles /tmp/out1/ stu6
*/
if (b){
System.out.println("==================Hbase BulkLoad=====================");
long start = System.currentTimeMillis(); //此处直接将Hfile文件入表,原文件会被删除掉,这步执行的很快,正怀疑是直接mv的
LoadIncrementalHFiles load = new LoadIncrementalHFiles(getConf());
load.doBulkLoad(Outpath,admin,table,connection.getRegionLocator(TableName.valueOf(hbaseTable)));
long end = System.currentTimeMillis();
System.out.println("BulkLoad耗时: "+(end-start)+" ms");
}
}
return b?0:1;
}
public static void main(String[] args) throws Exception {
Configuration hbaseconf = HBaseConfiguration.create();
Configuration conf = new Configuration(hbaseconf);
int run = ToolRunner.run(conf, new joMain(), args);
System.exit(run);
}
}
使用命令行 将 HFile 文件 导入 hbase表中 *第一种方法: hadoop jar hbase/lib/hbase-server-1.2.6.jar completebulkload /tmp/out1/<文件路径> stu6<表名> *第二种方法: hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles /tmp/out1/ stu6
该MR过程没有reduce阶段, 但是在执行过程中,是可以看到reduce的, 而且时间还很长.