我们可以先看看,官网的MapReduce的示例:http://hbase.apache.org/book.html#mapreduce
我们这里对,hbase里已存在的表进行读取,将其中的一部分列然后写到另一张表中
原表为hive_emp 插入的表为hive_emp_bak
直接上代码,简单代码,首先你要在本机配置好,hbase开发环境,将配置文件放入到本地文件中,
package make.hbase_mapreduce.com;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Table2other extends Configured implements Tool {
// mapper classs
public static class readMapper extends TableMapper<Text, Put> {
@Override
public void map(ImmutableBytesWritable key, Result value,
Context context) throws IOException, InterruptedException {
Text mapoutputkey = new Text();
// output key
String rowkey = Bytes.toString(key.get());
mapoutputkey.set(rowkey);
// output value
Put put = new Put(key.get());
for (Cell cell : value.rawCells()) {
// put column family
if ("info".equals(Bytes.toString(CellUtil.cloneFamily(cell)))) {
// put column
if ("ename".equals(Bytes.toString(CellUtil //column ename
.cloneQualifier(cell)))) {
put.add(cell);
}
if ("job".equals(Bytes.toString(CellUtil //column job
.cloneQualifier(cell)))) {
put.add(cell);
}
if ("sal".equals(Bytes.toString(CellUtil //column sal
.cloneQualifier(cell)))) {
put.add(cell);
}
}
}
context.write(mapoutputkey, put);
}
}
// reducer class
public static class writereducer extends
TableReducer<Text, Put, ImmutableBytesWritable> {
@Override
protected void reduce(Text key, Iterable<Put> value, Context context)
throws IOException, InterruptedException {
for (Put put : value) {
// write format
context.write(null, put);
}
}
}
// driver
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(this.getConf(), "hbase_read2write");
job.setJarByClass(Table2other.class); // class that contains mapper and
// reducer
Scan scan = new Scan();
scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
scan.setCacheBlocks(false); // don't set to true for MR jobs set other scan attrs
//set mapper
TableMapReduceUtil.initTableMapperJob(
"hbase_hive:hive_emp", // input // table
scan, // Scan instance to control CF and attribute selection
readMapper.class, // mapper class
Text.class, // mapper output key
Put.class, // mapper output value
job);
//set reducer
TableMapReduceUtil.initTableReducerJob(
"hbase_hive:hive_emp_bak", // output table // table
writereducer.class, // reducer class
job);
job.setNumReduceTasks(1); // at least one, adjust as required
boolean status = job.waitForCompletion(true);
if (!status) {
throw new IOException("error with job!");
}
return status ? 0 : 1;
}
public static void main(String[] args) throws Exception {
Configuration config = HBaseConfiguration.create();
int status = ToolRunner.run(config, new Table2other(), args);
System.exit(status);
}
}
这个跟hadoop的MapReduce,还是很相似的,就是有几个类可以看下源码,其实就是对 put delete 等等的一些封装,已经帮我们实现了,可以看一看,我们这里是把各种写死了,但是也是可以通过传参来实现,很灵活的。
同时,scan是很重要的,是经常用的API ,所以对他的其他的一些设置也要清楚,下面这几个是经常用的,还有过滤器也要自己理解,
// scan.setCacheBlocks(cacheBlocks);
// scan.setCaching(caching);
//scan.setStartRow(Bytes.toBytes("7369"));
//scan.setStopRow(Bytes.toBytes("7654"));
//Scan scan = new Scan(startrowkey,soprowkey)
我们将我们的代码,打包上传,运行,如下图(这里是一个传参的测试,传入的是插入数据的表args[0])
这个是结果表的 scan,可以看到我们提取的列,每一行都有数据,到这里我们的简单的MapReduce就完成了,但是我么也可以尝试其他的很多,根据自己的需求来设计MapReduce
以上。