mapreduce 实现hbase表的读取和写入

最新推荐文章于 2024-06-28 12:19:09 发布

maketubu7

最新推荐文章于 2024-06-28 12:19:09 发布

阅读量5.5k

点赞数

文章标签： MapReduce hbase

本文链接：https://blog.csdn.net/maketubu7/article/details/80708045

版权

我们可以先看看，官网的MapReduce的示例：http://hbase.apache.org/book.html#mapreduce

我们这里对，hbase里已存在的表进行读取，将其中的一部分列然后写到另一张表中

原表为hive_emp 插入的表为hive_emp_bak

直接上代码，简单代码，首先你要在本机配置好，hbase开发环境，将配置文件放入到本地文件中，

package make.hbase_mapreduce.com;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class Table2other extends Configured implements Tool {

	// mapper classs
	public static class readMapper extends TableMapper<Text, Put> {

		@Override
		public void map(ImmutableBytesWritable key, Result value,
				Context context) throws IOException, InterruptedException {

			Text mapoutputkey = new Text();

			// output key
			String rowkey = Bytes.toString(key.get());

			mapoutputkey.set(rowkey);

			// output value
			Put put = new Put(key.get());

			for (Cell cell : value.rawCells()) {
				// put column family
				if ("info".equals(Bytes.toString(CellUtil.cloneFamily(cell)))) {

					// put column
					if ("ename".equals(Bytes.toString(CellUtil     //column ename
							.cloneQualifier(cell)))) {

						put.add(cell);
					}

					if ("job".equals(Bytes.toString(CellUtil       //column job
							.cloneQualifier(cell)))) {

						put.add(cell);
					}
					if ("sal".equals(Bytes.toString(CellUtil      //column sal
							.cloneQualifier(cell)))) {

						put.add(cell);
					}
				}
			}
			context.write(mapoutputkey, put);
		}
	}

	// reducer class

	public static class writereducer extends
			TableReducer<Text, Put, ImmutableBytesWritable> {

		@Override
		protected void reduce(Text key, Iterable<Put> value, Context context)
				throws IOException, InterruptedException {

			for (Put put : value) {
				// write format
				context.write(null, put);
			}

		}

	}

	// driver
	@Override
	public int run(String[] args) throws Exception {

		Job job = Job.getInstance(this.getConf(), "hbase_read2write");
		job.setJarByClass(Table2other.class); // class that contains mapper and
												// reducer

		Scan scan = new Scan();
		scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs						
		scan.setCacheBlocks(false); // don't set to true for MR jobs set other scan attrs

		//set mapper
		TableMapReduceUtil.initTableMapperJob(
				"hbase_hive:hive_emp", // input // table
				scan, // Scan instance to control CF and attribute selection
				readMapper.class, // mapper class
				Text.class, // mapper output key
				Put.class, // mapper output value
				job);
		//set reducer
		TableMapReduceUtil.initTableReducerJob(
				"hbase_hive:hive_emp_bak", // output table 														// table
				writereducer.class, // reducer class
				job);
		job.setNumReduceTasks(1); // at least one, adjust as required

		boolean status = job.waitForCompletion(true);
		if (!status) {
			throw new IOException("error with job!");
		}

		return status ? 0 : 1;

	}

	public static void main(String[] args) throws Exception {

		Configuration config = HBaseConfiguration.create();

		int status = ToolRunner.run(config, new Table2other(), args);

		System.exit(status);
	}
}

这个跟hadoop的MapReduce，还是很相似的，就是有几个类可以看下源码，其实就是对 put delete 等等的一些封装，已经帮我们实现了，可以看一看，我们这里是把各种写死了，但是也是可以通过传参来实现，很灵活的。

同时，scan是很重要的，是经常用的API ，所以对他的其他的一些设置也要清楚，下面这几个是经常用的，还有过滤器也要自己理解，

// scan.setCacheBlocks(cacheBlocks);
// scan.setCaching(caching);
//scan.setStartRow(Bytes.toBytes("7369"));
//scan.setStopRow(Bytes.toBytes("7654"));

//Scan scan = new Scan(startrowkey,soprowkey)

我们将我们的代码，打包上传，运行，如下图（这里是一个传参的测试，传入的是插入数据的表args[0]）