MapReduce集成Hbase

最新推荐文章于 2023-04-16 03:45:52 发布

镜子里的宇宙

最新推荐文章于 2023-04-16 03:45:52 发布

阅读量158

点赞数

分类专栏： Hbase 文章标签： hbase

本文链接：https://blog.csdn.net/qq_45798620/article/details/115245941

版权

Hbase 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

MapReduce集成Hbase

MR集成Hbase：读Hbase规则

目标
- 掌握MapReduce中读取Hbase的开发规则
分析
- 读取由InputFormat决定
  - TextInputFormat：读取文件中的内容，每一行返回一个KV
    - K：行的偏移量：LongWritable
    - V：行的内容值：Text
- TableInputFormat：负责实现读取Hbase的数据，将每个Rowkey的数据转换为一个KV对象
  - K：Rowkey的字节对象：ImmutableBytesWritable
  - V：Rowkey的数据内容：Result

实现

step1：调用工具类方法，初始化Input和Map

MapReduce中封装了工具类，实现读取Hbase数据

TableMapReduceUtil.initTableMapperJob

public static void initTableMapperJob(
      String table, 
      Scan scan,
      Class<? extends TableMapper> mapper,
      Class<?> outputKeyClass,
      Class<?> outputValueClass, 
      Job job
);

step2：构建Map类继承TableMapper类

/**
 * Extends the base <code>Mapper</code> class to add the required input key
 * and value classes.
 *
 * @param <KEYOUT>  The type of the key.
 * @param <VALUEOUT>  The type of the value.
 * @see org.apache.hadoop.mapreduce.Mapper
 */
@InterfaceAudience.Public
public abstract class TableMapper<KEYOUT, VALUEOUT>
extends Mapper<ImmutableBytesWritable, Result, KEYOUT, VALUEOUT> {

}

总结
- MapReduce读取Hbase数据的API已经封装好了，只需要调用工具类实现即可

MR集成Hbase：读Hbase实现

目标
- 实现从Hbase读取数据，将数据写入文件中
分析
- step1：使用TableInputFormat读取Hbase数据
- step2：使用TextOutputFormat写入文件

实现

package bigdata.itcast.cn.hbase.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

/**
 * @ClassName ReadHbaseTable
 * @Description TODO 通过MapReduce读取Hbase表中的数据
 * @Create By     Frank
 */
public class ReadHbaseTable extends Configured implements Tool {

    public int run(String[] args) throws Exception {
        //todo:1-创建
        Job job =  Job.getInstance(this.getConf(),"read");
        job.setJarByClass(ReadHbaseTable.class);
        //todo:2-配置
        //input&map
//        job.setInputFormatClass(TextInputFormat.class);
//        TextInputFormat.setInputPaths(job,new Path(""));
//        job.setMapperClass(null);
//        job.setMapOutputKeyClass(null);
//        job.setMapOutputValueClass(null);
        //input&map
        /**
         * public static void initTableMapperJob(
         *       String table,                              指定从哪张表读取
         *       Scan scan,                                 读取Hbase数据使用的Scan对象，自定义过滤器
         *       Class<? extends TableMapper> mapper,       Mapper类
         *       Class<?> outputKeyClass,                   Map输出的Key类型
         *       Class<?> outputValueClass,                 Map输出的Value类型
         *       Job job                                    当前的job
         *  )
         */
        //构建TableInputFormat用于读取Hbase的scan对象
        Scan scan = new Scan();//为了方便让你使用过滤器，提前过滤数据，再传递到MapReduce中，所以让你自定义一个scan对象
        //可以为scan设置过滤器,将过滤后的数据加载到MapReduce程序中
        TableMapReduceUtil.initTableMapperJob(
                "itcast:t1",
                scan,
                ReadHbaseMap.class,
                Text.class,
                Text.class,
                job
        );
        //reduce
        job.setNumReduceTasks(0);
        //output
        TextOutputFormat.setOutputPath(job,new Path("datas/output/hbase"));
        //todo:3-提交
        return job.waitForCompletion(true) ? 0:-1;
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = HBaseConfiguration.create();
        //指定Hbase服务端地址
        conf.set("hbase.zookeeper.quorum", "node1:2181,node2:2181,node3:2181");
        int status = ToolRunner.run(conf, new ReadHbaseTable(), args);
        System.exit(status);
    }

    /**
     * TableMapper<KEYOUT, VALUEOUT>
     * extends Mapper<ImmutableBytesWritable, Result, KEYOUT, VALUEOUT>
     */
    public static class ReadHbaseMap extends TableMapper<Text, Text>{
        //rowkey
        Text outputKey = new Text();
        //每一列的数据
        Text outputValue = new Text();


        /**
         * 每个KV【一个Rowkey】调用一次map方法
         * @param key：rowkey
         * @param value：这个rowkey的数据
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
            //给key进行赋值
            String rowkey = Bytes.toString(key.get());
            this.outputKey.set(rowkey);
            //给value赋值
            for(Cell cell : value.rawCells()){
                //得到每一列的数据
                String family = Bytes.toString(CellUtil.cloneFamily(cell));
                String column = Bytes.toString(CellUtil.cloneQualifier(cell));
                String val  = Bytes.toString(CellUtil.cloneValue(cell));
                long ts = cell.getTimestamp();
                this.outputValue.set(family+"\t"+column+"\t"+val+"\t"+ts);
                //输出每一列的数据
                context.write(this.outputKey,this.outputValue);
            }
        }
    }
}

总结
- 最终也是调用了Hbase Java API
- 通过Scan来读取表的数据，返回到MapReduce程序汇总

MR集成Hbase：写Hbase规则

目标
- 掌握MapReduce写入Hbase的开发规则

分析

输出由OutputFormat决定
- TextOutputFormat：将KV输出写入文件中

TableOutputFormat：负责实现将上一步的KV数据写入Hbase表中

/**
 * Convert Map/Reduce output and write it to an HBase table. The KEY is ignored
 * while the output value <u>must</u> be either a {@link Put} or a
 * {@link Delete} instance.
 */
@InterfaceAudience.Public
public class TableOutputFormat<KEY> extends OutputFormat<KEY, Mutation>

要求输出的Value类型必须为Mutation类型：Put / Delete
Key是什么类型，不重要，在写入过程中，Key会被丢弃

实现

step1：调用工具类初始化Reduce和Output

MapReduce中封装了工具类，实现读取Hbase数据

TableMapReduceUtil.initTableReducerJob

/**
   * Use this before submitting a TableReduce job. It will
   * appropriately set up the JobConf.
   *
   * @param table  The output table.
   * @param reducer  The reducer class to use.
   * @param job  The current job to adjust.
   * @throws IOException When determining the region count fails.
   */
  public static void initTableReducerJob(
      String table,
      Class<? extends TableReducer> reducer,  指定Reduce类，不用传递Key和Value类型，因为Key不重要，Value定死了
      Job job
  );

step2：构建Reduce类继承TableReducer

/**
 * Extends the basic <code>Reducer</code> class to add the required key and
 * value input/output classes. 
 *
 * @param <KEYIN>  The type of the input key.
 * @param <VALUEIN>  The type of the input value.
 * @param <KEYOUT>  The type of the output key.
 * @see org.apache.hadoop.mapreduce.Reducer
 */
@InterfaceAudience.Public
public abstract class TableReducer<KEYIN, VALUEIN, KEYOUT>
	extends Reducer<KEYIN, VALUEIN, KEYOUT, Mutation> {
}

总结
- MapReduce写入Hbase数据的API已经封装好了，只需要调用工具类实现即可

MR集成Hbase：写Hbase实现

目标
- 实现从文件读取数据，将数据写入Hbase中
分析
- step1：使用TextInputFormat读取文件中的数据
- step2：构建Put对象，封装Rowkey以及列
- step3：使用TableOutputFormat将数据写入Hbase表中

实现

Hbase中建表
```
create 'itcast:mrwrite','info'
```

实现

package bigdata.itcast.cn.hbase.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

/**
 * @ClassName WriteHbaseTable
 * @Description TODO 通过MapReduce将数据写入Hbase
 * @Create By     Frank
 */
public class WriteHbaseTable extends Configured implements Tool {

    public int run(String[] args) throws Exception {
        //todo:1-创建
        Job job =  Job.getInstance(this.getConf(),"write");
        job.setJarByClass(WriteHbaseTable.class);
        //todo:2-配置
        //input
        TextInputFormat.setInputPaths(job,new Path("datas/hbase/writeHbase.txt"));
        //map
        job.setMapperClass(WriteToHbaseMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Put.class);
        //shuffle
        //reduce&output
        /**
         *  public static void initTableReducerJob(
         *     String table,                                将数据写入Hbase的哪张表
         *     Class<? extends TableReducer> reducer,       reducer的类
         *     Job job)                                     当前的job
         *
         *     以前输出的写法：
         *      job.setoutputKey：因为Key可以任意的，这里根本用不到
         *      job.setoutputValue：在TableReduce中将outputValue定死了，所以不用写
         *
         */
        TableMapReduceUtil.initTableReducerJob(
            "itcast:mrwrite",
            WriteToHbaseReduce.class,
            job
        );
        //output & reduce
//        job.setReducerClass(null);
//        job.setOutputKeyClass(null);
//        job.setOutputValueClass(null);
//        job.setOutputFormatClass(TextOutputFormat.class);
//        TextOutputFormat.setOutputPath(job,new Path(""));

        //todo:3-提交
        return job.waitForCompletion(true) ? 0:-1;
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum", "node1:2181,node2:2181,node3:2181");
        int status = ToolRunner.run(conf, new WriteHbaseTable(), args);
        System.exit(status);
    }

    /**
     * 读取文件，将文件中的内容，id作为key，其他的每一列作为一个Put对象
     */
    public static class WriteToHbaseMap extends Mapper<LongWritable,Text,Text, Put>{

        Text rowkey = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //value：1	liudehua	18	male
            String[] split = value.toString().split("\t");
            String row = split[0];
            String name = split[1];
            String age = split[2];
            String sex = split[3];
            //将id作为rowkey，放在key中输出
            this.rowkey.set(row);
            //构造输出的Value
            Put putname = new Put(Bytes.toBytes(row));
            putname.addColumn(Bytes.toBytes("info"),Bytes.toBytes("name"),Bytes.toBytes(name));
            context.write(rowkey,putname);
            Put putage = new Put(Bytes.toBytes(row));
            putage.addColumn(Bytes.toBytes("info"),Bytes.toBytes("age"),Bytes.toBytes(age));
            context.write(rowkey,putage);
            Put putsex = new Put(Bytes.toBytes(row));
            putsex.addColumn(Bytes.toBytes("info"),Bytes.toBytes("sex"),Bytes.toBytes(sex));
            context.write(rowkey,putsex);
        }
    }

    /**
     * public abstract class TableReducer<KEYIN, VALUEIN, KEYOUT>
     * extends Reducer<KEYIN, VALUEIN, KEYOUT, Mutation>
     *     最后Reduce输出的Value类型必须为Put类型，才能将数据写入Hbase
     */
    public static class WriteToHbaseReduce extends TableReducer<Text,Put,Text>{
        /**
         * 相同rowkey的所有Put都在一个迭代器中
         * @param key
         * @param values
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void reduce(Text key, Iterable<Put> values, Context context) throws IOException, InterruptedException {
            //直接遍历每个put对象，输出即可
            for (Put value : values) {
                context.write(key,value);
            }
        }
    }

}

总结
- 最终还是调用了Hbase Java API来实现的
- 通过构建Table对象，执行所有的Put对象实现将数据写入Hbase

附录一：Maven依赖

	<repositories>
        <repository>
            <id>aliyun</id>
            <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
        </repository>
    </repositories>
    <properties>
        <hadoop.version>2.7.3</hadoop.version>
        <hbase.version>2.1.2</hbase.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-mapreduce</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-auth</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.6</version>
        </dependency>
    </dependencies>

镜子里的宇宙

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
MapReduce集成Hbase

MapReduce集成HbaseMR集成Hbase：读Hbase规则MR集成Hbase：读Hbase实现MR集成Hbase：写Hbase规则MR集成Hbase：写Hbase实现附录一：Maven依赖MR集成Hbase：读Hbase规则目标掌握MapReduce中读取Hbase的开发规则分析读取由InputFormat决定TextInputFormat：读取文件中的内容，每一行返回一个KVK：行的偏移量：LongWritableV：行的内容值：TextTableIn
复制链接

扫一扫