MapReduce集成Hbase

MR集成Hbase:读Hbase规则

  • 目标

    • 掌握MapReduce中读取Hbase的开发规则
  • 分析

    • 读取由InputFormat决定
      • TextInputFormat:读取文件中的内容,每一行返回一个KV
        • K:行的偏移量:LongWritable
        • V:行的内容值:Text
    • TableInputFormat:负责实现读取Hbase的数据,将每个Rowkey的数据转换为一个KV对象
      • K:Rowkey的字节对象:ImmutableBytesWritable
      • V:Rowkey的数据内容:Result
  • 实现

    • step1:调用工具类方法,初始化Input和Map

      • MapReduce中封装了工具类,实现读取Hbase数据
      TableMapReduceUtil.initTableMapperJob
      
      public static void initTableMapperJob(
            String table, 
            Scan scan,
            Class<? extends TableMapper> mapper,
            Class<?> outputKeyClass,
            Class<?> outputValueClass, 
            Job job
      );
      
    • step2:构建Map类继承TableMapper类

      /**
       * Extends the base <code>Mapper</code> class to add the required input key
       * and value classes.
       *
       * @param <KEYOUT>  The type of the key.
       * @param <VALUEOUT>  The type of the value.
       * @see org.apache.hadoop.mapreduce.Mapper
       */
      @InterfaceAudience.Public
      public abstract class TableMapper<KEYOUT, VALUEOUT>
      extends Mapper<ImmutableBytesWritable, Result, KEYOUT, VALUEOUT> {
      
      }
      
  • 总结

    • MapReduce读取Hbase数据的API已经封装好了,只需要调用工具类实现即可

MR集成Hbase:读Hbase实现

  • 目标

    • 实现从Hbase读取数据,将数据写入文件中
  • 分析

    • step1:使用TableInputFormat读取Hbase数据
    • step2:使用TextOutputFormat写入文件
  • 实现

    package bigdata.itcast.cn.hbase.mr;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.hbase.Cell;
    import org.apache.hadoop.hbase.CellUtil;
    import org.apache.hadoop.hbase.HBaseConfiguration;
    import org.apache.hadoop.hbase.client.Result;
    import org.apache.hadoop.hbase.client.Scan;
    import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
    import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
    import org.apache.hadoop.hbase.mapreduce.TableMapper;
    import org.apache.hadoop.hbase.util.Bytes;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    import java.io.IOException;
    
    /**
     * @ClassName ReadHbaseTable
     * @Description TODO 通过MapReduce读取Hbase表中的数据
     * @Create By     Frank
     */
    public class ReadHbaseTable extends Configured implements Tool {
    
        public int run(String[] args) throws Exception {
            //todo:1-创建
            Job job =  Job.getInstance(this.getConf(),"read");
            job.setJarByClass(ReadHbaseTable.class);
            //todo:2-配置
            //input&map
    //        job.setInputFormatClass(TextInputFormat.class);
    //        TextInputFormat.setInputPaths(job,new Path(""));
    //        job.setMapperClass(null);
    //        job.setMapOutputKeyClass(null);
    //        job.setMapOutputValueClass(null);
            //input&map
            /**
             * public static void initTableMapperJob(
             *       String table,                              指定从哪张表读取
             *       Scan scan,                                 读取Hbase数据使用的Scan对象,自定义过滤器
             *       Class<? extends TableMapper> mapper,       Mapper类
             *       Class<?> outputKeyClass,                   Map输出的Key类型
             *       Class<?> outputValueClass,                 Map输出的Value类型
             *       Job job                                    当前的job
             *  )
             */
            //构建TableInputFormat用于读取Hbase的scan对象
            Scan scan = new Scan();//为了方便让你使用过滤器,提前过滤数据,再传递到MapReduce中,所以让你自定义一个scan对象
            //可以为scan设置过滤器,将过滤后的数据加载到MapReduce程序中
            TableMapReduceUtil.initTableMapperJob(
                    "itcast:t1",
                    scan,
                    ReadHbaseMap.class,
                    Text.class,
                    Text.class,
                    job
            );
            //reduce
            job.setNumReduceTasks(0);
            //output
            TextOutputFormat.setOutputPath(job,new Path("datas/output/hbase"));
            //todo:3-提交
            return job.waitForCompletion(true) ? 0:-1;
        }
    
        public static void main(String[] args) throws Exception {
            Configuration conf = HBaseConfiguration.create();
            //指定Hbase服务端地址
            conf.set("hbase.zookeeper.quorum", "node1:2181,node2:2181,node3:2181");
            int status = ToolRunner.run(conf, new ReadHbaseTable(), args);
            System.exit(status);
        }
    
        /**
         * TableMapper<KEYOUT, VALUEOUT>
         * extends Mapper<ImmutableBytesWritable, Result, KEYOUT, VALUEOUT>
         */
        public static class ReadHbaseMap extends TableMapper<Text, Text>{
            //rowkey
            Text outputKey = new Text();
            //每一列的数据
            Text outputValue = new Text();
    
    
            /**
             * 每个KV【一个Rowkey】调用一次map方法
             * @param key:rowkey
             * @param value:这个rowkey的数据
             * @param context
             * @throws IOException
             * @throws InterruptedException
             */
            @Override
            protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
                //给key进行赋值
                String rowkey = Bytes.toString(key.get());
                this.outputKey.set(rowkey);
                //给value赋值
                for(Cell cell : value.rawCells()){
                    //得到每一列的数据
                    String family = Bytes.toString(CellUtil.cloneFamily(cell));
                    String column = Bytes.toString(CellUtil.cloneQualifier(cell));
                    String val  = Bytes.toString(CellUtil.cloneValue(cell));
                    long ts = cell.getTimestamp();
                    this.outputValue.set(family+"\t"+column+"\t"+val+"\t"+ts);
                    //输出每一列的数据
                    context.write(this.outputKey,this.outputValue);
                }
            }
        }
    }
    
    
  • 总结

    • 最终也是调用了Hbase Java API
    • 通过Scan来读取表的数据,返回到MapReduce程序汇总

MR集成Hbase:写Hbase规则

  • 目标

    • 掌握MapReduce写入Hbase的开发规则
  • 分析

    • 输出由OutputFormat决定

      • TextOutputFormat:将KV输出写入文件中
    • TableOutputFormat:负责实现将上一步的KV数据写入Hbase表中

      /**
       * Convert Map/Reduce output and write it to an HBase table. The KEY is ignored
       * while the output value <u>must</u> be either a {@link Put} or a
       * {@link Delete} instance.
       */
      @InterfaceAudience.Public
      public class TableOutputFormat<KEY> extends OutputFormat<KEY, Mutation>
      
      • 要求输出的Value类型必须为Mutation类型:Put / Delete
      • Key是什么类型,不重要,在写入过程中,Key会被丢弃
  • 实现

    • step1:调用工具类初始化Reduce和Output

      • MapReduce中封装了工具类,实现读取Hbase数据
      TableMapReduceUtil.initTableReducerJob
      
      /**
         * Use this before submitting a TableReduce job. It will
         * appropriately set up the JobConf.
         *
         * @param table  The output table.
         * @param reducer  The reducer class to use.
         * @param job  The current job to adjust.
         * @throws IOException When determining the region count fails.
         */
        public static void initTableReducerJob(
            String table,
            Class<? extends TableReducer> reducer,  指定Reduce类,不用传递KeyValue类型,因为Key不重要,Value定死了
            Job job
        );
      
    • step2:构建Reduce类继承TableReducer

      /**
       * Extends the basic <code>Reducer</code> class to add the required key and
       * value input/output classes. 
       *
       * @param <KEYIN>  The type of the input key.
       * @param <VALUEIN>  The type of the input value.
       * @param <KEYOUT>  The type of the output key.
       * @see org.apache.hadoop.mapreduce.Reducer
       */
      @InterfaceAudience.Public
      public abstract class TableReducer<KEYIN, VALUEIN, KEYOUT>
      	extends Reducer<KEYIN, VALUEIN, KEYOUT, Mutation> {
      }
      
  • 总结

    • MapReduce写入Hbase数据的API已经封装好了,只需要调用工具类实现即可

MR集成Hbase:写Hbase实现

  • 目标

    • 实现从文件读取数据,将数据写入Hbase中
  • 分析

    • step1:使用TextInputFormat读取文件中的数据
    • step2:构建Put对象,封装Rowkey以及列
    • step3:使用TableOutputFormat将数据写入Hbase表中
  • 实现

    • Hbase中建表

      create 'itcast:mrwrite','info'
      
    • 实现

      package bigdata.itcast.cn.hbase.mr;
      
      import org.apache.hadoop.conf.Configuration;
      import org.apache.hadoop.conf.Configured;
      import org.apache.hadoop.fs.Path;
      import org.apache.hadoop.hbase.HBaseConfiguration;
      import org.apache.hadoop.hbase.client.Put;
      import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
      import org.apache.hadoop.hbase.mapreduce.TableReducer;
      import org.apache.hadoop.hbase.util.Bytes;
      import org.apache.hadoop.io.LongWritable;
      import org.apache.hadoop.io.Text;
      import org.apache.hadoop.mapreduce.Job;
      import org.apache.hadoop.mapreduce.Mapper;
      import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
      import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
      import org.apache.hadoop.util.Tool;
      import org.apache.hadoop.util.ToolRunner;
      
      import java.io.IOException;
      
      /**
       * @ClassName WriteHbaseTable
       * @Description TODO 通过MapReduce将数据写入Hbase
       * @Create By     Frank
       */
      public class WriteHbaseTable extends Configured implements Tool {
      
          public int run(String[] args) throws Exception {
              //todo:1-创建
              Job job =  Job.getInstance(this.getConf(),"write");
              job.setJarByClass(WriteHbaseTable.class);
              //todo:2-配置
              //input
              TextInputFormat.setInputPaths(job,new Path("datas/hbase/writeHbase.txt"));
              //map
              job.setMapperClass(WriteToHbaseMap.class);
              job.setMapOutputKeyClass(Text.class);
              job.setMapOutputValueClass(Put.class);
              //shuffle
              //reduce&output
              /**
               *  public static void initTableReducerJob(
               *     String table,                                将数据写入Hbase的哪张表
               *     Class<? extends TableReducer> reducer,       reducer的类
               *     Job job)                                     当前的job
               *
               *     以前输出的写法:
               *      job.setoutputKey:因为Key可以任意的,这里根本用不到
               *      job.setoutputValue:在TableReduce中将outputValue定死了,所以不用写
               *
               */
              TableMapReduceUtil.initTableReducerJob(
                  "itcast:mrwrite",
                  WriteToHbaseReduce.class,
                  job
              );
              //output & reduce
      //        job.setReducerClass(null);
      //        job.setOutputKeyClass(null);
      //        job.setOutputValueClass(null);
      //        job.setOutputFormatClass(TextOutputFormat.class);
      //        TextOutputFormat.setOutputPath(job,new Path(""));
      
              //todo:3-提交
              return job.waitForCompletion(true) ? 0:-1;
          }
      
          public static void main(String[] args) throws Exception {
              Configuration conf = HBaseConfiguration.create();
              conf.set("hbase.zookeeper.quorum", "node1:2181,node2:2181,node3:2181");
              int status = ToolRunner.run(conf, new WriteHbaseTable(), args);
              System.exit(status);
          }
      
          /**
           * 读取文件,将文件中的内容,id作为key,其他的每一列作为一个Put对象
           */
          public static class WriteToHbaseMap extends Mapper<LongWritable,Text,Text, Put>{
      
              Text rowkey = new Text();
      
              @Override
              protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                  //value:1	liudehua	18	male
                  String[] split = value.toString().split("\t");
                  String row = split[0];
                  String name = split[1];
                  String age = split[2];
                  String sex = split[3];
                  //将id作为rowkey,放在key中输出
                  this.rowkey.set(row);
                  //构造输出的Value
                  Put putname = new Put(Bytes.toBytes(row));
                  putname.addColumn(Bytes.toBytes("info"),Bytes.toBytes("name"),Bytes.toBytes(name));
                  context.write(rowkey,putname);
                  Put putage = new Put(Bytes.toBytes(row));
                  putage.addColumn(Bytes.toBytes("info"),Bytes.toBytes("age"),Bytes.toBytes(age));
                  context.write(rowkey,putage);
                  Put putsex = new Put(Bytes.toBytes(row));
                  putsex.addColumn(Bytes.toBytes("info"),Bytes.toBytes("sex"),Bytes.toBytes(sex));
                  context.write(rowkey,putsex);
              }
          }
      
          /**
           * public abstract class TableReducer<KEYIN, VALUEIN, KEYOUT>
           * extends Reducer<KEYIN, VALUEIN, KEYOUT, Mutation>
           *     最后Reduce输出的Value类型必须为Put类型,才能将数据写入Hbase
           */
          public static class WriteToHbaseReduce extends TableReducer<Text,Put,Text>{
              /**
               * 相同rowkey的所有Put都在一个迭代器中
               * @param key
               * @param values
               * @param context
               * @throws IOException
               * @throws InterruptedException
               */
              @Override
              protected void reduce(Text key, Iterable<Put> values, Context context) throws IOException, InterruptedException {
                  //直接遍历每个put对象,输出即可
                  for (Put value : values) {
                      context.write(key,value);
                  }
              }
          }
      
      }
      
      
  • 总结

    • 最终还是调用了Hbase Java API来实现的
    • 通过构建Table对象,执行所有的Put对象实现将数据写入Hbase

附录一:Maven依赖

	<repositories>
        <repository>
            <id>aliyun</id>
            <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
        </repository>
    </repositories>
    <properties>
        <hadoop.version>2.7.3</hadoop.version>
        <hbase.version>2.1.2</hbase.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-mapreduce</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-auth</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.6</version>
        </dependency>
    </dependencies>
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值