hbase中的mapreduce

今天该取什么名字好

于 2021-12-05 15:17:44 发布

阅读量1.4k

点赞数 1

分类专栏： hbase 文章标签： hbase mapreduce

本文链接：https://blog.csdn.net/zzds111/article/details/121689113

版权

hbase 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

1.读取hbase上的表，将结果输出到hdfs上

问题是：统计出现班级的次数，也是一个词频统计的案例，但是是从hbase上读取数据，将结果输出到hdfs上,

做出的改动就是需要map端继承不同的类，Driver端设置关联mapper端信息有一些变化

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class MRHbase01 {
    /**
     * public abstract class TableMapper<KEYOUT, VALUEOUT> extends Mapper<ImmutableBytesWritable, Result, KEYOUT, VALUEOUT>
     *     TableMapper的k-v输入类型已经被限定了，所以我们只需要写出输出的数据类型
     */
    public class ReadHbaseMapper extends TableMapper<Text,IntWritable>{
        @Override
        protected void map(ImmutableBytesWritable key, Result value, Mapper<ImmutableBytesWritable, Result, Text, IntWritable>.Context context) throws IOException, InterruptedException{
            String id = Bytes.toString(key.get());
            //由于是统计做班级出现次数的统计，所以这里是获取result中的clazz列下面的内容
            String clazz = Bytes.toString(value.getValue("info".getBytes(), "clazz".getBytes()));
            context.write(new Text(clazz),new IntWritable(1));
        }
    }

    public class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum=0;
            for (IntWritable value : values) {
                sum+=value.get();
            }
            context.write(key,new IntWritable(sum));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
      
        Configuration conf = new Configuration();
        conf.set("hbase.zookeeper.quorum","master:2181,node1:2181,node2:2181");
        Job job = Job.getInstance(conf);
        job.setJarByClass(MRHbase01.class);

        job.setReducerClass(MyReducer.class);
        
        //值得注意的是这里设置Mapper类的时候与普通的mapreduce程序不大相同，用                        
        //TableMapRedeceUtil,设置mapper端，当然如果是reducer也可以使用
        //我们在这里指定了输出k-v类型和mapper类，普通的mr程序是分开的
        TableMapReduceUtil.initTableMapperJob(
                TableName.valueOf("student"),
                new Scan(),
                ReadHbaseMapper.class,
                Text.class,
                IntWritable.class,
                job
        );

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        
        
        //判断输出路径是否存在，存在就删除    
        Path outputPath = new Path("/output");
        FileOutputFormat.setOutputPath(job,outputPath);
        FileSystem fs=FileSystem.get(conf);
        if(fs.exists(outputPath)){
            fs.delete(outputPath,true);
        }
        
        //提交任务
        job.waitForCompletion(true);


    }
}

程序写完后需要打包到集群上，但是我们还需要在porm文件中添加一些插件，这样才能识别我们的hbase

  <!-- 带依赖jar 插件-->
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

打包后会产生2个压缩包，都给上传了

运行命令：hadoop dfs jar 压缩包名（大的那个）类名（有package的也要加上不然识别不出来）

 hadoop jar HbaseApi-1.0-SNAPSHOT-jar-with-dependencies.jar MRHbase01

输出结果：

补充：

我在写的时候可能是因为idea版本问题，重写了TableMapper方法的时候，Context没有前面的类名，使用最新版本的idea可以

也可以我们自己手动添加上，Ctrl点击TableMapper,进去就可以看看到MapperM<>,

我们把Mapper<ImmutableBytesWritable, Result, KEYOUT, VALUEOUT>复制出来，

再根据自己的输入输出类型进行修改

public abstract class TableMapper<KEYOUT, VALUEOUT> extends Mapper<ImmutableBytesWritable, Result, KEYOUT, VALUEOUT> {
    public TableMapper() {
    }
}

2.从hbase读取数据，并将结果写hbase

既然是将数据写入到hbase中，我们的准备工作就是创建一个空的表，等待被写入

create 'clazz_count','info';

实现代码

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;


public  class MR06Hbase  {

    /**
     * MapReduce读取HBase student表
     * 统计班级人数，并将最后的结果保存到hbase
     */
    public static class ReadHBaseMapper extends TableMapper<Text, IntWritable> {
        @Override
        protected void map(ImmutableBytesWritable key, Result value,  Mapper<ImmutableBytesWritable, Result, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            String clazz = Bytes.toString(value.getValue("info".getBytes(), "clazz".getBytes()));
            context.write(new Text(clazz),new IntWritable(1));
        }
    }

    /**
     * public abstract class TableReducer<KEYIN, VALUEIN, KEYOUT> extends Reducer<KEYIN, VALUEIN, KEYOUT, Mutation> {
     *     public TableReducer() {
     *     }
     * }
     * TableReducer类后面需要跟上3个内容，输入k-v类型和map端的输出类型保持一致，第四个参数是Mutation
     * Mutation有一个子类Put,所以我们的输出类型Value就用Put，输出类型Key不需要
     * 在
     */

    public static class WriteHbaseReducer extends TableReducer<Text,IntWritable,NullWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum=0;
            for (IntWritable value : values) {
                sum+=value.get();
            }
            Put put = new Put(key.getBytes());
            put.addColumn("info".getBytes(),"counts".getBytes(),(sum+"").getBytes());
            context.write(NullWritable.get(),put);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum", "master:2181,node1:2181,node2:2181");

        Job job = Job.getInstance(conf);
        job.setJobName("MR06Hbase");
        job.setJarByClass(MR06Hbase.class);

        //ctrl+p 查看参数慢慢敲
        TableMapReduceUtil.initTableMapperJob(
                TableName.valueOf("student"),
                new Scan(),
                ReadHBaseMapper.class,
                Text.class,
                IntWritable.class,
                job
        );

        //注意这里第一个参数提示是 ：String table,只要传入一个字符串就行了
        TableMapReduceUtil.initTableReducerJob(
                "clazz_count",
                WriteHbaseReducer.class,
                job
        );
        job.waitForCompletion(true);

    }
 }

打包上传并运行

 hadoop jar HbaseApi-1.0-SNAPSHOT-jar-with-dependencies.jar MR06Hbase

运行成功后查看hbase中表的内容

今天该取什么名字好

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
hbase中的mapreduce

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hbase.TableName;import org.apache.hadoop.hbase.client.Result;import org.apache.hadoop.hbase.client.Scan;impo.
复制链接

扫一扫