使用mapreduce操作Hbase

最新推荐文章于 2023-03-03 14:20:52 发布

成都往右

最新推荐文章于 2023-03-03 14:20:52 发布

阅读量1.1k

点赞数

分类专栏： hbase 文章标签： hadoop hbase mapreduce

本文链接：https://blog.csdn.net/qq_37334135/article/details/78420109

版权

hbase 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

前面用了java api操作了hbase，这里就用mapreduce来操作mapreduce。
需求：对下面的表中的数据进行统计

hbase(main):006:0> scan 'words'
ROW                                           COLUMN+CELL                                                                                                                         
 1                                            column=info:word, timestamp=1509345324263, value=hadoop,hdfs,mapreduce,hive,hbase                                                   
 2                                            column=info:word, timestamp=1509345324263, value=hdfs,hive,hbase,storm,kafka                                                        
 3                                            column=info:word, timestamp=1509345324263, value=hbase,storm,kafka,spark,mahout

要求统计结果放入到另外一张表words2中，格式如下

hbase(main):010:0> scan 'words2'
ROW                                           COLUMN+CELL                                                                                                                         
 hadoop                                       column=info:word, timestamp=1509345777813, value=1                                                                                  
 hbase                                        column=info:word, timestamp=1509345777813, value=3                                                                                  
 hdfs                                         column=info:word, timestamp=1509345777813, value=2                                                                                  
 hive                                         column=info:word, timestamp=1509345777813, value=2                                                                                  
 kafka                                        column=info:word, timestamp=1509345777813, value=2                                                                                  
 mahout                                       column=info:word, timestamp=1509345777813, value=1                                                                                  
 mapreduce                                    column=info:word, timestamp=1509345777813, value=1                                                                                  
 spark                                        column=info:word, timestamp=1509345777813, value=1                                                                                  
 storm                                        column=info:word, timestamp=1509345777813, value=2

注：上面的都是自己做完之后复制的。
1、建立数据来源表‘words’，包含一个列族‘info’
向表中添加数据，在列族中放入列‘word’，并将短文数据放入该列中，如此插入多行，行键为不同的数据即可

2、建立输出表‘words2’，包含一个列族‘info’

3、通过Mr操作Hbase的‘word’表，对‘info:word’中的单词做统计，并将统计结果写入‘words2’表的‘info:word中’，行键为单词

注释都写在程序中就不做解释了

/**
 * 使用mr操作hbase来进行单词统计
 * @author 12706
 *
 */
public class MrMachineHBase {
    //创建hbase配置对象
    static Configuration config = null;
    static{
         config = HBaseConfiguration.create();
         //设置参数
         config.set("hbase.zookeeper.quorum", "mini1,mini2,mini3");
         config.set("hbase.zookeeper.property.clientPort", "2181");
    }
    //创建表信息
    public static final String TABLE_NAME = "words";
    public static final String TABLE_NAME_ = "words2";
    public static final String FAMILY = "info";
    public static final String COLUMN = "word";
    //初始化表
    public static void init(){
        HTable table = null;
        HBaseAdmin admin = null;
        try{
            admin = new HBaseAdmin(config);
            //判断表是否存在
            if(admin.tableExists(TABLE_NAME)){
                //删除表
                admin.disableTable(TABLE_NAME);
                admin.deleteTable(TABLE_NAME);
            }
            if(admin.tableExists(TABLE_NAME_)){
                //删除表
                admin.disableTable(TABLE_NAME_);
                admin.deleteTable(TABLE_NAME_);
            }
            //创建表描述类
            TableName tableName = TableName.valueOf(TABLE_NAME);
            HTableDescriptor tableDescriptor = new HTableDescriptor(tableName);
            //创建列族描述类
            HColumnDescriptor columnDescriptor = new HColumnDescriptor(FAMILY);
            //列族描述类加入到表描述类中
            tableDescriptor.addFamily(columnDescriptor);
            //创建表
            admin.createTable(tableDescriptor);

            TableName tableName2 = TableName.valueOf(TABLE_NAME_);
            HTableDescriptor tableDescriptor2 = new HTableDescriptor(tableName2);
            //创建列族描述类
            HColumnDescriptor columnDescriptor2 = new HColumnDescriptor(FAMILY);
            //列族描述类加入到表描述类中
            tableDescriptor2.addFamily(columnDescriptor2);
            //创建表
            admin.createTable(tableDescriptor2);


            table = new HTable(config, TABLE_NAME);
            //自动刷出
            table.setAutoFlush(false);
            //一次缓存大小
            table.setWriteBufferSize(1000);
            /*
             * 向表中插入数据
             */
            List<Put> putList = new ArrayList<>();
            Put put = new Put(Bytes.toBytes("1"));
            put.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN), 
                    Bytes.toBytes("hadoop,hdfs,mapreduce,hive,hbase"));
            putList.add(put);
            Put put2 = new Put(Bytes.toBytes("2"));
            put2.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN), 
                    Bytes.toBytes("hdfs,hive,hbase,storm,kafka"));
            putList.add(put2);
            Put put3 = new Put(Bytes.toBytes("3"));
            put3.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN),
                    Bytes.toBytes("hbase,storm,kafka,spark,mahout"));
            putList.add(put3);
            table.put(putList);
            //提交
            table.flushCommits();

        }catch(Exception e){
            e.printStackTrace();
        }finally {
            if(table!=null){
                try {
                    table.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    /**
     * Text 输出key类型
     * IntWritable  输出value类型
     * 一次读取一行数据（一个rowkey对应一行）
     * @author 12706
     *
     */
    static class WordCountMapper extends TableMapper<Text, IntWritable>{
        static Text k = new Text();
        static IntWritable v = new IntWritable(1);
        //key:rowkey    value:对应的一行的result
        @Override
        protected void map(ImmutableBytesWritable key, Result value,
                Context context)throws IOException, InterruptedException {
            //获取info:word的value值
            //如:hadoop,hdfs,mapreduce,hive,hbase
            byte[] line = value.getValue(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN));
            //按逗号切割hadoop   hdfs    mapreduce   hive    hbase
            String[] wordz = Bytes.toString(line).split(",");
            //循环输出word和1
            for (String w : wordz) {
                k.set(w);
                //写出
                context.write(k, v);
            }
        }
    }

    /**
     * Text:传入的key类型
     * IntWritable:传入的value类型
     * ImmutableBytesWritable：输出类型，rowkey类型
     * @author 12706
     *
     */
    static class WordCountReducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,
                Context context)throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable intWritable : values) {
                count += intWritable.get();
            }
            //设置rowkey为单词
            Put put = new Put(Bytes.toBytes(key.toString()));
            put.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN), Bytes.toBytes(String.valueOf(count)));
            //写到hbase,需要指定rowkey、put
            context.write(new ImmutableBytesWritable(Bytes.toBytes(key.toString())), put);
        }
    }
    public static void main(String[] args) throws Exception {
        //初始化表
        init();
        //创建job
        Job job = Job.getInstance(config);//job
        job.setJarByClass(MrMachineHBase.class);//主类
        //创建scan
        Scan scan = new Scan();
        //可以指定查询某一列
        scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN));
        //创建查询hbase的mapper，设置表名、scan、mapper类、mapper的输出key、mapper的输出value
        TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, WordCountMapper.class,Text.class, IntWritable.class, job);
        //创建写入hbase的reducer，指定表名、reducer类、job reduce不设置输出默认跟mapper的输出一致
        TableMapReduceUtil.initTableReducerJob(TABLE_NAME_, WordCountReducer.class, job);
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

将工程打包上传到hadoop集群
开始测试

[root@mini1 ~]# hadoop jar hbase.jar com.scu.hbase.MrMachineHBase

执行完之后，hbase客户端命令行查看

hbase(main):011:0> list
TABLE                                                                                                                                                                             
user1                                                                                                                                                                             
words                                                                                                                                                                             
words2                                                                                                                                                                            
3 row(s) in 0.0550 seconds

=> ["user1", "words", "words2"]
hbase(main):012:0> scan 'words'
ROW                                           COLUMN+CELL                                                                                                                         
 1                                            column=info:word, timestamp=1509345735252, value=hadoop,hdfs,mapreduce,hive,hbase                                                   
 2                                            column=info:word, timestamp=1509345735252, value=hdfs,hive,hbase,storm,kafka                                                        
 3                                            column=info:word, timestamp=1509345735252, value=hbase,storm,kafka,spark,mahout                                                     
3 row(s) in 0.1610 seconds

hbase(main):013:0> scan 'words2'
ROW                                           COLUMN+CELL                                                                                                                         
 hadoop                                       column=info:word, timestamp=1509345777813, value=1                                                                                  
 hbase                                        column=info:word, timestamp=1509345777813, value=3                                                                                  
 hdfs                                         column=info:word, timestamp=1509345777813, value=2                                                                                  
 hive                                         column=info:word, timestamp=1509345777813, value=2                                                                                  
 kafka                                        column=info:word, timestamp=1509345777813, value=2                                                                                  
 mahout                                       column=info:word, timestamp=1509345777813, value=1                                                                                  
 mapreduce                                    column=info:word, timestamp=1509345777813, value=1                                                                                  
 spark                                        column=info:word, timestamp=1509345777813, value=1                                                                                  
 storm                                        column=info:word, timestamp=1509345777813, value=2                                                                                  
9 row(s) in 0.0860 seconds