Hbase JAVA API和HBase-MapReduce基本示例

最新推荐文章于 2022-07-31 21:01:22 发布

不以物喜2020

最新推荐文章于 2022-07-31 21:01:22 发布

阅读量605

点赞数 1

分类专栏：大数据组件文章标签：大数据 hbase mapreduce api java

本文链接：https://blog.csdn.net/Keyuchen_01/article/details/116696607

版权

大数据组件专栏收录该内容

33 篇文章 3 订阅

订阅专栏

1 Region server架构

在这里插入图片描述

1.1 StoreFile

保存实际数据的物理文件，StoreFile以Hfile的形式存储在HDFS上。每个Store会有一个或多个StoreFile（HFile），数据在每个StoreFile中都是有序的。

1.2 MemStore

写缓存，由于HFile中的数据要求是有序的，所以数据是先存储在MemStore中，排好序后，等到达刷写时机才会刷写到HFile，每次刷写都会形成一个新的HFile。

1.3 WAL

由于数据要经MemStore排序后才能刷写到HFile，但把数据保存在内存中会有很高的概率导致数据丢失，为了解决这个问题，数据会先写在一个叫做Write-Ahead logfile的文件中，然后再写入MemStore中。所以在系统出现故障的时候，数据可以通过这个日志文件重建。
每间隔hbase.regionserver.optionallogflushinterval(默认1s)， HBase会把操作从内存写入WAL。
一个RegionServer上的所有Region共享一个WAL实例。
WAL的检查间隔由hbase.regionserver.logroll.period定义，默认值为1小时。检查的内容是把当前WAL中的操作跟实际持久化到HDFS上的操作比较，看哪些操作已经被持久化了，被持久化的操作就会被移动到.oldlogs文件夹内（这个文件夹也是在HDFS上的）。一个WAL实例包含有多个WAL文件。WAL文件的最大数量通过hbase.regionserver.maxlogs（默认是32）参数来定义。

1.4 BlockCache

读缓存，每次查询出的数据会缓存在BlockCache中，方便下次查询。

2 Hbase API

2.1 添加依赖

<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-server</artifactId>
    <version>1.3.1</version>
</dependency>

<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-client</artifactId>
    <version>1.3.1</version>
</dependency>

2.2初始化

public static Configuration conf;
static {
    //添加指定说明windows下hadoop_home的路径，否则会报错
    System.setProperty("hadoop.home.dir", "F:\\hadoop\\hadoop-2.7.2");

    //使用HBaseConfiguration的单例方法实例化
    conf = HBaseConfiguration.create();
    //zk地址和端口
    conf.set("HBase.zookeeper.quorum","192.168.199.101,192.168.199.102,192.168.199.103");
    conf.set("HBase.zookeeper.property.clientPort", "2181");
}

2.3 判断某个表是否存在

public static boolean isExists(String tableName) throws IOException {
    //在HBase中管理、访问表需要先创建HBaseAdmin对象
    Connection connection = ConnectionFactory.createConnection(conf);
    HBaseAdmin admin = (HBaseAdmin) connection.getAdmin();
    // 判断当前表是否存在
    return admin.tableExists(tableName);
}

2.4 创建表

public static void createTable(String tableName,String ... CloumnFamily) throws IOException {
    //在HBase中管理、访问表需要先创建HBaseAdmin对象
    Connection connection = ConnectionFactory.createConnection(conf);
    HBaseAdmin admin = (HBaseAdmin) connection.getAdmin();
    if(isExists(tableName)){
        System.out.println(tableName+"已经存在");
    }else {
        //创建表属性对象,表名需要转字节
        HTableDescriptor hTableDescriptor = new HTableDescriptor(TableName.valueOf(tableName));
        //创建多个列族
        for(String cf:CloumnFamily){
            hTableDescriptor.addFamily(new HColumnDescriptor(cf));
        }
        //根据对表的配置，创建表
        admin.createTable(hTableDescriptor);
        System.out.println(tableName+"创建成功");
    }

2.5 删除表

public static void deleteTable(String tableName) throws IOException {
    //在HBase中管理、访问表需要先创建HBaseAdmin对象
    Connection connection = ConnectionFactory.createConnection(conf);
    HBaseAdmin admin = (HBaseAdmin) connection.getAdmin();
    if(!isExists(tableName)){
        System.out.println(tableName+"当前表不存在，无法删除");
    }else {
        //需要先停用该表，才能删除
        admin.disableTable(tableName);
        admin.deleteTable(tableName);
    }
}

2.6 向表中插入数据

public static void addRowData(String tableName, String rowkey, String cloumnFamily, String key,String value) throws IOException {
    Connection connection = ConnectionFactory.createConnection(conf);
       if(!isExists(tableName)){
        System.out.println(tableName+"当前表不存在，无法插入数据");
    }else {
        //创建Hbase表对象
        HTable hTable = new HTable(conf, tableName);
        //先根据rowkey获取put
        Put put = new Put(Bytes.toBytes(rowkey));
        //调put对象方法插入数据
        put.add(Bytes.toBytes(cloumnFamily), Bytes.toBytes(key), Bytes.toBytes(value));
        //插入数据
        hTable.put(put);
        hTable.close();
    }
}

main方法中调用该静态方法
在这里插入图片描述
可以看到数据插入成功

2.7 删除多行数据

public static void deleteByRowkey(String tableName,String ... rowKey) throws IOException {

    Connection connection = ConnectionFactory.createConnection(conf);

    if(!isExists(tableName)){
        System.out.println(tableName+"当前表不存在，无法删除");
    }else {
        //创建表对象
        HTable hTable = new HTable(conf, tableName);
        for(String rk: rowKey){

            hTable.delete(new Delete(rk.getBytes()));
        }
        hTable.close();
    }
}

2.8 查看表所有内容

public static void getAllRowData(String tableName) throws IOException {
    Connection connection = ConnectionFactory.createConnection(conf);
    if(!isExists(tableName)){
        System.out.println(tableName+"当前表不存在");
    }else {
        //创建表对象
        HTable hTable = new HTable(conf, tableName);
        //得到用于扫描region的对象
        Scan scan = new Scan();
        ResultScanner results = hTable.getScanner(scan);
        //扫描每个region
        for (Result rs:results){
            //一个列中可以存储多个版本的数据。而每个版本就称为一个单元格（Cell）
            Cell[] cells = rs.rawCells();
            for (Cell cell:cells){
                //获取cell中的rowKey
                System.out.println("行键值rowkey："+Bytes.toString(CellUtil.cloneRow(cell)));
                //获取列族
                System.out.println("列族："+Bytes.toString(CellUtil.cloneFamily(cell)));
                //获取列的属性
                System.out.println("列："+Bytes.toString(CellUtil.cloneQualifier(cell)));
                //获取列的值
                System.out.println("值: "+Bytes.toString(CellUtil.cloneValue(cell)));

            }
        }
        hTable.close();

    }
}

2.9获取某行数据

public static void getOneRowKeyData(String tableName, String rowKey) throws IOException {
    Connection connection = ConnectionFactory.createConnection(conf);
    if (!isExists(tableName)){
        System.out.println(tableName+"不存在");
    }else {
        //创建表对象
        HTable hTable = new HTable(conf, tableName);
        //根据rowkey获取result集
        Get get = new Get(Bytes.toBytes(rowKey));
        Result result = hTable.get(get);
        Cell[] cells = result.rawCells();
        for(Cell cell:cells){
            System.out.println("行键值rowkey:"+Bytes.toString(CellUtil.cloneRow(cell)));
            System.out.println("列族："+Bytes.toString(CellUtil.cloneFamily(cell)));
            System.out.println("列："+ Bytes.toString(CellUtil.cloneQualifier(cell)));
            System.out.println("值："+Bytes.toString(CellUtil.cloneValue(cell)));
        }
        hTable.close();
    }
}

2.10 获取某行指定“列族：列”的数据

public static void  getRowQualifier(String tableName,String rowKey,String family,String key) throws IOException {
    Connection connection = ConnectionFactory.createConnection(conf);
    if (!isExists(tableName)){
        System.out.println(tableName+"不存在");
    }else {
        HTable hTable = new HTable(conf, tableName);
        //获取rowkey对应的result值
        Get get = new Get(Bytes.toBytes(rowKey));
        Result result = hTable.get(get);
        //调用result方法获取对应rowkey对应列的列值value
        byte[] value = result.getValue(Bytes.toBytes(family), Bytes.toBytes(key));
        System.out.println(Bytes.toString(value));
        hTable.close();

    }
}

3 HBase-MapReduce

3.1 官方示例

1）条件准备
让Hadoop加载Hbase的jar包，最简单的就是把HBase的jar包复制到Hadoop的lib里面，或者把HBase的包地址写到Hadoop的环境变量里面
永久生效：在/etc/profile配置

export HBASE_HOME= /root/software/hbase-1.3.1
export HADOOP_HOME= /root/software/hadoop-2.7.2

并在hadoop-env.sh中配置：（注意：在for循环之后配）

export HADOOP_CLASSPATH=$HADOOP_CLASSPATH: /root/software/hbase-1.3.1/lib/*

在这里插入图片描述
2)运行官方的MapReduce任务
案例一：统计Student表中有多少行数据

bin/yarn jar lib/HBase-server-1.3.1.jar rowcounter student

执行后报错如下：原因在hadoop-env.sh配置文件中添加时，多加入了一个空格，去掉该空格
在这里插入图片描述

修改完成之后再次运行

bin/yarn jar lib/HBase-server-1.3.1.jar rowcounter student

在这里插入图片描述

3.2 案例二

使用MapReduce将本地数据导入到HBase
1）在本地创建一个tsv格式的文件：fruit.tsv

1001	Apple	Red
1002	Pear		Yellow
1003	Pineapple	Yellow

2）创建HBase表
HBase(main):001:0> create ‘fruit’,‘info’
在这里插入图片描述
3）在HDFS中创建input_fruit文件夹并上传fruit.tsv文件

$hdfs dfs -mkdir /input_fruit/
$hdfs dfs -put fruit.tsv /input_fruit/

在这里插入图片描述
4）执行MapReduce到HBase的fruit表中

$ bin/yarn jar lib/hbase-server-1.3.1.jar importtsv \
-Dimporttsv.columns=HBASE_ROW_KEY,info:name,info:color fruit \
hdfs://master-1:9000/input_fruit

5）使用scan命令查看导入后的结果

HBase(main):001:0> scan ‘fruit’

在这里插入图片描述

3.3 自定义HBase-MapReduce

实现将HDFS中的数据写入到HBase表中
将新的fruit2.tsv数据上传到hdfs上

1004	Banana			Yellow
1005	Orange			Yellow
1006	Watermelon		Green

在这里插入图片描述
1）构建ReadFruitFromHDFSMapper于读取HDFS中的文件数据

package com.chen.hbase;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class ReadFruitFromHDFSMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put>{
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //从HDFS中读取的数据
        String words = value.toString();
        //读取出来的每行数据使用\t进行分割，存于String数组
        String[] fruits = words.split("\t");
        //根据数据中值的含义取值
        String rowKey = fruits[0];
        String name = fruits[1];
        String color=fruits[2];
        //初始化rowKey
        ImmutableBytesWritable rowKeyWritable = new ImmutableBytesWritable(Bytes.toBytes(rowKey));
        //初始化put对象
        Put put = new Put(Bytes.toBytes(rowKey));
        //参数分别:列族、列、值
        put.add(Bytes.toBytes("info"),Bytes.toBytes("name"),Bytes.toBytes(name));
        put.add(Bytes.toBytes("info"),Bytes.toBytes("color"),Bytes.toBytes(color));

        context.write(rowKeyWritable,put);

    }
}

2）构建WriteFruitMRFromTxtReducer类

package com.chen.hbase;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.NullWritable;

import java.io.IOException;

public class WriteFruitMRFromTxtReducer extends TableReducer<ImmutableBytesWritable, Put, NullWritable> {
    @Override
    protected void reduce(ImmutableBytesWritable key, Iterable<Put> values, Context context) throws IOException, InterruptedException {
        //读出来的每一行数据写入到fruit_hdfs表中
        for (Put put:values){
            context.write(NullWritable.get(),put);
        }
    }
}

3）创建Txt2FruitRunner组装Job

package com.chen.hbase;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.Job;

import java.io.IOException;

public class Txt2FruitRunner {

    public static void main(String[] args) throws Exception {
        //得到Configuration
        Configuration conf = HBaseConfiguration.create();
        //创建Job任务
        Job job = Job.getInstance(conf);
        job.setJarByClass(Txt2FruitRunner.class);
        Path path = new Path("hdfs://master-1:9000/input_fruit/fruit2.tsv");
        FileInputFormat.addInputPath(job, path);
        //设置Mapper
        job.setMapperClass(ReadFruitFromHDFSMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(Put.class);
        //设置Reducer
        TableMapReduceUtil.initTableReducerJob("fruit", WriteFruitMRFromTxtReducer.class, job);
        //设置Reduce数量，最少1个
        job.setNumReduceTasks(1);
        boolean isSuccess = job.waitForCompletion(true);
        if (!isSuccess) {
            throw new IOException("Job running with error");
        }

    }

}

4）打包运行

/root/software/hadoop-2.7.2/bin/yarn jar Hive-0.0.1-SNAPSHOT.jar com.chen.hbase.Txt2FruitRunner

Mapreudece运行完成
在这里插入图片描述
此时查看hbase中的fruit表

不以物喜2020

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录