Apache HBase Day2

最新推荐文章于 2024-07-17 14:44:32 发布

caijq_newid

最新推荐文章于 2024-07-17 14:44:32 发布

阅读量112

点赞数

分类专栏： Apache Hadoop 文章标签： hadoop

本文链接：https://blog.csdn.net/cjqnewid123/article/details/109011215

版权

Apache Hadoop 专栏收录该内容

14 篇文章 0 订阅

订阅专栏

Apache HBase

√Java API

需要在maven工程中导入如下依赖：

<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-client</artifactId>
    <version>1.2.4</version>
</dependency>

<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-server</artifactId>
    <version>1.2.4</version>
</dependency>

在链接HBase服务的时候，我们需要创建Connection对象，该对象主要负责实现数据的DML(data manipulation language：它们是SELECT、UPDATE、INSERT、DELETE，就象它的名字一样，这4条命令是用来对数据库里的数据进行操作的语言 )，如果用户需要执行DDL(data definition language：DDL比DML要多，主要的命令有CREATE、ALTER、DROP等，DDL主要是用在定义或改变表（TABLE）的结构，数据类型，表之间的链接和约束等初始化工作上，他们大多在建立表时使用 )指令需要创建Admin对象,如果需要执行DML创建Table对象

public class HBaseDDLTest {
    private Admin admin;
    private Connection conn;
    private Table table;
    @Before
    public void before() throws IOException {
        Configuration conf= HBaseConfiguration.create();
        conf.set(HConstants.ZOOKEEPER_QUORUM,"CentOS");
        conf.set(HConstants.ZOOKEEPER_CLIENT_PORT,"2181");
        conn= ConnectionFactory.createConnection(conf);
        admin=conn.getAdmin();
        table=conn.getTable(TableName.valueOf("baizhi:t_user"));
    }


    @After
    public void after() throws IOException {
        admin.close();
        conn.close();
    }
}

常规访问

DDL

1、查看所有的namespace

NamespaceDescriptor[] descriptors = admin.listNamespaceDescriptors();
for (NamespaceDescriptor descriptor : descriptors) {
    System.out.println(descriptor.getName());
}

2、创建Namespace

//create_namespace 'zpark' ,{‘creator’=>'zhangsan'}
NamespaceDescriptor namespaceDescriptor=NamespaceDescriptor.create("zpark")
    .addConfiguration("creator","zhangsan")
    .build();
admin.createNamespace(namespaceDescriptor);

3、修改Namespace

//alter_namespace 'zpark' ,{METHOD=>'unset',NAME=>'creator'}
NamespaceDescriptor namespaceDescriptor=NamespaceDescriptor.create("zpark")
    .removeConfiguration("creator")
    .build();
admin.modifyNamespace(namespaceDescriptor);

4、查看namespace下的表

//list_namespace_tables 'baizhi'
TableName[] tables = admin.listTableNamesByNamespace("baizhi");
for (TableName tableName : tables) {
    System.out.println(tableName.getNameAsString());
}

5、删除namespace

//drop_namespace 'zpark'
admin.deleteNamespace("zpark");

6、创建table

//create 'zpark:t_user',{NAME=>'cf1',VERSIONS=>3,IN_MEMORY=>true,BLOOMFILTER=>'ROWCOL'},{NAME=>'cf2',TTL=>60}
HTableDescriptor tableDescriptor=new HTableDescriptor(TableName.valueOf("zpark:t_user"));

HColumnDescriptor cf1=new HColumnDescriptor("cf1");
cf1.setMaxVersions(3);
cf1.setInMemory(true);
cf1.setBloomFilterType(BloomType.ROWCOL);

HColumnDescriptor cf2=new HColumnDescriptor("cf2");
cf2.setTimeToLive(60);

tableDescriptor.addFamily(cf1);
tableDescriptor.addFamily(cf2);

admin.createTable(tableDescriptor);

7、删除Table

//disable 'zpark:t_user'
//drop  'zpark:t_user'
TableName tableName = TableName.valueOf("zpark:t_user");
boolean exists = admin.tableExists(tableName);
if(!exists){
    return;
}
boolean disabled = admin.isTableDisabled(tableName);
if(!disabled){
    admin.disableTable(tableName);
}
admin.deleteTable(tableName);

8、截断表

TableName tableName = TableName.valueOf("baizhi:t_user");
boolean disabled = admin.isTableDisabled(tableName);
if(!disabled){
    admin.disableTable(tableName);
}
admin.truncateTable(tableName,false);

DML

1、插入/更新-put

String[] depts=new String[]{"search","sale","manager"};
for(Integer i=0;i<=1000;i++){
    DecimalFormat format = new DecimalFormat("0000");
    String rowKey=format.format(i);

    Put put=new Put(toBytes(rowKey));
    put.addColumn(toBytes("cf1"),toBytes("name"),toBytes("user"+rowKey));
    put.addColumn(toBytes("cf1"),toBytes("salary"),toBytes(100.0 * i));
    put.addColumn(toBytes("cf1"),toBytes("dept"),toBytes(depts[new Random().nextInt(3)]));

    table.put(put);
}

String[] depts=new String[]{"search","sale","manager"};
//实现批量更新、修改
BufferedMutator bufferedMutator = conn.getBufferedMutator(TableName.valueOf("baizhi:t_user"));
for(Integer i=1000;i<=2000;i++){
    DecimalFormat format = new DecimalFormat("0000");
    String rowKey=format.format(i);

    Put put=new Put(toBytes(rowKey));
    put.addColumn(toBytes("cf1"),toBytes("name"),toBytes("user"+rowKey));
    put.addColumn(toBytes("cf1"),toBytes("salary"),toBytes(100.0 * i));
    put.addColumn(toBytes("cf1"),toBytes("dept"),toBytes(depts[new Random().nextInt(3)]));

    bufferedMutator.mutate(put);
    if(i%500==0 && i>1000){//执行刷新
        bufferedMutator.flush();
    }
}
bufferedMutator.close();

2、查询某一行（含有多个Cell）-get

Get get=new Get(toBytes("2000"));

Result result = table.get(get);//一行记录，包含多个Cell

byte[] bname = result.getValue(toBytes("cf1"), toBytes("name"));
byte[] bdept = result.getValue(toBytes("cf1"), toBytes("dept"));
byte[] bsalary = result.getValue(toBytes("cf1"), toBytes("salary"));

String name= Bytes.toString(bname);
String dept= Bytes.toString(bdept);
Double salary= Bytes.toDouble(bsalary);

System.out.println(name+" "+dept+" "+salary);

获取Result的Cell方式有很多，其中getValue方法最常用，用户必须制定Column信息。对于Result遍历可以使用CellScanner或者listCells方法

Get get=new Get(toBytes("2000"));

Result result = table.get(get);//一行记录，包含多个Cell
CellScanner cellScanner = result.cellScanner();
while (cellScanner.advance()){
    Cell cell = cellScanner.current();
    //获取Cell的列名字
    String qualifier = Bytes.toString(cloneQualifier(cell));
    //获取值
    Object value=null;
    if(qualifier.equals("salary")){
        value=toDouble(cloneValue(cell));
    }else{
        value=Bytes.toString(cloneValue(cell));
    }
    //获取RowKey
    String rowKey=Bytes.toString(cloneRow(cell));
    System.out.println(rowKey+" "+qualifier+" "+value);
}

Get get=new Get(toBytes("2000"));

Result result = table.get(get);//一行记录，包含多个Cell
List<Cell> cells = result.listCells();
for (Cell cell : cells) {
    //获取Cell的列名字
    String qualifier = Bytes.toString(cloneQualifier(cell));
    //获取值
    Object value=null;
    if(qualifier.equals("salary")){
        value=toDouble(cloneValue(cell));
    }else{
        value=Bytes.toString(cloneValue(cell));
    }
    //获取RowKey
    String rowKey=Bytes.toString(cloneRow(cell));
    System.out.println(rowKey+" "+qualifier+" "+value);
}

用户还可以使用getColumnCells方法获取某个Cell的多个版本的数据

Get get=new Get(toBytes("2000"));
get.setMaxVersions(3);
get.setTimeStamp(1602299440060L);

Result result = table.get(get);//一行记录，包含多个Cell

List<Cell> salaryCells = result.getColumnCells(toBytes("cf1"), toBytes("salary"));
for (Cell salaryCell : salaryCells) {
    System.out.println(toDouble(cloneValue(salaryCell)));
}

3、给某个Cell增加值-incr

Increment increment=new Increment(toBytes("2000"));
increment.addColumn(toBytes("cf1"),toBytes("salary"),1000L);
table.increment(increment);

4、删除数据-delete/deleteall

deleteall

Delete delete=new Delete(toBytes("2000"));
table.delete(delete);

delete

Delete delete=new Delete(toBytes("2000"));
delete.addColumn(toBytes("cf1"),toBytes("salary"));
table.delete(delete);

5、表扫描-scan

Scan scan = new Scan();


ResultScanner scanner = table.getScanner(scan);
Iterator<Result> resultIterator = scanner.iterator();
while (resultIterator.hasNext()){
    Result result = resultIterator.next();

    byte[] bname = result.getValue(toBytes("cf1"), toBytes("name"));
    byte[] bdept = result.getValue(toBytes("cf1"), toBytes("dept"));
    byte[] bsalary = result.getValue(toBytes("cf1"), toBytes("salary"));

    String name= Bytes.toString(bname);
    String dept= Bytes.toString(bdept);
    Double salary= Bytes.toDouble(bsalary);
    String rowKey=Bytes.toString(result.getRow());
    System.out.println(rowKey+" " +name+" "+dept+" "+salary);
}

我们可以尝试配置Scan对象定制查询条件，完成复查查询需求

Scan scan = new Scan();

scan.setStartRow(toBytes("1000"));
scan.setStopRow(toBytes("1100"));

//scan.setRowPrefixFilter(toBytes("108"));
Filter filter1=new RowFilter(CompareFilter.CompareOp.EQUAL,new RegexStringComparator("09$"));
Filter filter2=new RowFilter(CompareFilter.CompareOp.EQUAL,new SubstringComparator("80"));
FilterList filter=new FilterList(FilterList.Operator.MUST_PASS_ONE,filter1,filter2);

scan.setFilter(filter);

ResultScanner scanner = table.getScanner(scan);
Iterator<Result> resultIterator = scanner.iterator();
while (resultIterator.hasNext()){
    Result result = resultIterator.next();

    byte[] bname = result.getValue(toBytes("cf1"), toBytes("name"));
    byte[] bdept = result.getValue(toBytes("cf1"), toBytes("dept"));
    byte[] bsalary = result.getValue(toBytes("cf1"), toBytes("salary"));

    String name= Bytes.toString(bname);
    String dept= Bytes.toString(bdept);
    Double salary= Bytes.toDouble(bsalary);
    String rowKey=Bytes.toString(result.getRow());
    System.out.println(rowKey+" " +name+" "+dept+" "+salary);
}

更多Filter参考：https://www.jianshu.com/p/bcc54f63abe4

MapReduce集成

HBase提供了和MapReduce框架集成输入和输出格式TableInputFormat/TableOutputFormat实现。用户只需要按照输入和输出格式定制代码即可。

这里需要注意，由于使用了TableInputFormat所需在任务提交初期，程序需要计算任务的切片信息，因此需要在提交节点上配置HABASE的类路径

[root@CentOS ~]# vi .bashrc
JAVA_HOME=/usr/java/latest
HADOOP_HOME=/usr/hadoop-2.9.2/
HBASE_HOME=/usr/hbase-1.2.4/
PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HBASE_HOME/bin
CLASSPATH=.
export JAVA_HOME
export PATH
export CLASSPATH
export HADOOP_HOME
export HBASE_HOME
HBASE_CLASSPATH=$(/usr/hbase-1.2.4/bin/hbase classpath)
HADOOP_CLASSPATH=$HBASE_CLASSPATH:/root/mysql-connector-java-5.1.49.jar
export HADOOP_CLASSPATH
[root@CentOS ~]# source .bashrc

public class AvgSalaryApplication extends Configured implements Tool {
    public int run(String[] strings) throws Exception {
        Configuration conf=getConf();
        conf= HBaseConfiguration.create(conf);
        conf.set(HConstants.ZOOKEEPER_QUORUM,"CentOS");

        conf.setBoolean("mapreduce.map.output.compress",true);
        conf.setClass("mapreduce.map.output.compress.codec", GzipCodec.class, CompressionCodec.class);

        Job job= Job.getInstance(conf,"AvgSalaryApplication");
        job.setJarByClass(AvgSalaryApplication.class);

        job.setInputFormatClass(TableInputFormat.class);
        job.setOutputFormatClass(TableOutputFormat.class);

        TableMapReduceUtil.initTableMapperJob(
                "baizhi:t_user",new Scan(),AvgSalaryMapper.class,
                Text.class,
                DoubleWritable.class,
                job
        );

        TableMapReduceUtil.initTableReducerJob(
                "baizhi:t_result",
                AvgSalaryReducer.class,
                job
        );

        job.setNumReduceTasks(3);

        return job.waitForCompletion(true)?0:1;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new AvgSalaryApplication(),args);
    }
}

public class AvgSalaryMapper extends TableMapper<Text, DoubleWritable> {
    @Override
    protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
        String dept= Bytes.toString(value.getValue(Bytes.toBytes("cf1"),Bytes.toBytes("dept")));
        Double salary= Bytes.toDouble(value.getValue(Bytes.toBytes("cf1"),Bytes.toBytes("salary")));

        context.write(new Text(dept),new DoubleWritable(salary));
    }
}

public class AvgSalaryReducer  extends TableReducer<Text, DoubleWritable, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
        double sum=0.0;
        int count=0;
        for (DoubleWritable value : values) {
            count++;
            sum+=value.get();
        }
        Put put = new Put(key.getBytes());
        put.addColumn("cf1".getBytes(),"avg".getBytes(),((sum/count)+"").getBytes());

        context.write(null,put);
    }
}

这里需要注意，默认TableInputFormat计算切片数目等于表的Region的数目。

[root@CentOS ~]# hadoop jar HBase-1.0-SNAPSHOT.jar com.baizhi.mapreduce.AvgSalaryApplication

协处理器（高级）

Hbase 作为列族数据库最经常被人诟病的特性包括：无法轻易建立“二级索引”，难以执行求和、计数、排序等操作。比如，在旧版本的(<0.92)Hbase 中，统计数据表的总行数，需要使用 Counter 方法，执行一次 MapReduce Job 才能得到。虽然 HBase 在数据存储层中集成了 MapReduce，能够有效用于数据表的分布式计算。然而在很多情况下，做一些简单的相加或者聚合计算的时候，如果直接将计算过程放置在 server 端，能够减少通讯开销，从而获得很好的性能提升。于是，HBase 在 0.92 之后引入了协处理器(coprocessors)，实现一些激动人心的新特性：能够轻易建立二次索引、复杂过滤器(谓词下推)以及访问控制等。

总体来说其包含两种协处理器：Observers和Endpoint

Observer

Observer 类似于传统数据库中的触发器，当发生某些事件的时候这类协处理器会被 Server 端调用。Observer Coprocessor 就是一些散布在 HBase Server 端代码中的 hook 钩子，在固定的事件发生时被调用。比如：put 操作之前有钩子函数 prePut，该函数在 put 操作执行前会被 Region Server 调用；在 put 操作之后则有 postPut 钩子函数。

1、编写观察者

public class UserAppendObServer extends BaseRegionObserver {
    private final static Log LOG= LogFactory.getLog(UserAppendObServer.class);

    @Override
    public Result preAppend(ObserverContext<RegionCoprocessorEnvironment> e, Append append) throws IOException {
        LOG.info("User Append SomeThing ~~~~~~~~~~~~~~");
        return null;
    }
}

2、将代码打包，上传至HDFS

[root@CentOS ~]# hdfs dfs -mkdir /libs
[root@CentOS ~]# hdfs dfs -put HBase-1.0-SNAPSHOT.jar /libs/

3、启动hbase，并且实时查看RegionServer的启动日志

[root@CentOS ~]# rm -rf /usr/hbase-1.2.4/logs/*
[root@CentOS ~]# start-hbase.sh
[root@CentOS ~]# tail -f /usr/hbase-1.2.4/logs/hbase-root-regionserver-CentOS.log

4、给zpark:t_user添加协处理器

[root@CentOS ~]# hbase shell
hbase(main):001:0> disable 'zpark:t_user'

hbase(main):003:0> alter 'zpark:t_user' , METHOD =>'table_att','coprocessor'=>'hdfs:///libs/HBase-1.0-SNAPSHOT.jar|com.baizhi.observer.UserAppendObServer|1001'
Updating all regions with the new schema...
1/1 regions updated.
Done.
0 row(s) in 2.0830 seconds
hbase(main):004:0> enable 'zpark:t_user'
0 row(s) in 1.2890 seconds

5、测试监听器是否生效

hbase(main):005:0> desc 'zpark:t_user'
Table zpark:t_user is ENABLED
zpark:t_user, {TABLE_ATTRIBUTES => {coprocessor$1 => 'hdfs:///libs/HBase-1.0-SNAPSHOT.jar|com.baizhi.observer.UserAppendObServer|1001'}
COLUMN FAMILIES DESCRIPTION
{NAME => 'cf1', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '0', BLO
CKCACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'}
{NAME => 'cf2', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '0', BLO
CKCACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'}
2 row(s) in 0.0490 seconds

6、尝试执行Append命令，注意观察日志输出

hbase(main):003:0> append  'zpark:t_user','001','cf1:subscribe','002|'
0 row(s) in 0.2140 seconds

2020-10-10 17:23:20,847 INFO  [B.defaultRpcServer.handler=3,queue=0,port=16020] observer.UserAppendObServer: User Append SomeThing ~~~~~~~~~~~~~~

caijq_newid

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Apache HBase Day2

Apache HBase√Java API需要在maven工程中导入如下依赖：<dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>1.2.4</version></dependency><dependency> <
复制链接

扫一扫