api学习：hbase的filter

最新推荐文章于 2024-06-17 14:35:38 发布

猫耳山大王

最新推荐文章于 2024-06-17 14:35:38 发布

阅读量1.9k

点赞数

分类专栏： API学习：大数据及云计算文章标签： hadoop hbase api

本文链接：https://blog.csdn.net/blue__yeah/article/details/41040399

版权

API学习：大数据及云计算专栏收录该内容

1 篇文章 0 订阅

订阅专栏

1、下面是整理了一下去年学习hbase的filter的api时的笔记，放到了一个测试类中，方便观看

2、在此测试类中，假定有一个名字叫tableName的表，列族为all，all列族中有city，column1，column2，column3等字段
3、我的经验是应该每个filter都亲自测试一下，只要碰到某个问题时能确定这个场景能否用filter解决就可以了，然后现用现查即可。
4、但是hbase中最重要的还是行键的设计（因为只有这一级索引）

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.ColumnCountGetFilter;
import org.apache.hadoop.hbase.filter.ColumnPaginationFilter;
import org.apache.hadoop.hbase.filter.ColumnPrefixFilter;
import org.apache.hadoop.hbase.filter.ColumnRangeFilter;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.filter.DependentColumnFilter;
import org.apache.hadoop.hbase.filter.FamilyFilter;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
import org.apache.hadoop.hbase.filter.FuzzyRowFilter;
import org.apache.hadoop.hbase.filter.InclusiveStopFilter;
import org.apache.hadoop.hbase.filter.KeyOnlyFilter;
import org.apache.hadoop.hbase.filter.MultipleColumnPrefixFilter;
import org.apache.hadoop.hbase.filter.PageFilter;
import org.apache.hadoop.hbase.filter.PrefixFilter;
import org.apache.hadoop.hbase.filter.QualifierFilter;
import org.apache.hadoop.hbase.filter.RandomRowFilter;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.filter.SingleColumnValueExcludeFilter;
import org.apache.hadoop.hbase.filter.TimestampsFilter;
import org.apache.hadoop.hbase.filter.ValueFilter;
import org.apache.hadoop.hbase.filter.WhileMatchFilter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;

public class FilterTest {

/**
* 1，FilterList取数据时各_act列搜索时的“或”操作
FilterList fl = new FilterList(FilterList.Operator.MUST_PASS_ALL);
FilterList fl2 = new FilterList(Operator.MUST_PASS_ONE);
fl2中添加filter互相为“或”的关系，
fl.addFilter(fl2);即可实现需求
*/
/**
* 2,ColumnPrefixFilter
a,返回有此前缀的所有列，
b,只有FilterList.Operator.MUST_PASS_ONE时，才能添加多个MultipleColumnPrefixFilter，
如果设成ALL，会造成互相冲突，无法取到
* @throws IOException
*/
public static void filterListAndColumnPrefixFilterTest() throws IOException{

Configuration conf = HBaseConfiguration.create();
HTable table = new HTable(conf, "tableName");
Scan scan = new Scan();

//list中filter关系为或
FilterList fl = new FilterList(FilterList.Operator.MUST_PASS_ONE);
fl.addFilter(
new ColumnPrefixFilter(Bytes.toBytes("column"))
) ;
fl.addFilter(
new ColumnPrefixFilter(Bytes.toBytes("ci"))
) ;
scan.setFilter(fl);
//如果增加addColumn，则只返回在此添加过的列值对(这种情况下不会返回city列)
scan.addColumn(Bytes.toBytes("all"), Bytes.toBytes("column1"));
scan.addColumn(Bytes.toBytes("all"), Bytes.toBytes("column2"));
//scan.addColumn(Bytes.toBytes("all"), Bytes.toBytes("city"));
ResultScanner rs = table.getScanner(scan);
for(Result r:rs){
if(r.getColumnLatest(Bytes.toBytes("all"), Bytes.toBytes("column1"))!=null)
//if(new String(r.getColumnLatest(Bytes.toBytes("all"), Bytes.toBytes("_act")).getValue()) == "#chat")
System.out.println(new String(r.getColumnLatest(Bytes.toBytes("all"), Bytes.toBytes("column1")).getValue()));
if(r.getColumnLatest(Bytes.toBytes("all"), Bytes.toBytes("city"))!=null)
System.out.println(new String(r.getColumnLatest(Bytes.toBytes("all"), Bytes.toBytes("city")).getValue(),"utf-8"));
}

}

/**
* 3，MultipleColumnPrefixFilter
a,返回有此前缀的所有列，
b,在byte[][]中定义所有需要的列前缀，只要满足其中一条约束就会被返回（ColumnPrefixFilterTest的加强版），
*/
public static void multipleColumnPrefixFilterTest(){
Scan scan = new Scan();
byte[][] prefix = {Bytes.toBytes("_a"),Bytes.toBytes("a")};
scan.setFilter(
new MultipleColumnPrefixFilter(prefix)
);
}

/**
* 4,ColumnCountGetFilter
a,无法再scan中使用，只能在Get中
b,若设为0，则无法返回数据，设为几就按服务器中存储位置取回几列
c,可用size()取到列数，观察效果
* @throws IOException
*/
public static void columnCountGetFilterTest() throws IOException{

Configuration conf = HBaseConfiguration.create();
HTable table = new HTable(conf, "tableName");
//HTable table = new HTable(conf, "csntable");
Get get = new Get(Bytes.toBytes("ABCDEFGH"));

get.setFilter(
new ColumnCountGetFilter(3)
);
get.addFamily(Bytes.toBytes("all"));

Result r = table.get(get);

System.out.println(new String (r.getRow()));
//输出结果size，观察效果
System.out.println(r.size());

}

/**
* 5,ColumnPaginationFilter
a,limit 表示返回列数
b,offset 表示返回列的偏移量，如果为0，则全部取出，如果为1，则返回第二列及以后
* @throws IOException
*/
public static void columnPaginationFilterTest() throws IOException{

Configuration conf = HBaseConfiguration.create();
HTable table = new HTable(conf, "tableName");
Scan scan = new Scan();
scan.setFilter(
new ColumnPaginationFilter(5,2)
);
//用addFamily增加列族后，会只返回指定列族的数据
scan.addFamily(Bytes.toBytes("all"));
ResultScanner rs = table.getScanner(scan);
for(Result r:rs){
if(r.getColumnLatest(Bytes.toBytes("all"), Bytes.toBytes("city"))!=null)
System.out.println(new String(r.getColumnLatest(Bytes.toBytes("all"), Bytes.toBytes("city")).getValue()));

}
}

/**
* 6，ColumnRangeFilter
new ColumnRangeFilter(
Bytes.toBytes("e"), //下限
true, //是否包括下限
Bytes.toBytes("d"), //上限
true) //是否包括上限
);
*/
public static void columnRangeFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new ColumnRangeFilter(
Bytes.toBytes("a"),
true,
Bytes.toBytes("b"),
false)
);
}

/**
* ?7, DependentColumnFilter （该过滤器有两个参数：family和Qualifier,尝试找到该列所在的每一行，
并返回该行具有相同时间戳的全部键值对。如果某一行不包含指定的列，则该行的任何键值对都不返回，
该过滤器还可以有一个可选的布尔参数-如果为true,从属的列不返回；
该过滤器还可以有两个可选的参数--一个比较操作符和一个值比较器，用于family和Qualifier
的进一步检查，如果从属的列找到，其值还必须通过值检查，然后就是时间戳必须考虑）
*/
public static void dependentColumnFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new DependentColumnFilter(Bytes.toBytes("all"), Bytes.toBytes("city"),true,CompareOp.EQUAL,new BinaryComparator(Bytes.toBytes("深圳"))))
;
}

/**
* 8,FamilyFilter
a,按family查找，取回所有符合条件的“family”
b,构造方法第一个参数为compareOp
c,第二个参数为WritableByteArrayComparable，有BinaryComparator, BinaryPrefixComparator,
BitComparator, NullComparator, RegexStringComparator, SubstringComparator这些类，
最常用的为BinaryComparator
*/
public static void familyFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new FamilyFilter(
CompareOp.LESS_OR_EQUAL,
new BinaryComparator(Bytes.toBytes("all"))
)

);
}

/**
* 9，FirstKeyOnlyFilter
如名字所示，结果只返回每行的第一个值对
*/
public static void firstKeyOnlyFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new FirstKeyOnlyFilter()
);
}

/**
* 10，FuzzyRowFilter
模糊row查询
pair中第一个参数为模糊查询的string
第二个参数为byte[]其中装与string位数相同的数值0或1,0表示该位必须与string中值相同，1表示可以不同
*/
public static void fuzzyRowFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new FuzzyRowFilter(
Arrays.asList(
new Pair<byte[], byte[]>(
Bytes.toBytes("000000000098764_1378035885"),
new byte[] {0, 0, 0, 0 , 0, 0,0, 0, 0, 0 , 0, 0, 0, 0, 0, 0 , 0, 0,1, 1, 1, 1 , 1, 1}
)
)
)
);
}

/**
* 11,InclusiveStopFilter
指定stopRow，程序在scan时从头扫描全部返回，直到stopRow停止（stopRow这行也会返回，然后scan停止）
*/
public static void inclusiveStopFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new InclusiveStopFilter(Bytes.toBytes("CCCCCCCCCCCCCCCC"))
);
}

/**
* ？12，KeyOnlyFilter
只取key值，size正常，说明value不是没取，而是在取的时候被重写为空（能打印，不是null）
lenAsVal这个值没大搞明白，如果设为true，false时打印的空串此时将会是“口口”
*/
public static void keyOnlyFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new KeyOnlyFilter(false)
);
}

/**
* 13,PageFilter
取回XX条数据
*/
public static void pageFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new PageFilter(10)
);
}

/**
* 14,PrefixFilter
取回rowkey以指定prefix开头的所有行
*/
public static void prefixFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new PrefixFilter(Bytes.toBytes("bj_"))
);
}

/**
* 15，QualifierFilter
类似于FamilyFilter，取回所有符合条件的“列”
构造方法第一个参数 compareOp
第二个参数为WritableByteArrayComparable
*/
public static void qualifierFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new QualifierFilter(
CompareOp.LESS_OR_EQUAL,
new BinaryComparator(Bytes.toBytes("city"))
)
);
}

/**
* ???16，RandomRowFilter
参数<0时一条查不出
>0时好像返回是所有
--后续测试，后来的测试中，大于1值会返回所有，而想取随机列的话有效区间为0~1，值代表取到的几率
*/
public static void randomRowFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new RandomRowFilter((float)0.0000000001)
);
}

/**
* 17，RowFilter
构造方法参数设置类似于FamilyFilter，符合条件的row都返回
但是通过row查询时，如果知道开始结束的row，还是用scan的start和end方法更直接并且经测试速度快一半以上
*/
public static void rowFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new RowFilter(
CompareOp.LESS_OR_EQUAL,
new BinaryComparator(Bytes.toBytes("AAAAAAAAAAAA"))
)
);

/**
* 更推荐用下面的方法直接指定起止行，因为filter本质上还是会遍历全部数据，而设定起止行后会直接从指定行开始，指定行结束，效率高很多。
*/
// scan.setStartRow(Bytes.toBytes("AAAAAAAAAAAA"));
// scan.setStopRow(Bytes.toBytes( "AAAAAAAAABBB"));
}

/**
* 18，SingleColumnValueFilter和SingleColumnValueExcludeFilter
用来查找并返回指定条件的列的数据
a，如果查找时没有该列，两种filter都会把该行所有数据发回
b，如果查找时有该列，但是不符合条件，则该行所有列都不发回
c，如果找到该列，并且符合条件，前者只返回该列，后者返回所有列
*/
public static void singleColumnValueFilterAndSingleColumnValueExcludeFilterTest(){
Scan scan = new Scan();
scan.setFilter(
// new SingleColumnValueFilter(Bytes.toBytes("all"), Bytes.toBytes("city"),CompareOp.EQUAL,new BinaryComparator(Bytes.toBytes("深圳"))))
new SingleColumnValueExcludeFilter(Bytes.toBytes("all"), Bytes.toBytes("city"),CompareOp.EQUAL,new BinaryComparator(Bytes.toBytes("深圳"))))
;
}

/**
* ？？？19，SkipFilter
KeyValueFilter会返回所有满足条件的row及对应的列。
而加上SkipFilter以后。会发现如果某一行的某一列不符合条件，则这一行全部不返回了。
这个类没测过，不很确定用法及意义。
*/

/**
* 20，TimestampsFilter
a，按时间戳搜索数据库
b，需设定List<Long> 存放所有需要检索的时间戳，
*/
public static void timestampsFilterTest(){
Scan scan = new Scan();
//ls中存放所有需要查找匹配的时间戳
List<Long> ls = new ArrayList<Long>();
ls.add((long)1378035885);
ls.add((long)1378035000);
ls.add((long)1378035775);
ls.add((long)1378034964);
scan.setFilter(
new TimestampsFilter(ls)
);
}

/**
* 21，ValueFilter
按value全数据库搜索（全部列的value均会被检索）
*/
public static void valueFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new ValueFilter(
CompareOp.EQUAL,
new BinaryComparator(Bytes.toBytes("shanghai"))
)
);
}

/**
* ???22,WhileMatchFilter
相当于while执行，直到不match就break了返回了。
*/
public static void whileMatchFilterTest(){
Scan scan = new Scan();
scan.setFilter(
new WhileMatchFilter(
new ValueFilter(
CompareOp.EQUAL,
new BinaryComparator(Bytes.toBytes("shanghai"))
)
)
);
}
}

猫耳山大王

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
api学习：hbase的filter

在此测试类中，假定有一个名字叫tableName的表，列族为all，all列族中有city，column1，column2，column3等字段我的经验是应该每个都亲自测试一下，只要碰到某个问题时能确定这个场景能否用filter解决就可以了，然后现用现查即可。但是hbase中最重要的还是行键的设计（因为只有这一级索引）
复制链接

扫一扫