1月 25, 2019 |
Nix.Huang
基于lucene-core 4.10.3接口, 亲测ok
假设要自定义一个keyword分词器,以省、市、县、镇、区将地址分词。
1、扩展Tokenizer 实现自定义的分词器MyKeywordTokenizer,主要待实现的方法有
a)incrementToken()。 没有term时return false, 否则return true并将term的字符拷贝到CharTermAttribute中, 将该词的偏移量拷贝到OffsetAttribute,OffsetAttribute的另一个名字叫做term vector, 主要用于词组的高亮。
b)reset()。 reset重置该类的状态,必须要调用super.reset不然状态不一致。
2、protected createComponents 创建TokenStreamComponents对象, Analyzer 很多时候是Tokenizer+Filter的包装类,比如分完词后进行同义词处理等。
当手动操作TokenStream时,记得调用reset和close方法
public class MyKeywordAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
return new TokenStreamComponents(new MyKeywordTokenizer(reader));
}
//main 方法测试代码
public static void main(String[] args) throws IOException {
String arr[] = {"重庆市云阳县高阳镇","四川省成都市玉龙乡"};
MyKeywordAnalyzer analyzer = new MyKeywordAnalyzer();
for(int i = 0; i< arr.length; i++) {
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(arr[i]) );
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();//必须
while(tokenStream.incrementToken()) {
System.out.println(termAtt.toString());
}
System.out.println("===============================");
tokenStream.close();//必须
}
analyzer.close();
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
publicclassMyKeywordAnalyzerextendsAnalyzer{
@Override
protectedTokenStreamComponentscreateComponents(StringfieldName,
Readerreader){
returnnewTokenStreamComponents(newMyKeywordTokenizer(reader));
}
//main 方法测试代码
publicstaticvoidmain(String[]args)throwsIOException{
Stringarr[]={"重庆市云阳县高阳镇","四川省成都市玉龙乡"};
MyKeywordAnalyzeranalyzer=newMyKeywordAnalyzer();
for(inti=0;i
TokenStreamtokenStream=analyzer.tokenStream(null,newStringReader(arr[i]));
CharTermAttributetermAtt=tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();//必须
while(tokenStream.incrementToken()){
System.out.println(termAtt.toString());
}
System.out.println("===============================");
tokenStream.close();//必须
}
analyzer.close();
}
}
public class MyKeywordTokenizer extends Tokenizer {
private static char[] levelArr = {'省','市','县','镇','区'};
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private int startoffset, endoffset;
protected MyKeywordTokenizer(Reader input) {
super(input);
}
//从流中读取字符,直到遇到关键词或者流结束-1
@Override
public boolean incrementToken() throws IOException {
char[] buffer = termAtt.buffer();
int index = 0;
int c = -1;
while( (c=input.read()) != -1 ) {
if (index == buffer.length)
buffer = termAtt.resizeBuffer(8+buffer.length);
buffer[index++] = (char)c;
if(isSplitChar((char)c)){
break;
}
}
startoffset = endoffset;
endoffset += index;
if(startoffset != endoffset) {
termAtt.setLength(index);
offsetAtt.setOffset(startoffset, endoffset);
}
if(c == -1 && index <=0 ) {
return false;
}
return true;
}
//重置状态
@Override
public void reset() throws IOException {
super.reset();
startoffset = 0;
endoffset = 0;
}
private boolean isSplitChar(char c) {
for(char item : levelArr) {
if(item == c ) {
return true;
}
}
return false;
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
publicclassMyKeywordTokenizerextendsTokenizer{
privatestaticchar[]levelArr={'省','市','县','镇','区'};
privatefinalCharTermAttributetermAtt=addAttribute(CharTermAttribute.class);
privateOffsetAttributeoffsetAtt=addAttribute(OffsetAttribute.class);
privateintstartoffset,endoffset;
protectedMyKeywordTokenizer(Readerinput){
super(input);
}
//从流中读取字符,直到遇到关键词或者流结束-1
@Override
publicbooleanincrementToken()throwsIOException{
char[]buffer=termAtt.buffer();
intindex=0;
intc=-1;
while((c=input.read())!=-1){
if(index==buffer.length)
buffer=termAtt.resizeBuffer(8+buffer.length);
buffer[index++]=(char)c;
if(isSplitChar((char)c)){
break;
}
}
startoffset=endoffset;
endoffset+=index;
if(startoffset!=endoffset){
termAtt.setLength(index);
offsetAtt.setOffset(startoffset,endoffset);
}
if(c==-1&&index<=0){
returnfalse;
}
returntrue;
}
//重置状态
@Override
publicvoidreset()throwsIOException{
super.reset();
startoffset=0;
endoffset=0;
}
privatebooleanisSplitChar(charc){
for(charitem:levelArr){
if(item==c){
returntrue;
}
}
returnfalse;
}
}
Posted in: Lucene