经过paoding分词后,再对每个token进行2次分词,此处是二元切分法
public class MyCJKFilter extends TokenFilter {
private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);;
private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
private PositionIncrementAttribute posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
protected MyCJKFilter(TokenStream input) {
super(input);
}
private Vector<Token> bufferToken = new Vector<Token>();
private int count = 0;
private CJKAnalyzer analyzer = new CJKAnalyzer();
Map<String, Token> map = new HashMap<String, Token>();
@Override
public final boolean incrementToken() throws IOException {
if (this.bufferToken.size() > 0) {
Token t = this.bufferToken.remove(0);
this.termAtt.setTermBuffer(t.term());
this.offsetAtt.setOffset(t.startOffset(), t.endOffset());
this.posAtt.setPositionIncrement(t.getPositionIncrement());
return true;
}
if (this.bufferToken.size() == 0 && this.count > 0) {
// System.out.println("count is > 0!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
count = 0;
return false;
}
map.clear();
while (input.incrementToken()) {
this.termAtt = (TermAttribute) input.getAttribute(TermAttribute.class);
this.offsetAtt = (OffsetAttribute) input.getAttribute(OffsetAttribute.class);
this.posAtt = (PositionIncrementAttribute) input.getAttribute(PositionIncrementAttribute.class);
String term = this.termAtt.term();
Token tokenOri = new Token(term, this.offsetAtt.startOffset(), this.offsetAtt.endOffset());
this.bufferToken.add(tokenOri);
map.put(term, tokenOri);
// System.out.println(term + "-->" + this.offsetAtt.startOffset() + "," + this.offsetAtt.endOffset());
TokenStream ts = this.analyzer.tokenStream("", new StringReader(term));
while (ts.incrementToken()) {
TermAttribute ta = (TermAttribute) ts.getAttribute(TermAttribute.class);
if (map.containsKey(ta.term())) {
continue;
}
OffsetAttribute offa = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
// System.out.println(ta.term() + "-->" + offa.startOffset() + "," + offa.endOffset());
Token token = new Token(ta.term(), offa.startOffset(), offa.endOffset());
if (token == null) {
// System.out.println("ts.next() is null");
} else {
this.bufferToken.add(token);
// System.out.println("add to vector, term=" + token.term());
}
}
count++;
}
if (bufferToken.size() > 0) {
return this.incrementToken();
} else {
return false;
}
}
}