使用ansj、stanford coreNLP这种类型的中文分词组件,需要实现两个类。
一个是Tokenizer,一个是TokenizerFactory。
以下是tokenizer的实现,使用的ansj,如果要使用其它组件,把分词那一块的逻辑修改下就可以了:
public class LAnsjTokenizer extends Tokenizer {
private final CharTermAttribute termAtt = (CharTermAttribute)this.addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = (OffsetAttribute)this.addAttribute(OffsetAttribute.class);
private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(4096);
private int offset = 0;//当前buffer的起始偏移距
private int dataLen = 0; //分词器当前缓存的数据长度
private int finalOffset = 0;
private int analysisType ;//使用的分词算法
private Iterator<Term> tokenIterator;
private List<Term> tokenBuffer;
public LAnsjTokenizer(AttributeFactory factory, int analysisType){
this.analysisType = analysisType;
}
public boolean incrementToken() throws IOException {
if (tokenIterator == null || !tokenIterator.hasNext()) {//当前读取的数据处理完了,再次读取新的数据
this.offset += this.dataLen;
CharacterUtils.fill(this.ioBuffer, this.input);
if (this.ioBuffer.getLength() == 0) {//没有数据了
this.dataLen = 0;
return false;
}
this.dataLen = this.ioBuffer.getLength();
String currentSentence = new String(ioBuffer.getBuffer(), 0, ioBuffer.getLength());
tokenBuffer = new ArrayList<Term>();
if (analysisType == 1) {
for (Term term : ToAnalysis.parse(currentSentence)) {
tokenBuffer.add(term);
}
} else if (analysisType == 0) {
for (Term term : IndexAnalysis.parse(currentSentence)) {
tokenBuffer.add(term);
}
} else if (analysisType == 2) {
for (Term term : NlpAnalysis.parse(currentSentence)) {
tokenBuffer.add(term);
}
}
tokenIterator = tokenBuffer.iterator();
if (!tokenIterator.hasNext()) {
return false;
}
}
this.clearAttributes();
Term term = tokenIterator.next();
termAtt.append(term.getName());
termAtt.setLength(term.getName().length());
int currentStart = this.offset + term.getOffe();
int currentEnd = this.offset + term.toValue();
this.offsetAtt.setOffset(currentStart, this.finalOffset = this.correctOffset(currentEnd));
return true;
}
public final void end() throws IOException {
super.end();
System.out.println(this.finalOffset);
this.offsetAtt.setOffset(this.finalOffset, this.finalOffset);
}
public void reset() throws IOException {
super.reset();
this.tokenIterator = null;
this.tokenBuffer =null;
this.offset = 0;
this.dataLen = 0;
this.finalOffset = 0;
this.ioBuffer.reset();
}
public CharTermAttribute getTermAtt() {
return termAtt;
}
}