昨天研究学习了一下 Stanford Parse ,想利用 Stanford Parse 智能切词的效果结合到lucene 分词器中的想法;由于项目时间
仓促,部分研究没有完成。代码还存在bug,希望有这方面想法的小伙伴们,能完善。。
lucene版本:lucene4.10.3,引入jar包:stanford-parser-3.3.0-models.jar ,stanford-parser.jar
先构建分词器测试类,代码如下:
package main.test;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
public class AnalyzerTest {
public static void analyzer(Analyzer analyzer,String text){
try {
System.out.println("分词器名称:"+analyzer.getClass());
//获取tokenStream流
TokenStream tokenStream=analyzer.tokenStream("", new StringReader(text));
tokenStream.reset();
while(tokenStream.incrementToken()){
CharTermAttribute cta1=tokenStream.getAttribute(CharTermAttribute.class);
OffsetAttribute ofa=tokenStream.getAttribute(OffsetAttribute.class);
//位置增量的属性,存储词之间的距离
// PositionIncrementAttribute pia=tokenStream.getAttribute(PositionIncrementAttribute.class);
// System.out.print(pia.getPositionIncrement()+":");
System.out.print("["+ofa.startOffset()+"-"+ofa.endOffset()+"]-->"+cta1.toString()+"\n");
}
tokenStream.end();
tokenStream.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args){
String chText = "清华大学生说正在研究生命起源";
Analyzer analyzer = new NlpHhcAnalyzer();
analyzer(analyzer,chText);
}
}
重新定义一个新的分词器,实现Analyzer类,重写其:TokenStreamComponents createComponents 方法。这里注意:lucene4.x版
本的TokenStreamComponents 以组件的形式包含的lucene3.x版本的 filter和 tokenizer。
package main.test;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
public class NlpHhcAnalyzer extends Analyzer{
@Override
protected TokenStreamComponents createComponents(String arg0, Reader reader) {
return new TokenStreamComponents(new aaa(reader));
}
}
实现新的一个Tokenizer 类aaa: 这部分代码还有bug,没有时间去调试学习。。有时间的朋友可以试着完善一下。
package main.test;
import java.io.IOException;
import java.io.Reader;
import java.util.Collection;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeFactory;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure;
public class aaa extends Tokenizer{
//词元文本属性
private CharTermAttribute termAtt;
//词元位移属性
private OffsetAttribute offsetAtt;
//记录最后一个词元的结束位置
// private int finalOffset;
private String str;
private LexicalizedParser lp;
public aaa(Reader in) {
super(in);
StringBuilder sb=new StringBuilder();
try {
for (int i = 0; i <100; i++) {
sb.append((char) in.read());
}
} catch (IOException e) {
e.printStackTrace();
}
str=sb.toString();
String modelpath="edu/stanford/nlp/models/lexparser/xinhuaFactoredSegmenting.ser.gz";
lp = LexicalizedParser.loadModel(modelpath);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
}
protected aaa(AttributeFactory factory, Reader input) {
super(factory, input);
// TODO Auto-generated constructor stub
}
@SuppressWarnings("unchecked")
@Override
public boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
Tree t = lp.parse(str);
ChineseGrammaticalStructure gs = new ChineseGrammaticalStructure(t);
Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed();
ConcurrentHashMap map=new ConcurrentHashMap();
for(int i=0;i<tdl.size();i++)
{
TypedDependency td = (TypedDependency)tdl.toArray()[i];
String term = td.dep().nodeString().trim();
//将Lexeme转成Attributes
//设置词元文本
termAtt.append(term);
//设置词元长度
termAtt.setLength(term.length());
//设置词元位移
if(i==0){
map.put("beginPosition", i*term.length());
}else{
map.put("beginPosition", Integer.parseInt(map.get("beginPosition").toString())+term.length());
}
offsetAtt.setOffset(Integer.parseInt(map.get("beginPosition").toString()), Integer.parseInt(map.get("beginPosition").toString())+term.length());
//记录分词的最后位置
// finalOffset = nextLexeme.getEndPosition();
//返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
return false;
}
}