最近做一个项目需要对给定的文本中的句子做Parse,根据POS tag及句子成分信息找出词语/短语之间的dependency,然后根据dependency构建句子的parse tree. 需要用到Stanford Parser和OpenNLP 中的Shallow Parser,这两个Parser都用JAVA实现,提供API方式调用,可以根据句子输出语法解析树。下面总结两类Parser的作用及JAVA程序调用方法。
1 Shallow Parser
Shallow Parser主要作用是找出句子中的短语信息,包括名词短语NP,动词短语VP,形容词短语ADJP,副词短语ADVP等等,示例程序如下
package edu.pku.yangliu.nlp.pdt;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.HashMap;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.cmdline.PerformanceMonitor;
import opennlp.tools.cmdline.postag.POSModelLoader;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSSample;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
/**a Shallow Parser based on opennlp
* @author yangliu
* @blog http://blog.csdn.net/yangliuy
* @mail yang.liu@pku.edu.cn
*/
public class ShallowParser {
private static ShallowParser instance = null ;
private static POSModel model;
private static ChunkerModel cModel ;
//Singleton pattern
public static ShallowParser getInstance() throws InvalidFormatException, IOException{
if(ShallowParser.instance == null){
POSModel model = new POSModelLoader().load(new File("en-pos-maxent.bin"));
InputStream is = new FileInputStream("en-chunker.bin");
ChunkerModel cModel = new ChunkerModel(is);
ShallowParser.instance = new ShallowParser(model, cModel);
}
return ShallowParser.instance;
}
public ShallowParser(POSModel model, ChunkerModel cModel){
ShallowParser.model = model;
ShallowParser.cModel = cModel;
}
/** A shallow Parser, chunk a sentence and return a map for the phrase
* labels of words <wordsIndex, phraseLabel>
* Notice: There should be " " BEFORE and after ",", " ","(",")" etc.
* @param input The input sentence
* @param model The POSModel of the chunk
* @param cMode