目录
Part-of-Speech Tagger
词性标记器根据词本身和词的上下文标记单词的类型。OpenNLP POS标记器使用概率模型来预测标签集中词的类型。为了限制词的类型范围,可以使用字典来提高标记器的标记和运行时效率。
常用的词性简写参考:
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
训练可以用以下带标签格式的语料进行;词和词性用下划线”_”连接,词和词之间用空格分开。
About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._.
That_DT sounds_VBZ good_JJ ._.
模型训练
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import opennlp.tools.postag.POSEvaluator;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSSample;
import opennlp.tools.postag.POSTaggerFactory;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.postag.WordTagSampleStream;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
public class PartOfSpeechTaggingTrain {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//训练数据的路径
String filePath = fileResourcesDir + "part-of-speech-taggin.txt";
//训练后模型的保存路径
String modelPath = modelResourcesDir + "en-pos-maxent-my.bin";
//按行读取数据
InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));
ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);
//按行读取数据
ObjectStream<POSSample> sampleStream = new WordTagSampleStream(lineStream);
POSTaggerFactory factory =new POSTaggerFactory();
//训练模型
POSModel model =POSTaggerME.train("en",sampleStream, TrainingParameters.defaultParams(), factory);
//保存模型
FileOutputStream fos=new FileOutputStream(new File(modelPath));
OutputStream modelOut = new BufferedOutputStream(fos);
model.serialize(modelOut);
//评估模型
POSEvaluator evaluator = new POSEvaluator(new POSTaggerME(model),null);
evaluator.evaluate(sampleStream);
Double result = evaluator.getWordAccuracy();
System.out.println("正确标记的次数:"+result.toString());
}
}
词性标注
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
public class PartOfSpeechTaggingPredit {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//String filePath = fileResourcesDir + "sentenceDetector.txt";
String modelPath = modelResourcesDir + "en-pos-maxent.bin";
InputStream modelIn = new FileInputStream(modelPath) ;
//加载模型
POSModel model = new POSModel(modelIn);
//实例化模型
POSTaggerME tagger = new POSTaggerME(model);
//词性检测,返回的是一个概率数组
String sent[] = new String[]{"Most", "large", "cities", "in", "the", "US", "had","morning", "and", "afternoon", "newspapers", "."};
String tags[] = tagger.tag(sent);
for(String str:tags){
System.out.print(str+",");
}
System.out.println();
//返回概率
double probs[] = tagger.probs();
for(double str:probs){
System.out.print(str+",");
}
}
}