利用OpenNLP进行人名命名实体识别,代码来源于《驾驭文本》第五章。
import java.io.File;
import java.io.FileInputStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.Span;
public class NamedEntityExtraction {
public static void main(String[] args) throws Exception {
String[] sentences = {
"Former first lady Nancy Reagan was taken to a " + "suburban Los Angeles "
+ "hospital \"as a precaution\" Sunday after a fall at " + "her home, an " + "aide said. ",
"The 86-year-old Reagan will remain overnight for "
+ "observation at a hospital in Santa Monica, California, " + "said Joanne "
+ "Drake, chief of staff for the Reagan Foundation." };//两句话
NameFinderME finder = new NameFinderME(new TokenNameFinderModel(
new FileInputStream(new File(System.getProperty("model.dir"), "./nlpbin/en-ner-person.bin"))));//在http://opennlp.sourceforge.net/models-1.5/ 下载en-ner-person.bin
Tokenizer tokenizer = SimpleTokenizer.INSTANCE;//初始化简单切词(也就是按空格切词)
for (int si = 0; si < sentences.length; si++) {
String[] tokens = tokenizer.tokenize(sentences[si]);//第一句话[Former, first, lady, Nancy, Reagan, was, taken, to, a, suburban, Los, Angeles, hospital, ", as, a, precaution, ", Sunday, after, a, fall, at, her, home, ,, an, aide, said, .]
//第二句话[The, 86, -, year, -, old, Reagan, will, remain, overnight, for, observation, at, a, hospital, in, Santa, Monica, ,, California, ,, said, Joanne, Drake, ,, chief, of, staff, for, the, Reagan, Foundation, .]
Span[] names = finder.find(tokens);
displayNames(names, tokens);
}
finder.clearAdaptiveData();
}
public static void displayNames(Span[] names, String[] tokens) {
for (int si = 0; si < names.length; si++) {
StringBuilder cb = new StringBuilder();
for (int ti = names[si].getStart(); ti < names[si].getEnd(); ti++) {
cb.append(tokens[ti]).append(" ");
}
System.out.println(cb.substring(0, cb.length() - 1));
System.out.print("\tstart: " + names[si].getStart());
System.out.println("\tend: " + names[si].getEnd());//输出实体的起止位置
System.out.println("\ttype: " + names[si].getType());//输出实体的类型type
System.out.println("\tprob: " + names[si].getProb());//输出实体的概率prob
}
}
}
span包含了这么几个属性:start(该实体的起始位置),end(该实体的终止位置),prob(可能性,实体的概率),还有type(实体的类型)。
输出结果如下:
Nancy Reagan
start: 3 end: 5
type: person
prob: 0.9704748832886989
Reagan
start: 6 end: 7
type: person
prob: 0.9996172457889334
Joanne Drake
start: 22 end: 24
type: person
prob: 0.9929295961937021
Reagan
start: 30 end: 31
type: person
prob: 0.9976318413669909