################################################ Demo ######################################
/*
* To change this license header, choose License Headers in Project Properties.* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package framework.webapp.commons.utils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* 将句子中的名词取出,复数形式自动转为单数
* @author mly
*/
public class NLPUtil {
private static final Log log = LogFactory.getLog(NLPUtil.class);
public static StanfordCoreNLP pipeline;
static {
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
pipeline = new StanfordCoreNLP(props);
}
public static StanfordCoreNLP getStanfordCoreNLP() {
if (pipeline == null) {
pipeline = new StanfordCoreNLP();
}
return pipeline;
}
public static List<String> getTagsForSentence(String text) {
Annotation document = new Annotation(text);
// run all Annotators on this text
getStanfordCoreNLP().annotate(document);
// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
StringBuilder sb = new StringBuilder();
List<String> tags = new ArrayList<String>();
for (CoreMap sentence : sentences) {
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
String prevNeToken = "O";
String currNeToken = "O";
boolean newToken = true;
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
currNeToken = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
String word = token.get(CoreAnnotations.TextAnnotation.class);
String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
System.out.println("word:" + word + " currNeToken:" + currNeToken+ " pos:" +pos);
if (currNeToken.equals("NUMBER")) {
continue;
}
// Strip out "O"s completely, makes code below easier to understand
if (currNeToken.equals("O")) {
if (pos.startsWith("NN")) {
if(pos.equals("NNS")){
tags.add(InflectorUtil.getInstance().singularize(word));
}else{
tags.add(word);
}
}
if (!prevNeToken.equals("O") && (sb.length() > 0)) {
log.info("'"+sb.toString()+"' is a "+prevNeToken);
tags.add(sb.toString());
sb.setLength(0);
newToken = true;
}
continue;
}
if (newToken) {
prevNeToken = currNeToken;
newToken = false;
sb.append(word);
continue;
}
if (currNeToken.equals(prevNeToken)) {
sb.append(" " + word);
} else {
log.info("'"+sb.toString()+"' is a "+prevNeToken);
tags.add(sb.toString());
sb.setLength(0);
newToken = true;
}
prevNeToken = currNeToken;
}
if (!prevNeToken.equals("O") && (sb.length() > 0)) {
//handleEntity(prevNeToken, sb, tokens);
log.info("'" + sb.toString() + "' is a " + prevNeToken);
tags.add(sb.toString());
sb.setLength(0);
newToken = true;
}
}
log.info(tags.toString());
return tags;
}
}
################################################ Demo end ######################################
################################################ maven dependency ######################################
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.5.2</version>
<classifier>models</classifier>
</dependency>
################################################ maven dependency end ######################################