Stanford NLP自然语序处理demo，附maven dependency

最新推荐文章于 2024-08-19 00:09:54 发布

mlymark

最新推荐文章于 2024-08-19 00:09:54 发布

阅读量855

点赞数

文章标签： java nlp

本文链接：https://blog.csdn.net/mlymark/article/details/49175269

版权

################################################ Demo ######################################

* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package framework.webapp.commons.utils;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
* 将句子中的名词取出，复数形式自动转为单数
* @author mly
*/
public class NLPUtil {
private static final Log log = LogFactory.getLog(NLPUtil.class);
public static StanfordCoreNLP pipeline;

static {
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
pipeline = new StanfordCoreNLP(props);
}

public static StanfordCoreNLP getStanfordCoreNLP() {
if (pipeline == null) {
pipeline = new StanfordCoreNLP();
}
return pipeline;
}

public static List<String> getTagsForSentence(String text) {
Annotation document = new Annotation(text);
// run all Annotators on this text
getStanfordCoreNLP().annotate(document);

// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
StringBuilder sb = new StringBuilder();
List<String> tags = new ArrayList<String>();
for (CoreMap sentence : sentences) {
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
String prevNeToken = "O";
String currNeToken = "O";
boolean newToken = true;
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
currNeToken = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
String word = token.get(CoreAnnotations.TextAnnotation.class);
String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
System.out.println("word:" + word + " currNeToken:" + currNeToken+ " pos:" +pos);
if (currNeToken.equals("NUMBER")) {
continue;
}
// Strip out "O"s completely, makes code below easier to understand
if (currNeToken.equals("O")) {
if (pos.startsWith("NN")) {
if(pos.equals("NNS")){
tags.add(InflectorUtil.getInstance().singularize(word));
}else{
tags.add(word);
}
}
if (!prevNeToken.equals("O") && (sb.length() > 0)) {
log.info("'"+sb.toString()+"' is a "+prevNeToken);
tags.add(sb.toString());
sb.setLength(0);
newToken = true;
}
continue;
}

if (newToken) {
prevNeToken = currNeToken;
newToken = false;
sb.append(word);
continue;
}

if (currNeToken.equals(prevNeToken)) {
sb.append(" " + word);
} else {
log.info("'"+sb.toString()+"' is a "+prevNeToken);
tags.add(sb.toString());
sb.setLength(0);
newToken = true;
}

prevNeToken = currNeToken;
}
if (!prevNeToken.equals("O") && (sb.length() > 0)) {
//handleEntity(prevNeToken, sb, tokens);
log.info("'" + sb.toString() + "' is a " + prevNeToken);
tags.add(sb.toString());
sb.setLength(0);
newToken = true;
}
}
log.info(tags.toString());
return tags;
}

}

################################################ Demo end ######################################

################################################ maven dependency ######################################

<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.5.2</version>
<classifier>models</classifier>
</dependency>

################################################ maven dependency end ######################################