项目环境 WIN10,JDK8 ,IDEA
获取相关JAR。可以从 https://stanfordnlp.github.io/CoreNLP/ 下载。
slf4j-api.jar
slf4j-simple.jar
stanford-corenlp-3.6.0.jar
stanford-corenlp-3.6.0-javadoc.jar
stanford-corenlp-3.6.0-models.jar
stanford-corenlp-3.6.0-sources.jar
xom.jar
因JAR包过大,且需要从国外下载,因此在项目中引入NPL的JAR包。
package com.zhan.service.toefl;
import com.alibaba.fastjson.JSON;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
/**
* 对自然语言进行处理
* <p>分词(tokenize)、分句(ssplit)、词性标注(pos)、词形还原(lemma,中文没有)、
* 命名实体识别(ner)、语法解析(parse)、情感分析(sentiment)、指代消解(coreference resolution)等。</p>
* @author bean.zhang
* @date 2018/7/30
*/
@Slf4j
@Service
public class NplService {
/**
* 对句子进行分词、词形还原
*/
public void precess(String paragraph){
//创建props,依次对段落进行处理
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, pos, lemma");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
//对段落进行分词和词形还原
//paragraph = "The Origins of Theater";
Annotation document = new Annotation(paragraph);
pipeline.annotate(document);
//处理之后的结果
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
Map<String, String> map = new HashMap<>();
for (CoreMap sentence: sentences) {
for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
String word = token.get(TextAnnotation.class); // 获取分词
//String pos = token.get(PartOfSpeechAnnotation.class); // 获取词性标注
//String ne = token.get(NamedEntityTagAnnotation.class); // 获取命名实体识别结果
String lemma = token.get(LemmaAnnotation.class); // 获取词形还原结果
map.put(word,lemma);
}
}
log.info(" MAP = {}", JSON.toJSONString(map));
}
}
以下是运行的结果