Stanford CoreNLP 进行中文分词

最新推荐文章于 2025-02-20 19:27:13 发布

Macanv

最新推荐文章于 2025-02-20 19:27:13 发布

阅读量1.4w

点赞数 3

分类专栏： nlp 文章标签： nlp

本文链接：https://blog.csdn.net/macanv/article/details/72993873

版权

nlp 专栏收录该内容

12 篇文章

订阅专栏

Stanford CoreNLP 进行中文分词

中文分词的工具有很多，使用斯坦福的CoreNLP进行分词的教程网上也不少，本篇博客是记录自己在使用Stanford CoreNLP进行中文分词的学习笔记。

1. 工具准备

1.1 下载NLP相关包：

网址： https://stanfordnlp.github.io/CoreNLP/index.html
需要下载的包看下图：
这里写图片描述

1.2 准备jar包

将下载下来的stanford-corenlp-full-2016-10-31解压，在工程中导入以下jar:
1、stanford-corenlp-full-2016-10-31/ejml-0.23.jar (斜杠前面是目录。。。)
2、stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar
3、stanford-chinese-corenlp-2016-10-31-models.jar

2.分词

本篇文章仅仅记录分词，其他的功能后续在更新,注意JDK要1.8以上。

package Seg;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;

import java.util.List;
import java.util.Properties;

/**
 * Created by dd on 2017/6/8.
 * 斯坦福NLP 包，中文分词和英文分词
 */
public class Segmentation {

    public void segInCh(String text){
        //载入properties 文件
//        StanfordCoreNLP pipline = new StanfordCoreNLP("StanfordCoreNLP-chinese.properties");

        //1.2 自定义功能 （1）
//        Properties properties = new Properties();
//        properties.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
//        StanfordCoreNLP pipline = new StanfordCoreNLP(properties);

        //自定义功能(2) 自己在项目中建一个properties 文件，然后在文件中设置模型属性，可以参考1中的配置文件
        String[] args = new String[] {"-props", "properies/CoreNLP-Seg-CH.properties"};
        Properties properties = StringUtils.argsToProperties(args);
        StanfordCoreNLP pipline = new StanfordCoreNLP(properties);

        //自定义功能(3)
        /*
        StanfordCoreNLP pipline = new StanfordCoreNLP(PropertiesUtils.asProperties(
                "annotators", "tokenize,ssplit",
                "ssplit.isOneSentence", "true",
                "tokenize.language", "zh",
                "segment.model", "edu/stanford/nlp/models/segmenter/chinese/ctb.gz",
                "segment.sighanCorporaDict", "edu/stanford/nlp/models/segmenter/chinese",
                "segment.serDictionary", "edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz",
                "segment.sighanPostProcessing", "true"
        ));
        */
        //创建一个解析器，传入的是需要解析的文本
        Annotation annotation = new Annotation(text);

        //解析
        pipline.annotate(annotation);

        //根据标点符号，进行句子的切分，每一个句子被转化为一个CoreMap的数据结构，保存了句子的信息()
        List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);

        //从CoreMap 中取出CoreLabel List ,打印
        for (CoreMap sentence : sentences){
            for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)){
                String word = token.get(CoreAnnotations.TextAnnotation.class);
                System.out.println(word);
            }
        }
    }
}

2.2 测试

    String shortText = "硕士研究生产";
    @Test
    public void testSegCh(){
        Segmentation segmentation = new Segmentation();
        segmentation.segInCh(shortText);
    }