NLP分词

最新推荐文章于 2024-08-09 11:37:34 发布

liuwei063608

最新推荐文章于 2024-08-09 11:37:34 发布

阅读量491

点赞数 1

分类专栏：算法

本文链接：https://blog.csdn.net/liuwei063608/article/details/78122238

版权

算法专栏收录该内容

14 篇文章 0 订阅

订阅专栏

NLP分词jar包很大，不建议使用maven下载，几百M，直接官网下载

package test

import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import edu.stanford.nlp.pipeline.{Annotation, StanfordCoreNLP}
import org.apache.commons.lang.StringUtils
import util.EmojiFilter

import scala.collection.mutable.ListBuffer
import scala.util.control.Breaks.{break, breakable}
import scala.collection.JavaConverters._
import scala.collection.mutable.{ArrayBuffer, ListBuffer}

/**
  * Created by liuwei on 2017/8/23.
  */
object NLPTest {

  def main(args: Array[String]): Unit = {
    val string = "test环境服务器启动方式更新为supervisor启动"
    val res = nlp(string,List.empty[String],List.empty[String])
    println(res)

  }

  private def nlp(content: String, stopWordList: List[String], stopNatureList: List[String]): List[String] = {

    if(StringUtils.isEmpty(content))
      return List.empty[String]

    val stopWordListBuffer = ListBuffer.empty[String]
    stopWordListBuffer.append(null, "了", "的")//获取系统停用词
    stopWordList.foreach(f=> stopWordListBuffer.append(f))

    val stopWordAll = stopWordListBuffer.toList


    val stopNaturesFromDictListBuffer = ListBuffer.empty[String]
    stopNaturesFromDictListBuffer.append(null,  "PU") //获取系统停用词性
    stopNatureList.flatMap(f=> f.split(",")).foreach(
      f=> stopNaturesFromDictListBuffer.append(f)
    )
    val stopNatureAll = stopNaturesFromDictListBuffer.toList


    val props = "StanfordCoreNLP-chinese.properties"

    val pipeline = new StanfordCoreNLP(props)

    val context = EmojiFilter.filterEmoji(content)

    val document: Annotation = new Annotation(context)

    // run all Annotators n this text
    pipeline.annotate(document)

    val coreLabels: List[CoreLabel] = document.get(classOf[CoreAnnotations.TokensAnnotation]).asScala.toList

    val result = ListBuffer.empty[String]


    for (coreLabel:CoreLabel <- coreLabels) {

      breakable{

        val word = coreLabel.word
        val tag = coreLabel.tag

        val stopWordExisted = null != stopWordAll && stopWordAll.nonEmpty && stopWordAll.contains(word)

        if(stopWordExisted) break


        val stopNatureExisted = null != stopNatureAll && stopNatureAll.nonEmpty && stopNatureAll.contains(tag)

        if(stopNatureExisted) break


        //add word
        result.append(word)
      }
    }

    result.toList
  }

}