NLP分词

NLP分词jar包很大,不建议使用maven下载,几百M,直接官网下载

package test

import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import edu.stanford.nlp.pipeline.{Annotation, StanfordCoreNLP}
import org.apache.commons.lang.StringUtils
import util.EmojiFilter

import scala.collection.mutable.ListBuffer
import scala.util.control.Breaks.{break, breakable}
import scala.collection.JavaConverters._
import scala.collection.mutable.{ArrayBuffer, ListBuffer}

/**
  * Created by liuwei on 2017/8/23.
  */
object NLPTest {

  def main(args: Array[String]): Unit = {
    val string = "test环境服务器启动方式更新为supervisor启动"
    val res = nlp(string,List.empty[String],List.empty[String])
    println(res)

  }

  private def nlp(content: String, stopWordList: List[String], stopNatureList: List[String]): List[String] = {

    if(StringUtils.isEmpty(content))
      return List.empty[String]

    val stopWordListBuffer = ListBuffer.empty[String]
    stopWordListBuffer.append(null, "了", "的")//获取系统停用词
    stopWordList.foreach(f=> stopWordListBuffer.append(f))

    val stopWordAll = stopWordListBuffer.toList


    val stopNaturesFromDictListBuffer = ListBuffer.empty[String]
    stopNaturesFromDictListBuffer.append(null,  "PU") //获取系统停用词性
    stopNatureList.flatMap(f=> f.split(",")).foreach(
      f=> stopNaturesFromDictListBuffer.append(f)
    )
    val stopNatureAll = stopNaturesFromDictListBuffer.toList


    val props = "StanfordCoreNLP-chinese.properties"

    val pipeline = new StanfordCoreNLP(props)

    val context = EmojiFilter.filterEmoji(content)

    val document: Annotation = new Annotation(context)

    // run all Annotators n this text
    pipeline.annotate(document)

    val coreLabels: List[CoreLabel] = document.get(classOf[CoreAnnotations.TokensAnnotation]).asScala.toList

    val result = ListBuffer.empty[String]


    for (coreLabel:CoreLabel <- coreLabels) {

      breakable{

        val word = coreLabel.word
        val tag = coreLabel.tag

        val stopWordExisted = null != stopWordAll && stopWordAll.nonEmpty && stopWordAll.contains(word)

        if(stopWordExisted) break


        val stopNatureExisted = null != stopNatureAll && stopNatureAll.nonEmpty && stopNatureAll.contains(tag)

        if(stopNatureExisted) break


        //add word
        result.append(word)
      }
    }

    result.toList
  }

}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值