NLP分词jar包很大,不建议使用maven下载,几百M,直接官网下载
package test
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import edu.stanford.nlp.pipeline.{Annotation, StanfordCoreNLP}
import org.apache.commons.lang.StringUtils
import util.EmojiFilter
import scala.collection.mutable.ListBuffer
import scala.util.control.Breaks.{break, breakable}
import scala.collection.JavaConverters._
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
/**
* Created by liuwei on 2017/8/23.
*/
object NLPTest {
def main(args: Array[String]): Unit = {
val string = "test环境服务器启动方式更新为supervisor启动"
val res = nlp(string,List.empty[String],List.empty[String])
println(res)
}
private def nlp(content: String, stopWordList: List[String], stopNatureList: List[String]): List[String] = {
if(StringUtils.isEmpty(content))
return List.empty[String]
val stopWordListBuffer = ListBuffer.empty[String]
stopWordListBuffer.append(null, "了", "的")//获取系统停用词
stopWordList.foreach(f=> stopWordListBuffer.append(f))
val stopWordAll = stopWordListBuffer.toList
val stopNaturesFromDictListBuffer = ListBuffer.empty[String]
stopNaturesFromDictListBuffer.append(null, "PU") //获取系统停用词性
stopNatureList.flatMap(f=> f.split(",")).foreach(
f=> stopNaturesFromDictListBuffer.append(f)
)
val stopNatureAll = stopNaturesFromDictListBuffer.toList
val props = "StanfordCoreNLP-chinese.properties"
val pipeline = new StanfordCoreNLP(props)
val context = EmojiFilter.filterEmoji(content)
val document: Annotation = new Annotation(context)
// run all Annotators n this text
pipeline.annotate(document)
val coreLabels: List[CoreLabel] = document.get(classOf[CoreAnnotations.TokensAnnotation]).asScala.toList
val result = ListBuffer.empty[String]
for (coreLabel:CoreLabel <- coreLabels) {
breakable{
val word = coreLabel.word
val tag = coreLabel.tag
val stopWordExisted = null != stopWordAll && stopWordAll.nonEmpty && stopWordAll.contains(word)
if(stopWordExisted) break
val stopNatureExisted = null != stopNatureAll && stopNatureAll.nonEmpty && stopNatureAll.contains(tag)
if(stopNatureExisted) break
//add word
result.append(word)
}
}
result.toList
}
}