实现地理位置名词的分词
val spark = SparkSession
.builder()
.appName("Word2Vec").master("local[*]")
.getOrCreate()
val df=spark.createDataFrame(Seq(("1","湖北武汉市汉口北大道12345号"),
("2","成都青羊区清江中路"),
("3","地址是乱输入的")
)).toDF("id","address")
import spark.implicits._
val tmpDf=df.map(r=>{
val id=r.getAs[String]("id")
val address=r.getAs[String]("address")
val result=address.wordSplit().mkString("|")
(id,address,result)
}).filter(x=>{
x._3!=""
}).toDF("id","address","address_split")
tmpDf.show()
spark.stop()
核心代码:
implicit class WordSplit(word: String) extends Serializable {
def wordSplit(flag: Boolean = false): Seq[String] = {
Option(word) match {
case None => Seq.empty[String]
case Some(s) => {
val el = segments.seg(s.trim)
val result = if (el.isEmpty) Seq.empty[String]
else {
//取地理名词
el.filter(_.nature.name() == "ns")
.map(x => {
x.word.trim.replaceAll(" ", "")
}).filterNot(_.isEmpty).distinct
}
flag match {
case false => result
case true => result.map(_.replaceAll(usenessWs.mkString("[", " ", "]"), ""))
}
}
}
}
}