启动spark-shell
## jieba-analysis-1.0.3-SNAPSHOT.jar 为bluemapleman提供,作者进行了打包。链接: https://pan.baidu.com/s/1FeSkrueoXB303_KnsExPog 提取码: negi
spark-shell --jars jieba-analysis-1.0.3-SNAPSHOT.jar
执行代码
import java.util
import com.qianxinyao.analysis.jieba.keyword.{Keyword, TFIDFAnalyzer}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
object TitleJiebaParticiplePro {
def main(args: Array[String]): Unit = {
//
// if (args.length < 8) {
// throw new IllegalArgumentException("Need to 8 args!!!")
// }
// val Array(output_tr, output_te, date1_no, date1_yes, date0, date7, series,hour0,series_flag) = args
val conf = new SparkConf()
val sc = new SparkContext(conf)
sc.setLogLevel("INFO")
val sqlContext = new HiveContext(sc)
import sqlContext.implicits._
//UDF注册(普通方法注册成UDF)
sqlContext.udf.register("jiebaParticipleUDF", (str: String) => {
val topN: Int = 5
val tfidfAnalyzer: TFIDFAnalyzer = new TFIDFAnalyzer
val list: util.List[Keyword] = tfidfAnalyzer.analyze(str, topN)
var result_str = ""
import scala.collection.JavaConversions._
for (word <- list) {
result_str += word.getName + "\t"
}
result_str
})
val train_data = sqlContext.sql("SELECT info_id,title, jiebaParticipleUDF(title), series_id FROM tmp.test1 WHERE dt = '2019-05-06' limit 10")
train_data.rdd.map(_.toString().replaceAll("\\[", "").replaceAll("\\]", "")).saveAsTextFile("/user/xxx/tmp/20190507_jieba_test")
}
}
jieba-analysis-1.0.3-SNAPSHOT.jar 源码地址:https://github.com/bluemapleman/jieba-analysis