1.整体程序,没有问题,copy可以运行。path需要路径下需要放几个英文文档。
2.spark ml程序,spark 2.X,Scala 2.11.X,jdk 1.8
3.内容大概为生成英文文档的关键词提取。
4.主要注意spark ml和mlib的区别,我尽量使用的是最新的,ml + dataframe + spark SQL
5.流程:去读英文文档、分词、过滤停用词、创建word2Vec、结果格式化输出
import org.ansj.recognition.impl.StopRecognition
import org.ansj.splitWord.analysis.ToAnalysis
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.feature.{CountVectorizer, RegexTokenizer, StopWordsRemover, Word2Vec}
import org.apache.spark.sql.SQLContext
object word2vec_test01 {
def main(args: Array[String]):Unit={
val conf = new SparkConf().setMaster("local[2]").setAppName("word2vec_test01")
var sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val path = "D:\\soft\\IDEA\\data\\input\\news_data\\*"
val rdd = sc.wholeTextFiles(path)
val filename = rdd.map(_._1);
//构建停词
import scala.collection.JavaConverters._
val stopWords = sc.textFile("stop_words_eng.txt").collect().toSeq.asJava
//过滤停词
val filter = new StopRecognition().insertStopWords(stopWords)
filter.insertStopNatures("w", null, "null")
val splitWordRdd = rdd.map(file => {
val str = ToAnalysis.parse(file._2).recognition(filter).toStringWithOutNature(" ")
(file._1, str.split(" "))
})
val df = sqlContext.createDataFrame(splitWordRdd).toDF("fileName", "words")
df.rdd.map(x => x.toString()).foreach(println)
// Word2Vec 创建
// Learn a mapping from words to Vectors.
val word2Vec = new Word2Vec()
.setInputCol("words")
.setOutputCol("result")
.setVectorSize(4)
.setMinCount(4)
val model = word2Vec.fit(df)
val result = model.transform(df)
result.rdd.foreach(println)
// 展示topN在所有文档的权重
val vocs = model.getVectors
vocs.createOrReplaceTempView("vocs")
val result1 = sqlContext.sql(
"""
| select word,
| split(table2.values, ',')[0] as values01,
| split(table2.values, ',')[1] as values02,
| split(table2.values, ',')[2] as values03,
| split(table2.values, ',')[3] as values04
| from(
| select word,
| substring(temp1,2,length(temp1)-2) as values
| from(
| select word,
| String(vector) as temp1
| from vocs
| )table1
| )table2
""".stripMargin)
result1.show()
// 展示同义词topN
// val like = model.findSynonyms("中国", 40)
// for ((item, cos) <- like) {
// println(s"$item $cos")
// }
// 最终结果可使用的官方api如下:
// http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.feature.Word2VecModel
}
}
有兴趣可以加我的大数据、数据分析、爬虫群:
《453908562》