Spark 提供有两个包提供了word2vec, 分别是
org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
org.apache.spark.ml.feature.Word2Vec
本质没有太大的区别,只是两个包的作用对象不一样
- spark.mllib contains the original API built on top of RDDs.
- spark.ml provides higher-level API built on top of DataFrames for constructing ML pipelines.
mllib直接用于RDD,ml用于DataFrames
以下是使用者两种来跑word2vec的例子,都是官方给的例子,注意导入数据的类型
使用ml
// $example on$ import org.apache.spark.ml.feature.Word2Vec // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object Word2VecExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("Word2Vec example") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ // Input data: Each row is a bag of words from a sentence or document. val documentDF = sqlContext.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.select("result").take(3).foreach(println) // $example off$ } }
其中documentDF是
DataFrame对象
使用mllib
package mllib import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} //import org.apache.spark.ml.feature.Word2Vec import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} /** * Created by Zhili on 2016/12/8. */ object TestWord2Vec { def main(args: Array[String]) { val conf = new SparkConf().setMaster("local").setAppName("test word2vec ") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) val input = sc.textFile("G:\\BigData\\spark-1.6.1\\spark-1.6.1\\data/mllib/sample_lda_data.txt") .map(line => line.split(" ").toSeq)val word2vec = new Word2Vec() val model = word2vec.fit(input)val synonyms = model.findSynonyms("1", 5) for((synonym, cosineSimilarity) <- synonyms) { println(s"