spark word2vec 源码_Spark文本特征提取(TF-IDF/Word2Vec/CountVectorizer)

importorg.apacheimportorg.apache.sparkimportorg.apache.spark.ml.feature._importorg.apache.spark.mllib.linalg._importorg.apache.spark.mllib.linalg.Vectorsimportorg.apache.spark.mllib.linalg.distributed.RowMatriximportorg.apache.spark.mllib.regression.LabeledPointimportorg.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}importorg.apache.spark.rdd.RDDimportorg.apache.spark.sql.SQLContextimportorg.apache.spark.{SparkConf, SparkContext}importorg.apache.spark.mllib.linalg.{Matrices, Matrix}

object test42 {

def main(args: Array[String]): Unit={

val masterUrl= "local[2]"val appName= "tfidf_test"val sparkConf= newSparkConf().setMaster(masterUrl).setAppName(appName)

@transient val sc = newSparkContext(sparkConf)

val sqlContext= newSQLContext(sc)

sc.setLogLevel("ERROR")//Scala默认会导入scala.collection.immutable.Vector,//所以必须显式导入org.apache.spark.mllib.linalg.Vector才能使用MLlib才能使用MLlib提供的Vector。//密集向量

val dv:Vector = Vectors.dense(1.0,0.0,3.0)

println(dv)//稀疏向量,3表示此向量的长度,第一个Array(0,2)表示的索引,第二个Array(1.0, 3.0)与前面的Array(0,2)是相互对应的,表示第0个位置的值为1.0,第2个位置的值为3

val sv1:Vector=Vectors.sparse(3,Array(0,2),Array(1.0,3.0))

println(sv1)//稀疏向量, 3表示此向量的长度,Seq里面每一对都是(索引,值)的形式

val sv2:Vector=Vectors.sparse(3,Seq((0,1.0),(2,3.0)))

println(sv2)//标记点

val pos=LabeledPoint(1.0,Vectors.dense(1.0,0.0,3.0))

val neg=LabeledPoint(0.0,Vectors.sparse(3,Array(0,2),Array(1.0,3.0)))//创建矩阵,3行2列

val dm:Matrix=Matrices.dense(2,3,Array(1,0,2.0,3.0,4.0,5.0))

println("========dm========")

println(dm)

val v0= Vectors.dense(1.0, 0.0, 3.0)

val v1= Vectors.sparse(3, Array(1), Array(2.5))

val v2= Vectors.sparse(3, Seq((0, 1.5), (1, 1.8)))

val rows=sc.parallelize(Seq(v0, v1, v2))

println("=========rows=======")

println(rows.collect().toBuffer)

val mat: RowMatrix= newRowMatrix(rows)

val seriesX: RDD[Double]=sc.parallelize(List(1.0,2.0,3.0)) //a series

val seriesY: RDD[Double] = sc.parallelize(List(4.0,5.0,6.0)) //和seriesX必须有相同的分区和基数

val correlation:Double = Statistics.corr(seriesX, seriesY, "pearson")

val data: RDD[Vector]=rows//每个向量必须是行,不能是列

val correlMatrix: Matrix = Statistics.corr(data, "pearson")

println("========correlMatrix========")

println(correlMatrix)

val summary: MultivariateStatisticalSummary=Statistics.colStats(rows)

println("===================")

println(summary.mean)//每个列值组成的密集向量

println(summary.variance) //列向量方差

println(summary.numNonzeros) //每个列的非零值个数

/*** Word2Vec*/val documentDF=sqlContext.createDataFrame(Seq("Hi I heard about Spark".split(" "),"I wish Java could use case classes".split(" "),"Logistic regression models are neat".split(" ")

).map(Tuple1.apply)).toDF("text")//Learn a mapping from words to Vectors.

val word2Vec = newWord2Vec()

.setInputCol("text")

.setOutputCol("result")

.setVectorSize(3)

.setMinCount(0)

val model=word2Vec.fit(documentDF)

val result=model.transform(documentDF)

println("=======word2vec=========")

result.show(10,false)/*** Countvectorizer*/val df=sqlContext.createDataFrame(Seq(

(0, Array("a", "b", "c")),

(1, Array("a", "b", "b", "c", "a"))

)).toDF("id", "words")//fit a CountVectorizerModel from the corpus

val cvModel: CountVectorizerModel = newCountVectorizer()

.setInputCol("words")

.setOutputCol("features")

.setVocabSize(3)

.setMinDF(2)

.fit(df)//alternatively, define CountVectorizerModel with a-priori vocabulary

val cvm = new CountVectorizerModel(Array("a", "b", "c"))

.setInputCol("words")

.setOutputCol("features")

println("=======CountVectorizerModel=========")

cvModel.transform(df).show(10,false)/*** TF-IDF*/val sentenceData=sqlContext.createDataFrame(Seq(

(0, "Hi I heard about Spark"),

(0, "I wish Java could use case classes"),

(1, "Logistic regression models are neat")

)).toDF("label", "sentence")

val tokenizer= new Tokenizer().setInputCol("sentence").setOutputCol("words")

val wordsData=tokenizer.transform(sentenceData)

val hashingTF= newHashingTF()

.setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)

val featurizedData=hashingTF.transform(wordsData)//CountVectorizer也可获取词频向量

val idf= new IDF().setInputCol("rawFeatures").setOutputCol("features")

val idfModel=idf.fit(featurizedData)

val rescaledData=idfModel.transform(featurizedData)

rescaledData.show(10,false)

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值