package xxx
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
object TopicExtraction {
def main(arg: Array[String]) {
Logger.getLogger("org").setLevel(Level.ERROR) // 控制输出日志的级别
println("hello")
val spark = SparkSession
.builder().master("local")
.appName("TopicExtraction")
.getOrCreate()
val soureceData= spark.createDataFrame(Seq(
(0,"soyo spark like spark hadoop spark and spark like spark"),
(1,"i wish i can like java i"),
(2,"but i dont know how to soyo"),
(3,"spark is good spark tool")
)).toDF("label","sentence")
//进行分词
val tokenizer=new Tokenizer().setInputCol("sentence").setOutputCol("words")
val wordsData=tokenizer.transform(soureceData)
wordsData.show(false) //表示不省略,打印字符串的所有单词
//使用HashingTF生成特征向量
val hashTF=new HashingTF().setInputCol("words").setOutputCol("rawsFeatures").setNumFeatures(1000)
val featuredData=hashTF.transform(wordsData)
featuredData.show(false)
//使用CountVectorizer生成特征向量
val cvModel: CountVectorizerModel =newCountVectorizer()
.setInputCol("words")
.setOutputCol("rawFeatures")
.setMinTF(2)
.fit(wordsData)
val featuredData = cvModel.transform(wordsData)
//输出词表
val vocabulary = cvModel.vocabulary
println(vocabulary.mkString(","))
val idf=new IDF().setInputCol("rawsFeatures").setOutputCol("features")
val idfModel=idf.fit(featuredData)
val result=idfModel.transform(featuredData)
result.show(false)
result.select("label","features").show(false)
}
}
spark实现tfidf
最新推荐文章于 2021-05-29 11:54:24 发布