- 输出将包含一系列ñ-克每个 ñ-gram由空格分隔的字符串表示 ñ连续的单词
def ngramTest(){
import org.apache.spark.ml.feature.NGram
val spark: SparkSession = SparkSession.builder().appName("implicits").master("local[2]").getOrCreate()
val wordDataFrame = spark.createDataFrame(Seq(
(0, Array("Hi", "I", "heard", "about", "Spark")),
(1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
(2, Array("Logistic", "regression", "models", "are", "neat"))
)).toDF("id", "words")
val ngram = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams")
val ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(false)
}
运行结果