第二章
1,文本特征处理
(1)分词 demo
package com.dianyou.mllib.secondPage import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer} import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.{SparkConf, SparkContext} object test { def main(args: Array[String]): Unit = { //设置Spark配置 val conf: SparkConf = new SparkConf().setAppName("SparkWC").setMaster("local[*]") //创建Spark上下文 val sc: SparkContext = new SparkContext(conf) val spark = SparkSession .builder .config(conf) .getOrCreate() // 数据 val test_DataFrame: DataFrame = spark.createDataFrame(Seq( (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") )).toDF("id", "sentence") // 创建tokenizer 指定输入,指定输出 val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") // 拿到新的dataframe val tokenized: DataFrame = tokenizer.transform(test_DataFrame) // 查看字段然后输出 tokenized.select("sentence", "words").take(3).foreach(println) //regexTokenizer 更高级一些 val regexTokenizer = new RegexTokenizer() .setInputCol("sentence") .setOutputCol("words") .setPattern("\\W") .setGaps(true) // 拿到新的dataframe val regexTokenized = regexTokenizer.transform(test_DataFrame) regexTokenized.select("sentence", "words").show(false) regexTokenized.select("sentence", "words").take(3).foreach(println) //todo 关闭 sc.stop() } }
(2)