scala FlatMap与Map操作
数据文件:
words.txt
book banana
monkey man
woman book
man monkey
banana book
book man
spark spark home
测试代码:
package com.saker.spark.rdd
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vector
object FlatMapAndMap {
def main(args: Array[String]) {
val conf = new SparkConf()
val spark = new SparkSession.Builder()
.appName("SimpleSVMTest")
.master("local")
.config(conf)
.getOrCreate()
val sc = spark.sparkContext
sc.setLogLevel("WARN")
//val infile ="D:\\spark-2.2.0-bin-hadoop2.7\\data\\mllib\\sample_libsvm_data.txt"
val infile = "D:\\words.txt"
// 加载和解析数据文件
val data = sc.textFile(infile)
//map
println("map = ")
val sparseData = data.map{line => //line: String行字符串
val parts = line.split(" ") //parts:Array[String] 按行操作,按数组存取 ,一行一个数组列表
//LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray)
parts
} //sparseData: RDD[Array[String]]
//sparseData.collect().foreach(arr => println(arr(0)))
//data.foreach(arr => println(arr(1)))
sparseData.foreach(arr => println(arr)) //sparseData: RDD[Array[String]] 持有多个数组,一个数组为一行数据
sparseData.foreach(arr => println(arr(0)))
//flatmap
println("flatmap = ")
val flatSparseData = data.flatMap{
line =>
val arr = line.split(" ")
arr //arr:Array[String]
} //flatSparseData: RDD[String] 多行 arr:Array[String] flatMap()后合成一个字符串(含有多行数据)
println("flatSparseData.foreach = ")
flatSparseData.foreach(println)
println("flatSparseData.collect() = ")
flatSparseData.collect().foreach(println) //collect(): Array[String]
val arrStrFlatSparseData = flatSparseData.collect() //arrStrFlatSparseData: Array[String]字符串数组
val s_0 = arrStrFlatSparseData(0)
println("s_0 = ", s_0)
val mapAfterFlatMapSparseData = flatSparseData.map(onestr => (onestr, 1))//mapAfterFlatMapSparseData: RDD[(String, Int)]
val m_0 = mapAfterFlatMapSparseData.collect()(0)
println("m_0 = ", m_0)
//使用Array List
println("使用Array List = ")
val rdd = sc.parallelize(List("coffee panda","happy panda","happiest panda party")) //rdd: RDD[String]
val seq = List("coffee panda","happy panda","happiest panda party")
val rdd1 = sc.makeRDD(seq) //rdd1: RDD[String]
println("rdd = ")
rdd.foreach(println)
println("rdd1 = ")
rdd1.foreach(println)
//collect()
println("rdd collect() = ")
rdd.collect().foreach(println)
println("rdd1 collect() = ")
rdd1.collect().foreach(println) //collect(): Array[String]
//flatmap
val rddString = rdd.flatMap(oneStr => oneStr.split(" ")) //rddString: RDD[String]
println("rddString flatMap = ")
rddString.foreach(println)
println("rddString flatMap collect = ")
rddString.collect().foreach(println)
//flatmap -> map
val rddMapAfterFlat = rddString.map(word => (word, 1))//rddMap: RDD[(String, Int)]
println("rddMapAfterFlat foreach = ")
rddMapAfterFlat.foreach(println)
println("rddMapAfterFlat collect foreach = ")
rddMapAfterFlat.collect().foreach(println) //collect(): Array[(String, Int)]
//map before flatmap
val rddMapBeforeFlat = rdd.map(onestr => onestr.split(" ")) //rddMapBeforeFlat: RDD[Array[String]]
println("rddMapBeforeFlat foreach = ")
rddMapBeforeFlat.foreach(println) //打印的是一个数组对象的字符名称
rddMapBeforeFlat.foreach(arr => println(arr(0)))//打印每一个数组对象中的第一个元素
rddMapBeforeFlat.collect() //collect(): Array[Array[String]] 转成类似二维数组的结构
val coffee = rddMapBeforeFlat.collect()(0)(0) //coffee
println("coffee = ", coffee)
println("rddMapBeforeFlat first = ")
rddMapBeforeFlat.collect().foreach(arr => println(arr(0)))
//rddMapBeforeFlat: RDD[Array[String]] 是RDD持有数组,foreach每一个数组
//rddMapBeforeFlat.collect() collect(): Array[Array[String]] 是数组持有数组,foreach每一个数组,
//和rddMapBeforeFlat foreach遍历内容相同,都是每个数组
}
}
输出:
map =
[Ljava.lang.String;@baf5726
[Ljava.lang.String;@632ec19f
[Ljava.lang.String;@2f4c5520
[Ljava.lang.String;@3e12fa01
[Ljava.lang.String;@4ff58c82
[Ljava.lang.String;@5a428eda
[Ljava.lang.String;@176a35e7
book
monkey
woman
man
banana
book
spark
flatmap =
flatSparseData.foreach =
book
banana
monkey
man
woman
book
man
monkey
banana
book
book
man
spark
spark
home
flatSparseData.collect() =
book
banana
monkey
man
woman
book
man
monkey
banana
book
book
man
spark
spark
home
(s_0 = ,book)
(m_0 = ,(book,1))
使用Array List =
rdd =
coffee panda
happy panda
happiest panda party
rdd1 =
coffee panda
happy panda
happiest panda party
rdd collect() =
coffee panda
happy panda
happiest panda party
rdd1 collect() =
coffee panda
happy panda
happiest panda party
rddString flatMap =
coffee
panda
happy
panda
happiest
panda
party
rddString flatMap collect =
coffee
panda
happy
panda
happiest
panda
party
rddMapAfterFlat foreach =
(coffee,1)
(panda,1)
(happy,1)
(panda,1)
(happiest,1)
(panda,1)
(party,1)
rddMapAfterFlat collect foreach =
(coffee,1)
(panda,1)
(happy,1)
(panda,1)
(happiest,1)
(panda,1)
(party,1)
rddMapBeforeFlat foreach =
[Ljava.lang.String;@5bf646e7
[Ljava.lang.String;@68665502
[Ljava.lang.String;@11fe40f5
coffee
happy
happiest
(coffee = ,coffee)
rddMapBeforeFlat first =
coffee
happy
happiest