Scala FlatMap与Map操作

scala FlatMap与Map操作

数据文件:
words.txt

book banana
monkey man
woman book
man monkey
banana book
book man
spark spark home

测试代码:

package com.saker.spark.rdd
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vector
object FlatMapAndMap {
   def main(args: Array[String]) {
         val conf = new SparkConf()
         val spark = new SparkSession.Builder()
                                     .appName("SimpleSVMTest")
                                     .master("local")
                                     .config(conf)
                                     .getOrCreate()
         val sc = spark.sparkContext
         sc.setLogLevel("WARN")
         
         //val infile ="D:\\spark-2.2.0-bin-hadoop2.7\\data\\mllib\\sample_libsvm_data.txt"
         val infile = "D:\\words.txt"
         // 加载和解析数据文件
         val data = sc.textFile(infile)
         //map
          println("map = ")
         val sparseData = data.map{line =>  //line: String行字符串
             val parts = line.split(" ")    //parts:Array[String] 按行操作,按数组存取 ,一行一个数组列表
             //LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray)
             parts
         }  //sparseData: RDD[Array[String]]
         //sparseData.collect().foreach(arr => println(arr(0)))
         //data.foreach(arr => println(arr(1)))
         sparseData.foreach(arr => println(arr)) //sparseData: RDD[Array[String]] 持有多个数组,一个数组为一行数据
         sparseData.foreach(arr => println(arr(0)))
         
         //flatmap
         println("flatmap = ")
         val flatSparseData = data.flatMap{
               line => 
               val arr = line.split(" ")
               arr  //arr:Array[String]
         }        //flatSparseData: RDD[String]   多行 arr:Array[String] flatMap()后合成一个字符串(含有多行数据)
         println("flatSparseData.foreach = ")
         flatSparseData.foreach(println)
         
         println("flatSparseData.collect() = ")
         flatSparseData.collect().foreach(println) //collect(): Array[String]
         
         val arrStrFlatSparseData = flatSparseData.collect() //arrStrFlatSparseData: Array[String]字符串数组
         val s_0 = arrStrFlatSparseData(0)
         println("s_0 = ", s_0)
          
         val mapAfterFlatMapSparseData = flatSparseData.map(onestr => (onestr, 1))//mapAfterFlatMapSparseData: RDD[(String, Int)]
         val m_0 = mapAfterFlatMapSparseData.collect()(0)
         println("m_0 = ", m_0)
         
         
         //使用Array List
         println("使用Array List = ")
         val rdd = sc.parallelize(List("coffee panda","happy panda","happiest panda party")) //rdd: RDD[String]
         val seq = List("coffee panda","happy panda","happiest panda party") 
         val rdd1 = sc.makeRDD(seq) //rdd1: RDD[String]
         println("rdd = ")
         rdd.foreach(println)
         println("rdd1 = ")
         rdd1.foreach(println)
         //collect()
         println("rdd collect() = ")
         rdd.collect().foreach(println)
         println("rdd1 collect() = ")
         rdd1.collect().foreach(println) //collect(): Array[String]
         
         //flatmap
         val rddString = rdd.flatMap(oneStr => oneStr.split(" ")) //rddString: RDD[String]
         println("rddString flatMap = ")
         rddString.foreach(println)
         println("rddString flatMap collect = ")
         rddString.collect().foreach(println)
         
         //flatmap -> map
         val rddMapAfterFlat = rddString.map(word => (word, 1))//rddMap: RDD[(String, Int)]
         println("rddMapAfterFlat foreach = ")
         rddMapAfterFlat.foreach(println)
         println("rddMapAfterFlat collect foreach = ")
         rddMapAfterFlat.collect().foreach(println) //collect(): Array[(String, Int)]
         
         
         //map before flatmap
         val rddMapBeforeFlat = rdd.map(onestr => onestr.split(" ")) //rddMapBeforeFlat: RDD[Array[String]]
         println("rddMapBeforeFlat foreach = ")
         rddMapBeforeFlat.foreach(println) //打印的是一个数组对象的字符名称
         rddMapBeforeFlat.foreach(arr => println(arr(0)))//打印每一个数组对象中的第一个元素 
         rddMapBeforeFlat.collect()   //collect(): Array[Array[String]] 转成类似二维数组的结构
         val coffee = rddMapBeforeFlat.collect()(0)(0) //coffee
         println("coffee = ", coffee)
         println("rddMapBeforeFlat first = ")
         rddMapBeforeFlat.collect().foreach(arr => println(arr(0)))
         //rddMapBeforeFlat: RDD[Array[String]] 是RDD持有数组,foreach每一个数组
         //rddMapBeforeFlat.collect() collect(): Array[Array[String]] 是数组持有数组,foreach每一个数组,
         //和rddMapBeforeFlat foreach遍历内容相同,都是每个数组
     }
}

输出:

map = 
[Ljava.lang.String;@baf5726
[Ljava.lang.String;@632ec19f
[Ljava.lang.String;@2f4c5520
[Ljava.lang.String;@3e12fa01
[Ljava.lang.String;@4ff58c82
[Ljava.lang.String;@5a428eda
[Ljava.lang.String;@176a35e7
book
monkey
woman
man
banana
book
spark
flatmap = 
flatSparseData.foreach = 
book
banana
monkey
man
woman
book
man
monkey
banana
book
book
man
spark
spark
home
flatSparseData.collect() = 
book
banana
monkey
man
woman
book
man
monkey
banana
book
book
man
spark
spark
home
(s_0 = ,book)
(m_0 = ,(book,1))
使用Array List = 
rdd = 
coffee panda
happy panda
happiest panda party
rdd1 = 
coffee panda
happy panda
happiest panda party
rdd collect() = 
coffee panda
happy panda
happiest panda party
rdd1 collect() = 
coffee panda
happy panda
happiest panda party
rddString flatMap = 
coffee
panda
happy
panda
happiest
panda
party
rddString flatMap collect = 
coffee
panda
happy
panda
happiest
panda
party
rddMapAfterFlat foreach = 
(coffee,1)
(panda,1)
(happy,1)
(panda,1)
(happiest,1)
(panda,1)
(party,1)
rddMapAfterFlat collect foreach = 
(coffee,1)
(panda,1)
(happy,1)
(panda,1)
(happiest,1)
(panda,1)
(party,1)
rddMapBeforeFlat foreach = 
[Ljava.lang.String;@5bf646e7
[Ljava.lang.String;@68665502
[Ljava.lang.String;@11fe40f5
coffee
happy
happiest
(coffee = ,coffee)
rddMapBeforeFlat first = 
coffee
happy
happiest

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值