import org.apache.spark.{SparkConf, SparkContext}
object test {
def main(args: Array[String]): Unit = {
//SparkConf
val conf=new SparkConf().setMaster("local").setAppName("test")
//SparkContext
val sc=new SparkContext(conf)
//parallelize集合并行化 (一般只用于测试)
val rdd=sc.parallelize(List((1,"apple:hate",Array("a","b","c")),(2,"banana:like",Array("d","e","f")),(3,"strawberry:love",Array("g","h","i"))))
//打印RDD
println(rdd.collect.toBuffer)
//按部分分开打印
rdd.collect.foreach{x=>
println(x._1+">>>"+x._2+">>>"+x._3.toBuffer)
}
//map
val rdd2=rdd.map{case(id:Int, name:String,information:Array[String])=>
val id2=id+1
val name2=name.split(":")//.toBuffer //split: String转Array
val information2=information.mkString("#") //mkString: Array转String
(id2,name2,information2) //最后一句是返回值
}
println(rdd2.collect.toBuffer)
//filter过滤
val rdd3=rdd2.filter(x=>x._1>=3)
println(rdd3.collect.toBuffer)
}
}