package dt.sparkimport org.apache.spark.SparkConfimport org.apache.spark.SparkContextobject Transformation { def main(args: Array[String]): Unit = { val conf=new SparkConf().setAppName("hehadf").setMaster("local") val sc=new SparkContext(conf) mapTransformation(sc) filterTransformation(sc) flatMapTransformation(sc) joinTransformation(sc) cogroupTransformation(sc) } def mapTransformation(sc:SparkContext){ val nums=sc.parallelize(1 to 10) val mapped=nums.map(item=>item*2) mapped.collect().foreach(println) //sc.stop } def filterTransformation(sc:SparkContext){ val nums=sc.parallelize(1 to 10) val filtered=nums.filter { item =>item%2==0 }//need a boolean filtered.collect().foreach(println) //sc.stop } def flatMapTransformation(sc:SparkContext){// def parallelize[T: ClassTag]( //这是parallelize的定义// seq: Seq[T],// numSlices: Int = defaultParallelism): RDD[T] //可以这样理解先map后flat flat的意思就是把map的结果放在一个集合中比如(Array) val data=Array("jj sf","haha f er ","yyzz sdf d") val data1=sc.parallelize(data)//parallelize()中要传入什么东西,要传入一个集合 val word=data1.flatMap(line=>line.split(" ")) word.collect().foreach { x=>println(x) } } def joinTransformation(sc:SparkContext){ val data=Array(Tuple2(1,"xx"),Tuple2(2,"yy"),Tuple2(3,"zz")) val data1=Array(Tuple2(1,"aa"),Tuple2(2,"bb"),Tuple2(3,"cc")) val data2=sc.parallelize(data) val data3=sc.parallelize(data1) val data4=data2.join(data3) data4.collect().foreach(pair=>println(pair._1+" "+pair._2)) } def groupByKeyTransformation(sc:SparkContext){ val data=Array(Tuple2(1,2),Tuple2(2,3),Tuple2(3,4),Tuple2(1,4)) val data1=sc.parallelize(data) val data2=data1.groupByKey() data2.collect().foreach(println) } def cogroupTransformation(sc:SparkContext){ val data=Array(Tuple2(1,Tuple2(1,2)),Tuple2(2,3),Tuple2(3,4),Tuple2(1,4)) val data1=Array(Tuple2(1,9),Tuple2(2,7),Tuple2(3," "),Tuple2(1,9),Tuple2(8,9)) val data2=sc.parallelize(data) val data3=sc.parallelize(data1) val data4=data2.cogroup(data3) data4.collect.foreach(pair=>println(pair._1+" "+pair._2)) }}
RDDTransformation Ops
最新推荐文章于 2021-11-24 09:00:00 发布