创建 RDD
从集合创建 RDD
创建方式一:parallelize
scala> val paraRDD = sc.parallelize(arr)
paraRDD: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:26
scala> val listRDD = sc.parallelize(list)
listRDD: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[1] at parallelize at <console>:26
创建方式二:makeRDD
scala> val paraRDD = sc.makeRDD(arr)
paraRDD: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[2] at makeRDD at <console>:26
scala> val listRDD = sc.makeRDD(list)
listRDD: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[3] at makeRDD at <console>:26
从文件系统创建RDD
scala> val localFile =sc.textFile("/root/app/sparktest/test.txt")
localFile: org.apache.spark.rdd.RDD[String] = /root/app/sparktest/test.txt MapPartitionsRDD[7] at textFile at <console>:24
topn 实现
scala> val filerdd = sc.textFile("/root/app/sparktest/test2.txt")
filerdd: org.apache.spark.rdd.RDD[String] = /root/app/sparktest/test2.txt MapPartitionsRDD[1] at textFile at <console>:24
scala> val lines = filerdd.flatMap(_.split("\\s+")).map((_,1)).reduceByKey(_+_).map(t=>(t._2,t._1)).sortByKey(false).map(t=>(t._2,t._1))
lines: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[9] at map at <console>:25
scala> lines.take(3).foreach(println)
(spark,6)
(hadoop,4)
(java,3)
#filerdd.flatMap(_.split("\\s+")):创建单词的数据集
#map((_,1)):形成key,value的PairRDD
#reduceByKey(_+_):相同key的value进行相加
#map(t=>(t._2,t._1)):key和value的位置互换
#sortByKey(false):降序排序