SparkRDD算子分为两类:Transformation与Action.
Transformation:即延迟加载数据,Transformation会记录元数据信息,当计算任务触发Action,才会真正开始计算。
Action:即立即加载数据,开始计算。
创建RDD的方式有两种:
1、通过sc.textFile(“/root/words.txt”)从文件系统中创建 RDD。
2、#通过并行化scala集合创建RDD:val rdd1 = sc.parallelize(Array(1,2,3,4,5,6,7,8))
parallelize
定义:def parallelize[T](seq: Seq[T],numSlices: Int)(implicit evidence$1: scala.reflect.ClassTag[T]): org.apache.spark.rdd.RDD[T]
scala>
val rdd1=sc.parallelize(List(1,2,3,4,5,6,7),2)//分成两个区
scala> rdd1.map(_*2).collect
res6: Array[Int] = Array(2, 4, 6, 8, 10, 12, 14)
makeRDD
定义:def makeRDD[T](seq: Seq[(T, Seq[String])])(implicit evidence$3: scala.reflect.ClassTag[T]): org.apache.spark.rdd.RDD[T]
def makeRDD[T](seq: Seq[T],numSlices: Int)(implicit evidence$2: scala.reflect.ClassTag[T]): org.apache.spark.rdd.RDD[T]
scala> val rdd3=sc.makeRDD(1 to 10)
rdd3: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[10] at makeRDD at <console>:24
scala> rdd3.collect
res40: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
map//处理一个/行数据
定义:def map[U](f: Int => U)(implicit evidence$3: scala.reflect.ClassTag[U]): org.apache.spark.rdd.RDD[U]
scala> val rdd1=sc.parallelize(List(1,2,3,4,5,6,7),2)//分成两个区
scala> rdd1.map(_*2).collect
res6: Array[Int] = Array(2, 4, 6, 8, 10, 12, 14)
scala> rdd.map(x=>(x,1)).collect//形成键值对,形成函数要加()
res17: Array[(Int, Int)] = Array((1,1), (2,1), (9,1), (7,1), (4,1))
mapPartitions//处理一批数据
定义:def mapPartitions[U](f: Iterator[String] => Iterator[U],preservesPartitioning: Boolean)(implicit evidence$6: scala.reflect.ClassTag[U]): org.apache.spark.rdd.RDD[U]
scala> var rdd1 = sc.makeRDD(1 to 5,4)
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[2] at makeRDD at <console>:24
scala> var rdd3 = rdd1.mapPartitions{ x => {
| var result = List[Int]()
| var i = 0
| while(x.hasNext){
| i += x.next()
| }
| result.::(i).iterator
| }}
rdd3: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[3] at mapPartitions at <console>:25
scala> rdd3.collect//本是1到5的和,分成四个区
res2: Array[Int] = Array(1, 2, 3, 9)
mapPartitionsWithIndex
定义:def mapPartitionsWithIndex[U](f: (Int, Iterator[Int]) => Iterator[U],preservesPartitioning: Boolean)(implicit evidence$9: scala.reflect.ClassTag[U]): org.apache.spark.rdd.RDD[U]
var
rdd1
=
sc
.
makeRDD
(
1
to
5
,
2
)
//rdd1有两个分区
var
rdd2
=
rdd
.
mapPartitionsWithIndex
{
(
x
,