- foreach()
# Action foreach(function) : Applies a function to all elements of this RDD
rdd6.foreach(print)
# (27, 'anyone')
# (27, 'anywhere')
# (27, 'at')
# (27, 'and')
- count()
# Action - count() : Return the number of elements in this RDD.
print("Count : ", rdd6.count())
# Count : 4
- first()
- max()
# Action - first() : Return the first element in this RDD.
firstRec = rdd6.first()
print("First Record : ", firstRec[0], firstRec[1])
# First Record : 27 anyone
# Action - max(): Find the maximum item in this RDD.(不清楚)
datMax = rdd6.max(key=str)
print("Max Record : ", datMax[0], datMax[1])
# Max Record : 27 at
- reduce()
- take()
- collect()
- saveAsTextFile()
# Action - reduce 并行整合RDD 中所有数据
totalWordCount = rdd6.reduce(lambda x,y : (x[0]+y[0], x[1]))
print("dataReduce Record : ", totalWordCount)
# dataReduce Record : (108, 'anyone')
# Action - take(m) :取前m个元素;返回类型是list,不能直接使用foreach方法
data3 = rdd6.take(3)
spark.sparkContext.parallelize(data3).foreach(lambda x: print(x[0],'1231231231',x[1]))
# 27 1231231231 anywhere
# 27 1231231231 at
# 27 1231231231 anyone
# Action - collect;返回类型是list
data = rdd6.collect()
spark.sparkContext.parallelize(data).foreach(lambda x: print(x[0],'1231231231',x[1]))
# 27 1231231231 anywhere
# 27 1231231231 at
# 27 1231231231 anyone
# 27 1231231231 and
# Action - saveAsTextFile
rdd5.saveAsTextFile("wordCount")
- 其它重要方法
spark = SparkSession.builder\
.appName('SparkByExamples.com')\
.master("local[3]")\
.getOrCreate()
inputRDD = spark.sparkContext.parallelize([("Z", 1), ("A", 20), ("B", 30), ("C", 40), ("B", 30), ("B", 60)])
listRdd = spark.sparkContext.parallelize((1, 2, 3, 4, 5, 3, 2), 4)
- aggregate()
# aggregate – action
# 理解:https://blog.csdn.net/qingyang0320/article/details/51603243
# 例1
seqOp = (lambda x, y: (x[0] + y, x[1] + 1))
combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1]))
sc = spark.sparkContext
res = sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp)
print(type(res)) # <class 'tuple'>
# 求平均值
print(res[0]/res[1])
- treeAggregate()
# treeAggregate – action :与aggregate功能一样,但更安全,效率更高
'''
与aggregate不同的是treeAggregate多了depth的参数,其他参数含义相同。aggregate在执行完SeqOp后会将计算结果拿到driver端使用CombOp遍历一次
SeqOp计算的结果,最终得到聚合结果。而treeAggregate不会一次就Comb得到最终结果,SeqOp得到的结果也许很大,直接拉到driver可能会OutOfMemory,
因此它会先把分区的结果做局部聚合(reduceByKey),如果分区数过多时会做分区合并,之后再把结果拿到driver端做reduce。
'''
seqOp= (lambda x, y: (x + y))
combOp= (lambda x, y: (x + y))
print("treeAggregate : ", listRdd.treeAggregate(0, seqOp, combOp))
- fold()
# fold – action
from operator import add
print("fold : ", listRdd.fold(0, add))
# 或者如下;其中x的初始值就是0
print(listRdd.fold(0, lambda x,y:x+y))
# 类似的方法还有reduce
print(listRdd.reduce(lambda x,y: x+y))
# 或者
print(listRdd.reduce(add))
'''
reduce和fold方法,是对同种元素类型数据的RDD进行操作,即必须同构。其返回值返回一个同样类型的新元素。
'''
# treeReduce
print("treeReduce : ", listRdd.treeReduce(add))
- countByValue()
# countByValue, countByValueApprox
# countByValue : 先按值对rdd进行分组,返回一个map,{1: 1, 2: 2, 3: 2, 4: 1, 5: 1},key是按值分组,value是这种值的个数
# countByValueApprox 与 countByValue一样,但返回的是一个近似值
print("countByValue : ", listRdd.countByValue())