RDD 算子分为 Transformation 算子(转换算子)和 Action 算子(行动算子)
一、Transformation 算子
通过转换算子,将一个 RDD 转换为另外一个 RDD,类似于 y=f(x) 的计算,比如:rddb = rdda.map(...)。
注意:转换算子不计算出结果,只记录作用到了哪些数据集上去,比如只记录了 map、filter,不算出结果。不触发提交作业,因此几乎不耗费多少计算时间。
常用的转换算子有:map、filter、flatMap、group by、reduceByKey、sort、union、distinct、join
1、map
# 转换算子——map例子01
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[2]").setAppName("spark0401")
sc = SparkContext(conf = conf)
data = [1,2,3,4,5]
rdd1 = sc.parallelize(data)
rdd2 = rdd1.map(lambda x:x*2)
print(rdd2.collect())
#最后结果是 [2,4,6,8,10]
# 转换算子——map例子02
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[2]").setAppName("spark0401")
sc = SparkContext(conf = conf)
data = ["dog","tiger","lion","cat","panther","eagle"]
rdd1 = sc.parallelize(data)
rdd2 = rdd1.map(lambda x:(x,1))
print(rdd2.collect())
#最后结果是 [('dog', 1), ('tiger', 1), ('lion', 1), ('cat', 1), ('panther', 1), ('eagle', 1)]
2、filter
# 转换算子——filter例子
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[2]").setAppName("spark0401")
sc = SparkContext(conf = conf)
data = [1,2,3,4,5]
rdd1 = sc.parallelize(data)
mapRdd = rdd1.map(lambda x:x*2)
filterRdd = mapRdd.filter(lambda x:x>5)
print(filterRdd.collect()) #输出 [6,8,10]
print(sc.parallelize(data).map(lambda x:x*2).filter(lambda x:x>5).collect()) #输出 [6,8,10]
3、flatMap
# 转换算子——flatMap例子
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[2]").setAppName("spark0401")
sc = SparkContext(conf = conf)
data = ["hello spark","hello world","hello world"]
rdd = sc.parallelize(data)
print(rdd.flatMap(lambda line:line.split(" ")).collect()) #['hello', 'spark', 'hello', 'world', 'hello', 'world']
4、groupByKey
# 转换算子——groupByKey例子
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[2]").setAppName("spark0401")
sc = SparkContext(conf = conf)
data = ["hello spark","hello world","hello world"]
rdd = sc.parallelize(data)
mapRdd = rdd.flatMap(lambda line:line.split(" ")).map(lambda x:(x,1))
groupByRdd = mapRdd.groupByKey()
# print(groupByRdd.collect())
print(groupByRdd.map(lambda x:{x[0]:list(x[1])}).collect()) #输出[{'world': [1, 1]}, {'hello': [1, 1, 1]}, {'spark': [1]}]
5、reduceByKey
# 转换算子——reduceByKey例子
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[2]").setAppName("spark0401")
sc = SparkContext(conf = conf)
data = ["hello spark", "hello world", "hello world"]
rdd = sc.parallelize(data)
mapRdd = rdd.flatMap(lambda line: line.split(" ")).map(lambda x: (x, 1))
print(mapRdd.collect()) #[('hello', 1), ('spark', 1), ('hello', 1), ('world', 1), ('hello', 1), ('world', 1)]
reduceByKeyRdd = mapRdd.reduceByKey(lambda a,b:a+b) #把值加起来
print(reduceByKeyRdd.collect()) #[('world', 2), ('hello', 3), ('spark', 1)]
6、sort
# 转换算子——sort例子
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[2]").setAppName("spark0401")
sc = SparkContext(conf = conf)
data = ["hello spark", "hello world", "hello world"]
rdd = sc.parallelize(data)
mapRdd = rdd.flatMap(lambda line: line.split(" ")).map(lambda x: (x, 1))
reduceByKeyRdd = mapRdd.reduceByKey(lambda a, b: a + b)
print(reduceByKeyRdd.map(lambda x:(x[1],x[0])).sortByKey().collect()) #需要把key和vlaue换位置[(1, 'spark'), (2, 'world'), (3, 'hello')]
print(reduceByKeyRdd.map(lambda x: (x[1], x[0])).sortByKey().map(lambda x: (x[1], x[0])).collect()) #再掉过头[('spark', 1), ('world', 2), ('hello', 3)]
7、union
# 转换算子——union例子
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[2]").setAppName("spark0401")
sc = SparkContext(conf = conf)
a = sc.parallelize([1,2,3])
b = sc.parallelize([3,4,5])
print(a.union(b).collect()) # [1, 2, 3, 3, 4, 5]
8、distinct
# 转换算子——distinct例子
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[2]").setAppName("spark0401")
sc = SparkContext(conf = conf)
a = sc.parallelize([1, 2, 3])
b = sc.parallelize([3, 4, 2])
print(a.union(b).distinct().collect()) # [4, 1, 2, 3]
9、join
# 转换算子——join例子
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[2]").setAppName("spark0401")
sc = SparkContext(conf = conf)
a = sc.parallelize([("A","a1"),("C","c1"),("D","d1"),("F","f1"),("F","f2")])
b = sc.parallelize([("A","a2"),("C","c2"),("C","c3"),("E","e1")])
print(a.join(b).collect()) #[('A', ('a1', 'a2')), ('C', ('c1', 'c2')), ('C', ('c1', 'c3'))]
print(a.leftOuterJoin(b).collect()) #[('A', ('a1', 'a2')), ('F', ('f1', None)), ('F', ('f2', None)), ('C', ('c1', 'c2')), ('C', ('c1', 'c3')), ('D', ('d1', None))]
print(a.rightOuterJoin(b).collect()) #('a1', 'a2')), ('C', ('c1', 'c2')), ('C', ('c1', 'c3')), ('E', (None, 'e1'))]
print(a.fullOuterJoin(b).collect()) #('a1', 'a2')), ('F', ('f1', None)), ('F', ('f2', None)), ('C', ('c1', 'c2')), ('C', ('c1', 'c3')), ('D', ('d1', None)), ('E', (None, 'e1'))]
二、Action 算子
return a value to the driver program after running a computation on the dataset
行动算子负责计算出结果,会触发 SparkContext 提交 Job 作业,因此比较耗费时间
常用的转换算子有:count、collect、take、max、min、sum、reduce、foreach、saveAsTextFile
# 行动算子
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[2]").setAppName("spark0401")
sc = SparkContext(conf = conf)
data = [1,2,3,4,5,6,7,8,9,10]
rdd = sc.parallelize(data)
print(rdd.collect()) # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
print(rdd.count()) # 10
print(rdd.take(4)) # [1,2,3,4]
print(rdd.max()) # 10
print(rdd.min()) # 1
print(rdd.sum()) # 55
print(rdd.reduce(lambda x,y:x+y)) # 55
rdd.foreach(lambda x: print(x)) # 依次输出 1,2,3,4,5,6,7,8,9,10