参考https://www.iteblog.com/archives/1395.html#map
[Map]
# map
# sc = spark context, parallelize creates an RDD from the passed object
x = sc.parallelize([1,2,3])
y = x.map(lambda x: (x,x**2))
# collect copies RDD elements to a list on the driver
print(x.collect())
print(y.collect())
[1, 2, 3]
[(1, 1), (2, 4), (3, 9)]
【FlatMap】
# flatMap
x = sc.parallelize([1,2,3])
y = x.flatMap(lambda x: (x, 100*x, x**2))
print(x.collect())
print(y.collect())
[1, 2, 3]
[1, 100, 1, 2, 200, 4, 3, 300, 9]
【mapParititions】
# mapPartitions
x = sc.parallelize([1,2,3], 2)
def f(iterator): yield sum(iterator)
y = x.mapPartitions(f)
# glom() flattens elements on the same partition
print(x.glom().collect())
print(y.glom()