#-*-coding=utf-8 -*-
from pyspark importSparkConf, SparkContext
sc= SparkContext('local')#map(func):对RDD中的每个元素都执行一个指定的函数产生一个新的RDD。RDD之间的元素是一对一关系
print sc.parallelize([1, 2, 3, 4]).map(lambda x: x * x).collect() #[1, 4, 9, 16]
print sc.parallelize(["hello world", "hi"]).map(lambda line: line.split(" ")).collect() #[['hello', 'world'], ['hi']]
#filter(func):是对RDD元素进行过滤;返回一个新的数据集,由经过func函数后返回值为true的元素组成
print sc.parallelize([1, 2, 3, 4]).filter(lambda x: x>2).collect() #[3, 4]
#flatMap(func):类似于map,但是输出结果会被“拍扁”
print sc.parallelize(["hello world", "hi"]).flatMap(lambda line: line.split(" ")).collect() #['hello', 'world', 'hi']
#sample(withReplacement,fraction,seed)根据给定的随机种子seed,随机抽样出fraction比例的数据,withReplacement:是否放回抽样
print sc.parallelize([1, 2, 3, 4, 5, 6, 7]).sample(True,0.2,1).collect() #[1, 7, 7]
#union(RDD):RDD取并集
print sc.parallelize([1, 2, 3]).union(sc.parallelize([3, 4, 5])).collect() #[1, 2, 3, 3, 4, 5]
#intersection(RDD):RDD取交集
print sc.parallelize([1, 2, 3]).intersection(sc.parallelize([3, 4, 5])).collect() #[3]
#subtract(RDD):差集
print sc.parallelize([1, 2, 3]).subtract(sc.parallelize([3, 4, 5])).collect() #[2, 1]
#cartesian(RDD):笛卡尔乘积,作用于数据集T和U上,返回(T, U),即数据集中每个元素的两两组合
print sc.parallelize([1, 2, 3]).cartesian(sc.parallelize([3, 4, 5])).collect() #[(1, 3), (1, 4), (1, 5), (2, 3), (2, 4), (2, 5), (3, 3), (3, 4), (3, 5)]
#distinct():RDD去重
print sc.parallelize([1, 2, 3, 3]).distinct().collect() #[1, 2, 3]
#groupByKey():作用于由键值对(K, V)组成的数据集上,将Key相同的数据放在一起,返回一个由键值对(K, Iterable)组成的数据集
a = sc.parallelize({(1,2),(3,4),(3,6)}).groupByKey().collect() #[(1, Iterable), (3, Iterable)]
for i ina:print str(i[0])+":"+str(list(i[1])) #1:[2] ; 3:[4, 6]
#reduceByKey():作用于键值对(K, V)上,按Key分组,然后将Key相同的键值对的Value都执行func操作,得到一个值
print sc.parallelize({(1,2),(3,4),(3,6)}).reduceByKey(lambda x,y: x+y).collect() #[(1, 2), (3, 10)]
#sortByKey([ascending=True], [numTasks]):按照Key进行排序,ascending的值默认为True,True/False表示升序还是降序
print sc.parallelize({(2,2),(1,4),(3,6)}).sortByKey().collect() #[(1, 4), (2, 2), (3, 6)]
#join(otherDataset, [numTasks]):类似于SQL中的连接操作,即作用于键值对(K, V)和(K, W)上,返回元组 (K, (V, W)),spark也支持外连接,包括leftOuterJoin,rightOuterJoin和fullOuterJoin。例子:
print sc.parallelize({(1,2),(3,4),(3,6)}).join(sc.parallelize({(3,7),(4,8)})).collect() #[(3, (4, 7)), (3, (6, 7))]
print sc.parallelize({(1,2),(3,4),(3,6)}).leftOuterJoin(sc.parallelize({(3,7),(4,8)})).collect() #[(1, (2, None)), (3, (4, 7)), (3, (6, 7))]
print sc.parallelize({(1,2),(3,4),(3,6)}).rightOuterJoin(sc.parallelize({(3,7),(4,8)})).collect() #[(4, (None, 8)), (3, (4, 7)), (3, (6, 7))]
print sc.parallelize({(1,2),(3,4),(3,6)}).fullOuterJoin(sc.parallelize({(3,7),(4,8)})).collect() #[(4, (None, 8)), (1, (2, None)), (3, (4, 7)), (3, (6, 7))]
#cogroup(otherDataset, [numTasks]):作用于键值对(K, V)和(K, W)上,返回元组 (K, (Iterable, Iterable))。这一操作可叫做groupWith。
a = sc.parallelize({(1,2),(3,4),(3,6)}).cogroup(sc.parallelize({(3,7),(4,8)})).collect() #[(4, (Iterable, Iterable)), (1, (Iterable, Iterable)), (3, (Iterable, Iterable))]
for i ina:print str(i[0])+":"+str(list(i[1][0]))+","+str(list(i[1][1])) #4:[],[8] ; 1:[2],[] ;3:[4, 6],[7]
#mapValues(func): 扩展值
print sc.parallelize({("panda",0),("pink",3)}).mapValues(lambda x:(x,1)).collect() #[('pink', (3, 1)), ('panda', (0, 1))]