import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName='rdd_excercise_1')
1.创建一个RDD
acTransList =sc.parallelize([("SB10001",1000),("SB10002",1200),("SB10003",8000),("SB10004",4000),("SB10005",300),("SB10006",10000),("SB10007",500),("SB10008",56),("SB10009",1000),("B10009",1000),("SB10011",-1000)])
acTransList.collect()
[('SB10001', 1000),
('SB10002', 1200),
('SB10003', 8000),
('SB10004', 4000),
('SB10005', 300),
('SB10006', 10000),
('SB10007', 500),
('SB10008', 56),
('SB10009', 1000),
('B10009', 1000),
('SB10011', -1000)]
2.找出所有正常记录,正常记录以"SB"开头,交易值>0
acTransList.filter(lambda x: x[1] and x[0].startswith("SB")).collect()
3.找出所有记录大于1000的记录
acTransList_3 = acTransList.filter(lambda x: x[1]).filter(lambda x: x[0].startswith("SB"))
acTransList_3.filter(lambda x: x[1]>1000).collect()
[('SB10002', 1200), ('SB10003', 8000), ('SB10004', 4000), ('SB10006', 10000)]
4.找出所有不正常记录
acTransList.filter(lambda x: (x[1]<=0) or x[0][0:2]!='SB' ).collect()
[('B10009', 1000), ('SB10011', -1000)]
5.找出交易等于或者小于0的记录
acTransList.filter(lambda x: (x[1]<=0)).collect()
[('SB10011', -1000)]
6.找出非"SB"开头记录,找出交易小于等于0的记录,之后进行union操作
rdd1 = acTransList.filter(lambda x: (x[1]<=0))
rdd2 = acTransList.filter(lambda x: (x[0][0:2]!='SB')
rdd3 = rdd1.union(rdd2)
rdd3.collect()
File "<ipython-input-138-a352a6850e3a>", line 3
rdd3 = rdd1.union(rdd2)
^
SyntaxError: invalid syntax
7.计算所有交易的总和
acTransList.map(lambda x: x[1]).sum()
26056
8.最大和最小的交易
acTransList.map(lambda x: x[1]).max()
10000
acTransList.map(lambda x: x[1]).min()
-1000