RDD转换运算
# 创建intRDD
intRDD = sc.parallelize([3,1,2,5,5,6])
intRDD.collect()
[3, 1, 2, 5, 5, 6]
# 创建stringRDD
stringRDD = sc.parallelize(['apple','pen','banana'])
stringRDD.collect()
['apple', 'pen', 'banana']
# map 对每个元素都进行运算操作
def addOne(x):
return (x*3)
intRDD.map(addOne).collect()
intRDD.map(lambda x:x+1).collect()
stringRDD.map(lambda x:'first:'+x).collect()
[9, 3, 6, 15, 18]
[4, 2, 3, 6, 7]
['first:apple', 'first:pen', 'first:banana']
# filter数字运算,筛选
intRDD.filter(lambda x: x>2).collect()
intRDD.filter(lambda x:0<x<5).collect()
intRDD.filter(lambda x:x>=5 or x<3).collect()
stringRDD.filter(lambda x: 'a' in x).collect()
[3, 5, 6]
[3, 1, 2]
[1, 2, 5, 6]
['apple', 'banana']
# distinct 删除重复元素
intRDD.distinct().collect()
# randomSplit 可以将整个集合元素以随机数的方式按照比例分为多个RDD
sRDD = intRDD.randomSplit([0.4,0.6])
sRDD[0].collect()
sRDD[1].collect()
# groupby可以按照传入的匿名函数规则将数据分为多个List
gRDD = intRDD.groupBy(lambda x: "even" if(x%2 == 0) else "odd").collect()
gRDD
[1, 5, 2, 6, 3]
[2, <