# # parallelize:并行化数据,转化为RDD
# data = [1, 2, 3, 4, 5]
# distData = sc.parallelize(data, numSlices=10) # numSlices为分块数目,根据集群数进行分块
# # textFile读取外部数据
# rdd = sc.textFile("./c2.txt") # 以行为单位读取外部文件,并转化为RDD
# print rdd.collect()
# map:迭代,对数据集中数据进行单独操作
def my_add(l):
return (l,l)
data = [1, 2, 3, 4, 5,7]
distData = sc.parallelize(data) # 并行化数据集
result = distData.map(my_add)
print (result.collect()) # 返回一个分布数据集
for ii in result.collect():
print(ii[0])
# # filter:过滤数据
# def my_add(l):
# result = False
# if l > 2:
# result = True
# return result
# data = [1, 2, 3, 4, 5]
# distData = sc.parallelize(data)#并行化数据集,分片
# result = distData.filter(my_add)
# print (result.collect())#返回一个分布数据集
# # zip:将两个RDD对应元素组合为元组
# x = sc.parallelize(range(0,5))
# y = sc.parallelize(range(1000, 1005))
# print x.zip(y).collect()
# #union 组合两个RDD
# print x.union(x).collect()
# # Aciton操作
# # collect:返回RDD中的数据
# rdd = sc.parallelize(range(1, 10))
# print rdd
# print rdd.collect()
# # collectAsMap:以rdd元素为元组,以元组中一个元素作为索引返回RDD中的数据
# m = sc.parallelize([('a', 2), (3, 4)]).collectAsMap()
# print m['a']
# print m[3]
# # groupby函数:根据提供的方法为RDD分组:
# rdd = sc.parallelize([1, 1, 2, 3, 5, 8])
# def fun(i):
# return i % 2
# result = rdd.groupBy(fun).collect()
# print [(x, sorted(y)) for (x, y) in result]
# # reduce:对数据集进行运算
# rdd = sc.parallelize(range(1, 10))
# result = rdd.reduce(lambda a, b: a + b)
# print result
spark中RDD的相关操作
最新推荐文章于 2023-10-09 15:16:55 发布