spark python_Spark Python API函数学习:pyspark API(1)

PySpark.png

iteblog.png

如果想及时了解iteblog_hadoop

pyspark version # print Spark version

print("pyspark version:" + str(sc.version))

pyspark version:1.2.2

map

pyspark-page3.svg # map

# sc = spark context, parallelize creates an RDD from the passed object

x = sc.parallelize([1,2,3])

y = x.map(lambda x: (x,x**2))

# collect copies RDD elements to a list on the driver

print(x.collect())

print(y.collect())

[1, 2, 3]

[(1, 1), (2, 4), (3, 9)]

flatMap

pyspark-page4.svg # flatMap

x = sc.parallelize([1,2,3])

y = x.flatMap(lambda x: (x, 100*x, x**2))

print(x.collect())

print(y.collect())

[1, 2, 3]

[1, 100, 1, 2, 200, 4, 3, 300, 9]

mapPartitions

pyspark-page5.svg # mapPartitions

x = sc.parallelize([1,2,3], 2)

def f(iterator): yield sum(iterator)

y = x.mapPartitions(f)

# glom() flattens elements on the same partition

print(x.glom().collect())

print(y.glom().collect())

[[1], [2, 3]]

[[1], [5]]

mapPartitionsWithIndex

pyspark-page6.svg # mapPartitionsWithIndex

x = sc.parallelize([1,2,3], 2)

def f(partitionIndex, iterator): yield (partitionIndex,sum(iterator))

y = x.mapPartitionsWithIndex(f)

# glom() flattens elements on the same partition

print(x.glom().collect())

print(y.glom().collect())

[[1], [2, 3]]

[[(0, 1)], [(1, 5)]]

getNumPartitions

pyspark-page7.svg # getNumPartitions

x = sc.parallelize([1,2,3], 2)

y = x.getNumPartitions()

print(x.glom().collect())

print(y)

[[1], [2, 3]]

2

filter

pyspark-page8.svg # filter

x = sc.parallelize([1,2,3])

y = x.filter(lambda x: x%2 == 1) # filters out even elements

print(x.collect())

print(y.collect())

[1, 2, 3]

[1, 3]

distinct

pyspark-page9.svg # distinct

x = sc.parallelize(['A','A','B'])

y = x.distinct()

print(x.collect())

print(y.collect())

['A', 'A', 'B']

['A', 'B']

sample

pyspark-page10.svg # sample

x = sc.parallelize(range(7))

# call 'sample' 5 times

ylist = [x.sample(withReplacement=False, fraction=0.5) for i in range(5)]

print('x = ' + str(x.collect()))

for cnt,y in zip(range(len(ylist)), ylist):

print('sample:' + str(cnt) + ' y = ' + str(y.collect()))

x = [0, 1, 2, 3, 4, 5, 6]

sample:0 y = [0, 2, 5, 6]

sample:1 y = [2, 6]

sample:2 y = [0, 4, 5, 6]

sample:3 y = [0, 2, 6]

sample:4 y = [0, 3, 4]

takeSample

pyspark-page11.svg # takeSample

x = sc.parallelize(range(7))

# call 'sample' 5 times

ylist = [x.takeSample(withReplacement=False, num=3) for i in range(5)]

print('x = ' + str(x.collect()))

for cnt,y in zip(range(len(ylist)), ylist):

print('sample:' + str(cnt) + ' y = ' + str(y)) # no collect on y

x = [0, 1, 2, 3, 4, 5, 6]

sample:0 y = [0, 2, 6]

sample:1 y = [6, 4, 2]

sample:2 y = [2, 0, 4]

sample:3 y = [5, 4, 1]

sample:4 y = [3, 1, 4]

union

pyspark-page12.svg # union

x = sc.parallelize(['A','A','B'])

y = sc.parallelize(['D','C','A'])

z = x.union(y)

print(x.collect())

print(y.collect())

print(z.collect())

['A', 'A', 'B']

['D', 'C', 'A']

['A', 'A', 'B', 'D', 'C', 'A']

intersection

pyspark-page13.svg # intersection

x = sc.parallelize(['A','A','B'])

y = sc.parallelize(['A','C','D'])

z = x.intersection(y)

print(x.collect())

print(y.collect())

print(z.collect())

['A', 'A', 'B']

['A', 'C', 'D']

['A']

sortByKey

pyspark-page14.svg # sortByKey

x = sc.parallelize([('B',1),('A',2),('C',3)])

y = x.sortByKey()

print(x.collect())

print(y.collect())

[('B', 1), ('A', 2), ('C', 3)]

[('A', 2), ('B', 1), ('C', 3)]

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值