python接口
环境:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName(appName).setMaster(master)
sc = SparkContext(conf=conf)
rdd
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)
kv数据,groupByKey
data2 = [('a',1),('b',2),('c',3),('a',4),('b',5)]
rdd2 = sc.parallelize(data)
kv,join
data2 = [('a',1),('b',2),('c',3),('a',4),('b',5)]
rdd2 = sc.parallelize(data2)
data3 = [('a',1),('b',2)]
rdd3 = sc.parallelize(data3)
rdd2.join(rdd3)
[('a', (1, 1)), ('a', (4, 1)), ('b', (2, 2)), ('b', (5, 2))]
spark 估算pi
def inside(p):
x, y = random.random(), random.random()
return x*x + y*y < 1
count = sc.parallelize(xrange(0, NUM_SAMPLES)) \
.filter(inside).count()
print "Pi is roughly %f" % (4.0 * count / NUM_SAMPLES)