- 启动Hadoop和集群
- 进入spark-1.1.0-bin-hadoop 目录下的
cd ./bin/pyspark
启动spark shell
In [1]: pair={("panda",0),("pink",3),("pirate",3),("panda",1),("pink",4)}
In [2]: pairRDD=sc.parallelize(pair)
In [3]: print"{}".format(pairRDD.collect())
[('panda', 1), ('pink', 3), ('pirate', 3), ('panda', 0), ('pink', 4)]
In [4]: pairRDD1=pairRDD.mapValues(lambda x :(x,1))
In [5]: pairRDD2=pairRDD1.reduceByKey(lambda x,y:(x[0]+y[0],x[1]+y[1]))
In [6]: pairRDD3=pairRDD2.reduceByKey(lambda x,y:(x[0]/x[1]))
In [7]: print"{}".format(pairRDD3.collect())
[('pink', (7, 2)), ('panda', (1, 2)), ('pirate', (3, 1))]