代码:
#coding:utf-8
from pyspark import SparkContext
sc=SparkContext("local[2]","First Spark App");
#将csv格式数据映射到record数组中,record[0],record[1],record[2]
data=sc.textFile("data/UserPurchaseHistory.csv").map(lambda line: line.split(",")).map(lambda record: (record[0], record[1] , record[2]));
numPurchases=data.count() #获得数据条数
print numPurchases
uniqueUsers=data.map(lambda record: record[0]).distinct().count() #
print uniqueUsers
totalRevenue=data.map(lambda record:float(record[2])).sum()
print totalRevenue
#reduceByKey就是对元素为KV对的RDD中Key相同的元素的Value进行binary_function的reduce操作,因此,Key相同的多个元素的值被reduce为一个值,然后与原RDD中的Key组成一个新的KV对。
products = data.map(lambda record: (record[1], 1.0)).reduceByKey(lambda a, b: a + b).collect()
print products
mostPopular = sorted(products, key=lambda x: x[1], reverse=True)[0]
print "Total purchases: %