数据格式为[序号id,区号,区的销售额]
import findspark
findspark.init()
from pyspark import SparkConf,SparkContext
#from pyspark.sql import SparkSession
conf = SparkConf().setMaster('local').setAppName('sales')
sc = SparkContext(conf = conf)
lines = sc.textFile('sales.csv')# 销售数据为[序号id,区号,区的销售额]
lines.take(5)
显示结果为
['1,34,2334', '2,33,3433', '3,53,2324', '4,59,2232', '5,68,4444']
def parseLine(line):
fields =line.split(',')
dist = int(fields[1])
sales = int(fields[2])
return (dist,sales)#取得(区号和区的销售额)为一个键值对
rdd =lines.map(parseLine)
rdd.take(5)
#结果为[(34, 2334), (33, 3433), (53, 2324), (59, 2232), (68, 4444)]
rdd.mapValues(lambda x: (x,1)).take(10)
#如果transformation不需要改变key,那么我们通常考虑使用mapValues()或者flatValues()而不是map()或者flatmap()
#前者更加高效,可以保留原始的partition,而无需shuffle数据。
'''
[(34, (2334, 1)),
(33, (3433, 1)),
(53, (2324, 1)),
(59, (2232, 1)),
(68, (4444, 1)),
(78, (4522, 1)),
(87, (4600, 1)),
(97, (4678, 1)),
(16, (4756, 1)),
(16, (4834, 1))]
'''
rdd.mapValues(lambda x : (x,1)).reduceByKey(lambda x,y:(x[0] + y[0],x[1] + y[1])).take(10)
#得到(区号, (该区号的所有销售值, sales.csv中该区号出现的次数))
'''
[(34, (476038, 35)),
(33, (512389, 37)),
(53, (393748, 30)),
(59, (173806, 14)),
(68, (177032, 14)),
(78, (210880, 16)),
(87, (212128, 16)),
(97, (321940, 22)),
(16, (649028, 44)),
(23, (327088, 22))]
'''
totalsByAge =rdd.mapValues(lambda x : (x,1)).reduceByKey(lambda x,y:(x[0] + y[0],x[1] + y[1]))
averageByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])
#averageByAge.take(5)
#得到(区号, (该区号的所有销售值/sales.csv中该区号出现的次数)--->也就是平均销售额)
'''
[(34, 13601.085714285715),
(33, 13848.351351351352),
(53, 13124.933333333332),
(59, 12414.714285714286),
(68, 12645.142857142857)]
'''
results = averageByAge.collect()
for result in results:
print(result)
'''
(34, 13601.085714285715)
(33, 13848.351351351352)
(53, 13124.933333333332)
(59, 12414.714285714286)
(68, 12645.142857142857)
(78, 13180.0)
(87, 13258.0)
(97, 14633.636363636364)
(16, 14750.636363636364)
(23, 14867.636363636364)
'''