一、数据源
xiaoliu 64
xiaoliu 69
xiaoliu 79
xiaoji 98
xiaoliu 100
xiaoji 99
xiaowang 27
xiaowang 69
xiaowang 64
xiaozhang 67
xiaozhang 38
xiaozhang 93
xiaozhang 29
xiaozhang 85
xiaoliu 19
xiaoliu 53
xiaoliu 93
xiaoji 90
xiaoji 85
xiaoji 73
xiaoji 64
xiaoji 39
二、编程pyspark
2.1 方法一:
from pyspark import SparkContext,SparkConf
import os
import random
if 'SPARK_HOME' not in os.environ:
os.environ['SPARK_HOME'] = "E:\ProgramFiles\spark-2.2.1-bin-2.6.0-cdh5.14.2"
os.environ['PYSPARK_PYTHON'] = "E:\ProgramFiles\Anaconda3\python.exe"
config = SparkConf() \
.setMaster("local[*]") \
.setAppName("TopNDemo")
sc = SparkContext.getOrCreate(config)
path = "G:/liu/topn.txt"
rdd = sc.textFile(path)
rdd1 = rdd \
.map(lambda line : line.split(" ")) \
.filter(lambda arr : len(arr) == 2) \
.map(lambda arr : (arr[0],int(arr[1])))
def top3(key,iter):
sortedIter = sorted(iter,reverse=True)
top3 = sortedIter[0:3]
return map(lambda x : (key,x),top3)
result1 = rdd1 \
.map(lambda t : ((random.randint(1,10),t[0]),t[1])) \
.groupByKey() \
.flatMap(lambda t : top3(t[0][1],t[1])) \
.groupByKey() \
.flatMap(lambda t : top3(t[0],t[1]))
print(result1.collect())
结果:
[('xiaoliu', 100), ('xiaoliu', 93), ('xiaoliu', 79), ('xiaowang', 69), ('xiaowang', 64), ('xiaowang', 27), ('xiaozhang', 93), ('xiaozhang', 85), ('xiaozhang', 67), ('xiaoji', 99), ('xiaoji', 98), ('xiaoji', 90)]
2.2 方法二(aggregateByKey)
2.2.1 方法一
from pyspark import SparkContext,SparkConf
import os
from functools import reduce
if 'SPARK_HOME' not in os.environ:
os.environ['SPARK_HOME'] = "E:\ProgramFiles\spark-2.2.1-bin-2.6.0-cdh5.14.2"
os.environ['PYSPARK_PYTHON'] = "E:\ProgramFiles\Anaconda3\python.exe"
config = SparkConf() \
.setMaster("local[*]") \
.setAppName("TopNDemo")
sc = SparkContext.getOrCreate(config)
path = "G:/liu/topn.txt"
rdd = sc.textFile(path)
rdd1 = rdd \
.map(lambda line : line.split(" ")) \
.filter(lambda arr : len(arr) == 2) \
.map(lambda arr : (arr[0],int(arr[1])))
zeroValue = []
def f(a,b):
a.append(b)
sortedIter = sorted(a,reverse=True)
top3 = sortedIter[0:3]
return top3
seqFunc = lambda a ,b : f(a,b)
def g(c,d):
for i in d:
c.append(i)
sortedIter = sorted(c,reverse=True)
top3 = sortedIter[0:3]
return top3
combFunc = lambda c , d: g(c,d)
result2 = rdd1 \
.aggregateByKey(zeroValue,seqFunc,combFunc)
print(result2.collect())
2.2.2 方法二
from pyspark import SparkContext,SparkConf
import os
from functools import reduce
if 'SPARK_HOME' not in os.environ:
os.environ['SPARK_HOME'] = "E:\ProgramFiles\spark-2.2.1-bin-2.6.0-cdh5.14.2"
os.environ['PYSPARK_PYTHON'] = "E:\ProgramFiles\Anaconda3\python.exe"
config = SparkConf() \
.setMaster("local[*]") \
.setAppName("TopNDemo")
sc = SparkContext.getOrCreate(config)
path = "G:/liu/topn.txt"
rdd = sc.textFile(path)
rdd1 = rdd \
.map(lambda line : line.split(" ")) \
.filter(lambda arr : len(arr) == 2) \
.map(lambda arr : (arr[0],int(arr[1])))
zeroValue = []
def f(a,b):
a.append(b)
sortedIter = sorted(a,reverse=True)
top3 = sortedIter[0:3]
return top3
seqFunc = lambda a ,b : f(a,b)
combFunc = lambda c,d : reduce(lambda x , y : f(x,y),c,d)
result3 = rdd1 \
.aggregateByKey(zeroValue,seqFunc,combFunc)
print(result3.collect())