combineByKey 实现 reduceByKey
conf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext(conf = conf)
def getSentences(nums):
return LoremIpsum().get_sentences(nums)
def wordCountApp(data):
data = sc.parallelize(data)
words = data.filter( lambda line : len(line.strip()) != 0 ).flatMap(lambda line : line.strip().split(" ")).map(lambda word : word.replace("." , "").replace(",",""))
# return words.map(lambda key : (key , 1)).reduceByKey(lambda a , b : a + b).sortByKey()
return words.map(lambda word : (word , 1))
if __name__ == '__main__':
nums = 1000
data = list()
data.append(getSentences(nums))
result = wordCountApp(data)
print(result.collect())
# reduceByKey 实现
resRDD = result.combineByKey(
lambda word : 1 ,
lambda acc , word : acc + word,
lambda acc1 , acc2 : acc1 + acc2
).map(lambda x : (x[0] , x[1]))
print(resRDD.collect())
combineByKey 实现 groupByKey
conf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext(conf = conf)
def getSentences(nums):
return LoremIpsum().get_sentences(nums)
def wordCountApp(data):
data = sc.parallelize(data)
words = data.filter( lambda line : len(line.strip()) != 0 ).flatMap(lambda line : line.strip().split(" ")).map(lambda word : word.replace("." , "").replace(",",""))
# return words.map(lambda key : (key , 1)).reduceByKey(lambda a , b : a + b).sortByKey()
return words.map(lambda word : (word , 1))
if __name__ == '__main__':
nums = 1000
data = list()
data.append(getSentences(nums))
result = wordCountApp(data)
print(result.collect())
# groupByKey实现
resRDD = result.combineByKey(
lambda cnt : [cnt],
lambda acc , cnt : acc + [cnt], #lambda acc , cnt : acc.append(cnt)
lambda acc1 , acc2 : acc1 + acc2
)
print(resRDD.collect())