from mrjob.job import MRJob
class MRWordCounter(MRJob):
def mapper(self, key, line):
for word in line.split():
yield word, 1
def reducer(self, word, occurrences):
yield word, sum(occurrences)
if __name__=='__main__':
MRWordCounter.run()
top-N
import sys
from mrjob.job import MRJob,MRStep
import heapq
class TopNWords(MRJob):
def mapper(self, _, line):
if line.strip() != "":
for word in line.strip().split():
yield word,1
def combiner(self, word, counts): #介于mapper和reducer之间,用于临时的将mapper输出的数据进行统计
yield word,sum(counts)
def reducer_sum(self, word, counts):
yield None,(sum(counts),word)
def top_n_reducer(self,_,word_cnts): #利用heapq将数据进行排序,将最大的2个取出
for cnt,word in heapq