这是Udacity的课程 intro to hadoop and mapReduce里面Lesson4的练习
求总体的Top N。
首先在Mapper中求出局部的Top N,求Top N不能像word count那样来一句print一句,要把所有的line都读完,计数,排序,输入topN
然后再Reducer中求出全局的 Top N。
以下是Mapper 代码
#!/usr/bin/python
"""
Your mapper function should print out 10 lines containing longest posts, sorted in
ascending order from shortest to longest.
Please do not use global variables and do not change the "main" function.
"""
import sys
import csv
def mapper():
reader = csv.reader(sys.stdin, delimiter='\t')
writer = csv.writer(sys.stdout, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
lines = []
for line in reader:
lines.append(line)
# YOUR CODE HERE
lines.sort(key = lambda x: len(x[4]), reverse = True)
for i in range(9, -1, -1):
writer.writerow(lines[i])
test_text = """\"\"\t\"\"\t\"\"\t\"\"\t\"333\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"88888888\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"1\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"11111111111\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"1000000000\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"22\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"4444\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"666666\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"55555\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"999999999\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"7777777\"\t\"\"
"""
# This function allows you to test the mapper with the provided test string
def main():
import StringIO
sys.stdin = StringIO.StringIO(test_text)
mapper()
sys.stdin = sys.__stdin__
main()