安装方式:easy_install mrjob
使用方法:
#!-*- coding: utf8 -*-
"这段代码是使用MapReduce进行分词统计!"
import re import sys import jieba from mrjob.job import MRJob word_re = re.compile(u"[\u4E00-\u9FA5]+") class TestMrJob(MRJob): def mapper(self, _, line):
"统计每行" for word in word_re.findall(line.decode("utf8")): for new_word in jieba.cut(word): yield new_word.lower(), 1 def reducer(self, word, counts):
"统计结果" yield word, sum(counts) if __name__ == '__main__':
#此处注意:
data = TestMrJob.run() #sys.argv.append("data/2") #TestMrJob.run()
python test-mrjob.py data/2 > tmp
python test-mrjob.py data/* > tmp
奇怪的是,tmp文件中的结果是这样子的....
碉堡了...
-