-
使用纯 Python 编写多步的 MapReduce 作业
-
在本机上进行测试
-
在 Hadoop 集群上运行
1
|
pip install mrjob
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
#coding:utf-
8
from mrjob.job
import
MRJob
import
re
#xiaorui.cc
#WORD_RE = re.compile(r
"[\w']+"
)
WORD_RE = re.compile(r
"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
)
class
MRWordFreqCount(MRJob):
def mapper(self, word, line):
for
word
in
WORD_RE.findall(line):
yield word.lower(),
1
def combiner(self, word, counts):
yield word, sum(counts)
def reducer(self, word, counts):
yield word, sum(counts)
if
__name__ ==
'__main__'
:
MRWordFreqCount.run()
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
[root@kspc ~]# python mo.py -r local <
10.7
.
17.7
-dnsquery.log.
1
> output
no configs found; falling back on auto-configuration
no configs found; falling back on auto-configuration
creating tmp directory /tmp/mo.root.
20131224.040935
.
241241
reading from STDIN
writing to /tmp/mo.root.
20131224.040935
.
241241
/step-
0
-mapper_part-
00000
> /usr/bin/python mo.py --step-num=
0
--mapper /tmp/mo.root.
20131224.040935
.
241241
/input_part-
00000
| sort | /usr/bin/python mo.py --step-num=
0
--combiner > /tmp/mo.root.
20131224.040935
.
241241
/step-
0
-mapper_part-
00000
writing to /tmp/mo.root.
20131224.040935
.
241241
/step-
0
-mapper_part-
00001
> /usr/bin/python mo.py --step-num=
0
--mapper /tmp/mo.root.
20131224.040935
.
241241
/input_part-
00001
| sort | /usr/bin/python mo.py --step-num=
0
--combiner > /tmp/mo.root.
20131224.040935
.
241241
/step-
0
-mapper_part-
00001
Counters from step
1
:
(no counters found)
writing to /tmp/mo.root.
20131224.040935
.
241241
/step-
0
-mapper-sorted
> sort /tmp/mo.root.
20131224.040935
.
241241
/step-
0
-mapper_part-
00000
/tmp/mo.root.
20131224.040935
.
241241
/step-
0
-mapper_part-
00001
writing to /tmp/mo.root.
20131224.040935
.
241241
/step-
0
-reducer_part-
00000
> /usr/bin/python mo.py --step-num=
0
--reducer /tmp/mo.root.
20131224.040935
.
241241
/input_part-
00000
> /tmp/mo.root.
20131224.040935
.
241241
/step-
0
-reducer_part-
00000
writing to /tmp/mo.root.
20131224.040935
.
241241
/step-
0
-reducer_part-
00001
> /usr/bin/python mo.py --step-num=
0
--reducer /tmp/mo.root.
20131224.040935
.
241241
/input_part-
00001
> /tmp/mo.root.
20131224.040935
.
241241
/step-
0
-reducer_part-
00001
Counters from step
1
:
(no counters found)
Moving /tmp/mo.root.
20131224.040935
.
241241
/step-
0
-reducer_part-
00000
-> /tmp/mo.root.
20131224.040935
.
241241
/output/part-
00000
Moving /tmp/mo.root.
20131224.040935
.
241241
/step-
0
-reducer_part-
00001
-> /tmp/mo.root.
20131224.040935
.
241241
/output/part-
00001
Streaming
final
output from /tmp/mo.root.
20131224.040935
.
241241
/output
removing tmp directory /tmp/mo.root.
20131224.040935
.
241241
|
1
2
3
4
5
6
7
8
9
10
11
12
13
|
from mrjob.job
import
MRJob
#from xiaorui.cc
class
MRWordFrequencyCount(MRJob):
#把东西拼凑起来
def mapper(self, _, line):
yield
"chars"
, len(line)
yield
"words"
, len(line.split())
yield
"lines"
,
1
#总结kv
def reducer(self, key, values):
yield key, sum(values)
if
__name__ ==
'__main__'
:
MRWordFrequencyCount.run()
|
Running your job different ways
$ python my_job.py input.txt
$ python my_job.py < input.txt
$ python my_job.py input1.txt input2.txt - < input3.txt
$ python my_job.py -r emr s3://my-inputs/input.txt
$ python my_job.py -r hadoop hdfs://my_home/input.txt