代码
run.sh
#!/bin/sh
HADOOP_CMD="/data/server/hadoop/bin/hadoop"
STREAM_JAR_PATH="/data/server/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.6.5.jar"
INPUT_FILE_PATH_1="/The_Man_of_Property.txt"
OUTPUT_PATH="/output_file_broadcast"
$HADOOP_CMD fs -rm -r -skipTrash $OUTPUT_PATH
$HADOOP_CMD jar $STREAM_JAR_PATH \
-input $INPUT_FILE_PATH_1 \
-output $OUTPUT_PATH \
-mapper "python map.py mapper_func white_list" \
-reducer "python red.py reduer_func" \
-jobconf "mapred.reduce.tasks=2" \
-file ./map.py \
-file ./red.py \
-file ./white_list
map.py
#!/usr/bin/env python
#!-*- coding: utf-8 -*-
import sys
def read_local_file(f):
word_set = set()
file_in = open(f, 'r')
for line in file_in:
word = line.strip()
if word:
word_set.add(word)
return word_set
def mapper_func(white_list_fd):
word_set = read_local_file(white_list_fd)
# print word_set
for line in sys.stdin:
ss = line.strip().split(' ')
for s in ss:
word = s.strip()
if word != "" and (word in word_set):
print "\t".join([word, "1"])
if __name__ == "__main__":
module = sys.modules[__name__]
func = getattr(module, sys.argv[1])
args = None
if len(sys.argv) > 1:
args = sys.argv[2:]
func(*args)
red.py
#/usr/bin/env python
#!-*- coding: utf-8 -*-
import sys
def reduer_func():
cur_word = None
cnts_pool = []
num = 0
for line in sys.stdin:
line = line.strip()
if not line or len(line.split("\t")) != 2:
continue
word, val = line.split("\t")
if cur_word == None:
cur_word = word
if cur_word != word:
num = sum(cnts_pool)
print "\t".join([cur_word, str(num)])
cur_word = word
cnts_pool = []
num = 0
cnts_pool.append(int(val))
num = sum(cnts_pool)
# print "%s\t%s" % (cur_word, num)
# print "\t".join([str(cur_word), str(num)])
print "\t".join([str(cur_word), str(num)]) # 这样就可以保证一定是字符串
if __name__ == "__main__":
# print sys.modules.keys()
module = sys.modules[__name__]
# print module
func = getattr(module, sys.argv[1])
args = None
if len(sys.argv) > 1:
args = sys.argv[2:]
func(*args)
本地调试运行
$ cat The_Man_of_Property.txt | python map.py mapper_func white_list | sort | uniq -c | python red.py reduer_func
93 against 1
2 recent 1
2 suitable 1
集群上跑
在集群上跑的话,会报错
相关报错
终端打印日志
java.lang.RuntimeException: PipeMapRed.waitOutputThreads(): subprocess failed with code 1
at org.apache.hadoop.streaming.PipeMapRed.waitOutputThreads(PipeMapRed.java:322)
at org.apache.hadoop.streaming.PipeMapRed.mapRedFinished(PipeMapRed.java:535)
at org.apache.hadoop.streaming.PipeReducer.close(PipeReducer.java:134)
at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:453)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:392)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1692)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
2019-09-03 18:39:47,284 INFO [main] org.apache.hadoop.streaming.PipeMapRed: PipeMapRed failed!
java.lang.RuntimeException: PipeMapRed.waitOutputThreads(): subprocess failed with code 1
at org.apache.hadoop.streaming.PipeMapRed.waitOutputThreads(PipeMapRed.java:322)
at org.apache.hadoop.streaming.PipeMapRed.mapRedFinished(PipeMapRed.java:535)
at org.apache.hadoop.streaming.PipeReducer.close(PipeReducer.java:134)
at org.apache.hadoop.io.IOUtils.cleanup(IOUtils.java:237)
at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:459)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:392)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1692)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
2019-09-03 18:39:47,284 WARN [main] org.apache.hadoop.mapred.YarnChild: Exception running child : java.lang.RuntimeException: PipeMapRed.waitOutputThreads(): subprocess failed with code 1
at org.apache.hadoop.streaming.PipeMapRed.waitOutputThreads(PipeMapRed.java:322)
at org.apache.hadoop.streaming.PipeMapRed.mapRedFinished(PipeMapRed.java:535)
at org.apache.hadoop.streaming.PipeReducer.close(PipeReducer.java:134)
at org.apache.hadoop.io.IOUtils.cleanup(IOUtils.java:237)
at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:459)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:392)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1692)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Container: container_1567506933807_0002_01_000008 on hadoop-node-slave-02_33466_1567507210129
===============================================================================================
LogType:stderr
Log Upload Time:3-Sep-2019 18:40:10
LogLength:458
Log Contents:
Traceback (most recent call last):
File "red.py", line 45, in <module>
func(*args)
File "red.py", line 34, in reduer_func
print "\t".join([cur_word, str(num)])
TypeError: sequence item 0: expected string, NoneType found
log4j:WARN No appenders could be found for logger (org.apache.hadoop.hdfs.DFSClient).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
解决方法
使用格式化输出
- 尽量不要使用"".join()的方法,而是使用字符串格式化的方式
强制类型转换为 str
- 强制转换类型为字符串,保证 join 的时候不会报错