#!/usr/bin/env python
QUEUE="queue_1101_01"
JOBNAME="find_min"
Anaconda_python="/appcom/AnacondaInstall/anaconda-2.1.0/bin/python"
input_file_list="/apps-data/usr/"
output_file_list="/apps-data/usr_outfile/"
mapper_file="/home/usr/mapper.py"
reducer_file="/home/usr/reducer.py"
hadoop fs -rmr ${output_file_list}
/appcom/hadoop/bin/hadoop jar /appcom/hadoop/contrib/streaming/hadoop-streaming-1.2.1.jar \
-D mapred.job.name=$JOBNAME \
-D mapred.job.queue.name=$QUEUE \
-D mapred.job.priority=NORMAL \
-D mapred.map.task=1 \
-D mapred.reduce.tasks=1 \
-D mapred.job.map.capacity=10 \
-D mapred.job.reduce.capacity=10 \
-D stream.map.output.field.separator="\t" \
-input "${input_file_list}" \
-output "${output_file_list}" \
-mapper "${Anaconda_python} mapper.py" \
-reducer "${Anaconda_python} reducer.py" \
-file "${mapper_file}" \
-file "${reducer_file}"
##mapper.py
#!/usr/bin/env python
import sys
def read_input(file):
for line in file:
# split the line into words
yield line.split()
def main(separator='\t'):
# input comes from STDIN (standard input)
data = read_input(sys.stdin)
for numbers in data:
smallest = min([int(x) for x in numbers])
print '%s%s%s' % (0, separator, smallest)
if __name__ == "__main__":
main()
##reducer.py
#!/usr/bin/env python
import sys
def read_mapper_output(file, separator='\t'):
for line in file:
yield line.strip().split(separator)
def main(separator='\t'):
data = read_mapper_output(sys.stdin, separator=separator)
smallest = None
for numbers in data:
if smallest == None or smallest > int(numbers[1]):
smallest = int(numbers[1])
print '%s%s%s' % (0, separator, smallest)
if __name__ == "__main__":
main()
转载于:https://my.oschina.net/kyo4321/blog/1358446