mapper:
import sys,hashlib,struct,os
from urllib import unquote
if __name__=="__main__" :
for line in sys.stdin :
line = line .strip()
tks = line .split ("\t" )
print tks[0 ]+'\t' +'1'
reducer:
import sys,re
curqq=''
for line in sys.stdin :
line = line .strip()
tks = line .split ("\t" )
if curqq !=tks[0 ]:
curqq=tks[0 ]
else :
print tks[0 ]
运行脚本:
if [[ $# -ne 3 ]]
then
echo $#
echo "run.sh day input whitelist md5_flag output"
exit -1
fi
day=`date +%Y%m%d -d "${1}" `
input_dir=${2 }
output=${3 }
fs_kd -b task_rcm -u vitamin -test -d ${input_dir}
if [[ $? -ne 0 ]]
then
echo "input dir not exists: ${input_dir}"
exit -1
fi
fs_kd -b task_rcm -u vitamin -test -d ${output}
if [[ $? -eq 0 ]]
then
echo "output dir exists: ${output}"
exit -1
fi
hadoop jar hadoop-streaming-2.3 .0 -cdh5.1 .0 .jar -archives hadoop-streaming-2.3 .0 -cdh5.1 .0 .jar \
-D mapred.reduce.tasks=1 \
-D mapreduce.job.name=model_utils \
-D mapreduce.job.queuename="task_rcm" \
-D stream.num.map.output.key.fields=2 \
-D num.key.fields.for .partition=1 \
-D mapred.text.key.comparator.options="-k2,2nr" \
-D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
-mapper mapper.py \
-file mapper.py \
-reducer reducer.py \
-file reducer.py \
-input ${input_dir} \
-output ${output}
is_completed=`grep log.txt -e 'completed successfully' | wc -l`
if [[ $is_completed -lt 1 ]]
then
echo "${cur_date}: failed"
exit -1
fi