1、map:
#!/home/work/python2.7/bin/python
# -*- coding:utf-8 -*-
import string
import sys, re, time, getopt, random, os, json
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def mapper():
for line in sys.stdin:
items = line.strip().split('\0')
output = []
query = items[1]
output.append(str(query))
print '\t'.join(output)
if __name__ == '__main__':
mapper()
2、reduce
# coding:utf-8
import sys, re, time, getopt, random, os
import urllib
import datetime
from collections import defaultdict
def reducer():
output = {}
for line in sys.stdin:
items = line.strip().split('\t')
print '\t'.join(items)
if __name__ == "__main__":
reducer()
3、shell
#!/bin/bash
LIB_DIR=/home/work/local/upi_bin/lib
export HADOOP_CLASSPATH=${LIB_DIR}/upi-mr.jar
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${LIB_DIR}/native/Linux-amd64-64/
unset ${HADOOP_HOME}
export HADOOP_HOME=/home/work/local/hadoop-client/hadoop
HADOOP_BIN=/home/work/local/hadoop-client/hadoop/bin/hadoop
STREAMING=/home/work/local/hadoop-client/hadoop/contrib/streaming/hadoop-2-streaming.jar
if [ "$1" = "" ]; then
today_date=`date -d "3 days ago" +%Y%m%d`
else
today_date=$1
fi
yesterday_date=$(date -d "${today_date} -7 days" +%Y%m%d)
OUTPUT_PATH=""
INPUT_PATH=""
HADOOP=/home/work/dschedulerAgent/dscheduler-opera/hadoop-2.7.5/bin/hadoop
${HADOOP} fs -rmr ${OUTPUT_PATH}
${HADOOP} fs -rmr ${delete_PATH}
${HADOOP_BIN} streaming -libjars ${LIB_DIR}/upi-mr.jar \
-libjars ${LIB_DIR}/hive-exec-1.2.1.jar \
-D mapred.job.priority=VERY_HIGH \
-D num.key.fields.for.partition=1 \
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
-input ${INPUT_PATH} \
-output ${OUTPUT_PATH} \
-cacheArchive '**/Python27.tar.gz#Python27' \
-mapper "Python27/Python-2.7.8/python map.py " \
-reducer "Python27/Python-2.7.8/python red.py " \
-file map.py \
-file red.py \
-inputformat com.baidu.udw.mapred.MultiTableInputFormat \
-jobconf mapred.map.tasks=100 \
-jobconf mapred.reduce.tasks=100 \
-jobconf mapred.job.map.capacity=100 \
-jobconf mapred.job.reduce.capacity=400 \
-jobconf stream.recordreader.compression=gzip \
-jobconf mapred.output.compress=true \
-jobconf mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec \
-jobconf mapred.job.name="testjob"
if [ ${?} -ne 0 ]
then
echo "job failed"
exit 1
fi