文章标题

run.sh

#!/bin/bash
HADOOP_HOME=/usr/local/webserver/hadoop
INPUT_PATH=/data/archive/app_oeudjgn5872a7c3aaa54_datamine/george/inputs
INPUT_PATH2=/user/resys/projects/image/koudai_img/d_img/reduced/2015-taobao/part-00$1*

#total 10, from may00 to may 10
OUTPUT_PATH=/data/archive/app_oeudjgn5872a7c3aaa54_datamine/george/output/may0$1
#OUTPUT_PATH=output
#echo "Clearing output path: $OUTPUT_PATH"
$HADOOP_HOME/bin/hadoop fs -rmr $OUTPUT_PATH

${HADOOP_HOME}/bin/hadoop jar\
   ${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-streaming-2.6.0.jar\
  -file mapper1.py\
  -file reducer1.py\
  -mapper mapper1.py\
  -reducer reducer1.py\
  -input $INPUT_PATH\
  -input $INPUT_PATH2\
  -output $OUTPUT_PATH\
  -jobconf mapreduce.jobtracker.split.metainfo.maxsize=-1\
  -jobconf mapred.job.queue.name="offline"\
  -jobconf mapred.job.priority="NORMAL"\
  -jobconf mapred.reduce.tasks="50"

mapper.py

#!/usr/bin/env python
import sys 
import base64

for line in sys.stdin:
    line = line.split()
    if len(line)>=2:
        print line[0], "\t", line[2]

reducer1.py

#!/usr/bin/env python
import sys 

cn = 1 
nb_line = 1 
for line in sys.stdin:
    line = line.split()
    if nb_line == 1:
        last_key, last_value = line[0], line[1]
        before_cur_key, before_cur_value = line[0], line[1]
    else:
        cur_key, cur_value = line[0], line[1]
        if last_key != cur_key:
            if cn <= 1:
                before_cur_key = last_key = cur_key
                before_cur_value = last_value = cur_value
                cn = 1 
                continue
            else:
                #to verify which is the image
                if len(before_cur_value)>=1000: 
                    print before_cur_key, '\t', before_cur_value 
                elif len(last_value)>=1000:
                    print last_key, '\t', last_value
                before_cur_key = last_key = cur_key
                before_cur_value = last_value = cur_value
                cn = 1 
        else:
            before_cur_key = cur_key
            before_cur_value = cur_value
            cn += 1
    nb_line += 1
if cn != 1:
    if len(before_cur_value)>=1000: 
        print before_cur_key, '\t', before_cur_value 
    elif len(last_value) >= 1000:
        print last_key, '\t', last_value
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值