haddop 任务 start.sh

hadoop_bin="/usr/bin/hadoop"
hadoop_streaming_file="/opt/cloudera/parcels/CDH-5.11.0-1.cdh5.11.0.p0.34/lib/hadoop-mapreduce/hadoop-streaming.jar"
hadoop_input_dir="hdfs://nameservice1/user/cid_pv_dir_combine"
hadoop_output_dir="hdfs://nameservice1/user/cid_most_simiar_combine"
${hadoop_bin} fs -rm -r ${hadoop_output_dir}
${hadoop_bin} jar ${hadoop_streaming_file} \
   -archives test.tar \
   -file mapper_cid_most_similar.py \
   -file reducer_cid_most_similar.py \
   -file uid_cids.model \
   -file uid_cids.bin \
   -file uid_cids.model.docvecs.doctag_syn0.npy \
   -file uid_cids.model.wv.syn0.npy\
   -file uid_cids.model.syn1neg.npy\
   -inputformat TextInputFormat \
   -input $hadoop_input_dir \
   -output $hadoop_output_dir \
   -mapper "test.tar/test/bin/python mapper_cid_most_similar.py" \
   -reducer "test.tar/test/bin/python reducer_cid_most_similar.py" \
   -jobconf mapred.job.map.capacity=100 \
   -jobconf mapred.job.reduce.capacity=100 \
   -jobconf mapreduce.map.memory.mb=38192 \
   -jobconf mapreduce.reduce.memory.mb=38192 \
   -jobconf mapred.map.tasks=100 \
   -jobconf mapred.reduce.tasks=0 \
   -jobconf mapred.map.maxrunning=200 \
   -jobconf mapred.reduce.maxrunning=200 \
   -jobconf mapred.job.name="doc2vec generate uid_rec_cids" \
   -jobconf mapred.reduce.slowstart.completed.maps=0.99 \
   -jobconf stream.num.map.output.key.fields=2 \
   -jobconf num.key.fields.for.partition=1 \
   -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
   -jobconf mapred.job.priority=NORMAL

${hadoop_bin} fs -text ${hadoop_output_dir}/* >cid_most_simiar

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值