hadoop_bin="/usr/bin/hadoop"
hadoop_streaming_file="/opt/cloudera/parcels/CDH-5.11.0-1.cdh5.11.0.p0.34/lib/hadoop-mapreduce/hadoop-streaming.jar"
hadoop_input_dir="hdfs://nameservice1/user/cid_pv_dir_combine"
hadoop_output_dir="hdfs://nameservice1/user/cid_most_simiar_combine"
${hadoop_bin} fs -rm -r ${hadoop_output_dir}
${hadoop_bin} jar ${hadoop_streaming_file} \
-archives test.tar \
-file mapper_cid_most_similar.py \
-file reducer_cid_most_similar.py \
-file uid_cids.model \
-file uid_cids.bin \
-file uid_cids.model.docvecs.doctag_syn0.npy \
-file uid_cids.model.wv.syn0.npy\
-file uid_cids.model.syn1neg.npy\
-inputformat TextInputFormat \
-input $hadoop_input_dir \
-output $hadoop_output_dir \
-mapper "test.tar/test/bin/python mapper_cid_most_similar.py" \
-reducer "test.tar/test/bin/python reducer_cid_most_similar.py" \
-jobconf mapred.job.map.capacity=100 \
-jobconf mapred.job.reduce.capacity=100 \
-jobconf mapreduce.map.memory.mb=38192 \
-jobconf mapreduce.reduce.memory.mb=38192 \
-jobconf mapred.map.tasks=100 \
-jobconf mapred.reduce.tasks=0 \
-jobconf mapred.map.maxrunning=200 \
-jobconf mapred.reduce.maxrunning=200 \
-jobconf mapred.job.name="doc2vec generate uid_rec_cids" \
-jobconf mapred.reduce.slowstart.completed.maps=0.99 \
-jobconf stream.num.map.output.key.fields=2 \
-jobconf num.key.fields.for.partition=1 \
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
-jobconf mapred.job.priority=NORMAL
${hadoop_bin} fs -text ${hadoop_output_dir}/* >cid_most_simiar