hadoop_bin="/usr/bin/hadoop"
hadoop_streaming_file="/opt/cloudera/parcels/CDH-5.11.0-1.cdh5.11.0.p0.34/lib/hadoop-mapreduce/hadoop-streaming.jar"
hadoop_input_dir="hdfs://nameservice1/user/cid_pv_dir_combine"
hadoop_output_dir="hdfs://nameservice1/user/cid_most_simiar_combine"
${hadoop_bin} fs -rm -r ${hadoop_output_dir}
${hadoop_bin} jar ${hadoop_streaming_file}
-archives test.tar
-file mapper_cid_most_similar.py
-file reducer_cid_most_similar.py
-file uid_cids.model
-file uid_cids.bin
-file uid_cids.model.docvecs.doctag_syn0.npy
-file uid_cids.model.wv.syn0.npy
-file uid_cids.model.syn1neg.npy
-inputformat TextInputFormat
-input $hadoop_input_dir
-output $hadoop_output_dir
-mapper “test.tar/test/bin/python mapper_cid_most_similar.py”
-reducer “test.tar/test/bin/python reducer_cid_most_similar.py”
-jobconf mapred.job.map.capacity=100
-jobconf mapred.job.reduce.capacity=100
-jobconf mapreduce.map.memory.mb=38192
-jobconf mapreduce.reduce.memory.mb=38192
-jobconf mapred.map.tasks=100
-jobconf mapred.reduce.tasks=0
-jobconf mapred.map.maxrunning=200
-jobconf mapred.reduce.maxrunning=200
-jobconf mapred.job.name=“doc2vec generate uid_rec_cids”
-jobconf mapred.reduce.slowstart.completed.maps=0.99
-jobconf stream.num.map.output.key.fields=2
-jobconf num.key.fields.for.partition=1
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
-jobconf mapred.job.priority=NORMAL
${hadoop_bin} fs -text ${hadoop_output_dir}/* >cid_most_simiar