在TOOLS_PATH后给Mosh的地址
改文件名字
如果没有权限,cd到最tool文件夹里,执行chmod 777 -R Moses
最后bash deal_smg.sh
#
# Created by Jxu 2019-06-20
# Use Moses deal the '.smg' file
#
set -e
#
# Data preprocessing configuration
#
N_MONO=10000000 # number of monolingual sentences for each language
CODES=60000 # number of BPE codes
N_THREADS=48 # number of threads in data preprocessing
N_EPOCHS=10 # number of fastText epochs
#
# Initialize tools and data paths
#
UMT_PATH=$PWD
TOOLS_PATH=$PWD/transformer3-12/tool/Moses
DATA_PATH=$PWD
PARA_PATH=$DATA_PATH
# moses
MOSES=$TOOLS_PATH/mosesdecoder-master
TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl
NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl
INPUT_FROM_SGM=$MOSES/scripts/ems/support/input-from-sgm.perl
REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl
SRC_TEST=$PARA_PATH/newstest2019-zhen-ref.en
TGT_TEST=$PARA_PATH/newstest2019-enzh-ref.zh
if ! [[ -f "$SRC_TEST.sgm" ]]; then echo "$SRC_TEST.sgm is not found!"; exit; fi
if ! [[ -f "$TGT_TEST.sgm" ]]; then echo "$TGT_TEST.sgm is not found!"; exit; fi
echo "Tokenizing valid and test data..."
$INPUT_FROM_SGM < $SRC_TEST.sgm | $NORM_PUNC -l en | $REM_NON_PRINT_CHAR | $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_TEST
$INPUT_FROM_SGM < $TGT_TEST.sgm | $NORM_PUNC -l zh | $REM_NON_PRINT_CHAR | $TOKENIZER -l zh -no-escape -threads $N_THREADS > $TGT_TEST