做机器翻译的时候遇到的问题,输入的命令如下
nohup fairseq-train /data4/zxzhou/wnt22/ende/preprocessionresult \
--arch transformer_iwslt_de_en --share-decoder-input-output-embed \
--optimizer adam --adam-betas '(0.9, 0.98)' \
--clip-norm 0.0 --lr 5e-4 --lr-scheduler inverse_sqrt \
--warmup-updates 512 --dropout 0.3 --weight-decay 0.0001 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--max-tokens 9216 \
--max-epoch 50 \
--save-interval 5 \
--keep-last-epochs 11 \
--eval-bleu \
--eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
--eval-bleu-detok moses \
--eval-bleu-remove-bpe \
--eval-bleu-print-samples \
--best-checkpoint-metric bleu \
--maximize-best-checkpoint-metric \
--save-dir /data4/zxzhou/wnt22/ende/checkpoints/test_transformerbaseline/ \
> baselinetrain.log 2>&1 &
训练完成后报错
2024-04-27 13:45:21 | INFO | fairseq_cli.train | done training in 131604.6 seconds
Exception in thread Thread-3:
Exception in thread Thread-6:
Traceback (most recent call last):
Traceback (most recent call last):
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/threading.py", line 1009, in _bootstrap_inner
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/threading.py", line 1009, in _bootstrap_inner
Exception in thread Thread-7:
Traceback (most recent call last):
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/threading.py", line 1009, in _bootstrap_inner
self.run()
self.run()
self.run()
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/site-packages/tensorboardX/event_file_writer.py", line 202, in run
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/site-packages/tensorboardX/event_file_writer.py", line 202, in run
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/site-packages/tensorboardX/event_file_writer.py", line 202, in run
data = self._queue.get(True, queue_wait_duration)
data = self._queue.get(True, queue_wait_duration)
data = self._queue.get(True, queue_wait_duration)
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/queues.py", line 117, in get
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/queues.py", line 117, in get
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/queues.py", line 117, in get
res = self._recv_bytes()
res = self._recv_bytes()
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/connection.py", line 221, in recv_bytes
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/connection.py", line 221, in recv_bytes
res = self._recv_bytes()
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/connection.py", line 221, in recv_bytes
buf = self._recv_bytes(maxlength)
buf = self._recv_bytes(maxlength)
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/connection.py", line 419, in _recv_bytes
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/connection.py", line 419, in _recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/connection.py", line 419, in _recv_bytes
buf = self._recv(4)
buf = self._recv(4)
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/connection.py", line 388, in _recv
buf = self._recv(4)
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/connection.py", line 388, in _recv
raise EOFError
File "/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/connection.py", line 388, in _recv
EOFError
raise EOFError
raise EOFError
EOFError
EOFError
/home/zxzhou/miniconda3/envs/fseq/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 600 leaked semaphore objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '