125| 0%| | 0/1 [00:00<?, ?it/s] 0%| | 0/1 [03:24<?, ?it/s]
126| Traceback (most recent call last):
127| File "/nfs/volume-826-2/carlos/0830_dector/ja/AA_gector-master/train.py", line 305, in <module>
128| args = parser.parse_args()
129| File "/nfs/volume-826-2/carlos/0830_dector/ja/AA_gector-master/train.py", line 174, in main
130| print("Start training")
131| File "/nfs/volume-826-2/carlos/0830_dector/ja/AA_gector-master/gector/trainer.py", line 689, in train
132| train_metrics = self._train_epoch(epoch)
133| File "/nfs/volume-826-2/carlos/0830_dector/ja/AA_gector-master/gector/trainer.py", line 477, in _train_epoch
134| loss = self.batch_loss(batch_group, for_training=True) / iter_len
135| File "/nfs/volume-826-2/carlos/0830_dector/ja/AA_gector-master/gector/trainer.py", line 381, in batch_loss
136| output_dict = training_util.data_parallel(batch_group, self.model, self._cuda_devices)
137| File "/usr/local/lib/python3.6/dist-packages/allennlp/training/util.py", line 332, in data_parallel
138| outputs = parallel_apply(replicas, inputs, moved, used_device_ids)
139| File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
140| output.reraise()
141| File "/usr/local/lib/python3.6/dist-packages/torch/_utils.py", line 425, in reraise
142| raise self.exc_type(msg)
143| StopIteration: Caught StopIteration in replica 1 on device 1.
144| Original Traceback (most recent call last):
145| File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
146| output = module(*input, **kwargs)
147| File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 1051, in _call_impl
148| return forward_call(*input, **kwargs)
149| File "/nfs/volume-826-2/carlos/0830_dector/ja/AA_gector-master/gector/seq2labels_model.py", line 130, in forward
150| encoded_text = self.text_field_embedder(tokens)
151| File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 1051, in _call_impl
152| return forward_call(*input, **kwargs)
153| File "/usr/local/lib/python3.6/dist-packages/allennlp/modules/text_field_embedders/basic_text_field_embedder.py", line 110, in forward
154| token_vectors = embedder(*tensors)
155| File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 1051, in _call_impl
156| return forward_call(*input, **kwargs)
157| File "/nfs/volume-826-2/carlos/0830_dector/ja/AA_gector-master/gector/bert_token_embedder.py", line 147, in forward
158| attention_mask=util.combine_initial_dims(input_mask),
159| File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 1051, in _call_impl
160| return forward_call(*input, **kwargs)
161| File "/usr/local/lib/python3.6/dist-packages/transformers/modeling_bert.py", line 694, in forward
162| extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
163| StopIteration
这是我遇到的一个问题 其实一开觉得是P40不支持半精度问题导致的,后来换成了2080ti 发现还是不行 就开始重新考量错误的原因 发现是由于模型重复加载导致的