Skip to content

为什么训练到166个epoch之后报错了呢 #5

@BooleanYu

Description

@BooleanYu

File "train.py", line 30, in
trainer_defaults={"plugins": DDPPlugin(find_unused_parameters=True)},
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/utilities/cli.py", line 289, in init
self.fit()
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/utilities/cli.py", line 432, in fit
self.trainer.fit(**self.fit_kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 552, in fit
self._run(model)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 922, in _run
self._dispatch()
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 990, in _dispatch
self.accelerator.start_training(self)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 92, in start_training
self.training_type_plugin.start_training(trainer)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 161, in start_training
self._results = trainer.run_stage()
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1000, in run_stage
return self._run_train()
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1049, in _run_train
self.fit_loop.run()
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/loops/fit_loop.py", line 200, in advance
epoch_output = self.epoch_loop.run(train_dataloader)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 130, in advance
batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 100, in run
super().run(batch, batch_idx, dataloader_idx)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 147, in advance
result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 201, in _run_optimization
self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 403, in _optimizer_step
using_lbfgs=is_lbfgs,
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1616, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 206, in step
self.__optimizer_step(closure=closure, profiler_name=profiler_name, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 128, in __optimizer_step
trainer.accelerator.optimizer_step(self._optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 296, in optimizer_step
self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 303, in run_optimizer_step
self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 226, in optimizer_step
optimizer.step(closure=lambda_closure, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/torch/optim/optimizer.py", line 89, in wrapper
return func(*args, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/torch/optim/sgd.py", line 87, in step
loss = closure()
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 235, in _training_step_and_backward_closure
result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 548, in training_step_and_backward
self.backward(result, optimizer, opt_idx)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 589, in backward
result.closure_loss = self.trainer.accelerator.backward(result.closure_loss, optimizer, *args, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 276, in backward
self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, *args, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 78, in backward
model.backward(closure_loss, optimizer, *args, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1479, in backward
loss.backward(*args, **kwargs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/torch/tensor.py", line 245, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/root/miniconda3/envs/HMER/lib/python3.7/site-packages/torch/autograd/init.py", line 147, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: CUDA error: an illegal memory access was encountered

报错内容这样 是什么原因呢

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions