You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Traceback (most recent call last):
File "/home/code/BEVStereo-master/exps/bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py", line 90, in
run_cli()
File "/home/code/BEVStereo-master/exps/bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py", line 86, in run_cli
main(args)
File "/home/code/BEVStereo-master/exps/bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py", line 56, in main
trainer.fit(model)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 772, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 724, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 812, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1237, in _run
results = self._run_stage()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1324, in _run_stage
return self._run_train()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1354, in _run_train
self.fit_loop.run()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/fit_loop.py", line 269, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 208, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 207, in advance
self.optimizer_idx,
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 256, in _run_optimization
self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 378, in _optimizer_step
using_lbfgs=is_lbfgs,
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1596, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1625, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/strategies/ddp.py", line 278, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 193, in optimizer_step
return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 85, in optimizer_step
closure_result = closure()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 148, in call
self._result = self.closure(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 143, in closure
self._backward_fn(step_output.closure_loss)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 311, in backward_fn
self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1766, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 168, in backward
self.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 80, in backward
model.backward(closure_loss, optimizer, *args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1370, in backward
loss.backward(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/torch/_tensor.py", line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/init.py", line 156, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/function.py", line 199, in apply
return user_fn(self, *args)
File "/opt/conda/lib/python3.7/site-packages/torch/utils/checkpoint.py", line 138, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/init.py", line 156, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the forward function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes. or try to use _set_static_graph() as a workaround if this module graph does not change during training loop.2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple checkpoint functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases in default. You can try to use _set_static_graph() as a workaround if your module graph does not change over iterations.
Parameter at index 327 has been marked as ready twice. This means that multiple autograd engine hooks have fired for this particular parameter during this iteration. You can set the environment variable TORCH_DISTRIBUTED_DEBUG to either INFO or DETAIL to print parameter names for further debugging.
您好,我在将bacobone替换成vovnet(v2-99)的过程中,遇到了上述问题,具体报错信息如下:
Traceback (most recent call last):
File "/home/code/BEVStereo-master/exps/bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py", line 90, in
run_cli()
File "/home/code/BEVStereo-master/exps/bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py", line 86, in run_cli
main(args)
File "/home/code/BEVStereo-master/exps/bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py", line 56, in main
trainer.fit(model)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 772, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 724, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 812, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1237, in _run
results = self._run_stage()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1324, in _run_stage
return self._run_train()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1354, in _run_train
self.fit_loop.run()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/fit_loop.py", line 269, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 208, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 207, in advance
self.optimizer_idx,
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 256, in _run_optimization
self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 378, in _optimizer_step
using_lbfgs=is_lbfgs,
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1596, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1625, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/strategies/ddp.py", line 278, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 193, in optimizer_step
return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 85, in optimizer_step
closure_result = closure()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 148, in call
self._result = self.closure(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 143, in closure
self._backward_fn(step_output.closure_loss)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 311, in backward_fn
self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1766, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 168, in backward
self.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 80, in backward
model.backward(closure_loss, optimizer, *args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1370, in backward
loss.backward(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/torch/_tensor.py", line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/init.py", line 156, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/function.py", line 199, in apply
return user_fn(self, *args)
File "/opt/conda/lib/python3.7/site-packages/torch/utils/checkpoint.py", line 138, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/init.py", line 156, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the
forward
function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes. or try to use _set_static_graph() as a workaround if this module graph does not change during training loop.2) Reused parameters in multiple reentrant backward passes. For example, if you use multiplecheckpoint
functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases in default. You can try to use _set_static_graph() as a workaround if your module graph does not change over iterations.Parameter at index 327 has been marked as ready twice. This means that multiple autograd engine hooks have fired for this particular parameter during this iteration. You can set the environment variable TORCH_DISTRIBUTED_DEBUG to either INFO or DETAIL to print parameter names for further debugging.
麻烦您指导解决下。
我的环境版本如下:
Python: 3.7.11 (default, Jul 27 2021, 14:32:16) [GCC 7.5.0]
CUDA available: True
GPU 0,1: NVIDIA A100-PCIE-40GB
CUDA_HOME: /usr/local/cuda
NVCC: Cuda compilation tools, release 11.1, V11.1.105
GCC: gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
PyTorch: 1.10.0
PyTorch compiling details: PyTorch built with:
TorchVision: 0.11.0
OpenCV: 4.6.0
MMCV: 1.5.2
MMCV Compiler: GCC 7.5
MMCV CUDA Compiler: 11.1
MMDetection: 2.24.0
MMSegmentation: 0.26.0
MMDetection3D: 1.0.0rc4+unknown
spconv2.0: False
超级期待您的回复。
The text was updated successfully, but these errors were encountered: