You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Traceback (most recent call last):
File "scripts/training/train_text_generation.py", line 84, in
main(
File "scripts/training/train_text_generation.py", line 55, in main
trainer.train_and_eval()
File "/root/autodl-tmp/RL4LMs/rl4lms/envs/text_generation/training_utils.py", line 205, in train_and_eval
self._alg.learn(self._n_steps_per_iter)
File "/root/autodl-tmp/RL4LMs/rl4lms/algorithms/ppo/ppo.py", line 342, in learn
return super().learn(
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/stable_baselines3/common/on_policy_algorithm.py", line 267, in learn
self.train()
File "/root/autodl-tmp/RL4LMs/rl4lms/algorithms/ppo/ppo.py", line 215, in train
evaluation_output: EvaluateActionsOutput = self.policy.evaluate_actions(
File "/root/autodl-tmp/RL4LMs/rl4lms/envs/text_generation/policy/seq2seq_policy.py", line 242, in evaluate_actions
policy_outputs = self.forward_policy(obs=obs, actions=actions)
File "/root/autodl-tmp/RL4LMs/rl4lms/envs/text_generation/policy/seq2seq_policy.py", line 116, in forward_policy
past_model_kwargs = unwrap_model(
File "/root/autodl-tmp/RL4LMs/rl4lms/envs/text_generation/hf_generation_utils.py", line 534, in _prepare_encoder_decoder_kwargs_for_generation
model_kwargs["encoder_outputs"]: ModelOutput = encoder(
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/transformers/models/t5/modeling_t5.py", line 1033, in forward
layer_outputs = layer_module(
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/transformers/models/t5/modeling_t5.py", line 668, in forward
self_attention_outputs = self.layer[0](
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/transformers/models/t5/modeling_t5.py", line 574, in forward
attention_output = self.SelfAttention(
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/transformers/models/t5/modeling_t5.py", line 511, in forward
scores = torch.matmul(
RuntimeError: CUDA out of memory. Tried to allocate 768.00 MiB (GPU 0; 31.75 GiB total capacity; 28.32 GiB already allocated; 327.94 MiB free; 30.22 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
When` I try the example, all the environment has done, and I use V100-32G. It continues to show the error about RuntimeError. I changed the.yml file pyremeter but still see this bug. So I want to ask how to solve the problem, or have I done something wrong?
The text was updated successfully, but these errors were encountered:
hanninaa
changed the title
A Issues about RuntimeError
An Issues about RuntimeError
Nov 8, 2024
`
| time/ | |
| fps | 16 |
| iterations | 1 |
| time_elapsed | 200 |
| total_timesteps | 3355 |
Traceback (most recent call last):
File "scripts/training/train_text_generation.py", line 84, in
main(
File "scripts/training/train_text_generation.py", line 55, in main
trainer.train_and_eval()
File "/root/autodl-tmp/RL4LMs/rl4lms/envs/text_generation/training_utils.py", line 205, in train_and_eval
self._alg.learn(self._n_steps_per_iter)
File "/root/autodl-tmp/RL4LMs/rl4lms/algorithms/ppo/ppo.py", line 342, in learn
return super().learn(
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/stable_baselines3/common/on_policy_algorithm.py", line 267, in learn
self.train()
File "/root/autodl-tmp/RL4LMs/rl4lms/algorithms/ppo/ppo.py", line 215, in train
evaluation_output: EvaluateActionsOutput = self.policy.evaluate_actions(
File "/root/autodl-tmp/RL4LMs/rl4lms/envs/text_generation/policy/seq2seq_policy.py", line 242, in evaluate_actions
policy_outputs = self.forward_policy(obs=obs, actions=actions)
File "/root/autodl-tmp/RL4LMs/rl4lms/envs/text_generation/policy/seq2seq_policy.py", line 116, in forward_policy
past_model_kwargs = unwrap_model(
File "/root/autodl-tmp/RL4LMs/rl4lms/envs/text_generation/hf_generation_utils.py", line 534, in _prepare_encoder_decoder_kwargs_for_generation
model_kwargs["encoder_outputs"]: ModelOutput = encoder(
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/transformers/models/t5/modeling_t5.py", line 1033, in forward
layer_outputs = layer_module(
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/transformers/models/t5/modeling_t5.py", line 668, in forward
self_attention_outputs = self.layer[0](
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/transformers/models/t5/modeling_t5.py", line 574, in forward
attention_output = self.SelfAttention(
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/envs/rl/lib/python3.8/site-packages/transformers/models/t5/modeling_t5.py", line 511, in forward
scores = torch.matmul(
RuntimeError: CUDA out of memory. Tried to allocate 768.00 MiB (GPU 0; 31.75 GiB total capacity; 28.32 GiB already allocated; 327.94 MiB free; 30.22 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and
PYTORCH_CUDA_ALLOC_CONF
When` I try the example, all the environment has done, and I use V100-32G. It continues to show the error about RuntimeError. I changed the.yml file pyremeter but still see this bug. So I want to ask how to solve the problem, or have I done something wrong?
The text was updated successfully, but these errors were encountered: