From 384e36f87685501a427da5d5f89bc90daa1a1bec Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Oct 2025 07:55:04 +0000 Subject: [PATCH 1/6] Initial plan From 20830159c25fa25a05c8316a260f442e5062ce7c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Oct 2025 08:07:26 +0000 Subject: [PATCH 2/6] Replace torch.cuda.amp.GradScaler with torch.amp.GradScaler Co-authored-by: vfdev-5 <2459423+vfdev-5@users.noreply.github.com> --- examples/cifar10/main.py | 5 ++-- .../benchmark_torch_cuda_amp.py | 5 ++-- examples/cifar10_qat/main.py | 5 ++-- .../CycleGAN_with_torch_cuda_amp.ipynb | 24 +++++++++---------- .../classification/imagenet/main.py | 5 ++-- .../segmentation/pascal_voc2012/main.py | 5 ++-- examples/transformers/main.py | 5 ++-- ignite/engine/__init__.py | 20 ++++++++-------- tests/ignite/engine/test_create_supervised.py | 14 +++++------ 9 files changed, 41 insertions(+), 47 deletions(-) diff --git a/examples/cifar10/main.py b/examples/cifar10/main.py index b8dbce5d9601..5ba566fec982 100644 --- a/examples/cifar10/main.py +++ b/examples/cifar10/main.py @@ -7,8 +7,7 @@ import torch.nn as nn import torch.optim as optim import utils -from torch.amp import autocast -from torch.cuda.amp import GradScaler +from torch.amp import autocast, GradScaler import ignite import ignite.distributed as idist @@ -289,7 +288,7 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con # - Two progress bars on epochs and optionally on iterations with_amp = config["with_amp"] - scaler = GradScaler(enabled=with_amp) + scaler = GradScaler('cuda', enabled=with_amp) def train_step(engine, batch): x, y = batch[0], batch[1] diff --git a/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py b/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py index 746d7eb54c49..cc045995cb90 100644 --- a/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py +++ b/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py @@ -1,7 +1,6 @@ import fire import torch -from torch.amp import autocast -from torch.cuda.amp import GradScaler +from torch.amp import autocast, GradScaler from torch.nn import CrossEntropyLoss from torch.optim import SGD from torchvision.models import wide_resnet50_2 @@ -26,7 +25,7 @@ def main(dataset_path, batch_size=256, max_epochs=10): optimizer = SGD(model.parameters(), lr=0.01) criterion = CrossEntropyLoss().to(device) - scaler = GradScaler() + scaler = GradScaler('cuda') def train_step(engine, batch): x = convert_tensor(batch[0], device, non_blocking=True) diff --git a/examples/cifar10_qat/main.py b/examples/cifar10_qat/main.py index 7b8366a2a63f..8158f5d71d62 100644 --- a/examples/cifar10_qat/main.py +++ b/examples/cifar10_qat/main.py @@ -6,8 +6,7 @@ import torch.nn as nn import torch.optim as optim import utils -from torch.amp import autocast -from torch.cuda.amp import GradScaler +from torch.amp import autocast, GradScaler import ignite import ignite.distributed as idist @@ -273,7 +272,7 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con # - Two progress bars on epochs and optionally on iterations with_amp = config["with_amp"] - scaler = GradScaler(enabled=with_amp) + scaler = GradScaler('cuda', enabled=with_amp) def train_step(engine, batch): x, y = batch[0], batch[1] diff --git a/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb b/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb index c687267d0d52..1228bfe40c09 100644 --- a/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb +++ b/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb @@ -875,10 +875,10 @@ "As suggested, we divide the objective by 2 while optimizing D, which slows down the rate at which D learns, relative to the rate of G. \n", "\n", "According to the paper:\n", - "- generator A is trained minimize $\\text{mean}_{x \\in A}[(D_B(G(x)) − 1)^2]$ and cycle loss $\\text{mean}_{x \\in A}\\left[ |F(G(x)) - x|_1 \\right]$\n", - "- generator B is trained minimize $\\text{mean}_{y \\in B}[(D_A(F(y)) − 1)^2]$ and cycle loss $\\text{mean}_{y \\in B}\\left[ |G(F(y)) - y|_1 \\right]$\n", - "- discriminators A is trained to minimize $\\text{mean}_{x \\in A}[(D_A(x) − 1)^2] + \\text{mean}_{y \\in B}[D_A(F(y))^2]$.\n", - "- discriminator B is trained to minimize $\\text{mean}_{y \\in B}[(D_B(y) − 1)^2] + \\text{mean}_{x \\in A}[D_B(G(x))^2]$." + "- generator A is trained minimize $\\text{mean}_{x \\in A}[(D_B(G(x)) \u2212 1)^2]$ and cycle loss $\\text{mean}_{x \\in A}\\left[ |F(G(x)) - x|_1 \\right]$\n", + "- generator B is trained minimize $\\text{mean}_{y \\in B}[(D_A(F(y)) \u2212 1)^2]$ and cycle loss $\\text{mean}_{y \\in B}\\left[ |G(F(y)) - y|_1 \\right]$\n", + "- discriminators A is trained to minimize $\\text{mean}_{x \\in A}[(D_A(x) \u2212 1)^2] + \\text{mean}_{y \\in B}[D_A(F(y))^2]$.\n", + "- discriminator B is trained to minimize $\\text{mean}_{y \\in B}[(D_B(y) \u2212 1)^2] + \\text{mean}_{x \\in A}[D_B(G(x))^2]$." ] }, { @@ -887,7 +887,7 @@ "id": "JE8dLeEfIl_Z" }, "source": [ - "We will use [`torch.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.amp.autocast) and [`torch.cuda.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)." + "We will use [`torch.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.amp.autocast) and [`torch.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)." ] }, { @@ -896,7 +896,7 @@ "id": "vrJls4p-FRcA" }, "source": [ - "from torch.cuda.amp import GradScaler\n", + "from torch.amp import GradScaler\n", "from torch.amp import autocast\n", "\n", "from ignite.utils import convert_tensor\n", @@ -924,7 +924,7 @@ "\n", "\n", "def compute_loss_discriminator(decision_real, decision_fake):\n", - " # loss = mean (D_b(y) − 1)^2 + mean D_b(G(x))^2 \n", + " # loss = mean (D_b(y) \u2212 1)^2 + mean D_b(G(x))^2 \n", " loss = F.mse_loss(decision_fake, torch.zeros_like(decision_fake))\n", " loss += F.mse_loss(decision_real, torch.ones_like(decision_real))\n", " return loss\n", @@ -954,10 +954,10 @@ " decision_fake_b = discriminator_B(fake_b)\n", "\n", " # Compute loss for generators and update generators\n", - " # loss_a2b = GAN loss: mean (D_b(G(x)) − 1)^2 + Forward cycle loss: || F(G(x)) - x ||_1 \n", + " # loss_a2b = GAN loss: mean (D_b(G(x)) \u2212 1)^2 + Forward cycle loss: || F(G(x)) - x ||_1 \n", " loss_a2b = compute_loss_generator(decision_fake_b, real_a, rec_a, lambda_value) \n", "\n", - " # loss_b2a = GAN loss: mean (D_a(F(x)) − 1)^2 + Backward cycle loss: || G(F(y)) - y ||_1\n", + " # loss_b2a = GAN loss: mean (D_a(F(x)) \u2212 1)^2 + Backward cycle loss: || G(F(y)) - y ||_1\n", " loss_b2a = compute_loss_generator(decision_fake_a, real_b, rec_b, lambda_value)\n", "\n", " # total generators loss:\n", @@ -977,10 +977,10 @@ " decision_real_a, decision_fake_a = discriminator_forward_pass(discriminator_A, real_a, fake_a.detach(), fake_a_buffer) \n", " decision_real_b, decision_fake_b = discriminator_forward_pass(discriminator_B, real_b, fake_b.detach(), fake_b_buffer) \n", " # Compute loss for discriminators and update discriminators\n", - " # loss_a = mean (D_a(y) − 1)^2 + mean D_a(F(x))^2\n", + " # loss_a = mean (D_a(y) \u2212 1)^2 + mean D_a(F(x))^2\n", " loss_a = compute_loss_discriminator(decision_real_a, decision_fake_a)\n", "\n", - " # loss_b = mean (D_b(y) − 1)^2 + mean D_b(G(x))^2\n", + " # loss_b = mean (D_b(y) \u2212 1)^2 + mean D_b(G(x))^2\n", " loss_b = compute_loss_discriminator(decision_real_b, decision_fake_b)\n", " \n", " # total discriminators loss:\n", @@ -1578,4 +1578,4 @@ "outputs": [] } ] -} +} \ No newline at end of file diff --git a/examples/references/classification/imagenet/main.py b/examples/references/classification/imagenet/main.py index defb4ddc1510..e6eda1a4db95 100644 --- a/examples/references/classification/imagenet/main.py +++ b/examples/references/classification/imagenet/main.py @@ -6,8 +6,7 @@ import torch try: - from torch.amp import autocast - from torch.cuda.amp import GradScaler + from torch.amp import autocast, GradScaler except ImportError: raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0") @@ -140,7 +139,7 @@ def create_trainer(model, optimizer, criterion, train_sampler, config, logger, w model_output_transform = config.get("model_output_transform", lambda x: x) with_amp = config.get("with_amp", True) - scaler = GradScaler(enabled=with_amp) + scaler = GradScaler('cuda', enabled=with_amp) def training_step(engine, batch): model.train() diff --git a/examples/references/segmentation/pascal_voc2012/main.py b/examples/references/segmentation/pascal_voc2012/main.py index b6fbc7ad494a..5e1208822686 100644 --- a/examples/references/segmentation/pascal_voc2012/main.py +++ b/examples/references/segmentation/pascal_voc2012/main.py @@ -6,8 +6,7 @@ import torch try: - from torch.amp import autocast - from torch.cuda.amp import GradScaler + from torch.amp import autocast, GradScaler except ImportError: raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0") @@ -187,7 +186,7 @@ def create_trainer(model, optimizer, criterion, train_sampler, config, logger, w model_output_transform = config.get("model_output_transform", lambda x: x) with_amp = config.get("with_amp", True) - scaler = GradScaler(enabled=with_amp) + scaler = GradScaler('cuda', enabled=with_amp) def forward_pass(batch): model.train() diff --git a/examples/transformers/main.py b/examples/transformers/main.py index f8118eabf90e..53f52a06a843 100644 --- a/examples/transformers/main.py +++ b/examples/transformers/main.py @@ -7,8 +7,7 @@ import torch.nn as nn import torch.optim as optim import utils -from torch.amp import autocast -from torch.cuda.amp import GradScaler +from torch.amp import autocast, GradScaler import ignite import ignite.distributed as idist @@ -298,7 +297,7 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con # - Two progress bars on epochs and optionally on iterations with_amp = config["with_amp"] - scaler = GradScaler(enabled=with_amp) + scaler = GradScaler('cuda', enabled=with_amp) def train_step(engine, batch): input_batch = batch[0] diff --git a/ignite/engine/__init__.py b/ignite/engine/__init__.py index 6e82bc2f6bc7..b337e746f734 100644 --- a/ignite/engine/__init__.py +++ b/ignite/engine/__init__.py @@ -133,11 +133,11 @@ def supervised_training_step_amp( prepare_batch: Callable = _prepare_batch, model_transform: Callable[[Any], Any] = lambda output: output, output_transform: Callable[[Any, Any, Any, torch.Tensor], Any] = lambda x, y, y_pred, loss: loss.item(), - scaler: Optional["torch.cuda.amp.GradScaler"] = None, + scaler: Optional["torch.amp.GradScaler"] = None, gradient_accumulation_steps: int = 1, model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x), ) -> Callable: - """Factory function for supervised training using ``torch.cuda.amp``. + """Factory function for supervised training using ``torch.amp``. Args: model: the model to train. @@ -170,7 +170,7 @@ def supervised_training_step_amp( model = ... optimizer = ... loss_fn = ... - scaler = torch.cuda.amp.GradScaler(2**10) + scaler = torch.amp.GradScaler('cuda', 2**10) update_fn = supervised_training_step_amp(model, optimizer, loss_fn, 'cuda', scaler=scaler) trainer = Engine(update_fn) @@ -185,7 +185,7 @@ def supervised_training_step_amp( """ try: - from torch.amp import autocast + from torch.amp import autocast, GradScaler except ImportError: raise ImportError("Please install torch>=1.12.0 to use amp_mode='amp'.") @@ -393,8 +393,8 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to def _check_arg( - on_tpu: bool, on_mps: bool, amp_mode: Optional[str], scaler: Optional[Union[bool, "torch.cuda.amp.GradScaler"]] -) -> Tuple[Optional[str], Optional["torch.cuda.amp.GradScaler"]]: + on_tpu: bool, on_mps: bool, amp_mode: Optional[str], scaler: Optional[Union[bool, "torch.amp.GradScaler"]] +) -> Tuple[Optional[str], Optional["torch.amp.GradScaler"]]: """Checking tpu, mps, amp and GradScaler instance combinations.""" if on_mps and amp_mode: raise ValueError("amp_mode cannot be used with mps device. Consider using amp_mode=None or device='cuda'.") @@ -410,10 +410,10 @@ def _check_arg( raise ValueError(f"scaler argument is {scaler}, but amp_mode is {amp_mode}. Consider using amp_mode='amp'.") elif amp_mode == "amp" and isinstance(scaler, bool): try: - from torch.cuda.amp import GradScaler + from torch.amp import GradScaler except ImportError: raise ImportError("Please install torch>=1.6.0 to use scaler argument.") - scaler = GradScaler(enabled=True) + scaler = GradScaler('cuda', enabled=True) if on_tpu: return "tpu", None @@ -434,7 +434,7 @@ def create_supervised_trainer( output_transform: Callable[[Any, Any, Any, torch.Tensor], Any] = lambda x, y, y_pred, loss: loss.item(), deterministic: bool = False, amp_mode: Optional[str] = None, - scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False, + scaler: Union[bool, "torch.amp.GradScaler"] = False, gradient_accumulation_steps: int = 1, model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x), ) -> Engine: @@ -459,7 +459,7 @@ def create_supervised_trainer( :class:`~ignite.engine.deterministic.DeterministicEngine`, otherwise :class:`~ignite.engine.engine.Engine` (default: False). amp_mode: can be ``amp`` or ``apex``, model and optimizer will be casted to float16 using - `torch.cuda.amp `_ for ``amp`` and + `torch.amp `_ for ``amp`` and using `apex `_ for ``apex``. (default: None) scaler: GradScaler instance for gradient scaling if `torch>=1.6.0` and ``amp_mode`` is ``amp``. If ``amp_mode`` is ``apex``, this argument will be ignored. diff --git a/tests/ignite/engine/test_create_supervised.py b/tests/ignite/engine/test_create_supervised.py index ba42baddddae..d17f58867e7e 100644 --- a/tests/ignite/engine/test_create_supervised.py +++ b/tests/ignite/engine/test_create_supervised.py @@ -48,7 +48,7 @@ def _default_create_supervised_trainer( trainer_device: Optional[str] = None, trace: bool = False, amp_mode: str = None, - scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False, + scaler: Union[bool, "torch.amp.GradScaler"] = False, with_model_transform: bool = False, with_model_fn: bool = False, ): @@ -104,7 +104,7 @@ def _test_create_supervised_trainer( trainer_device: Optional[str] = None, trace: bool = False, amp_mode: str = None, - scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False, + scaler: Union[bool, "torch.amp.GradScaler"] = False, with_model_transform: bool = False, with_model_fn: bool = False, ): @@ -170,10 +170,10 @@ def _(): @pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") def test_create_supervised_training_scalar_assignment(): with mock.patch("ignite.engine._check_arg") as check_arg_mock: - check_arg_mock.return_value = None, torch.cuda.amp.GradScaler(enabled=False) + check_arg_mock.return_value = None, torch.amp.GradScaler('cuda', enabled=False) trainer, _ = _default_create_supervised_trainer(model_device="cpu", trainer_device="cpu", scaler=True) assert hasattr(trainer.state, "scaler") - assert isinstance(trainer.state.scaler, torch.cuda.amp.GradScaler) + assert isinstance(trainer.state.scaler, torch.amp.GradScaler) def _test_create_mocked_supervised_trainer( @@ -181,7 +181,7 @@ def _test_create_mocked_supervised_trainer( trainer_device: Optional[str] = None, trace: bool = False, amp_mode: str = None, - scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False, + scaler: Union[bool, "torch.amp.GradScaler"] = False, ): with mock.patch("ignite.engine.supervised_training_step_amp") as training_step_amp_mock: with mock.patch("ignite.engine.supervised_training_step_apex") as training_step_apex_mock: @@ -462,7 +462,7 @@ def test_create_supervised_trainer_amp_error(mock_torch_cuda_amp_module): @pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") def test_create_supervised_trainer_scaler_not_amp(): - scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available()) + scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available()) with pytest.raises(ValueError, match=f"scaler argument is {scaler}, but amp_mode is None."): _test_create_supervised_trainer(amp_mode=None, scaler=scaler) @@ -540,7 +540,7 @@ def test_create_supervised_trainer_on_cuda_amp_scaler(): _test_create_mocked_supervised_trainer( model_device=model_device, trainer_device=trainer_device, amp_mode="amp", scaler=True ) - scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available()) + scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available()) _test_create_supervised_trainer( gradient_accumulation_steps=1, model_device=model_device, From c430571bc57ea59df82893953c90ad3c640316c4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Oct 2025 08:36:35 +0000 Subject: [PATCH 3/6] Address review comments: remove cuda parameter from GradScaler, update PyTorch version requirements, revert notebook changes Co-authored-by: vfdev-5 <2459423+vfdev-5@users.noreply.github.com> --- examples/cifar10/main.py | 2 +- .../benchmark_torch_cuda_amp.py | 2 +- examples/cifar10_qat/main.py | 2 +- .../CycleGAN_with_torch_cuda_amp.ipynb | 24 +++++++++---------- .../classification/imagenet/main.py | 4 ++-- .../segmentation/pascal_voc2012/main.py | 4 ++-- examples/transformers/main.py | 2 +- ignite/engine/__init__.py | 6 ++--- 8 files changed, 23 insertions(+), 23 deletions(-) diff --git a/examples/cifar10/main.py b/examples/cifar10/main.py index 5ba566fec982..37ffdbb70250 100644 --- a/examples/cifar10/main.py +++ b/examples/cifar10/main.py @@ -288,7 +288,7 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con # - Two progress bars on epochs and optionally on iterations with_amp = config["with_amp"] - scaler = GradScaler('cuda', enabled=with_amp) + scaler = GradScaler(enabled=with_amp) def train_step(engine, batch): x, y = batch[0], batch[1] diff --git a/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py b/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py index cc045995cb90..03d53752c4e3 100644 --- a/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py +++ b/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py @@ -25,7 +25,7 @@ def main(dataset_path, batch_size=256, max_epochs=10): optimizer = SGD(model.parameters(), lr=0.01) criterion = CrossEntropyLoss().to(device) - scaler = GradScaler('cuda') + scaler = GradScaler() def train_step(engine, batch): x = convert_tensor(batch[0], device, non_blocking=True) diff --git a/examples/cifar10_qat/main.py b/examples/cifar10_qat/main.py index 8158f5d71d62..48f38fa04a45 100644 --- a/examples/cifar10_qat/main.py +++ b/examples/cifar10_qat/main.py @@ -272,7 +272,7 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con # - Two progress bars on epochs and optionally on iterations with_amp = config["with_amp"] - scaler = GradScaler('cuda', enabled=with_amp) + scaler = GradScaler(enabled=with_amp) def train_step(engine, batch): x, y = batch[0], batch[1] diff --git a/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb b/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb index 1228bfe40c09..c687267d0d52 100644 --- a/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb +++ b/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb @@ -875,10 +875,10 @@ "As suggested, we divide the objective by 2 while optimizing D, which slows down the rate at which D learns, relative to the rate of G. \n", "\n", "According to the paper:\n", - "- generator A is trained minimize $\\text{mean}_{x \\in A}[(D_B(G(x)) \u2212 1)^2]$ and cycle loss $\\text{mean}_{x \\in A}\\left[ |F(G(x)) - x|_1 \\right]$\n", - "- generator B is trained minimize $\\text{mean}_{y \\in B}[(D_A(F(y)) \u2212 1)^2]$ and cycle loss $\\text{mean}_{y \\in B}\\left[ |G(F(y)) - y|_1 \\right]$\n", - "- discriminators A is trained to minimize $\\text{mean}_{x \\in A}[(D_A(x) \u2212 1)^2] + \\text{mean}_{y \\in B}[D_A(F(y))^2]$.\n", - "- discriminator B is trained to minimize $\\text{mean}_{y \\in B}[(D_B(y) \u2212 1)^2] + \\text{mean}_{x \\in A}[D_B(G(x))^2]$." + "- generator A is trained minimize $\\text{mean}_{x \\in A}[(D_B(G(x)) − 1)^2]$ and cycle loss $\\text{mean}_{x \\in A}\\left[ |F(G(x)) - x|_1 \\right]$\n", + "- generator B is trained minimize $\\text{mean}_{y \\in B}[(D_A(F(y)) − 1)^2]$ and cycle loss $\\text{mean}_{y \\in B}\\left[ |G(F(y)) - y|_1 \\right]$\n", + "- discriminators A is trained to minimize $\\text{mean}_{x \\in A}[(D_A(x) − 1)^2] + \\text{mean}_{y \\in B}[D_A(F(y))^2]$.\n", + "- discriminator B is trained to minimize $\\text{mean}_{y \\in B}[(D_B(y) − 1)^2] + \\text{mean}_{x \\in A}[D_B(G(x))^2]$." ] }, { @@ -887,7 +887,7 @@ "id": "JE8dLeEfIl_Z" }, "source": [ - "We will use [`torch.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.amp.autocast) and [`torch.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)." + "We will use [`torch.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.amp.autocast) and [`torch.cuda.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)." ] }, { @@ -896,7 +896,7 @@ "id": "vrJls4p-FRcA" }, "source": [ - "from torch.amp import GradScaler\n", + "from torch.cuda.amp import GradScaler\n", "from torch.amp import autocast\n", "\n", "from ignite.utils import convert_tensor\n", @@ -924,7 +924,7 @@ "\n", "\n", "def compute_loss_discriminator(decision_real, decision_fake):\n", - " # loss = mean (D_b(y) \u2212 1)^2 + mean D_b(G(x))^2 \n", + " # loss = mean (D_b(y) − 1)^2 + mean D_b(G(x))^2 \n", " loss = F.mse_loss(decision_fake, torch.zeros_like(decision_fake))\n", " loss += F.mse_loss(decision_real, torch.ones_like(decision_real))\n", " return loss\n", @@ -954,10 +954,10 @@ " decision_fake_b = discriminator_B(fake_b)\n", "\n", " # Compute loss for generators and update generators\n", - " # loss_a2b = GAN loss: mean (D_b(G(x)) \u2212 1)^2 + Forward cycle loss: || F(G(x)) - x ||_1 \n", + " # loss_a2b = GAN loss: mean (D_b(G(x)) − 1)^2 + Forward cycle loss: || F(G(x)) - x ||_1 \n", " loss_a2b = compute_loss_generator(decision_fake_b, real_a, rec_a, lambda_value) \n", "\n", - " # loss_b2a = GAN loss: mean (D_a(F(x)) \u2212 1)^2 + Backward cycle loss: || G(F(y)) - y ||_1\n", + " # loss_b2a = GAN loss: mean (D_a(F(x)) − 1)^2 + Backward cycle loss: || G(F(y)) - y ||_1\n", " loss_b2a = compute_loss_generator(decision_fake_a, real_b, rec_b, lambda_value)\n", "\n", " # total generators loss:\n", @@ -977,10 +977,10 @@ " decision_real_a, decision_fake_a = discriminator_forward_pass(discriminator_A, real_a, fake_a.detach(), fake_a_buffer) \n", " decision_real_b, decision_fake_b = discriminator_forward_pass(discriminator_B, real_b, fake_b.detach(), fake_b_buffer) \n", " # Compute loss for discriminators and update discriminators\n", - " # loss_a = mean (D_a(y) \u2212 1)^2 + mean D_a(F(x))^2\n", + " # loss_a = mean (D_a(y) − 1)^2 + mean D_a(F(x))^2\n", " loss_a = compute_loss_discriminator(decision_real_a, decision_fake_a)\n", "\n", - " # loss_b = mean (D_b(y) \u2212 1)^2 + mean D_b(G(x))^2\n", + " # loss_b = mean (D_b(y) − 1)^2 + mean D_b(G(x))^2\n", " loss_b = compute_loss_discriminator(decision_real_b, decision_fake_b)\n", " \n", " # total discriminators loss:\n", @@ -1578,4 +1578,4 @@ "outputs": [] } ] -} \ No newline at end of file +} diff --git a/examples/references/classification/imagenet/main.py b/examples/references/classification/imagenet/main.py index e6eda1a4db95..3069523c1115 100644 --- a/examples/references/classification/imagenet/main.py +++ b/examples/references/classification/imagenet/main.py @@ -8,7 +8,7 @@ try: from torch.amp import autocast, GradScaler except ImportError: - raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0") + raise RuntimeError("Please, use recent PyTorch version, e.g. >=2.3.1") import dataflow as data import utils @@ -139,7 +139,7 @@ def create_trainer(model, optimizer, criterion, train_sampler, config, logger, w model_output_transform = config.get("model_output_transform", lambda x: x) with_amp = config.get("with_amp", True) - scaler = GradScaler('cuda', enabled=with_amp) + scaler = GradScaler(enabled=with_amp) def training_step(engine, batch): model.train() diff --git a/examples/references/segmentation/pascal_voc2012/main.py b/examples/references/segmentation/pascal_voc2012/main.py index 5e1208822686..34f43b895879 100644 --- a/examples/references/segmentation/pascal_voc2012/main.py +++ b/examples/references/segmentation/pascal_voc2012/main.py @@ -8,7 +8,7 @@ try: from torch.amp import autocast, GradScaler except ImportError: - raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0") + raise RuntimeError("Please, use recent PyTorch version, e.g. >=2.3.1") import dataflow as data import utils @@ -186,7 +186,7 @@ def create_trainer(model, optimizer, criterion, train_sampler, config, logger, w model_output_transform = config.get("model_output_transform", lambda x: x) with_amp = config.get("with_amp", True) - scaler = GradScaler('cuda', enabled=with_amp) + scaler = GradScaler(enabled=with_amp) def forward_pass(batch): model.train() diff --git a/examples/transformers/main.py b/examples/transformers/main.py index 53f52a06a843..8dc90a7ef9e3 100644 --- a/examples/transformers/main.py +++ b/examples/transformers/main.py @@ -297,7 +297,7 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con # - Two progress bars on epochs and optionally on iterations with_amp = config["with_amp"] - scaler = GradScaler('cuda', enabled=with_amp) + scaler = GradScaler(enabled=with_amp) def train_step(engine, batch): input_batch = batch[0] diff --git a/ignite/engine/__init__.py b/ignite/engine/__init__.py index b337e746f734..6300011fd79d 100644 --- a/ignite/engine/__init__.py +++ b/ignite/engine/__init__.py @@ -187,7 +187,7 @@ def supervised_training_step_amp( try: from torch.amp import autocast, GradScaler except ImportError: - raise ImportError("Please install torch>=1.12.0 to use amp_mode='amp'.") + raise ImportError("Please install torch>=2.3.1 to use amp_mode='amp'.") if gradient_accumulation_steps <= 0: raise ValueError( @@ -412,8 +412,8 @@ def _check_arg( try: from torch.amp import GradScaler except ImportError: - raise ImportError("Please install torch>=1.6.0 to use scaler argument.") - scaler = GradScaler('cuda', enabled=True) + raise ImportError("Please install torch>=2.3.1 to use scaler argument.") + scaler = GradScaler(enabled=True) if on_tpu: return "tpu", None From 942d8f4613fbc6d90238cec6b74d5a3fb92e3889 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Oct 2025 09:01:21 +0000 Subject: [PATCH 4/6] Fix test failures: remove cuda parameter from test GradScaler instantiations and revert to correct PyTorch version requirements Co-authored-by: vfdev-5 <2459423+vfdev-5@users.noreply.github.com> --- examples/references/classification/imagenet/main.py | 2 +- examples/references/segmentation/pascal_voc2012/main.py | 2 +- ignite/engine/__init__.py | 4 ++-- tests/ignite/engine/test_create_supervised.py | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/references/classification/imagenet/main.py b/examples/references/classification/imagenet/main.py index 3069523c1115..efa0c8b146cb 100644 --- a/examples/references/classification/imagenet/main.py +++ b/examples/references/classification/imagenet/main.py @@ -8,7 +8,7 @@ try: from torch.amp import autocast, GradScaler except ImportError: - raise RuntimeError("Please, use recent PyTorch version, e.g. >=2.3.1") + raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0") import dataflow as data import utils diff --git a/examples/references/segmentation/pascal_voc2012/main.py b/examples/references/segmentation/pascal_voc2012/main.py index 34f43b895879..b52dcd468ae9 100644 --- a/examples/references/segmentation/pascal_voc2012/main.py +++ b/examples/references/segmentation/pascal_voc2012/main.py @@ -8,7 +8,7 @@ try: from torch.amp import autocast, GradScaler except ImportError: - raise RuntimeError("Please, use recent PyTorch version, e.g. >=2.3.1") + raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0") import dataflow as data import utils diff --git a/ignite/engine/__init__.py b/ignite/engine/__init__.py index 6300011fd79d..e8c4e8b6d835 100644 --- a/ignite/engine/__init__.py +++ b/ignite/engine/__init__.py @@ -187,7 +187,7 @@ def supervised_training_step_amp( try: from torch.amp import autocast, GradScaler except ImportError: - raise ImportError("Please install torch>=2.3.1 to use amp_mode='amp'.") + raise ImportError("Please install torch>=1.12.0 to use amp_mode='amp'.") if gradient_accumulation_steps <= 0: raise ValueError( @@ -412,7 +412,7 @@ def _check_arg( try: from torch.amp import GradScaler except ImportError: - raise ImportError("Please install torch>=2.3.1 to use scaler argument.") + raise ImportError("Please install torch>=1.6.0 to use scaler argument.") scaler = GradScaler(enabled=True) if on_tpu: diff --git a/tests/ignite/engine/test_create_supervised.py b/tests/ignite/engine/test_create_supervised.py index d17f58867e7e..6bd759f9c2b5 100644 --- a/tests/ignite/engine/test_create_supervised.py +++ b/tests/ignite/engine/test_create_supervised.py @@ -170,7 +170,7 @@ def _(): @pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") def test_create_supervised_training_scalar_assignment(): with mock.patch("ignite.engine._check_arg") as check_arg_mock: - check_arg_mock.return_value = None, torch.amp.GradScaler('cuda', enabled=False) + check_arg_mock.return_value = None, torch.amp.GradScaler(enabled=False) trainer, _ = _default_create_supervised_trainer(model_device="cpu", trainer_device="cpu", scaler=True) assert hasattr(trainer.state, "scaler") assert isinstance(trainer.state.scaler, torch.amp.GradScaler) @@ -462,7 +462,7 @@ def test_create_supervised_trainer_amp_error(mock_torch_cuda_amp_module): @pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") def test_create_supervised_trainer_scaler_not_amp(): - scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available()) + scaler = torch.amp.GradScaler(enabled=torch.cuda.is_available()) with pytest.raises(ValueError, match=f"scaler argument is {scaler}, but amp_mode is None."): _test_create_supervised_trainer(amp_mode=None, scaler=scaler) @@ -540,7 +540,7 @@ def test_create_supervised_trainer_on_cuda_amp_scaler(): _test_create_mocked_supervised_trainer( model_device=model_device, trainer_device=trainer_device, amp_mode="amp", scaler=True ) - scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available()) + scaler = torch.amp.GradScaler(enabled=torch.cuda.is_available()) _test_create_supervised_trainer( gradient_accumulation_steps=1, model_device=model_device, From b97301c8204c3ce5a9364c7cb52af7ee772422dd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Oct 2025 09:41:19 +0000 Subject: [PATCH 5/6] Apply torch.cuda.amp.GradScaler to torch.amp.GradScaler replacements in README, docs, and notebooks Co-authored-by: vfdev-5 <2459423+vfdev-5@users.noreply.github.com> --- README.md | 2 +- docs/source/conf.py | 2 +- .../CycleGAN_with_torch_cuda_amp.ipynb | 25 +++++++++---------- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 2cfa25e36810..20dd3156e078 100644 --- a/README.md +++ b/README.md @@ -397,7 +397,7 @@ Few pointers to get you started: - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytorch/ignite/blob/master/examples/notebooks/FastaiLRFinder_MNIST.ipynb) [Basic example of LR finder on MNIST](https://github.com/pytorch/ignite/blob/master/examples/notebooks/FastaiLRFinder_MNIST.ipynb) - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytorch/ignite/blob/master/examples/notebooks/Cifar100_bench_amp.ipynb) [Benchmark mixed precision training on Cifar100: - torch.cuda.amp vs nvidia/apex](https://github.com/pytorch/ignite/blob/master/examples/notebooks/Cifar100_bench_amp.ipynb) + torch.amp vs nvidia/apex](https://github.com/pytorch/ignite/blob/master/examples/notebooks/Cifar100_bench_amp.ipynb) - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytorch/ignite/blob/master/examples/notebooks/MNIST_on_TPU.ipynb) [MNIST training on a single TPU](https://github.com/pytorch/ignite/blob/master/examples/notebooks/MNIST_on_TPU.ipynb) - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1E9zJrptnLJ_PKhmaP5Vhb6DTVRvyrKHx) [CIFAR10 Training on multiple TPUs](https://github.com/pytorch/ignite/tree/master/examples/cifar10) diff --git a/docs/source/conf.py b/docs/source/conf.py index 6fdadc7b34b4..5f28cba5bd19 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -354,7 +354,7 @@ def run(self): ("py:class", "torch.optim.optimizer.Optimizer"), ("py:class", "torch.utils.data.dataset.Dataset"), ("py:class", "torch.utils.data.sampler.BatchSampler"), - ("py:class", "torch.cuda.amp.grad_scaler.GradScaler"), + ("py:class", "torch.amp.grad_scaler.GradScaler"), ("py:class", "torch.optim.lr_scheduler._LRScheduler"), ("py:class", "torch.optim.lr_scheduler.LRScheduler"), ("py:class", "torch.utils.data.dataloader.DataLoader"), diff --git a/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb b/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb index c687267d0d52..4f280013b596 100644 --- a/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb +++ b/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb @@ -875,10 +875,10 @@ "As suggested, we divide the objective by 2 while optimizing D, which slows down the rate at which D learns, relative to the rate of G. \n", "\n", "According to the paper:\n", - "- generator A is trained minimize $\\text{mean}_{x \\in A}[(D_B(G(x)) − 1)^2]$ and cycle loss $\\text{mean}_{x \\in A}\\left[ |F(G(x)) - x|_1 \\right]$\n", - "- generator B is trained minimize $\\text{mean}_{y \\in B}[(D_A(F(y)) − 1)^2]$ and cycle loss $\\text{mean}_{y \\in B}\\left[ |G(F(y)) - y|_1 \\right]$\n", - "- discriminators A is trained to minimize $\\text{mean}_{x \\in A}[(D_A(x) − 1)^2] + \\text{mean}_{y \\in B}[D_A(F(y))^2]$.\n", - "- discriminator B is trained to minimize $\\text{mean}_{y \\in B}[(D_B(y) − 1)^2] + \\text{mean}_{x \\in A}[D_B(G(x))^2]$." + "- generator A is trained minimize $\\text{mean}_{x \\in A}[(D_B(G(x)) \u2212 1)^2]$ and cycle loss $\\text{mean}_{x \\in A}\\left[ |F(G(x)) - x|_1 \\right]$\n", + "- generator B is trained minimize $\\text{mean}_{y \\in B}[(D_A(F(y)) \u2212 1)^2]$ and cycle loss $\\text{mean}_{y \\in B}\\left[ |G(F(y)) - y|_1 \\right]$\n", + "- discriminators A is trained to minimize $\\text{mean}_{x \\in A}[(D_A(x) \u2212 1)^2] + \\text{mean}_{y \\in B}[D_A(F(y))^2]$.\n", + "- discriminator B is trained to minimize $\\text{mean}_{y \\in B}[(D_B(y) \u2212 1)^2] + \\text{mean}_{x \\in A}[D_B(G(x))^2]$." ] }, { @@ -887,7 +887,7 @@ "id": "JE8dLeEfIl_Z" }, "source": [ - "We will use [`torch.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.amp.autocast) and [`torch.cuda.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)." + "We will use [`torch.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.amp.autocast) and [`torch.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)." ] }, { @@ -896,8 +896,7 @@ "id": "vrJls4p-FRcA" }, "source": [ - "from torch.cuda.amp import GradScaler\n", - "from torch.amp import autocast\n", + "from torch.amp import autocast, GradScaler\n", "\n", "from ignite.utils import convert_tensor\n", "import torch.nn.functional as F\n", @@ -924,7 +923,7 @@ "\n", "\n", "def compute_loss_discriminator(decision_real, decision_fake):\n", - " # loss = mean (D_b(y) − 1)^2 + mean D_b(G(x))^2 \n", + " # loss = mean (D_b(y) \u2212 1)^2 + mean D_b(G(x))^2 \n", " loss = F.mse_loss(decision_fake, torch.zeros_like(decision_fake))\n", " loss += F.mse_loss(decision_real, torch.ones_like(decision_real))\n", " return loss\n", @@ -954,10 +953,10 @@ " decision_fake_b = discriminator_B(fake_b)\n", "\n", " # Compute loss for generators and update generators\n", - " # loss_a2b = GAN loss: mean (D_b(G(x)) − 1)^2 + Forward cycle loss: || F(G(x)) - x ||_1 \n", + " # loss_a2b = GAN loss: mean (D_b(G(x)) \u2212 1)^2 + Forward cycle loss: || F(G(x)) - x ||_1 \n", " loss_a2b = compute_loss_generator(decision_fake_b, real_a, rec_a, lambda_value) \n", "\n", - " # loss_b2a = GAN loss: mean (D_a(F(x)) − 1)^2 + Backward cycle loss: || G(F(y)) - y ||_1\n", + " # loss_b2a = GAN loss: mean (D_a(F(x)) \u2212 1)^2 + Backward cycle loss: || G(F(y)) - y ||_1\n", " loss_b2a = compute_loss_generator(decision_fake_a, real_b, rec_b, lambda_value)\n", "\n", " # total generators loss:\n", @@ -977,10 +976,10 @@ " decision_real_a, decision_fake_a = discriminator_forward_pass(discriminator_A, real_a, fake_a.detach(), fake_a_buffer) \n", " decision_real_b, decision_fake_b = discriminator_forward_pass(discriminator_B, real_b, fake_b.detach(), fake_b_buffer) \n", " # Compute loss for discriminators and update discriminators\n", - " # loss_a = mean (D_a(y) − 1)^2 + mean D_a(F(x))^2\n", + " # loss_a = mean (D_a(y) \u2212 1)^2 + mean D_a(F(x))^2\n", " loss_a = compute_loss_discriminator(decision_real_a, decision_fake_a)\n", "\n", - " # loss_b = mean (D_b(y) − 1)^2 + mean D_b(G(x))^2\n", + " # loss_b = mean (D_b(y) \u2212 1)^2 + mean D_b(G(x))^2\n", " loss_b = compute_loss_discriminator(decision_real_b, decision_fake_b)\n", " \n", " # total discriminators loss:\n", @@ -1578,4 +1577,4 @@ "outputs": [] } ] -} +} \ No newline at end of file From 6046bc8867d9707f957bf0787ac9507762711d14 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Oct 2025 09:54:45 +0000 Subject: [PATCH 6/6] Fix code style: add newline at end of CycleGAN notebook Co-authored-by: vfdev-5 <2459423+vfdev-5@users.noreply.github.com> --- examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb b/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb index 4f280013b596..df18d041b81e 100644 --- a/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb +++ b/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb @@ -1577,4 +1577,4 @@ "outputs": [] } ] -} \ No newline at end of file +}