[Flux] Add cuda_graph_scope and cache images ids for full iteration cuda graph.

alpha0422 · mmarcinkiewicz · commit 2182063ed489 · 2025-09-19T14:07:27.000Z
Signed-off-by: Wil Kong &lt;alpha0422@gmail.com&gt;
diff --git a/nemo/collections/diffusion/models/flux/model.py b/nemo/collections/diffusion/models/flux/model.py
@@ -16,6 +16,7 @@
 import os
 from contextlib import nullcontext
 from dataclasses import dataclass, field
+from functools import lru_cache
 from pathlib import Path
 from typing import Callable, Optional
 
@@ -97,6 +98,7 @@ class FluxConfig(TransformerConfig, io.IOMixin):
     use_cpu_initialization: bool = True
     gradient_accumulation_fusion: bool = False
     enable_cuda_graph: bool = False
+    cuda_graph_scope: Optional[str] = None  # full, full_iteration
     use_te_rng_tracker: bool = False
     cuda_graph_warmup_steps: int = 2
 
@@ -731,6 +733,7 @@ def _unpack_latents(self, latents, height, width):
 
         return latents
 
+    @lru_cache
     def _prepare_latent_image_ids(
         self, batch_size: int, height: int, width: int, device: torch.device, dtype: torch.dtype
     ):