Add option to use bf16 in PT sdp (#5) (huggingface#1514)

astachowiczhabana · ugolowic · regisss · commit 7d4385b2d4c5 · 2024-12-03T12:33:35.000Z
Co-authored-by: Urszula Golowicz &lt;urszula.golowicz@intel.com&gt;
diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py
@@ -228,6 +228,9 @@ def main():
         ),
     )
     parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.")
+    parser.add_argument(
+        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
+    )
     parser.add_argument(
         "--ldm3d", action="store_true", help="Use LDM3D to generate an image and a depth map from a given text prompt."
     )
@@ -344,6 +347,7 @@ def main():
         "use_habana": args.use_habana,
         "use_hpu_graphs": args.use_hpu_graphs,
         "gaudi_config": args.gaudi_config_name,
+        "sdp_on_bf16": args.sdp_on_bf16,
     }
 
     if scheduler is not None:
diff --git a/optimum/habana/diffusers/pipelines/pipeline_utils.py b/optimum/habana/diffusers/pipelines/pipeline_utils.py
@@ -113,6 +113,8 @@ class GaudiDiffusionPipeline(DiffusionPipeline):
         bf16_full_eval (bool, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit.
             This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -121,9 +123,13 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         DiffusionPipeline.__init__(self)
 
+        if sdp_on_bf16:
+            torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
+
         self.use_habana = use_habana
         if self.use_habana:
             self.use_hpu_graphs = use_hpu_graphs
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -131,6 +131,8 @@ class GaudiStableDiffusionPipeline(GaudiDiffusionPipeline, StableDiffusionPipeli
         bf16_full_eval (bool, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit.
             This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -148,13 +150,15 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
             use_habana,
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         # Workaround for Synapse 1.11 for full bf16
diff --git a/optimum/habana/transformers/training_args.py b/optimum/habana/transformers/training_args.py
@@ -305,6 +305,11 @@ class GaudiTrainingArguments(TrainingArguments):
         },
     )
 
+    sdp_on_bf16: bool = field(
+        default=False,
+        metadata={"help": "Allow pyTorch to use reduced precision in the SDPA math backend"},
+    )
+
     fp8: Optional[bool] = field(
         default=False,
         metadata={"help": "Whether to use fp8 for training."},
@@ -847,6 +852,9 @@ def _setup_devices(self) -> "torch.device":
             ):
                 gaudi_config.declare_autocast_bf16_fp32_ops()
 
+        if self.sdp_on_bf16:
+            torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
+
         logger.info("PyTorch: setting up devices")
         if not is_accelerate_available():
             raise ImportError(

Original file line number	Diff line number	Diff line change
`@@ -228,6 +228,9 @@ def main():`
`228`	`228`	`),`
`229`	`229`	`)`
`230`	`230`	`parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.")`
	`231`	`+ parser.add_argument(`
	`232`	`+ "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"`
	`233`	`+ )`
`231`	`234`	`parser.add_argument(`
`232`	`235`	`"--ldm3d", action="store_true", help="Use LDM3D to generate an image and a depth map from a given text prompt."`
`233`	`236`	`)`
`@@ -344,6 +347,7 @@ def main():`
`344`	`347`	`"use_habana": args.use_habana,`
`345`	`348`	`"use_hpu_graphs": args.use_hpu_graphs,`
`346`	`349`	`"gaudi_config": args.gaudi_config_name,`
	`350`	`+ "sdp_on_bf16": args.sdp_on_bf16,`
`347`	`351`	`}`
`348`	`352`
`349`	`353`	`if scheduler is not None:`