Migrate torchdynamo to torch.compile (huggingface#20634)

sgugger · Magnus Pierrau · commit b27e801b10f2 · 2022-12-15T15:32:43.000+01:00
* Migrate torchdynamo to torch.compile

* Add docstring and generic option

* Properly use the function...

* Reorg args
diff --git a/docs/source/en/perf_train_gpu_one.mdx b/docs/source/en/perf_train_gpu_one.mdx
@@ -718,11 +718,11 @@ For some applications, such as pretraining large language models, applying all t
 
 Another use case for training on many GPUs is if the model does not fit on a single GPU with all the mentioned tricks. There are still more methods we can apply although life starts to get a bit more complicated. This usually involves some form of pipeline or tensor parallelism where the model itself is distributed across several GPUs. One can also make use of DeepSpeed which implements some of these parallelism strategies along with some more optimization to reduce the memory footprint such as partitioning the optimizer states. You can read more about this in the ["Multi-GPU training" section](perf_train_gpu_many).
 
-## Inference with torchdynamo
+## Using torch.compile
 
-TorchDynamo is a new tracer that uses Python’s frame evaluation API to automatically create FX traces from existing PyTorch programs. After capturing the FX graph, different backends can be deployed to lower the graph to an optimized engine. You can choose one option below for performance boost.
+PyTorch 2.0 introduces a new compile function, you can learn more about it [in their documentation](https://pytorch.org/get-started/pytorch-2.0/). It uses Python’s frame evaluation API to automatically create a graph from existing PyTorch programs. After capturing the graph, different backends can be deployed to lower the graph to an optimized engine. You can choose one option below for performance boost.
 
-TorchDynamo has a growing list of backends, which can be found in [backends.py](https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py)
+`torch.compile` has a growing list of backends, which can be found in [backends.py](https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py)
 or `torchdynamo.list_backends()` each of which with its optional dependencies.
 
 Some of the most commonly used backends are
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -144,8 +144,8 @@
     is_ipex_available,
     is_sagemaker_dp_enabled,
     is_sagemaker_mp_enabled,
+    is_torch_compile_available,
     is_torch_tpu_available,
-    is_torchdynamo_available,
     logging,
 )
 from .utils.generic import ContextManagers
@@ -642,9 +642,9 @@ def __init__(
         # very last
         self._memory_tracker.stop_and_update_metrics()
 
-        # torchdynamo
-        if args.torchdynamo is not None and not is_torchdynamo_available():
-            raise RuntimeError("Using torchdynamo requires a nighly install of PyTorch.")
+        # torch.compile
+        if args.torch_compile and not is_torch_compile_available():
+            raise RuntimeError("Using torch.compile requires a nighly install of PyTorch.")
 
     def add_callback(self, callback):
         """
@@ -1321,10 +1321,9 @@ def ipex_optimize_model(self, model, training=False, dtype=torch.float32):
         return model
 
     def _wrap_model(self, model, training=True, dataloader=None):
-        if self.args.torchdynamo is not None:
-            import torch._dynamo as dynamo
+        if self.args.torch_compile:
+            model = torch.compile(model, backend=self.args.torch_compile_backend, mode=self.args.torch_compile_mode)
 
-            model = dynamo.optimize(self.args.torchdynamo)(model)
         if self.args.use_ipex:
             dtype = torch.bfloat16 if self.use_cpu_amp else torch.float32
             model = self.ipex_optimize_model(model, training, dtype=dtype)
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -73,7 +73,7 @@
 trainer_log_levels = dict(**log_levels, passive=-1)
 
 
-DYNAMO_BACKENDS = [
+TORCH_COMPILE_BACKENDS = [
     "eager",
     "aot_eager",
     "inductor",
@@ -514,6 +514,21 @@ class TrainingArguments:
             information.
         use_mps_device (`bool`, *optional*, defaults to `False`):
             Whether to use Apple Silicon chip based `mps` device.
+        torch_compile (`bool`, *optional*, defaults to `False`):
+            Whether or not to compile the model using PyTorch 2.0
+            [`torch.compile`](https://pytorch.org/get-started/pytorch-2.0/) (requires a nighlty install of PyTorch).
+
+            If set, the backend will default to `"inductor"` (can be customized with `torch_compile_backend`) and the
+            mode will default to `"default"` (can be customized with `torch_compile_mode`).
+        torch_compile_backend (`str`, *optional*):
+            The backend to use in `torch.compile`. If set to any value, `torch_compile` will be set to `True`.
+
+            Possible choices are `"eager"`, `"aot_eager"`, `"inductor"`, `"nvfuser"`, `"aot_nvfuser"`,
+            `"aot_cudagraphs"`, `"ofi"`, `"fx2trt"`, `"onnxrt"` and `"ipex"`.
+        torch_compile_mode (`str`, *optional*):
+            The mode to use in `torch.compile`. If set to any value, `torch_compile` will be set to `True`.
+
+            Possible choices are `"default"`, `"reduce-overhead"` and `"max-autotune"`.
     """
 
     framework = "pt"
@@ -983,8 +998,8 @@ class TrainingArguments:
     torchdynamo: Optional[str] = field(
         default=None,
         metadata={
-            "help": "Sets up the backend compiler for TorchDynamo.",
-            "choices": DYNAMO_BACKENDS,
+            "help": "This argument is deprecated, use `--torch_compile_backend` instead.",
+            "choices": TORCH_COMPILE_BACKENDS,
         },
     )
     ray_scope: Optional[str] = field(
@@ -1006,6 +1021,23 @@ class TrainingArguments:
             "help": "Overrides the default timeout for distributed training (value should be given in seconds)."
         },
     )
+    torch_compile: bool = field(
+        default=False, metadata={"help": "If set to `True`, the model will be wrapped in `torch.compile`."}
+    )
+    torch_compile_backend: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Which backend to use with `torch.compile`, passing one will trigger a model compilation.",
+            "choices": TORCH_COMPILE_BACKENDS,
+        },
+    )
+    torch_compile_mode: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Which mode to use with `torch.compile`, passing one will trigger a model compilation.",
+            "choices": ["default", "reduce-overhead", "max-autotune"],
+        },
+    )
 
     def __post_init__(self):
         # Handle --use_env option in torch.distributed.launch (local_rank not passed as an arg then).
@@ -1148,10 +1180,24 @@ def __post_init__(self):
                 " (`--bf16_full_eval`) can only be used on CUDA or CPU devices."
             )
 
-        if self.framework == "pt" and is_torch_available() and self.torchdynamo is not None:
+        if self.torchdynamo is not None:
+            warnings.warn(
+                "`torchdynamo` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
+                " `torch_compile_backend` instead",
+                FutureWarning,
+            )
+            self.torch_compile_backend = self.torchdynamo
+        if (self.torch_compile_mode is not None or self.torch_compile_backend is not None) and not self.torch_compile:
+            self.torch_compile = True
+        if self.torch_compile and self.torch_compile_backend is None:
+            self.torch_compile_backend = "inductor"
+        if self.framework == "pt" and is_torch_available() and self.torch_compile:
             if is_torch_tf32_available():
                 if self.tf32 is None and not self.fp16 or self.bf16:
-                    logger.info("Setting TF32 in CUDA backends to speedup torchdynamo.")
+                    logger.info(
+                        "Setting TF32 in CUDA backends to speedup torch compile, you won't see any improvement"
+                        " otherwise."
+                    )
                     torch.backends.cuda.matmul.allow_tf32 = True
             else:
                 logger.warning(
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
@@ -148,6 +148,7 @@
     is_torch_bf16_available,
     is_torch_bf16_cpu_available,
     is_torch_bf16_gpu_available,
+    is_torch_compile_available,
     is_torch_cuda_available,
     is_torch_fx_available,
     is_torch_fx_proxy,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
@@ -455,6 +455,15 @@ def is_torchdynamo_available():
         return False
 
 
+def is_torch_compile_available():
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    return hasattr(torch, "compile")
+
+
 def is_torch_tensorrt_fx_available():
     if importlib.util.find_spec("torch_tensorrt") is None:
         return False