resolve conflicts and define patch function

achew010 · achew010 · commit 6764755b4e2b · 2024-05-29T03:23:11.000Z
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/autogptq_utils.py b/plugins/accelerated-peft/src/fms_acceleration_peft/autogptq_utils.py
@@ -16,20 +16,23 @@
 # https://spdx.dev/learn/handling-license-info/
 
 # Standard
-from typing import Callable, List, Any
+from typing import Any, Callable, List
 import importlib
 
 # Third Party
 from peft import LoraConfig
 from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ
 import torch
 
-def patch_target_module(
+
+# This function will be replaced after merging
+# https://github.com/foundation-model-stack/fms-acceleration/pull/25
+def _patch_target_module(
     to_patch: str,
     replace_with: Any,
     target_module: str = None,
 ):
-    to_patch = to_patch.split('.')
+    to_patch = to_patch.split(".")
     assert len(to_patch) > 1, "must have an object to patch"
 
     to_patch, obj_name_to_patch = to_patch[:-1], to_patch[-1]
@@ -46,6 +49,7 @@ def patch_target_module(
         # replace it
         setattr(source, obj_name_to_patch, original_obj)
 
+
 def make_sure_no_tensor_in_meta_device(
     model,
     use_triton: bool,
@@ -57,7 +61,11 @@ def make_sure_no_tensor_in_meta_device(
     use_marlin: bool = False,
     use_tritonv2: bool = False,
 ):
-    from auto_gptq.utils.import_utils import dynamically_import_QuantLinear  #pylint: disable=import-outside-toplevel,import-error
+    # Third Party
+    from auto_gptq.utils.import_utils import (  # pylint: disable=import-outside-toplevel,import-error
+        dynamically_import_QuantLinear,
+    )
+
     QuantLinear = dynamically_import_QuantLinear(
         use_triton,
         desc_act,
@@ -66,15 +74,17 @@ def make_sure_no_tensor_in_meta_device(
         disable_exllama=disable_exllama,
         disable_exllamav2=disable_exllamav2,
         use_marlin=use_marlin,
-        use_tritonv2=use_tritonv2
-        )
-    for n, m in model.named_modules():
+        use_tritonv2=use_tritonv2,
+    )
+    for _, m in model.named_modules():
         bias = getattr(m, "bias", None)
         if bias:
             if isinstance(m, QuantLinear) and bias.device == torch.device("meta"):
                 m.register_buffer(
-                    "bias", torch.zeros((m.outfeatures), dtype=torch.float16, device="cpu")
-                    )
+                    "bias",
+                    torch.zeros((m.outfeatures), dtype=torch.float16, device="cpu"),
+                )
+
 
 def replace_module_peft(self, parent_module, child_name, new_module, old_module):
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
@@ -31,6 +31,7 @@
 import torch
 import torch.distributed
 
+
 class AutoGPTQAccelerationPlugin(AccelerationPlugin):
 
     require_packages = ["auto_gptq"]
@@ -50,11 +51,18 @@ def __init__(self, configurations: Dict[str, Dict]):
     def model_loader(self, model_name: str, **kwargs):
         # guarded imports
         # Third Party
-        from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig #pylint: disable=import-outside-toplevel,import-error
-        from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import QuantLinear #pylint: disable=import-outside-toplevel,import-error
+        from auto_gptq import (  # pylint: disable=import-outside-toplevel,import-error
+            AutoGPTQForCausalLM,
+            BaseQuantizeConfig,
+        )
+        from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import (  # pylint: disable=import-outside-toplevel,import-error
+            QuantLinear,
+        )
 
         # Local
-        from .autogptq_utils import patch_forward_to_view_attributes_before_call #pylint: disable=import-outside-toplevel
+        from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
+            patch_forward_to_view_attributes_before_call,
+        )
 
         # Currently we allow only a quantized checkpoint to be loaded, we do not
         # implement the quantization process here.
@@ -93,27 +101,36 @@ def model_loader(self, model_name: str, **kwargs):
         AutoModelForCausalLM.from_config = _from_config  # patch
 
         if is_fsdp_enabled():
-            from .autogptq_utils import patch_target_module, make_sure_no_tensor_in_meta_device #pylint: disable=import-outside-toplevel
-            # We patch `make_sure_no_tensor_in_meta_device` from autogptq to avoid errors on models without bias
-            patch_target_module(
-                to_patch = "auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device",
-                replace_with = make_sure_no_tensor_in_meta_device,
-                target_module = "auto_gptq.modeling._base",
+            # Local
+            from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
+                _patch_target_module,
+                make_sure_no_tensor_in_meta_device,
+            )
+
+            # We patch `make_sure_no_tensor_in_meta_device`
+            # from autogptq to avoid errors on models without bias
+            _patch_target_module(
+                to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device",
+                replace_with=make_sure_no_tensor_in_meta_device,
+                target_module="auto_gptq.modeling._base",
             )
             low_cpu_mem_usage = True
 
         # NOTE: need to set the device map as below as we want to use AutoGPTQ for training.
-        # device_map is for inference only https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference
+        # device_map is for inference only
+        # https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference
         # For low_cpu_mem_usage = True, we have to set the device map to load checkpoints to "cpu"
         # to avoid gpu consumption before train
-        # This approach will divert consumption to cpu memory, a better approach would be to load the checkpoints to meta device
+        # This approach will divert consumption to cpu memory,
+        # a better approach would be to load the checkpoints to meta device
         # QLoRA is currently implemented by the former approach and will encounter the same issue.
         # see https://github.com/huggingface/transformers/pull/25107#issuecomment-2134833262
         device_map = {
             "": (
-                torch.cuda.current_device() if not low_cpu_mem_usage
-                else "cpu"
-            ) if torch.cuda.is_available() else None
+                (torch.cuda.current_device() if not low_cpu_mem_usage else "cpu")
+                if torch.cuda.is_available()
+                else None
+            )
         }
 
         # currently only enable triton_v2, because the triton kernels are the only ones
@@ -202,11 +219,19 @@ def augmentation(
     ):
         # guarded imports
         # Third Party
-        from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import QuantLinear #pylint: disable=import-outside-toplevel,import-error
-        from auto_gptq.utils.peft_utils import GPTQLoraModel, get_gptq_peft_model #pylint: disable=import-outside-toplevel,import-error
+        from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import (  # pylint: disable=import-outside-toplevel,import-error
+            QuantLinear,
+        )
+        from auto_gptq.utils.peft_utils import (  # pylint: disable=import-outside-toplevel,import-error
+            GPTQLoraModel,
+            get_gptq_peft_model,
+        )
 
         # Local
-        from .autogptq_utils import create_new_module_peft, replace_module_peft #pylint: disable=import-outside-toplevel
+        from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
+            create_new_module_peft,
+            replace_module_peft,
+        )
 
         (peft_config,) = modifiable_args  # unpack modifiable args