Add NPU support for one model in one node

ChenTaoyu-SJTU · ChenTaoyu-SJTU · commit 75b52b6f6b4a · 2025-09-18T03:23:48.000Z
Signed-off-by: ChenTaoyu-SJTU &lt;ctynb@qq.com&gt;
diff --git a/xfuser/core/distributed/parallel_state.py b/xfuser/core/distributed/parallel_state.py
@@ -22,6 +22,11 @@
 except ModuleNotFoundError:
     pass
 
+try:
+    from torch.npu import set_device, device_count
+except ModuleNotFoundError:
+    pass
+
 from .utils import RankGenerator
 
 env_info = envs.PACKAGES_CHECKER.get_packages_info()
diff --git a/xfuser/envs.py b/xfuser/envs.py
@@ -67,24 +67,36 @@ def _is_mps():
     return torch.backends.mps.is_available()
 
 
+def _is_npu():
+    try:
+        if hasattr(torch, "npu") and torch.npu.is_available():
+            return True
+    except ModuleNotFoundError:
+        return False
+
+
 def get_device(local_rank: int) -> torch.device:
-    if torch.cuda.is_available():
+    if _is_cuda():
         return torch.device("cuda", local_rank)
     elif _is_musa():
         return torch.device("musa", local_rank)
     elif _is_mps():
         return torch.device("mps")
+    elif _is_npu():
+        return torch.device("npu", local_rank)
     else:
         return torch.device("cpu")
 
 
 def get_device_name() -> str:
-    if torch.cuda.is_available():
+    if _is_cuda():
         return "cuda"
     elif _is_musa():
         return "musa"
     elif _is_mps():
         return "mps"
+    elif _is_npu():
+        return "npu"
     else:
         return "cpu"
 
@@ -100,19 +112,23 @@ def get_device_version():
         return torch.version.musa
     elif _is_mps():
         return None
+    elif _is_npu():
+        return None
     else:
         raise NotImplementedError(
             "No Accelerators(AMD/NV/MTT GPU, AMD MI instinct accelerators) available"
         )
 
 
 def get_torch_distributed_backend() -> str:
-    if torch.cuda.is_available():
+    if _is_cuda():
         return "nccl"
     elif _is_musa():
         return "mccl"
     elif _is_mps():
         return "gloo"
+    elif _is_npu():
+        return "hccl"
     else:
         raise NotImplementedError(
             "No Accelerators(AMD/NV/MTT GPU, AMD MI instinct accelerators) available"
@@ -191,6 +207,12 @@ def check_aiter(self):
     def check_flash_attn(self):
         if not torch.cuda.is_available():
             return False
+
+        # Check if torch_npu is available
+        if _is_npu():
+            logger.info("falsh_attn is not ready on torch_npu for now")
+            return False
+
         if _is_musa():
             logger.info(
                 "Flash Attention library is not supported on MUSA for the moment."
diff --git a/xfuser/model_executor/pipelines/pipeline_flux.py b/xfuser/model_executor/pipelines/pipeline_flux.py
@@ -40,6 +40,7 @@
 from xfuser.core.distributed.group_coordinator import GroupCoordinator
 from .base_pipeline import xFuserPipelineBaseWrapper
 from .register import xFuserPipelineWrapperRegister
+from ...envs import _is_npu
 
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
@@ -75,13 +76,14 @@ def prepare_run(
         prompt = [""] * input_config.batch_size if input_config.batch_size > 1 else ""
         warmup_steps = get_runtime_state().runtime_config.warmup_steps
         get_runtime_state().runtime_config.warmup_steps = sync_steps
+        device = "npu" if _is_npu() else "cuda"
         self.__call__(
             height=input_config.height,
             width=input_config.width,
             prompt=prompt,
             num_inference_steps=steps,
             max_sequence_length=input_config.max_sequence_length,
-            generator=torch.Generator(device="cuda").manual_seed(42),
+            generator=torch.Generator(device=device).manual_seed(42),
             output_type=input_config.output_type,
         )
         get_runtime_state().runtime_config.warmup_steps = warmup_steps