xdit-project · feifeibear · Oct 13, 2025 · Sep 18, 2025
diff --git a/xfuser/core/distributed/group_coordinator.py b/xfuser/core/distributed/group_coordinator.py
@@ -18,6 +18,11 @@
 except ModuleNotFoundError:
     pass
 
+import xfuser.envs as envs
+if envs._is_npu():
+    print("torch.npu synchronize")
+    from torch.npu import synchronize
+
 import xfuser.envs as envs
 from xfuser.logger import init_logger
 

diff --git a/xfuser/core/distributed/parallel_state.py b/xfuser/core/distributed/parallel_state.py
@@ -22,6 +22,11 @@
 except ModuleNotFoundError:
     pass
 
+try:
+    from torch.npu import set_device, device_count
+except ModuleNotFoundError:
+    pass
+
 from .utils import RankGenerator
 
 env_info = envs.PACKAGES_CHECKER.get_packages_info()
@@ -396,6 +401,11 @@ def initialize_model_parallel(
             f"sequence_parallel_degree is not equal to ring_degree * ulysses_degree, {sequence_parallel_degree} != {ring_degree} * {ulysses_degree}"
         )
 
+    # FIXME: Since the async p2p communication operation of NPU is not same as cuda in torch,
+    # the pipefusion is not ready for npu yet
+    if envs._is_npu():
+        assert pipeline_parallel_degree == 1, "Current pipefusion is not ready for NPU"
+
     dit_parallel_size = (
         data_parallel_degree
         * classifier_free_guidance_degree

diff --git a/xfuser/core/distributed/runtime_state.py b/xfuser/core/distributed/runtime_state.py
@@ -17,6 +17,10 @@
 except ModuleNotFoundError:
     pass
 
+import xfuser.envs as envs
+if envs._is_npu():
+    from torch.npu import manual_seed as device_manual_seed
+    from torch.npu import manual_seed_all as device_manual_seed_all
 from xfuser.config.config import (
     ParallelConfig,
     RuntimeConfig,

diff --git a/xfuser/core/utils/timer.py b/xfuser/core/utils/timer.py
@@ -9,6 +9,10 @@
 except ModuleNotFoundError:
     pass
 
+import xfuser.envs as envs
+if envs._is_npu():
+    from torch.npu import synchronize
+
 def gpu_timer_decorator(func):
     def wrapper(*args, **kwargs):
         synchronize()

diff --git a/xfuser/envs.py b/xfuser/envs.py
@@ -67,24 +67,36 @@ def _is_mps():
     return torch.backends.mps.is_available()
 
 
+def _is_npu():
+    try:
+        if hasattr(torch, "npu") and torch.npu.is_available():
+            return True
+    except ModuleNotFoundError:
+        return False
+
+
 def get_device(local_rank: int) -> torch.device:
-    if torch.cuda.is_available():
+    if _is_cuda():
         return torch.device("cuda", local_rank)
     elif _is_musa():
         return torch.device("musa", local_rank)
     elif _is_mps():
         return torch.device("mps")
+    elif _is_npu():
+        return torch.device("npu", local_rank)
     else:
         return torch.device("cpu")
 
 
 def get_device_name() -> str:
-    if torch.cuda.is_available():
+    if _is_cuda():
         return "cuda"
     elif _is_musa():
         return "musa"
     elif _is_mps():
         return "mps"
+    elif _is_npu():
+        return "npu"
     else:
         return "cpu"
 
@@ -100,19 +112,23 @@ def get_device_version():
         return torch.version.musa
     elif _is_mps():
         return None
+    elif _is_npu():
+        return None
     else:
         raise NotImplementedError(
             "No Accelerators(AMD/NV/MTT GPU, AMD MI instinct accelerators) available"
         )
 
 
 def get_torch_distributed_backend() -> str:
-    if torch.cuda.is_available():
+    if _is_cuda():
         return "nccl"
     elif _is_musa():
         return "mccl"
     elif _is_mps():
         return "gloo"
+    elif _is_npu():
+        return "hccl"
     else:
         raise NotImplementedError(
             "No Accelerators(AMD/NV/MTT GPU, AMD MI instinct accelerators) available"
@@ -191,6 +207,12 @@ def check_aiter(self):
     def check_flash_attn(self):
         if not torch.cuda.is_available():
             return False
+
+        # Check if torch_npu is available
+        if _is_npu():
+            logger.info("falsh_attn is not ready on torch_npu for now")
+            return False
+
         if _is_musa():
             logger.info(
                 "Flash Attention library is not supported on MUSA for the moment."

diff --git a/xfuser/model_executor/layers/feedforward.py b/xfuser/model_executor/layers/feedforward.py
@@ -11,6 +11,10 @@
 except ModuleNotFoundError:
     pass
 
+import xfuser.envs as envs
+if envs._is_npu():
+    from torch.npu import empty_cache
+
 from xfuser.core.distributed.parallel_state import (
     get_tensor_model_parallel_world_size,
     get_tensor_model_parallel_rank,

diff --git a/xfuser/model_executor/pipelines/pipeline_flux.py b/xfuser/model_executor/pipelines/pipeline_flux.py
@@ -40,6 +40,7 @@
 from xfuser.core.distributed.group_coordinator import GroupCoordinator
 from .base_pipeline import xFuserPipelineBaseWrapper
 from .register import xFuserPipelineWrapperRegister
+from ...envs import _is_npu
 
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
@@ -75,13 +76,14 @@ def prepare_run(
         prompt = [""] * input_config.batch_size if input_config.batch_size > 1 else ""
         warmup_steps = get_runtime_state().runtime_config.warmup_steps
         get_runtime_state().runtime_config.warmup_steps = sync_steps
+        device = "npu" if _is_npu() else "cuda"
         self.__call__(
             height=input_config.height,
             width=input_config.width,
             prompt=prompt,
             num_inference_steps=steps,
             max_sequence_length=input_config.max_sequence_length,
-            generator=torch.Generator(device="cuda").manual_seed(42),
+            generator=torch.Generator(device=device).manual_seed(42),
             output_type=input_config.output_type,
         )
         get_runtime_state().runtime_config.warmup_steps = warmup_steps

diff --git a/xfuser/model_executor/pipelines/pipeline_stable_diffusion_3.py b/xfuser/model_executor/pipelines/pipeline_stable_diffusion_3.py
@@ -41,6 +41,7 @@
 )
 from .base_pipeline import xFuserPipelineBaseWrapper
 from .register import xFuserPipelineWrapperRegister
+from ...envs import _is_npu
 
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
@@ -74,12 +75,15 @@ def prepare_run(
         prompt = [""] * input_config.batch_size if input_config.batch_size > 1 else ""
         warmup_steps = get_runtime_state().runtime_config.warmup_steps
         get_runtime_state().runtime_config.warmup_steps = sync_steps
+        device = "cuda"
+        if _is_npu():
+            device = "npu"
         self.__call__(
             height=input_config.height,
             width=input_config.width,
             prompt=prompt,
             num_inference_steps=steps,
-            generator=torch.Generator(device="cuda").manual_seed(42),
+            generator=torch.Generator(device=device).manual_seed(42),
             output_type=input_config.output_type,
         )
         get_runtime_state().runtime_config.warmup_steps = warmup_steps