[Core][AMD] Migrate fully transparent sleep mode to ROCm platform

HollowMan6 · HollowMan6 · commit 8a1e046f1b31 · 2025-03-06T10:41:52.000+02:00
Signed-off-by: Hollow Man &lt;hollowman@opensuse.org&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -208,10 +208,15 @@ set_gencode_flags_for_srcs(
   SRCS "${VLLM_CUMEM_EXT_SRC}"
   CUDA_ARCHS "${CUDA_ARCHS}")
 
-if(VLLM_GPU_LANG STREQUAL "CUDA")
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling cumem allocator extension.")
-  # link against cuda driver library
-  list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+  if(VLLM_GPU_LANG STREQUAL "CUDA")
+    # link against cuda driver library
+    list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+  else()
+    # link against rocm driver library
+    list(APPEND CUMEM_LIBS amdhip64)
+  endif()
   define_gpu_extension_target(
     cumem_allocator
     DESTINATION vllm
diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
@@ -3,14 +3,14 @@
 // need to be unsigned long long
 #include <iostream>
 
+#include "cumem_allocator_compat.h"
+
 extern "C" {
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
 #include <sys/types.h>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
 
 char error_msg[10240];  // 10KB buffer to store error messages
 CUresult no_error = CUresult(0);
diff --git a/csrc/cumem_allocator_compat.h b/csrc/cumem_allocator_compat.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#ifdef USE_ROCM
+////////////////////////////////////////
+// For compatibility with CUDA and ROCm
+////////////////////////////////////////
+  #include <hip/hip_runtime_api.h>
+
+extern "C" {
+  #ifndef CUDA_SUCCESS
+    #define CUDA_SUCCESS hipSuccess
+  #endif  // CUDA_SUCCESS
+
+// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html
+typedef unsigned long long CUdevice;
+typedef hipDeviceptr_t CUdeviceptr;
+typedef hipError_t CUresult;
+typedef hipCtx_t CUcontext;
+typedef hipStream_t CUstream;
+typedef hipMemGenericAllocationHandle_t CUmemGenericAllocationHandle;
+typedef hipMemAllocationGranularity_flags CUmemAllocationGranularity_flags;
+typedef hipMemAllocationProp CUmemAllocationProp;
+typedef hipMemAccessDesc CUmemAccessDesc;
+
+  #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
+  #define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
+  #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
+  #define CU_MEM_ALLOC_GRANULARITY_MINIMUM hipMemAllocationGranularityMinimum
+
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
+  #define CU_MEM_ALLOCATION_COMP_NONE 0x0
+
+// Error Handling
+// https://docs.nvidia.com/cuda/archive/11.4.4/cuda-driver-api/group__CUDA__ERROR.html
+CUresult cuGetErrorString(CUresult hipError, const char** pStr) {
+  *pStr = hipGetErrorString(hipError);
+  return CUDA_SUCCESS;
+}
+
+// Context Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html
+CUresult cuCtxGetCurrent(CUcontext* ctx) {
+  // This API is deprecated on the AMD platform, only for equivalent cuCtx
+  // driver API on the NVIDIA platform.
+  return hipCtxGetCurrent(ctx);
+}
+
+CUresult cuCtxSetCurrent(CUcontext ctx) {
+  // This API is deprecated on the AMD platform, only for equivalent cuCtx
+  // driver API on the NVIDIA platform.
+  return hipCtxSetCurrent(ctx);
+}
+
+// Primary Context Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html
+CUresult cuDevicePrimaryCtxRetain(CUcontext* ctx, CUdevice dev) {
+  return hipDevicePrimaryCtxRetain(ctx, dev);
+}
+
+// Virtual Memory Management
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html
+CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size) {
+  return hipMemAddressFree(ptr, size);
+}
+
+CUresult cuMemAddressReserve(CUdeviceptr* ptr, size_t size, size_t alignment,
+                             CUdeviceptr addr, unsigned long long flags) {
+  return hipMemAddressReserve(ptr, size, alignment, addr, flags);
+}
+
+CUresult cuMemCreate(CUmemGenericAllocationHandle* handle, size_t size,
+                     const CUmemAllocationProp* prop,
+                     unsigned long long flags) {
+  return hipMemCreate(handle, size, prop, flags);
+}
+
+CUresult cuMemGetAllocationGranularity(
+    size_t* granularity, const CUmemAllocationProp* prop,
+    CUmemAllocationGranularity_flags option) {
+  return hipMemGetAllocationGranularity(granularity, prop, option);
+}
+
+CUresult cuMemMap(CUdeviceptr dptr, size_t size, size_t offset,
+                  CUmemGenericAllocationHandle handle,
+                  unsigned long long flags) {
+  return hipMemMap(dptr, size, offset, handle, flags);
+}
+
+CUresult cuMemRelease(CUmemGenericAllocationHandle handle) {
+  return hipMemRelease(handle);
+}
+
+CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
+                        const CUmemAccessDesc* desc, size_t count) {
+  return hipMemSetAccess(ptr, size, desc, count);
+}
+
+CUresult cuMemUnmap(CUdeviceptr ptr, size_t size) {
+  return hipMemUnmap(ptr, size);
+}
+}  // extern "C"
+
+#else
+////////////////////////////////////////
+// Import CUDA headers for NVIDIA GPUs
+////////////////////////////////////////
+  #include <cuda_runtime_api.h>
+  #include <cuda.h>
+#endif
diff --git a/setup.py b/setup.py
@@ -627,6 +627,7 @@ def _read_requirements(filename: str) -> list[str]:
 
 if _is_cuda() or _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 
 if _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
diff --git a/vllm/config.py b/vllm/config.py
@@ -321,8 +321,10 @@ def __init__(
 
         from vllm.platforms import current_platform
 
-        if self.enable_sleep_mode and not current_platform.is_cuda():
-            raise ValueError("Sleep mode is only supported on CUDA devices.")
+        if self.enable_sleep_mode and not (current_platform.is_cuda()
+                                           or current_platform.is_rocm()):
+            raise ValueError(
+                "Sleep mode is only supported on CUDA/ROCM devices.")
 
         hf_config = get_config(self.hf_config_path or self.model,
                                trust_remote_code, revision, code_revision,
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
@@ -53,7 +53,7 @@ def find_loaded_library(lib_name) -> Optional[str]:
     libcudart = CudaRTLibrary()
     cumem_available = True
 except ModuleNotFoundError:
-    # rocm platform does not support cumem allocator
+    # only cuda and rocm platforms support cumem allocator
     init_module = None
     python_create_and_map = None
     python_unmap_and_release = None
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -95,6 +95,20 @@ class CudaRTLibrary:
         ]),
     ]
 
+    # https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Runtime_API_functions_supported_by_HIP.html # noqa
+    cuda_to_hip_mapping = {
+        "cudaSetDevice": "hipSetDevice",
+        "cudaDeviceSynchronize": "hipDeviceSynchronize",
+        "cudaDeviceReset": "hipDeviceReset",
+        "cudaGetErrorString": "hipGetErrorString",
+        "cudaMalloc": "hipMalloc",
+        "cudaFree": "hipFree",
+        "cudaMemset": "hipMemset",
+        "cudaMemcpy": "hipMemcpy",
+        "cudaIpcGetMemHandle": "hipIpcGetMemHandle",
+        "cudaIpcOpenMemHandle": "hipIpcOpenMemHandle",
+    }
+
     # class attribute to store the mapping from the path to the library
     # to avoid loading the same library multiple times
     path_to_library_cache: Dict[str, Any] = {}
@@ -103,11 +117,21 @@ class CudaRTLibrary:
     #  to the corresponding dictionary
     path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
 
+    # check if the current process is using ROCm
+    is_rocm = False
+
     def __init__(self, so_file: Optional[str] = None):
         if so_file is None:
             so_file = find_loaded_library("libcudart")
             if so_file is None:
-                so_file = envs.VLLM_CUDART_SO_PATH  # fallback to env var
+                # libcudart is not loaded in the current process, try hip
+                so_file = find_loaded_library("libamdhip64")
+                # should be safe to assume now that we are using ROCm
+                # as the following assertion should error out if the
+                # libhiprtc library is also not loaded
+                self.is_rocm = True
+                if so_file is None:
+                    so_file = envs.VLLM_CUDART_SO_PATH  # fallback to env var
             assert so_file is not None, \
                 (
                     "libcudart is not loaded in the current process, "
@@ -121,7 +145,9 @@ def __init__(self, so_file: Optional[str] = None):
         if so_file not in CudaRTLibrary.path_to_dict_mapping:
             _funcs = {}
             for func in CudaRTLibrary.exported_functions:
-                f = getattr(self.lib, func.name)
+                f = getattr(
+                    self.lib, CudaRTLibrary.cuda_to_hip_mapping[func.name]
+                    if self.is_rocm else func.name)
                 f.restype = func.restype
                 f.argtypes = func.argtypes
                 _funcs[func.name] = f
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1042,7 +1042,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             action="store_true",
                             default=False,
                             help="Enable sleep mode for the engine. "
-                            "(only cuda platform is supported)")
+                            "(only cuda and hip platforms are supported)")
 
         parser.add_argument(
             '--calculate-kv-scales',