vllm-project
diff --git a/‎.buildkite/run-amd-test.sh‎
Lines changed: 1 addition & 0 deletions b/‎.buildkite/run-amd-test.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/mypy.yaml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/mypy.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎benchmarks/launch_tgi_server.sh‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/launch_tgi_server.sh‎
Lines changed: 1 addition & 1 deletion
@@ -75,6 +75,7 @@ docker run \
         --network host \
         --shm-size=16gb \
         --rm \
+        -e HIP_VISIBLE_DEVICES=0 \
         -e HF_TOKEN \
         -v ${HF_CACHE}:${HF_MOUNT} \
         -e HF_HOME=${HF_MOUNT} \
 
@@ -233,12 +233,13 @@ steps:
   parallelism: 4
 
 - label: Tensorizer Test # 11min
+  mirror_hardwares: [amd]
   soft_fail: true
   source_file_dependencies:
   - vllm/model_executor/model_loader
   - tests/tensorizer_loader
   commands:
-    - apt-get install -y curl libsodium23
+    - apt-get update && apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s tensorizer_loader
 
 
@@ -35,7 +35,6 @@ jobs:
         mypy
         mypy tests --follow-imports skip
         mypy vllm/attention --follow-imports skip
-        mypy vllm/core --follow-imports skip
         mypy vllm/distributed --follow-imports skip
         mypy vllm/engine  --follow-imports skip
         mypy vllm/executor --follow-imports skip
 
@@ -296,6 +296,11 @@ set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
   "csrc/moe/topk_softmax_kernels.cu")
 
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_MOE_EXT_SRC
+      "csrc/moe/marlin_moe_ops.cu")
+endif()
+
 define_gpu_extension_target(
   _moe_C
   DESTINATION vllm
 
@@ -6,7 +6,7 @@ TOKENS=$2
 
 docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
            -v $PWD/data:/data \
-           ghcr.io/huggingface/text-generation-inference:1.4.0 \
+           ghcr.io/huggingface/text-generation-inference:2.2.0 \
            --model-id $MODEL \
            --sharded false  \
            --max-input-length 1024 \