opea-project · chensuyue · Mar 12, 2025 · Mar 3, 2025 · Mar 4, 2025 · Mar 4, 2025
@@ -45,3 +45,8 @@ services:
       dockerfile: Dockerfile.hpu
     shm_size: '128g'
     image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+  vllm-rocm:
+    build:
+      dockerfile: comps/third_parties/vllm/src/Dockerfile.amd_gpu
+    shm_size: '128g'
+    image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
@@ -187,7 +187,43 @@ OpenVINO best known configuration for GPU is:
     $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
         python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
 
-### 2.4 Query the service
+### 2.4 vLLM with ROCm (on AMD GPU)
+
+#### Build docker image for ROCm vLLM
+
+```bash
+cd GenAIComps/comps/third_parties/vllm/src
+docker build -f Dockerfile.amd_gpu -t opea/vllm-rocm:latest . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+```
+
+#### Launch vLLM service with docker compose
+
+```bash
+cd GenAIComps/comps/third_parties/vllm/deployment/docker_compose
+# IP port for vLLM service
+export VLLM_SERVICE_PORT=8011
+# HF token
+export HUGGINGFACEHUB_API_TOKEN="your_hf_token"
+# Cache dir
+export HF_CACHE_DIR="./data"
+# Model
+export VLLM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+# Specify the number of GPUs used
+export TENSOR_PARALLEL_SIZE=1
+# Run deploy
+docker compose -f compose.yaml up vllm-rocm-server -d
+```
+
+#### Checking ROCM vLLM service
+
+```bash
+curl http://${host_ip}:${VLLM_SERVICE_PORT}/v1/chat/completions \
+    -X POST \
+    -H "Content-Type: application/json" \
+    -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
+```
+
+### 2.5 Query the service
 
 And then you can make requests like below to check the service status:
 

@@ -101,6 +101,36 @@ services:
       interval: 10s
       timeout: 10s
       retries: 100
+  vllm-rocm-server:
+    image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
+    container_name: vllm-server
+    ports:
+      - "${VLLM_SERVICE_PORT:-8081}:8011"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      WILM_USE_TRITON_FLASH_ATTENTION: 0
+      PYTORCH_JIT: 0
+    volumes:
+      - "${HF_CACHE_DIR:-./data}:/data"
+    shm_size: 20G
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri/:/dev/dri/
+    cap_add:
+      - SYS_PTRACE
+    group_add:
+      - video
+    security_opt:
+      - seccomp:unconfined
+      - apparmor=unconfined
+    command: "--model ${VLLM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
+    ipc: host
 
 networks:
   default:

@@ -0,0 +1,20 @@
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
+
+# Refer to https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/index.html for further optimization
+
+FROM rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
+
+# Set the working directory
+WORKDIR /workspace
+
+# Expose the port used by the API server
+EXPOSE 8011
+
+# Set environment variables
+ENV HUGGINGFACE_HUB_CACHE=/workspace
+ENV VLLM_USE_TRITON_FLASH_ATTENTION=0
+ENV PYTORCH_JIT=0
+
+# Set the entrypoint to the api_server.py script
+RUN cp /usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
+ENTRYPOINT ["python3", "/workspace/api_server.py"]
@@ -0,0 +1,117 @@
+#!/bin/bash
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
+
+set -x
+
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+export REGISTRY=${IMAGE_REPO}
+export TAG="comps"
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=${TAG}"
+
+WORKPATH=$(dirname "$PWD")
+WORKDIR=${WORKPATH}/../
+export host_ip=$(hostname -I | awk '{print $1}')
+LOG_PATH="$WORKPATH"
+service_name="vllm-rocm-server"
+docker_container_name="vllm-server"
+
+function build_container() {
+    cd $WORKPATH/comps/third_parties/vllm/src
+    docker build --no-cache -t ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest} \
+      -f Dockerfile.amd_gpu \
+      . \
+      --build-arg https_proxy=$https_proxy \
+      --build-arg http_proxy=$http_proxy
+    if [ $? -ne 0 ]; then
+        echo "vllm-rocm built fail"
+        exit 1
+    else
+        echo "vllm-rocm built successful"
+    fi
+}
+
+# Function to start Docker container
+start_container() {
+    export VLLM_SERVICE_PORT=28011
+    export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN}
+    export HF_CACHE_DIR="./data"
+    export VLLM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+    export TENSOR_PARALLEL_SIZE=1
+
+    cd $WORKPATH/comps/third_parties/vllm/deployment/docker_compose
+    docker compose -f compose.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log
+
+    # check whether service is fully ready
+    n=0
+    until [[ "$n" -ge 300 ]]; do
+        docker logs $docker_container_name > ${LOG_PATH}/$docker_container_name.log 2>&1
+        n=$((n+1))
+        if grep -q ""Application startup complete"" ${LOG_PATH}/$docker_container_name.log; then
+            break
+        fi
+        sleep 10s
+    done
+
+}
+
+# Function to test API endpoint
+function test_api_endpoint {
+    local endpoint="$1"
+    local expected_status="$2"
+
+    # Make the HTTP request
+    if test "$1" = "v1/completions"
+    then
+        local response=$(curl "http://${host_ip}:${VLLM_SERVICE_PORT}/$endpoint" \
+          -H "Content-Type: application/json" \
+          -d '{
+                "model": "Intel/neural-chat-7b-v3-3",
+                "prompt": "What is deep learning?",
+                "max_tokens": 300,
+                "temperature": 0.7
+              }' \
+          --write-out '%{http_code}' \
+          --silent \
+          --output /dev/null)
+    else
+        local response=$(curl "http://${host_ip}:${VLLM_SERVICE_PORT}/$endpoint" \
+          --write-out '%{http_code}' \
+          --silent \
+          --output /dev/null)
+    fi
+
+    # Assert the response status code
+    if [[ "$response" -eq "$expected_status" ]]; then
+        echo "PASS: $endpoint returned expected status code: $expected_status"
+    else
+        echo "FAIL: $endpoint returned unexpected status code: $response (expected: $expected_status)"
+        docker logs $service_name
+        exit 1
+    fi
+}
+
+function stop_docker() {
+    cd $WORKPATH/../comps/third_parties/vllm/deployment/docker_compose
+    docker compose -f compose.yaml down ${service_name} --remove-orphans
+}
+
+# Main function
+main() {
+
+    build_container
+    start_container
+
+    # Sleep to allow the container to start up fully
+    sleep 10
+    # Test the /v1/models API
+    test_api_endpoint "v1/models" 200
+
+    # Test the /v1/completions API
+    test_api_endpoint "v1/completions" 200
+
+    stop_docker
+}
+
+# Call main function
+main