Skip to content
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
aeff487
Build and upstream latest base image on push event (#1355)
chensuyue Mar 3, 2025
beec699
Add timeout param for DocSum and FaqGen to deal with long context (#1…
XinyaoWa Mar 4, 2025
a18a91e
Megaservice / orchestrator metric testing + fixes (#1348)
eero-t Mar 4, 2025
31b5120
update image push machine (#1361)
chensuyue Mar 5, 2025
e892090
Add Dockerfile for build ROCm vLLM Docker image
Mar 6, 2025
76446a9
Add Dockerfile for build ROCm vLLM Docker image
Mar 6, 2025
b37ad61
Merge remote-tracking branch 'origin/feature/Add_ROCm_vLLM_Dockerfile…
Mar 6, 2025
d8df913
Add Dockerfile for build ROCm vLLM Docker image
Mar 6, 2025
c032a3e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 6, 2025
f54c26c
Add Dockerfile for build ROCm vLLM Docker image
Mar 6, 2025
ac3b7c6
Merge remote-tracking branch 'origin/feature/Add_ROCm_vLLM_Dockerfile…
Mar 6, 2025
9d9cbb6
Add Dockerfile for build ROCm vLLM Docker image
Mar 6, 2025
42712b6
Add Dockerfile for build ROCm vLLM Docker image
Mar 7, 2025
a2929f1
Add Dockerfile for build ROCm vLLM Docker image
Mar 7, 2025
4677d1d
Add Dockerfile for build ROCm vLLM Docker image
Mar 7, 2025
5ce31c3
Add Dockerfile for build ROCm vLLM Docker image
Mar 7, 2025
94db1d2
Add Dockerfile for build ROCm vLLM Docker image
Mar 7, 2025
3d384a2
Add Dockerfile for build ROCm vLLM Docker image
Mar 7, 2025
b5eb8ad
Add Dockerfile for build ROCm vLLM Docker image
Mar 10, 2025
fb78fc3
Add Dockerfile for build ROCm vLLM Docker image
Mar 10, 2025
b5c0f9a
Add Dockerfile for build ROCm vLLM Docker image
Mar 10, 2025
9aea3fa
Add Dockerfile for build ROCm vLLM Docker image
Mar 10, 2025
f917cae
Add Dockerfile for build ROCm vLLM Docker image
Mar 10, 2025
14c68ff
Add Dockerfile for build ROCm vLLM Docker image
Mar 10, 2025
a6686ed
Merge branch 'main' into feature/Add_ROCm_vLLM_Dockerfile
chyundunovDatamonsters Mar 10, 2025
79ef45f
Add Dockerfile for build ROCm vLLM Docker image
Mar 10, 2025
a3a57f8
Merge remote-tracking branch 'origin/feature/Add_ROCm_vLLM_Dockerfile…
Mar 10, 2025
d82abc7
Add Dockerfile for build ROCm vLLM Docker image
Mar 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/docker/compose/third_parties-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,8 @@ services:
dockerfile: Dockerfile.hpu
shm_size: '128g'
image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
vllm-rocm:
build:
dockerfile: comps/third_parties/vllm/src/Dockerfile.amd_gpu
shm_size: '128g'
image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
38 changes: 37 additions & 1 deletion comps/third_parties/vllm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,43 @@ OpenVINO best known configuration for GPU is:
$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json

### 2.4 Query the service
### 2.4 vLLM with ROCm (on AMD GPU)

#### Build docker image for ROCm vLLM

```bash
cd GenAIComps/comps/third_parties/vllm/src
docker build -f Dockerfile.amd_gpu -t opea/vllm-rocm:latest . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
```

#### Launch vLLM service with docker compose

```bash
cd GenAIComps/comps/third_parties/vllm/deployment/docker_compose
# IP port for vLLM service
export VLLM_SERVICE_PORT=8011
# HF token
export HUGGINGFACEHUB_API_TOKEN="your_hf_token"
# Cache dir
export HF_CACHE_DIR="./data"
# Model
export VLLM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
# Specify the number of GPUs used
export TENSOR_PARALLEL_SIZE=1
# Run deploy
docker compose -f compose.yaml up vllm-rocm-server -d
```

#### Checking ROCM vLLM service

```bash
curl http://${host_ip}:${VLLM_SERVICE_PORT}/v1/chat/completions \
-X POST \
-H "Content-Type: application/json" \
-d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
```

### 2.5 Query the service

And then you can make requests like below to check the service status:

Expand Down
30 changes: 30 additions & 0 deletions comps/third_parties/vllm/deployment/docker_compose/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,36 @@ services:
interval: 10s
timeout: 10s
retries: 100
vllm-rocm-server:
image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
container_name: vllm-server
ports:
- "${VLLM_SERVICE_PORT:-8081}:8011"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
WILM_USE_TRITON_FLASH_ATTENTION: 0
PYTORCH_JIT: 0
volumes:
- "${HF_CACHE_DIR:-./data}:/data"
shm_size: 20G
devices:
- /dev/kfd:/dev/kfd
- /dev/dri/:/dev/dri/
cap_add:
- SYS_PTRACE
group_add:
- video
security_opt:
- seccomp:unconfined
- apparmor=unconfined
command: "--model ${VLLM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
ipc: host

networks:
default:
Expand Down
20 changes: 20 additions & 0 deletions comps/third_parties/vllm/src/Dockerfile.amd_gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.

# Refer to https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/index.html for further optimization

FROM rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6

# Set the working directory
WORKDIR /workspace

# Expose the port used by the API server
EXPOSE 8011

# Set environment variables
ENV HUGGINGFACE_HUB_CACHE=/workspace
ENV VLLM_USE_TRITON_FLASH_ATTENTION=0
ENV PYTORCH_JIT=0

# Set the entrypoint to the api_server.py script
RUN cp /usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py /workspace/api_server.py
ENTRYPOINT ["python3", "/workspace/api_server.py"]
117 changes: 117 additions & 0 deletions tests/third_parties/test_third_parties_vllm_on_amd_gpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc.

set -x

IMAGE_REPO=${IMAGE_REPO:-"opea"}
export REGISTRY=${IMAGE_REPO}
export TAG="comps"
echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=${TAG}"

WORKPATH=$(dirname "$PWD")
WORKDIR=${WORKPATH}/../
export host_ip=$(hostname -I | awk '{print $1}')
LOG_PATH="$WORKPATH"
service_name="vllm-rocm-server"
docker_container_name="vllm-server"

function build_container() {
cd $WORKPATH/comps/third_parties/vllm/src
docker build --no-cache -t ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest} \
-f Dockerfile.amd_gpu \
. \
--build-arg https_proxy=$https_proxy \
--build-arg http_proxy=$http_proxy
if [ $? -ne 0 ]; then
echo "vllm-rocm built fail"
exit 1
else
echo "vllm-rocm built successful"
fi
}

# Function to start Docker container
start_container() {
export VLLM_SERVICE_PORT=28011
export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN}
export HF_CACHE_DIR="./data"
export VLLM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
export TENSOR_PARALLEL_SIZE=1

cd $WORKPATH/comps/third_parties/vllm/deployment/docker_compose
docker compose -f compose.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log

# check whether service is fully ready
n=0
until [[ "$n" -ge 300 ]]; do
docker logs $docker_container_name > ${LOG_PATH}/$docker_container_name.log 2>&1
n=$((n+1))
if grep -q ""Application startup complete"" ${LOG_PATH}/$docker_container_name.log; then
break
fi
sleep 10s
done

}

# Function to test API endpoint
function test_api_endpoint {
local endpoint="$1"
local expected_status="$2"

# Make the HTTP request
if test "$1" = "v1/completions"
then
local response=$(curl "http://${host_ip}:${VLLM_SERVICE_PORT}/$endpoint" \
-H "Content-Type: application/json" \
-d '{
"model": "Intel/neural-chat-7b-v3-3",
"prompt": "What is deep learning?",
"max_tokens": 300,
"temperature": 0.7
}' \
--write-out '%{http_code}' \
--silent \
--output /dev/null)
else
local response=$(curl "http://${host_ip}:${VLLM_SERVICE_PORT}/$endpoint" \
--write-out '%{http_code}' \
--silent \
--output /dev/null)
fi

# Assert the response status code
if [[ "$response" -eq "$expected_status" ]]; then
echo "PASS: $endpoint returned expected status code: $expected_status"
else
echo "FAIL: $endpoint returned unexpected status code: $response (expected: $expected_status)"
docker logs $service_name
exit 1
fi
}

function stop_docker() {
cd $WORKPATH/../comps/third_parties/vllm/deployment/docker_compose
docker compose -f compose.yaml down ${service_name} --remove-orphans
}

# Main function
main() {

build_container
start_container

# Sleep to allow the container to start up fully
sleep 10
# Test the /v1/models API
test_api_endpoint "v1/models" 200

# Test the /v1/completions API
test_api_endpoint "v1/completions" 200

stop_docker
}

# Call main function
main
Loading