From e1142bdebd0a582b6a200033643c9a41b3b83a77 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Tue, 13 May 2025 16:37:51 +0300 Subject: [PATCH 1/7] Minor KubeAI README improvement Signed-off-by: Eero Tamminen --- kubeai/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubeai/README.md b/kubeai/README.md index 465589267..988928731 100644 --- a/kubeai/README.md +++ b/kubeai/README.md @@ -60,7 +60,7 @@ kubectl explain models.kubeai.org # Deploying the Models -This section describes how to deploy various models. All the examples below use Kubernetes Persistent Volumes and Claims (PV/PVC) to store the models. The Kubernetes Storage Class (SC) is called `standard`. You can tune the storage configuration to match your environment during the installation (see `opea-values.yaml`, `cacheProfiles` for more information). +This section describes how to deploy various models. All the examples below use Kubernetes Persistent Volumes and Claims (PV/PVC) to store the models. The Kubernetes Storage Class (SC) is called `standard`. You can tune the storage configuration to match your environment during the installation (see `cacheProfiles` in `opea-values.yaml`). The models in the examples below are deployed to `$NAMESPACE`. Please set that according to your needs. From 9a8646e529fca06672b0f3b0a1f9f15a40b9654f Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Wed, 14 May 2025 21:56:27 +0300 Subject: [PATCH 2/7] Add KubeAI nodeSelector examples Signed-off-by: Eero Tamminen --- kubeai/opea-values.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kubeai/opea-values.yaml b/kubeai/opea-values.yaml index f4fc29637..8d1339a55 100644 --- a/kubeai/opea-values.yaml +++ b/kubeai/opea-values.yaml @@ -28,3 +28,8 @@ resourceProfiles: requests: cpu: "2" memory: "2Gi" + nodeSelector: + #kubeai-inference: "true" + +nodeSelector: + #kubeai-frontend: "true" From ba0bf64b774a320cc519edd4bd7e522d3d73ef9e Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Wed, 14 May 2025 21:57:05 +0300 Subject: [PATCH 3/7] Add autoscaling for 8b model Signed-off-by: Eero Tamminen --- kubeai/models/llama-3.1-8b-instruct-gaudi.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml index 37908426f..2466c62fa 100644 --- a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml +++ b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml @@ -17,6 +17,9 @@ spec: - --max-num-seqs=256 - --max-seq-len-to-capture=2048 env: - OMPI_MCA_btl_vader_single_copy_mechanism: none + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + # vLLM startup takes too long for autoscaling, especially with Gaudi + VLLM_SKIP_WARMUP: "true" minReplicas: 1 + maxReplicas: 4 resourceProfile: gaudi-for-text-generation:1 From 8c45f61eba85dda120e3e0598801f7771df3fa78 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Wed, 14 May 2025 21:58:10 +0300 Subject: [PATCH 4/7] Make 70b model scale-from-zero example Signed-off-by: Eero Tamminen --- kubeai/models/llama-3.3-70b-instruct-gaudi.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml b/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml index 350d04eaf..86d967e7b 100644 --- a/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml +++ b/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml @@ -19,8 +19,10 @@ spec: env: OMPI_MCA_btl_vader_single_copy_mechanism: none PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + # vLLM startup takes too long for autoscaling, especially with Gaudi VLLM_SKIP_WARMUP: "true" - minReplicas: 1 + # scale-from-zero avoids idle instance occupying half a node, but causes long delay + minReplicas: 0 maxReplicas: 2 resourceProfile: gaudi-for-text-generation:4 From 461d83eb92b885fa5b11bd92a683a98042eaa092 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Fri, 16 May 2025 19:12:21 +0300 Subject: [PATCH 5/7] Update KubeAI README Signed-off-by: Eero Tamminen --- kubeai/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kubeai/README.md b/kubeai/README.md index 988928731..8c0244be6 100644 --- a/kubeai/README.md +++ b/kubeai/README.md @@ -13,7 +13,7 @@ The following features are available at the moment. - Persistent Volume cache for models - tested/working - Model downloading & inference engine deployment - tested/working - Scaling pods to/from zero - tested/working -- Load based autoscaling - not tested/included +- Load based autoscaling - tested/working - Integration with OPEA application - missing The following models are included. @@ -98,7 +98,9 @@ kubect apply -f models/llama-3.1-8b-instruct-gaudi.yaml -n $NAMESPACE kubect apply -f models/llama-3.3-70b-instruct-gaudi.yaml -n $NAMESPACE ``` -The rest is the same as in the previous example. You should see a pod running with the name `model-llama-3.1-8b-instruct-gpu-xxxx` and/or `model-llama-3.3-70b-instruct-gpu-xxxx`. +The rest is the same as in the previous example. You should see a pod running with the name `model-llama-3.1-8b-instruct-gaudi-xxxx`. When request load for that model increases enough, KubeAI will automatically deploy more instances (model `maxReplicas` > `minReplicas`). + +Latter model is set to scale from zero (`minReplicas` = 0), so `model-llama-3.3-70b-instruct-gaudi-xxxx` pod(s) will be present only when KubeAI gets requests for that model (avoids multiple devices being exclusively reserved for idle pods, but significantly slows down first response). ## Text Embeddings with BGE on CPU From cc85b7fe51c093d24224a4cb46d9bb4b215d826e Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Mon, 19 May 2025 16:56:24 +0300 Subject: [PATCH 6/7] Drop nodeSelector for KubeAI itself (Review request.) Signed-off-by: Eero Tamminen --- kubeai/opea-values.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/kubeai/opea-values.yaml b/kubeai/opea-values.yaml index 8d1339a55..6f0ff7bfb 100644 --- a/kubeai/opea-values.yaml +++ b/kubeai/opea-values.yaml @@ -30,6 +30,3 @@ resourceProfiles: memory: "2Gi" nodeSelector: #kubeai-inference: "true" - -nodeSelector: - #kubeai-frontend: "true" From 6efd2fbb5e65d7ca11657dea93d6e4fee1edc246 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Tue, 20 May 2025 22:01:45 +0300 Subject: [PATCH 7/7] Add scaling request target example Signed-off-by: Eero Tamminen --- kubeai/models/llama-3.1-8b-instruct-gaudi.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml index 2466c62fa..34bdbeac7 100644 --- a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml +++ b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml @@ -22,4 +22,5 @@ spec: VLLM_SKIP_WARMUP: "true" minReplicas: 1 maxReplicas: 4 + targetRequests: 120 resourceProfile: gaudi-for-text-generation:1