From e1142bdebd0a582b6a200033643c9a41b3b83a77 Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Tue, 13 May 2025 16:37:51 +0300
Subject: [PATCH 1/7] Minor KubeAI README improvement

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 kubeai/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kubeai/README.md b/kubeai/README.md
index 465589267..988928731 100644
--- a/kubeai/README.md
+++ b/kubeai/README.md
@@ -60,7 +60,7 @@ kubectl explain models.kubeai.org
 
 # Deploying the Models
 
-This section describes how to deploy various models. All the examples below use Kubernetes Persistent Volumes and Claims (PV/PVC) to store the models. The Kubernetes Storage Class (SC) is called `standard`. You can tune the storage configuration to match your environment during the installation (see `opea-values.yaml`, `cacheProfiles` for more information).
+This section describes how to deploy various models. All the examples below use Kubernetes Persistent Volumes and Claims (PV/PVC) to store the models. The Kubernetes Storage Class (SC) is called `standard`. You can tune the storage configuration to match your environment during the installation (see `cacheProfiles` in `opea-values.yaml`).
 
 The models in the examples below are deployed to `$NAMESPACE`. Please set that according to your needs.
 

From 9a8646e529fca06672b0f3b0a1f9f15a40b9654f Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Wed, 14 May 2025 21:56:27 +0300
Subject: [PATCH 2/7] Add KubeAI nodeSelector examples

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 kubeai/opea-values.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kubeai/opea-values.yaml b/kubeai/opea-values.yaml
index f4fc29637..8d1339a55 100644
--- a/kubeai/opea-values.yaml
+++ b/kubeai/opea-values.yaml
@@ -28,3 +28,8 @@ resourceProfiles:
     requests:
       cpu: "2"
       memory: "2Gi"
+    nodeSelector:
+      #kubeai-inference: "true"
+
+nodeSelector:
+  #kubeai-frontend: "true"

From ba0bf64b774a320cc519edd4bd7e522d3d73ef9e Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Wed, 14 May 2025 21:57:05 +0300
Subject: [PATCH 3/7] Add autoscaling for 8b model

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 kubeai/models/llama-3.1-8b-instruct-gaudi.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
index 37908426f..2466c62fa 100644
--- a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
+++ b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
@@ -17,6 +17,9 @@ spec:
     - --max-num-seqs=256
     - --max-seq-len-to-capture=2048
   env:
-    OMPI_MCA_btl_vader_single_copy_mechanism: none
+    OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+    # vLLM startup takes too long for autoscaling, especially with Gaudi
+    VLLM_SKIP_WARMUP: "true"
   minReplicas: 1
+  maxReplicas: 4
   resourceProfile: gaudi-for-text-generation:1

From 8c45f61eba85dda120e3e0598801f7771df3fa78 Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Wed, 14 May 2025 21:58:10 +0300
Subject: [PATCH 4/7] Make 70b model scale-from-zero example

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 kubeai/models/llama-3.3-70b-instruct-gaudi.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml b/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml
index 350d04eaf..86d967e7b 100644
--- a/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml
+++ b/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml
@@ -19,8 +19,10 @@ spec:
   env:
     OMPI_MCA_btl_vader_single_copy_mechanism: none
     PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+    # vLLM startup takes too long for autoscaling, especially with Gaudi
     VLLM_SKIP_WARMUP: "true"
 
-  minReplicas: 1
+  # scale-from-zero avoids idle instance occupying half a node, but causes long delay
+  minReplicas: 0
   maxReplicas: 2
   resourceProfile: gaudi-for-text-generation:4

From 461d83eb92b885fa5b11bd92a683a98042eaa092 Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Fri, 16 May 2025 19:12:21 +0300
Subject: [PATCH 5/7] Update KubeAI README

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 kubeai/README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kubeai/README.md b/kubeai/README.md
index 988928731..8c0244be6 100644
--- a/kubeai/README.md
+++ b/kubeai/README.md
@@ -13,7 +13,7 @@ The following features are available at the moment.
 - Persistent Volume cache for models - tested/working
 - Model downloading & inference engine deployment - tested/working
 - Scaling pods to/from zero - tested/working
-- Load based autoscaling - not tested/included
+- Load based autoscaling - tested/working
 - Integration with OPEA application - missing
 
 The following models are included.
@@ -98,7 +98,9 @@ kubect apply -f models/llama-3.1-8b-instruct-gaudi.yaml -n $NAMESPACE
 kubect apply -f models/llama-3.3-70b-instruct-gaudi.yaml -n $NAMESPACE
 ```
 
-The rest is the same as in the previous example. You should see a pod running with the name `model-llama-3.1-8b-instruct-gpu-xxxx` and/or `model-llama-3.3-70b-instruct-gpu-xxxx`.
+The rest is the same as in the previous example. You should see a pod running with the name `model-llama-3.1-8b-instruct-gaudi-xxxx`. When request load for that model increases enough, KubeAI will automatically deploy more instances (model `maxReplicas` > `minReplicas`).
+
+Latter model is set to scale from zero (`minReplicas` = 0), so `model-llama-3.3-70b-instruct-gaudi-xxxx` pod(s) will be present only when KubeAI gets requests for that model (avoids multiple devices being exclusively reserved for idle pods, but significantly slows down first response).
 
 ## Text Embeddings with BGE on CPU
 

From cc85b7fe51c093d24224a4cb46d9bb4b215d826e Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Mon, 19 May 2025 16:56:24 +0300
Subject: [PATCH 6/7] Drop nodeSelector for KubeAI itself

(Review request.)

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 kubeai/opea-values.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kubeai/opea-values.yaml b/kubeai/opea-values.yaml
index 8d1339a55..6f0ff7bfb 100644
--- a/kubeai/opea-values.yaml
+++ b/kubeai/opea-values.yaml
@@ -30,6 +30,3 @@ resourceProfiles:
       memory: "2Gi"
     nodeSelector:
       #kubeai-inference: "true"
-
-nodeSelector:
-  #kubeai-frontend: "true"

From 6efd2fbb5e65d7ca11657dea93d6e4fee1edc246 Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Tue, 20 May 2025 22:01:45 +0300
Subject: [PATCH 7/7] Add scaling request target example

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 kubeai/models/llama-3.1-8b-instruct-gaudi.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
index 2466c62fa..34bdbeac7 100644
--- a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
+++ b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
@@ -22,4 +22,5 @@ spec:
     VLLM_SKIP_WARMUP: "true"
   minReplicas: 1
   maxReplicas: 4
+  targetRequests: 120
   resourceProfile: gaudi-for-text-generation:1