From 0802ae8d235c47ba190b035dd9c6fb5f74e45fe1 Mon Sep 17 00:00:00 2001
From: Rantala <valtteri.rantala@intel.com>
Date: Fri, 4 Jul 2025 09:14:19 +0300
Subject: [PATCH 1/4] Added 2 new model files and updated model parameters

* Added mistral and mistral model yaml files
* Updated model parameters more optimal based on
  benchmark-serving testing with default parameters:
  request-rate=800, Input/Output tokens=200/200

Signed-off-by: Rantala <valtteri.rantala@intel.com>
---
 .../deepseek-r1-distill-llama-70b-gaudi.yaml  | 25 ++++++++++-----
 .../deepseek-r1-distill-llama-8b-gaudi.yaml   | 21 +++++++-----
 kubeai/models/llama-3.1-8b-instruct-cpu.yaml  |  2 +-
 .../models/llama-3.1-8b-instruct-gaudi.yaml   | 15 +++++----
 .../models/llama-3.3-70b-instruct-gaudi.yaml  | 20 +++++++-----
 .../mistral-7b-instruct-v0.3-gaudi.yaml       | 32 +++++++++++++++++++
 .../mixtral-8x7b-instruct-v0.1-gaudi.yaml     | 31 ++++++++++++++++++
 kubeai/models/qwen2.5-72b-instruct-gaudi.yaml | 25 +++++++++------
 kubeai/models/qwen2.5-7b-instruct-gaudi.yaml  | 22 +++++++------
 9 files changed, 143 insertions(+), 50 deletions(-)
 create mode 100644 kubeai/models/mistral-7b-instruct-v0.3-gaudi.yaml
 create mode 100644 kubeai/models/mixtral-8x7b-instruct-v0.1-gaudi.yaml

diff --git a/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml b/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml
index a694a4439..e8cbd058c 100644
--- a/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml
+++ b/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml
@@ -5,21 +5,30 @@
 apiVersion: kubeai.org/v1
 kind: Model
 metadata:
+  # Change the name to match your --model argument in the benchmark job
   name: deepseek-r1-distill-llama-70b-gaudi
 spec:
   features: [TextGeneration]
   url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-70B
-  cacheProfile: nfs
+  cacheProfile: default
   engine: VLLM
   args:
-    - --tensor-parallel-size=8
+    - --tensor-parallel-size=2
+    - --max-model-len=2048
+    - --max-seq-len-to-capture=2048
+    - --max-num-batched-token=16000
+    - --max-num-seqs=64
+    - --gpu-memory-utilization=0.9
+    - --enable-auto-tool-choice
+    - --tool-call-parser=llama3_json
+    - --disable-log-requests
   env:
     OMPI_MCA_btl_vader_single_copy_mechanism: none
     PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-    # vLLM startup takes too long for autoscaling, especially with Gaudi
     VLLM_SKIP_WARMUP: "true"
-
-  # scale-from-zero avoids idle instance occupying a node, but causes long delay
-  minReplicas: 0
-  maxReplicas: 1
-  resourceProfile: gaudi-for-text-generation:8
+  minReplicas: 1
+  maxReplicas: 4
+  # same as max-num-seqs (batch size)
+  targetRequests: 64
+  # Change the resource profile to match your Gaudi node
+  resourceProfile: gaudi-node-2:2
diff --git a/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml b/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml
index 0266dc261..90cfc07a2 100644
--- a/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml
+++ b/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml
@@ -5,22 +5,27 @@
 apiVersion: kubeai.org/v1
 kind: Model
 metadata:
+  # Change the name to match your --model argument in the benchmark job
   name: deepseek-r1-distill-llama-8b-gaudi
 spec:
   features: [TextGeneration]
   url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  cacheProfile: nfs
   engine: VLLM
+  cacheProfile: default
   args:
     - --tensor-parallel-size=1
-    - --block-size=128
-    - --max-num-seqs=256
+    - --max-model-len=2048
     - --max-seq-len-to-capture=2048
+    - --max-num-batched-token=2048
+    - --max-num-seqs=512
+    - --gpu-memory-utilization=0.9
+    - --disable-log-requests
   env:
-    OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-    # vLLM startup takes too long for autoscaling, especially with Gaudi
+    OMPI_MCA_btl_vader_single_copy_mechanism: none
     VLLM_SKIP_WARMUP: "true"
   minReplicas: 1
-  maxReplicas: 4
-  targetRequests: 120
-  resourceProfile: gaudi-for-text-generation:1
+  maxReplicas: 8
+  # same as max-num-seqs (batch size)
+  targetRequests: 512
+  # Change the resource profile to match your Gaudi node
+  resourceProfile: gaudi-node-1:1
diff --git a/kubeai/models/llama-3.1-8b-instruct-cpu.yaml b/kubeai/models/llama-3.1-8b-instruct-cpu.yaml
index 6530af689..03e40ff06 100644
--- a/kubeai/models/llama-3.1-8b-instruct-cpu.yaml
+++ b/kubeai/models/llama-3.1-8b-instruct-cpu.yaml
@@ -9,7 +9,7 @@ metadata:
 spec:
   features: [TextGeneration]
   url: hf://meta-llama/Meta-Llama-3.1-8B-Instruct
-  cacheProfile: standard
+  cacheProfile: default
   engine: VLLM
   args:
     - --max-model-len=32768
diff --git a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
index 34bdbeac7..6b63e0d6c 100644
--- a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
+++ b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
@@ -1,10 +1,10 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-
 # Source: models/templates/models.yaml
 apiVersion: kubeai.org/v1
 kind: Model
 metadata:
+  # Change the name to match your --model argument in the benchmark job
   name: llama-3.1-8b-instruct-gaudi
 spec:
   features: [TextGeneration]
@@ -16,11 +16,14 @@ spec:
     - --block-size=128
     - --max-num-seqs=256
     - --max-seq-len-to-capture=2048
+    - --max-model-len=2048
+    - --max-num-batched-token=16000
   env:
-    OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-    # vLLM startup takes too long for autoscaling, especially with Gaudi
+    OMPI_MCA_btl_vader_single_copy_mechanism: none
     VLLM_SKIP_WARMUP: "true"
   minReplicas: 1
-  maxReplicas: 4
-  targetRequests: 120
-  resourceProfile: gaudi-for-text-generation:1
+  maxReplicas: 8
+  # same as max-num-seqs (batch size)
+  targetRequests: 256
+  # Change the resource profile to match your Gaudi node
+  resourceProfile: gaudi-node-1:1
diff --git a/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml b/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml
index 86d967e7b..70ea49570 100644
--- a/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml
+++ b/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml
@@ -9,20 +9,24 @@ metadata:
 spec:
   features: [TextGeneration]
   url: hf://meta-llama/Llama-3.3-70B-Instruct
-  cacheProfile: nfs
+  cacheProfile: default
   engine: VLLM
   args:
-    - --tensor-parallel-size=4
-    - --max-seq-len-to-capture=16384
+    - --tensor-parallel-size=2
+    - --max-num-seqs=64
+    - --max-seq-len-to-capture=2048
+    - --max-model-len=2048
+    - --max-num-batched-token=16000
     - --enable-auto-tool-choice
     - --tool-call-parser=llama3_json
+    - --disable-log-request
   env:
     OMPI_MCA_btl_vader_single_copy_mechanism: none
     PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
     # vLLM startup takes too long for autoscaling, especially with Gaudi
     VLLM_SKIP_WARMUP: "true"
-
-  # scale-from-zero avoids idle instance occupying half a node, but causes long delay
-  minReplicas: 0
-  maxReplicas: 2
-  resourceProfile: gaudi-for-text-generation:4
+  minReplicas: 1
+  maxReplicas: 4
+  # vLLM startup takes too long for autoscaling, especially with Gaudi
+  targetRequests: 64
+  resourceProfile: gaudi-node-2:2
diff --git a/kubeai/models/mistral-7b-instruct-v0.3-gaudi.yaml b/kubeai/models/mistral-7b-instruct-v0.3-gaudi.yaml
new file mode 100644
index 000000000..896d3a665
--- /dev/null
+++ b/kubeai/models/mistral-7b-instruct-v0.3-gaudi.yaml
@@ -0,0 +1,32 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: mistral-7b-instruct-v0.3-gaudi
+spec:
+  features: [TextGeneration]
+  url: hf://mistralai/Mistral-7B-Instruct-v0.3
+  cacheProfile: default
+  engine: VLLM
+  args:
+    - --model=mistralai/Mistral-7B-Instruct-v0.3
+    - --load_format=mistral
+    - --config_format=mistral
+    - --tensor-parallel-size=1
+    - --block-size=128
+    - --max-num-seqs=512
+    - --max-seq-len-to-capture=2048
+    - --max-model-len=2048
+    - --max-num-batched-token=2048
+    - --disable-log-request
+  env:
+    OMPI_MCA_btl_vader_single_copy_mechanism: none
+    VLLM_SKIP_WARMUP: "true"
+  minReplicas: 1
+  maxReplicas: 8
+  # Equals to max-num-seqs (batch-size) parameter
+    targetRequests: 512
+  resourceProfile: gaudi-node-2:1
diff --git a/kubeai/models/mixtral-8x7b-instruct-v0.1-gaudi.yaml b/kubeai/models/mixtral-8x7b-instruct-v0.1-gaudi.yaml
new file mode 100644
index 000000000..7df284068
--- /dev/null
+++ b/kubeai/models/mixtral-8x7b-instruct-v0.1-gaudi.yaml
@@ -0,0 +1,31 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: mixtral-8x7b-instruct-v0.1-gaudi
+spec:
+  features: [TextGeneration]
+  url: hf://mistralai/Mixtral-8x7B-Instruct-v0.1
+  cacheProfile: default
+  engine: VLLM
+  args:
+    - --model=mistralai/Mixtral-8x7B-Instruct-v0.1
+    - --tensor-parallel-size=2
+    - --block-size=128
+    - --max-num-seqs=512
+    - --max-model-len=32000
+    - --max-seq-len-to-capture=32000
+    - --max-num-batched-token=64000
+    - --disable-log-request
+  env:
+    OMPI_MCA_btl_vader_single_copy_mechanism: none
+    PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+    VLLM_SKIP_WARMUP: "true"
+  minReplicas: 1
+  maxReplicas: 4
+  # same as max-num-seqs (batch size)
+  targetRequests: 512
+  resourceProfile: gaudi-node-2:2
diff --git a/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml b/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
index 7079bfb4c..eb6bc7cfa 100644
--- a/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
+++ b/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
@@ -1,25 +1,30 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-
 # Source: models/templates/models.yaml
 apiVersion: kubeai.org/v1
 kind: Model
 metadata:
+  # Change the name to match your --model argument in the benchmark job
   name: qwen2.5-72b-instruct-gaudi
 spec:
   features: [TextGeneration]
   url: hf://Qwen/Qwen2.5-72B-Instruct
-  cacheProfile: nfs
+  cacheProfile: default  
   engine: VLLM
   args:
-    - --tensor-parallel-size=4
+    - --tensor-parallel-size=2
+    - --max-model-len=2048
+    - --max-seq-len-to-capture=2048
+    - --max-num-batched-token=16000
+    - --max-num-seqs=128
+    - --gpu-memory-utilization=0.9
+    - --disable-log-requests
   env:
     OMPI_MCA_btl_vader_single_copy_mechanism: none
-    PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-    # vLLM startup takes too long for autoscaling, especially with Gaudi
     VLLM_SKIP_WARMUP: "true"
-
-  # scale-from-zero avoids idle instance occupying half a node, but causes long delay
-  minReplicas: 0
-  maxReplicas: 2
-  resourceProfile: gaudi-for-text-generation:4
+  minReplicas: 1
+  maxReplicas: 4
+  # same as max-num-seqs (batch size)
+  targetRequests: 128
+  # Change the resource profile to match your Gaudi node
+  resourceProfile: gaudi-node-2:2
diff --git a/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml b/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
index ec1772366..75d9161f8 100644
--- a/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
+++ b/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
@@ -1,26 +1,30 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-
 # Source: models/templates/models.yaml
 apiVersion: kubeai.org/v1
 kind: Model
 metadata:
+  # Change the name to match your --model argument in the benchmark job
   name: qwen2.5-7b-instruct-gaudi
 spec:
   features: [TextGeneration]
   url: hf://Qwen/Qwen2.5-7B-Instruct
-  cacheProfile: nfs
+  cacheProfile: default
   engine: VLLM
   args:
     - --tensor-parallel-size=1
-    - --block-size=128
-    - --max-num-seqs=256
+    - --max-model-len=2048
     - --max-seq-len-to-capture=2048
+    - --max-num-batched-token=2048
+    - --max-num-seqs=512
+    - --gpu-memory-utilization=0.9
+    - --disable-log-requests
   env:
-    OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-    # vLLM startup takes too long for autoscaling, especially with Gaudi
+    OMPI_MCA_btl_vader_single_copy_mechanism: none
     VLLM_SKIP_WARMUP: "true"
   minReplicas: 1
-  maxReplicas: 4
-  targetRequests: 120
-  resourceProfile: gaudi-for-text-generation:1
+  maxReplicas: 8
+  # same as max-num-seqs (batch size)
+  targetRequests: 512
+  # Change the resource profile to match your Gaudi node
+  resourceProfile: gaudi-node-1:1

From ce8ce09c1f421718864c29aa9707d8f4c50b4336 Mon Sep 17 00:00:00 2001
From: Rantala <valtteri.rantala@intel.com>
Date: Fri, 4 Jul 2025 11:40:49 +0300
Subject: [PATCH 2/4] Added README.md fille and updated model parameters

Signed-off-by: Rantala <valtteri.rantala@intel.com>
---
 kubeai/models/README.md                       | 20 +++++++++++++++++++
 .../deepseek-r1-distill-llama-70b-gaudi.yaml  |  4 +---
 .../deepseek-r1-distill-llama-8b-gaudi.yaml   |  4 +---
 .../models/llama-3.1-8b-instruct-gaudi.yaml   |  6 ++----
 .../models/llama-3.3-70b-instruct-gaudi.yaml  |  5 ++---
 .../mistral-7b-instruct-v0.3-gaudi.yaml       |  4 ++--
 .../mixtral-8x7b-instruct-v0.1-gaudi.yaml     |  2 +-
 kubeai/models/qwen2.5-72b-instruct-gaudi.yaml |  4 +---
 kubeai/models/qwen2.5-7b-instruct-gaudi.yaml  |  3 +--
 9 files changed, 31 insertions(+), 21 deletions(-)
 create mode 100644 kubeai/models/README.md

diff --git a/kubeai/models/README.md b/kubeai/models/README.md
new file mode 100644
index 000000000..628694008
--- /dev/null
+++ b/kubeai/models/README.md
@@ -0,0 +1,20 @@
+# Model Files
+
+This directory contains YAML configuration files for various AI models designed to run on Kubernetes clusters. These files define the specifications, arguments, and resource profiles required for deploying and running the models efficiently.
+
+## Benchmarking
+The parameters for the models were determined using the KubeAI benchmarking tool benchmark_serving.py The benchmarking script can be found [here](https://github.com/substratusai/kubeai/blob/main/benchmarks/chat-py/benchmark_serving.py).
+
+The following arguments were used during benchmarking:
+- `--request-rate=800`
+- `--max-concurrency=800`
+- `--num-prompts=8000`
+- `--max-conversations=800`
+
+These parameters were chosen to optimize the model's performance in terms of throughput.
+
+## Additional Notes
+- The `cacheProfile` is set to `default`.
+- The `targetRequests` value matches the `max-num-seqs` (batch size).
+
+For more details, refer to the individual YAML files in this directory.
diff --git a/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml b/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml
index e8cbd058c..7d2ba9cc9 100644
--- a/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml
+++ b/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml
@@ -5,7 +5,6 @@
 apiVersion: kubeai.org/v1
 kind: Model
 metadata:
-  # Change the name to match your --model argument in the benchmark job
   name: deepseek-r1-distill-llama-70b-gaudi
 spec:
   features: [TextGeneration]
@@ -30,5 +29,4 @@ spec:
   maxReplicas: 4
   # same as max-num-seqs (batch size)
   targetRequests: 64
-  # Change the resource profile to match your Gaudi node
-  resourceProfile: gaudi-node-2:2
+  resourceProfile: gaudi-for-text-generation:2
diff --git a/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml b/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml
index 90cfc07a2..ac3b763fa 100644
--- a/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml
+++ b/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml
@@ -5,7 +5,6 @@
 apiVersion: kubeai.org/v1
 kind: Model
 metadata:
-  # Change the name to match your --model argument in the benchmark job
   name: deepseek-r1-distill-llama-8b-gaudi
 spec:
   features: [TextGeneration]
@@ -27,5 +26,4 @@ spec:
   maxReplicas: 8
   # same as max-num-seqs (batch size)
   targetRequests: 512
-  # Change the resource profile to match your Gaudi node
-  resourceProfile: gaudi-node-1:1
+  resourceProfile: gaudi-for-text-generation:1
diff --git a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
index 6b63e0d6c..7b8c475ad 100644
--- a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
+++ b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
@@ -4,12 +4,11 @@
 apiVersion: kubeai.org/v1
 kind: Model
 metadata:
-  # Change the name to match your --model argument in the benchmark job
   name: llama-3.1-8b-instruct-gaudi
 spec:
   features: [TextGeneration]
   url: hf://meta-llama/Meta-Llama-3.1-8B-Instruct
-  cacheProfile: nfs
+  cacheProfile: default
   engine: VLLM
   args:
     - --tensor-parallel-size=1
@@ -25,5 +24,4 @@ spec:
   maxReplicas: 8
   # same as max-num-seqs (batch size)
   targetRequests: 256
-  # Change the resource profile to match your Gaudi node
-  resourceProfile: gaudi-node-1:1
+  resourceProfile: gaudi-for-text-generation:1
diff --git a/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml b/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml
index 70ea49570..0b9babade 100644
--- a/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml
+++ b/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml
@@ -23,10 +23,9 @@ spec:
   env:
     OMPI_MCA_btl_vader_single_copy_mechanism: none
     PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-    # vLLM startup takes too long for autoscaling, especially with Gaudi
     VLLM_SKIP_WARMUP: "true"
   minReplicas: 1
   maxReplicas: 4
-  # vLLM startup takes too long for autoscaling, especially with Gaudi
+  # Equals to max-num-seqs (batch-size)
   targetRequests: 64
-  resourceProfile: gaudi-node-2:2
+  resourceProfile: gaudi-for-text-generation:2
diff --git a/kubeai/models/mistral-7b-instruct-v0.3-gaudi.yaml b/kubeai/models/mistral-7b-instruct-v0.3-gaudi.yaml
index 896d3a665..fb9681df5 100644
--- a/kubeai/models/mistral-7b-instruct-v0.3-gaudi.yaml
+++ b/kubeai/models/mistral-7b-instruct-v0.3-gaudi.yaml
@@ -28,5 +28,5 @@ spec:
   minReplicas: 1
   maxReplicas: 8
   # Equals to max-num-seqs (batch-size) parameter
-    targetRequests: 512
-  resourceProfile: gaudi-node-2:1
+  targetRequests: 512
+  resourceProfile: gaudi-for-text-generation:1
diff --git a/kubeai/models/mixtral-8x7b-instruct-v0.1-gaudi.yaml b/kubeai/models/mixtral-8x7b-instruct-v0.1-gaudi.yaml
index 7df284068..907a2347f 100644
--- a/kubeai/models/mixtral-8x7b-instruct-v0.1-gaudi.yaml
+++ b/kubeai/models/mixtral-8x7b-instruct-v0.1-gaudi.yaml
@@ -28,4 +28,4 @@ spec:
   maxReplicas: 4
   # same as max-num-seqs (batch size)
   targetRequests: 512
-  resourceProfile: gaudi-node-2:2
+  resourceProfile: gaudi-for-text-generation:2
diff --git a/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml b/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
index eb6bc7cfa..71134b6e9 100644
--- a/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
+++ b/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
@@ -4,7 +4,6 @@
 apiVersion: kubeai.org/v1
 kind: Model
 metadata:
-  # Change the name to match your --model argument in the benchmark job
   name: qwen2.5-72b-instruct-gaudi
 spec:
   features: [TextGeneration]
@@ -26,5 +25,4 @@ spec:
   maxReplicas: 4
   # same as max-num-seqs (batch size)
   targetRequests: 128
-  # Change the resource profile to match your Gaudi node
-  resourceProfile: gaudi-node-2:2
+  resourceProfile: gaudi-for-text-generation:2
diff --git a/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml b/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
index 75d9161f8..4cd5dbbb1 100644
--- a/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
+++ b/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
@@ -26,5 +26,4 @@ spec:
   maxReplicas: 8
   # same as max-num-seqs (batch size)
   targetRequests: 512
-  # Change the resource profile to match your Gaudi node
-  resourceProfile: gaudi-node-1:1
+  resourceProfile: gaudi-for-text-generation::1

From 71473ce33063c9aae1eb91f486aa927bcefbadb3 Mon Sep 17 00:00:00 2001
From: Rantala <valtteri.rantala@intel.com>
Date: Fri, 4 Jul 2025 13:40:28 +0300
Subject: [PATCH 3/4] Fixed README link and updated model files according to
 review comments

Signed-off-by: Rantala <valtteri.rantala@intel.com>
---
 kubeai/README.md                             | 3 ++-
 kubeai/models/README.md                      | 3 ++-
 kubeai/models/qwen2.5-7b-instruct-gaudi.yaml | 3 +--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/kubeai/README.md b/kubeai/README.md
index 79b9c7f5a..cfcaa61d6 100644
--- a/kubeai/README.md
+++ b/kubeai/README.md
@@ -20,6 +20,7 @@ For now, OPEA enables a subset of the KubeAI features. In the future more KubeAI
   - [Configuration of Balloons Policy Plugin](#configuration-of-balloons-policy-plugin)
 - [Observability](#observability)
 
+
 ## Features
 
 The following features are available at the moment.
@@ -79,7 +80,7 @@ kubectl explain models.kubeai.org
 
 This section describes how to deploy various models. All the examples below use Kubernetes Persistent Volumes and Claims (PV/PVC) to store the models. The Kubernetes Storage Class (SC) is called `standard`. You can tune the storage configuration to match your environment during the installation (see `cacheProfiles` in `opea-values.yaml`).
 
-The models in the examples below are deployed to `$NAMESPACE`. Please set that according to your needs.
+The models in the examples below are deployed to `$NAMESPACE`. Please set that according to your needs. Model README is located here [models](models/README.md)
 
 ```
 export NAMESPACE="kubeai"
diff --git a/kubeai/models/README.md b/kubeai/models/README.md
index 628694008..28bf4c548 100644
--- a/kubeai/models/README.md
+++ b/kubeai/models/README.md
@@ -3,7 +3,7 @@
 This directory contains YAML configuration files for various AI models designed to run on Kubernetes clusters. These files define the specifications, arguments, and resource profiles required for deploying and running the models efficiently.
 
 ## Benchmarking
-The parameters for the models were determined using the KubeAI benchmarking tool benchmark_serving.py The benchmarking script can be found [here](https://github.com/substratusai/kubeai/blob/main/benchmarks/chat-py/benchmark_serving.py).
+The parameters for the models were determined using the KubeAI benchmarking tool 'benchmark_serving.py' The benchmarking script can be found [here](https://github.com/substratusai/kubeai/blob/main/benchmarks/chat-py/benchmark_serving.py).
 
 The following arguments were used during benchmarking:
 - `--request-rate=800`
@@ -16,5 +16,6 @@ These parameters were chosen to optimize the model's performance in terms of thr
 ## Additional Notes
 - The `cacheProfile` is set to `default`.
 - The `targetRequests` value matches the `max-num-seqs` (batch size).
+- Most Models enabled autoscaling (`maxReplicas` > `MinReplicas`) so vLLM warmup is disabled as it would slow down new Gaudi vLLM instances startup too much
 
 For more details, refer to the individual YAML files in this directory.
diff --git a/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml b/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
index 4cd5dbbb1..ec83e51af 100644
--- a/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
+++ b/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
@@ -4,7 +4,6 @@
 apiVersion: kubeai.org/v1
 kind: Model
 metadata:
-  # Change the name to match your --model argument in the benchmark job
   name: qwen2.5-7b-instruct-gaudi
 spec:
   features: [TextGeneration]
@@ -24,6 +23,6 @@ spec:
     VLLM_SKIP_WARMUP: "true"
   minReplicas: 1
   maxReplicas: 8
-  # same as max-num-seqs (batch size)
+  # Equals to max-num-seqs (batch-size)
   targetRequests: 512
   resourceProfile: gaudi-for-text-generation::1

From be29b46f0a956a7ad3cf9429f5721fd69d8f162e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 4 Jul 2025 10:41:04 +0000
Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 kubeai/README.md                              | 1 -
 kubeai/models/README.md                       | 3 +++
 kubeai/models/qwen2.5-72b-instruct-gaudi.yaml | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/kubeai/README.md b/kubeai/README.md
index cfcaa61d6..20d26c56f 100644
--- a/kubeai/README.md
+++ b/kubeai/README.md
@@ -20,7 +20,6 @@ For now, OPEA enables a subset of the KubeAI features. In the future more KubeAI
   - [Configuration of Balloons Policy Plugin](#configuration-of-balloons-policy-plugin)
 - [Observability](#observability)
 
-
 ## Features
 
 The following features are available at the moment.
diff --git a/kubeai/models/README.md b/kubeai/models/README.md
index 28bf4c548..d78f5eb99 100644
--- a/kubeai/models/README.md
+++ b/kubeai/models/README.md
@@ -3,9 +3,11 @@
 This directory contains YAML configuration files for various AI models designed to run on Kubernetes clusters. These files define the specifications, arguments, and resource profiles required for deploying and running the models efficiently.
 
 ## Benchmarking
+
 The parameters for the models were determined using the KubeAI benchmarking tool 'benchmark_serving.py' The benchmarking script can be found [here](https://github.com/substratusai/kubeai/blob/main/benchmarks/chat-py/benchmark_serving.py).
 
 The following arguments were used during benchmarking:
+
 - `--request-rate=800`
 - `--max-concurrency=800`
 - `--num-prompts=8000`
@@ -14,6 +16,7 @@ The following arguments were used during benchmarking:
 These parameters were chosen to optimize the model's performance in terms of throughput.
 
 ## Additional Notes
+
 - The `cacheProfile` is set to `default`.
 - The `targetRequests` value matches the `max-num-seqs` (batch size).
 - Most Models enabled autoscaling (`maxReplicas` > `MinReplicas`) so vLLM warmup is disabled as it would slow down new Gaudi vLLM instances startup too much
diff --git a/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml b/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
index 71134b6e9..87898c1a7 100644
--- a/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
+++ b/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
@@ -8,7 +8,7 @@ metadata:
 spec:
   features: [TextGeneration]
   url: hf://Qwen/Qwen2.5-72B-Instruct
-  cacheProfile: default  
+  cacheProfile: default
   engine: VLLM
   args:
     - --tensor-parallel-size=2