diff --git a/README.md b/README.md
index e0e2446e..3d907919 100644
--- a/README.md
+++ b/README.md
@@ -58,37 +58,40 @@ Check Helm's [official docs](https://helm.sh/docs/intro/using_helm/) for more gu
## Values
Below are the values you can set.
-| Key | Description | Type | Default |
-|----------------------------------------|-------------------------------------------------------------------------------------------------------------------|--------------|---------------------------------------------|
-| `modelArtifacts.name` | name of model in the form namespace/modelId. Required. | string | N/A |
-| `modelArtifacts.uri` | Model artifacts URI. Current formats supported include `hf://`, `pvc://`, and `oci://` | string | N/A |
-| `modelArtifacts.size` | Size used to create an emptyDir volume for downloading the model. | string | N/A |
-| `modelArtifacts.authSecretName` | The name of the Secret containing `HF_TOKEN` for `hf://` artifacts that require a token for downloading a model. | string | N/A |
-| `modelArtifacts.mountPath` | Path to mount the volume created to store models | string | /model-cache |
-| `multinode` | Determines whether to create P/D using Deployments (false) or LeaderWorkerSets (true) | bool | `false` |
-| `routing.servicePort` | The port the routing proxy sidecar listens on.
If there is no sidecar, this is the port the request goes to. | int | N/A |
-| `routing.proxy.image` | Image used for the sidecar | string | `ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6` |
-| `routing.proxy.targetPort` | The port the vLLM decode container listens on.
If proxy is present, it will forward request to this port. | string | N/A |
-| `routing.proxy.debugLevel` | Debug level of the routing proxy | int | 5 |
-| `routing.proxy.parentRefs[*].name` | The name of the inference gateway | string | N/A |
-| `decode.create` | If true, creates decode Deployment or LeaderWorkerSet | List | `true` |
-| `decode.annotations` | Annotations that should be added to the Deployment or LeaderWorkerSet | Dict | {} |
-| `decode.tolerations` | Tolerations that should be added to the Deployment or LeaderWorkerSet | List | [] |
-| `decode.replicas` | Number of replicas for decode pods | int | 1 |
-| `decode.extraConfig` | Extra pod configuration | dict | {} |
-| `decode.containers[*].name` | Name of the container for the decode deployment/LWS | string | N/A |
-| `decode.containers[*].image` | Image of the container for the decode deployment/LWS | string | N/A |
-| `decode.containers[*].args` | List of arguments for the decode container. | List[string] | [] |
-| `decode.containers[*].modelCommand` | Nature of the command. One of `vllmServe`, `imageDefault` or `custom` | string | `imageDefault` |
-| `decode.containers[*].command` | List of commands for the decode container. | List[string] | [] |
-| `decode.containers[*].ports` | List of ports for the decode container. | List[Port] | [] |
-| `decode.containers[*].extraConfig` | Extra container configuration | dict | {} |
-| `decode.parallelism.data` | Amount of data parallelism | int | 1 |
-| `decode.parallelism.tensor` | Amount of tensor parallelism | int | 1 |
-| `decode.acceleratorTypes.labelKey` | Key of label on node that identifies the hosted GPU type | string | N/A |
-| `decode.acceleratorTypes.labelValue` | Value of label on node that identifies type of hosted GPU | string | N/A |
-| `prefill` | Same fields supported in `decode` | See above | See above |
-| `extraObjects` | Additional Kubernetes objects to be deployed alongside the main application | List | [] |
+| Key | Description | Type | Default |
+|----------------------------------------|-------------------------------------------------------------------------------------------------------------------|-----------------|---------------------------------------------|
+| `modelArtifacts.name` | name of model in the form namespace/modelId. Required. | string | N/A |
+| `modelArtifacts.uri` | Model artifacts URI. Current formats supported include `hf://`, `pvc://`, and `oci://` | string | N/A |
+| `modelArtifacts.size` | Size used to create an emptyDir volume for downloading the model. | string | N/A |
+| `modelArtifacts.authSecretName` | The name of the Secret containing `HF_TOKEN` for `hf://` artifacts that require a token for downloading a model. | string | N/A |
+| `modelArtifacts.mountPath` | Path to mount the volume created to store models | string | /model-cache |
+| `multinode` | Determines whether to create P/D using Deployments (false) or LeaderWorkerSets (true) | bool | `false` |
+| `routing.servicePort` | The port the routing proxy sidecar listens on.
If there is no sidecar, this is the port the request goes to. | int | N/A |
+| `routing.proxy.image` | Image used for the sidecar | string | `ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6` |
+| `routing.proxy.targetPort` | The port the vLLM decode container listens on.
If proxy is present, it will forward request to this port. | string | N/A |
+| `routing.proxy.debugLevel` | Debug level of the routing proxy | int | 5 |
+| `routing.proxy.parentRefs[*].name` | The name of the inference gateway | string | N/A |
+| `decode.create` | If true, creates decode Deployment or LeaderWorkerSet | List | `true` |
+| `decode.annotations` | Annotations that should be added to the Deployment or LeaderWorkerSet | Dict | {} |
+| `decode.tolerations` | Tolerations that should be added to the Deployment or LeaderWorkerSet | List | [] |
+| `decode.replicas` | Number of replicas for decode pods | int | 1 |
+| `decode.extraConfig` | Extra pod configuration | dict | {} |
+| `decode.containers[*].name` | Name of the container for the decode deployment/LWS | string | N/A |
+| `decode.containers[*].image` | Image of the container for the decode deployment/LWS | string | N/A |
+| `decode.containers[*].args` | List of arguments for the decode container. | List[string] | [] |
+| `decode.containers[*].modelCommand` | Nature of the command. One of `vllmServe`, `imageDefault` or `custom` | string | `imageDefault` |
+| `decode.containers[*].command` | List of commands for the decode container. | List[string] | [] |
+| `decode.containers[*].ports` | List of ports for the decode container. | List[Port] | [] |
+| `decode.containers[*].extraConfig` | Extra container configuration | dict | {} |
+| `decode.initContainers`. | List of initContainers that should be added (in addition to routing proxy if enabled) | List[Container] | N/A |
+| `decode.parallelism.tensor` | Amount of tensor parallelism | int | 1 |
+| `decode.parallelism.data` | Amount of data parallelism | int | 1 |
+| `decode.parallelism.dataLocal` | Amount of data local parallelism | int | 1 |
+| `decode.parallelism.workers` | Number of workers over which data parallelism is implemented | int | 1 |
+| `decode.acceleratorTypes.labelKey` | Key of label on node that identifies the hosted GPU type | string | N/A |
+| `decode.acceleratorTypes.labelValue` | Value of label on node that identifies type of hosted GPU | string | N/A |
+| `prefill` | Same fields supported in `decode` | See above | See above |
+| `extraObjects` | Additional Kubernetes objects to be deployed alongside the main application | List | [] |
## Contribute
diff --git a/charts/llm-d-modelservice/Chart.yaml b/charts/llm-d-modelservice/Chart.yaml
index bec901fd..87359da7 100644
--- a/charts/llm-d-modelservice/Chart.yaml
+++ b/charts/llm-d-modelservice/Chart.yaml
@@ -13,7 +13,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: "v0.3.4"
+version: "v0.3.5"
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
diff --git a/charts/llm-d-modelservice/templates/_helpers.tpl b/charts/llm-d-modelservice/templates/_helpers.tpl
index ba6d287e..3477979b 100644
--- a/charts/llm-d-modelservice/templates/_helpers.tpl
+++ b/charts/llm-d-modelservice/templates/_helpers.tpl
@@ -122,14 +122,87 @@ initContainers:
{{- end }}
{{- end }}
-{{/* Desired P/D tensor parallelism -- user set or defaults to 1 */}}
+{{/* Desired tensor parallelism --
+- if tensor set, return it
+- else return 1
+*/}}
{{- define "llm-d-modelservice.tensorParallelism" -}}
-{{- if and . .tensor }}{{ .tensor }}{{ else }}1{{ end }}
+{{- if and . .tensor -}}
+{{ .tensor }}
+{{- else -}}
+1
+{{- end -}}
{{- end }}
-{{/* Desired P/D data parallelism -- user set or defaults to 1 */}}
+{{/*
+Desired data parallelism --
+- if data set, return it
+- else if dataLocal and workers set, return dataLocal * workers
+- else if dataLocal set, return dataLocal (w = 1)
+- else return 1 (dpl = 1, w = 1)
+*/}}
{{- define "llm-d-modelservice.dataParallelism" -}}
-{{- if and . .data }}{{ .data }}{{ else }}1{{ end }}
+{{- if and . .data -}}
+{{ .data }}
+{{- else if and . .dataLocal .workers -}}
+{{ mul .dataLocal .workers }}
+{{- else if and . .dataLocal -}}
+{{ .dataLocal }}
+{{- else -}}
+1
+{{- end -}}
+{{- end }}
+
+{{/*
+Desired data local parallelism --
+- if dataLocal set, return it
+- else if data and workers set, return data / workers
+- else if data set, return data (w = 1)
+- else return 1 (dp = 1, w = 1)
+*/}}
+{{- define "llm-d-modelservice.dataLocalParallelism" -}}
+{{- if and . .dataLocal -}}
+{{ .dataLocal }}
+{{- else if and . .data .workers -}}
+{{ $result := div (int .data) (int .workers) }}
+{{- if ne (int .data) (mul $result .workers) -}}
+{{- fail "parallelism.data must be a multiple of parallelism.workers" -}}
+{{- else -}}
+{{ $result }}
+{{- end -}}
+{{- else if and . .data -}}
+{{ .data }}
+{{- else -}}
+1
+{{- end -}}
+{{- end }}
+
+{{/*
+Desired number of workers --
+- if workers set, return it
+- else if data and dataLocal set, return data / dataLocal
+- else return 1 (dp = 1, dpl = 1)
+*/}}
+{{- define "llm-d-modelservice.numWorkers" -}}
+{{- if and . .workers -}}
+{{ .workers }}
+{{- else if and . .data .dataLocal -}}
+{{ $result := div (int .data) (int .dataLocal) }}
+{{- if ne (int .data) (mul $result .dataLocal) -}}
+{{- fail "parallelism.data must be a multiple of parallelism.dataLocal" -}}
+{{- else -}}
+{{ $result }}
+{{- end -}}
+{{- else -}}
+1
+{{- end -}}
+{{- end }}
+
+{{/*
+Required number of GPU per worker -- dpl * tp
+*/}}
+{{- define "llm-d-modelservice.numGpuPerWorker" -}}
+{{ mul (include "llm-d-modelservice.dataLocalParallelism" .) (include "llm-d-modelservice.tensorParallelism" .) }}
{{- end }}
{{/*
@@ -172,21 +245,21 @@ nvidia.com/gpu
{{/* P/D deployment container resources */}}
{{- define "llm-d-modelservice.resources" -}}
-{{- $tensorParallelism := int (include "llm-d-modelservice.tensorParallelism" .parallelism) -}}
+{{- $numGpus := int (include "llm-d-modelservice.numGpuPerWorker" .parallelism) -}}
{{- $acceleratorResource := include "llm-d-modelservice.acceleratorResource" . -}}
{{- $limits := dict }}
{{- if and .resources .resources.limits }}
{{- $limits = deepCopy .resources.limits }}
{{- end }}
-{{- if and (ge (int $tensorParallelism) 1) (ne $acceleratorResource "") }}
-{{- $limits = mergeOverwrite $limits (dict $acceleratorResource (toString $tensorParallelism)) }}
+{{- if and (ge (int $numGpus) 1) (ne $acceleratorResource "") }}
+{{- $limits = mergeOverwrite $limits (dict $acceleratorResource (toString $numGpus)) }}
{{- end }}
{{- $requests := dict }}
{{- if and .resources .resources.requests }}
{{- $requests = deepCopy .resources.requests }}
{{- end }}
-{{- if and (ge (int $tensorParallelism) 1) (ne $acceleratorResource "") }}
-{{- $requests = mergeOverwrite $requests (dict $acceleratorResource (toString $tensorParallelism)) }}
+{{- if and (ge (int $numGpus) 1) (ne $acceleratorResource "") }}
+{{- $requests = mergeOverwrite $requests (dict $acceleratorResource (toString $numGpus)) }}
{{- end }}
resources:
limits:
@@ -417,6 +490,16 @@ args:
- --tensor-parallel-size
- {{ $tensorParallelism | quote }}
{{- end }}
+ {{- $dataParallelism := int (include "llm-d-modelservice.dataParallelism" .parallelism) -}}
+ {{- if gt (int $dataParallelism) 1 }}
+ - --data-parallel-size
+ - {{ $dataParallelism | quote }}
+ {{- end }}
+ {{- $dataLocalParallelism := int (include "llm-d-modelservice.dataLocalParallelism" .parallelism) -}}
+ {{- if gt (int $dataLocalParallelism) 1 }}
+ - --data-parallel-size-local
+ - {{ $dataLocalParallelism | quote }}
+ {{- end }}
- --served-model-name
- {{ .Values.modelArtifacts.name | quote }}
{{- with .container.args }}
@@ -435,6 +518,16 @@ args:
- --tensor-parallel-size
- {{ $tensorParallelism | quote }}
{{- end }}
+ {{- $dataParallelism := int (include "llm-d-modelservice.dataParallelism" .parallelism) -}}
+ {{- if gt (int $dataParallelism) 1 }}
+ - --data-parallel-size
+ - {{ $dataParallelism | quote }}
+ {{- end }}
+ {{- $dataLocalParallelism := int (include "llm-d-modelservice.dataLocalParallelism" .parallelism) -}}
+ {{- if gt (int $dataLocalParallelism) 1 }}
+ - --data-parallel-size-local
+ - {{ $dataLocalParallelism | quote }}
+ {{- end }}
- --served-model-name
- {{ .Values.modelArtifacts.name | quote }}
{{- with .container.args }}
@@ -516,4 +609,6 @@ context is a dict with helm root context plus:
value: {{ include "llm-d-modelservice.dataParallelism" .parallelism | quote }}
- name: TP_SIZE
value: {{ include "llm-d-modelservice.tensorParallelism" .parallelism | quote }}
+- name: DP_SIZE_LOCAL
+ value: {{ include "llm-d-modelservice.dataLocalParallelism" .parallelism | quote }}
{{- end }} {{- /* define "llm-d-modelservice.parallelismEnv" */}}
diff --git a/charts/llm-d-modelservice/templates/decode-lws.yaml b/charts/llm-d-modelservice/templates/decode-lws.yaml
index d6b28555..2172169b 100644
--- a/charts/llm-d-modelservice/templates/decode-lws.yaml
+++ b/charts/llm-d-modelservice/templates/decode-lws.yaml
@@ -20,7 +20,7 @@ spec:
replicas: {{ ternary .Values.decode.replicas 1 (hasKey .Values.decode "replicas") }}
{{- end }}
leaderWorkerTemplate:
- size: {{ int (include "llm-d-modelservice.dataParallelism" .Values.decode.parallelism) }}
+ size: {{ int (include "llm-d-modelservice.numWorkers" .Values.decode.parallelism) }}
{{- if .Values.decode.subGroupPolicy }}
subGroupPolicy:
{{- toYaml .Values.decode.subGroupPolicy | nindent 6 }}
diff --git a/charts/llm-d-modelservice/templates/prefill-lws.yaml b/charts/llm-d-modelservice/templates/prefill-lws.yaml
index 94e4f3ae..3ce797c4 100644
--- a/charts/llm-d-modelservice/templates/prefill-lws.yaml
+++ b/charts/llm-d-modelservice/templates/prefill-lws.yaml
@@ -20,7 +20,7 @@ spec:
replicas: {{ ternary .Values.prefill.replicas 1 (hasKey .Values.prefill "replicas") }}
{{- end }}
leaderWorkerTemplate:
- size: {{ int (include "llm-d-modelservice.dataParallelism" .Values.prefill.parallelism) }}
+ size: {{ int (include "llm-d-modelservice.numWorkers" .Values.prefill.parallelism) }}
{{- if .Values.prefill.subGroupPolicy }}
subGroupPolicy:
{{- toYaml .Values.prefill.subGroupPolicy | nindent 6 }}
diff --git a/examples/output-cpu.yaml b/examples/output-cpu.yaml
index a7c8cf6c..3ff57c0b 100644
--- a/examples/output-cpu.yaml
+++ b/examples/output-cpu.yaml
@@ -6,7 +6,7 @@ kind: ServiceAccount
metadata:
name: cpu-sim-llm-d-modelservice
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
---
@@ -16,7 +16,7 @@ kind: Deployment
metadata:
name: cpu-sim-llm-d-modelservice-decode
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
spec:
@@ -76,6 +76,8 @@ spec:
value: "1"
- name: TP_SIZE
value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
- name: HF_HOME
value: /model-cache
@@ -99,7 +101,7 @@ kind: Deployment
metadata:
name: cpu-sim-llm-d-modelservice-prefill
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
spec:
@@ -142,6 +144,8 @@ spec:
value: "1"
- name: TP_SIZE
value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
- name: HF_HOME
value: /model-cache
diff --git a/examples/output-dra.yaml b/examples/output-dra.yaml
index e773db26..e545b657 100644
--- a/examples/output-dra.yaml
+++ b/examples/output-dra.yaml
@@ -6,7 +6,7 @@ kind: ServiceAccount
metadata:
name: dra-llm-d-modelservice
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
---
@@ -16,7 +16,7 @@ kind: Deployment
metadata:
name: dra-llm-d-modelservice-decode
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
spec:
@@ -78,6 +78,8 @@ spec:
value: "1"
- name: TP_SIZE
value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
- name: HF_HUB_CACHE
value: /model-cache/
diff --git a/examples/output-gaudi.yaml b/examples/output-gaudi.yaml
new file mode 100644
index 00000000..b2897a89
--- /dev/null
+++ b/examples/output-gaudi.yaml
@@ -0,0 +1,101 @@
+# generated by generate-example-output.sh
+---
+# Source: llm-d-modelservice/templates/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: gaudi-llm-d-modelservice
+ labels:
+ helm.sh/chart: llm-d-modelservice-v0.3.5
+ app.kubernetes.io/version: "v0.2.0"
+ app.kubernetes.io/managed-by: Helm
+---
+# Source: llm-d-modelservice/templates/decode-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: gaudi-llm-d-modelservice-decode
+ labels:
+ helm.sh/chart: llm-d-modelservice-v0.3.5
+ app.kubernetes.io/version: "v0.2.0"
+ app.kubernetes.io/managed-by: Helm
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ llm-d.ai/inferenceServing: "true"
+ llm-d.ai/model: random_model
+ llm-d.ai/role: decode
+ template:
+ metadata:
+ labels:
+ llm-d.ai/inferenceServing: "true"
+ llm-d.ai/model: random_model
+ llm-d.ai/role: decode
+ spec:
+
+
+ serviceAccountName: gaudi-llm-d-modelservice
+
+ volumes:
+ - emptyDir: {}
+ name: metrics-volume
+ - name: model-storage
+ persistentVolumeClaim:
+ claimName: model-pvc
+ readOnly: true
+ containers:
+ - name: vllm
+ image: opea/vllm-gaudi:1.22.0
+
+ args:
+ - --model
+ - meta-llama/Llama-3.1-8B-Instruct
+ - --port
+ - "8000"
+ - --served-model-name
+ - "meta-llama/Llama-3.1-8B-Instruct"
+
+ - --block-size=128
+ - --max-num-seqs=256
+ - --max-seq-len-to-capture=2048
+ - --max-model-len=2048
+ - --max-num-batched-token=16000
+ env:
+ - name: OMPI_MCA_btl_vader_single_copy_mechanism
+ value: none
+ - name: HABANA_LOGS
+ value: /tmp/habana_logs
+ - name: VLLM_SKIP_WARMUP
+ value: "true"
+ - name: DO_NOT_TRACK
+ value: "1"
+ - name: VLLM_USE_V1
+ value: "1"
+ - name: DP_SIZE
+ value: "1"
+ - name: TP_SIZE
+ value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
+
+ - name: HF_HUB_CACHE
+ value: /model-cache/
+ - name: HF_TOKEN
+ valueFrom:
+ secretKeyRef:
+ name: llm-d-hf-token
+ key: HF_TOKEN
+ ports:
+ - containerPort: 8200
+ protocol: TCP
+
+ resources:
+ limits:
+ habana.ai/gaudi: "1"
+ requests:
+ habana.ai/gaudi: "1"
+
+ volumeMounts:
+ - name: model-storage
+ mountPath: /model-cache
diff --git a/examples/output-pd.yaml b/examples/output-pd.yaml
index 3a9294f8..a5ecd9ba 100644
--- a/examples/output-pd.yaml
+++ b/examples/output-pd.yaml
@@ -6,7 +6,7 @@ kind: ServiceAccount
metadata:
name: pd-llm-d-modelservice
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
---
@@ -16,7 +16,7 @@ kind: Deployment
metadata:
name: pd-llm-d-modelservice-decode
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
spec:
@@ -92,6 +92,8 @@ spec:
value: "1"
- name: TP_SIZE
value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
- name: HF_HOME
value: /model-cache
@@ -121,7 +123,7 @@ kind: Deployment
metadata:
name: pd-llm-d-modelservice-prefill
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
spec:
@@ -180,6 +182,8 @@ spec:
value: "1"
- name: TP_SIZE
value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
- name: HF_HOME
value: /model-cache
diff --git a/examples/output-pvc-hf.yaml b/examples/output-pvc-hf.yaml
index cb3d97b9..76c318cb 100644
--- a/examples/output-pvc-hf.yaml
+++ b/examples/output-pvc-hf.yaml
@@ -6,7 +6,7 @@ kind: ServiceAccount
metadata:
name: pvc-hf-llm-d-modelservice
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
---
@@ -16,7 +16,7 @@ kind: Deployment
metadata:
name: pvc-hf-llm-d-modelservice-decode
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
spec:
@@ -92,6 +92,8 @@ spec:
value: "1"
- name: TP_SIZE
value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
- name: HF_HUB_CACHE
value: /model-cache/path/to/hf_hub_cache
@@ -121,7 +123,7 @@ kind: Deployment
metadata:
name: pvc-hf-llm-d-modelservice-prefill
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
spec:
@@ -180,6 +182,8 @@ spec:
value: "1"
- name: TP_SIZE
value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
- name: HF_HUB_CACHE
value: /model-cache/path/to/hf_hub_cache
diff --git a/examples/output-pvc.yaml b/examples/output-pvc.yaml
index 2a7dd867..0e5ff307 100644
--- a/examples/output-pvc.yaml
+++ b/examples/output-pvc.yaml
@@ -6,7 +6,7 @@ kind: ServiceAccount
metadata:
name: pvc-llm-d-modelservice
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
---
@@ -16,7 +16,7 @@ kind: Deployment
metadata:
name: pvc-llm-d-modelservice-decode
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
spec:
@@ -92,6 +92,8 @@ spec:
value: "1"
- name: TP_SIZE
value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
ports:
- containerPort: 8200
@@ -119,7 +121,7 @@ kind: Deployment
metadata:
name: pvc-llm-d-modelservice-prefill
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
spec:
@@ -178,6 +180,8 @@ spec:
value: "1"
- name: TP_SIZE
value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
ports:
- containerPort: 8000
diff --git a/examples/output-requester.yaml b/examples/output-requester.yaml
index 29456785..28339e62 100644
--- a/examples/output-requester.yaml
+++ b/examples/output-requester.yaml
@@ -6,7 +6,7 @@ kind: ServiceAccount
metadata:
name: requester-llm-d-modelservice
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
---
@@ -81,6 +81,8 @@ spec:
value: "1"
- name: TP_SIZE
value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
- name: HF_HOME
value: /model-cache
@@ -138,7 +140,7 @@ kind: Deployment
metadata:
name: requester-llm-d-modelservice-prefill
labels:
- helm.sh/chart: llm-d-modelservice-v0.3.4
+ helm.sh/chart: llm-d-modelservice-v0.3.5
app.kubernetes.io/version: "v0.2.0"
app.kubernetes.io/managed-by: Helm
spec:
@@ -197,6 +199,8 @@ spec:
value: "1"
- name: TP_SIZE
value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
- name: HF_HOME
value: /model-cache
diff --git a/examples/output-xpu-pd.yaml b/examples/output-xpu-pd.yaml
new file mode 100644
index 00000000..2ce8dd30
--- /dev/null
+++ b/examples/output-xpu-pd.yaml
@@ -0,0 +1,271 @@
+# generated by generate-example-output.sh
+---
+# Source: llm-d-modelservice/templates/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: xpu-pd-llm-d-modelservice
+ labels:
+ helm.sh/chart: llm-d-modelservice-v0.3.5
+ app.kubernetes.io/version: "v0.2.0"
+ app.kubernetes.io/managed-by: Helm
+---
+# Source: llm-d-modelservice/templates/decode-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: xpu-pd-llm-d-modelservice-decode
+ labels:
+ helm.sh/chart: llm-d-modelservice-v0.3.5
+ app.kubernetes.io/version: "v0.2.0"
+ app.kubernetes.io/managed-by: Helm
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ llm-d.ai/inferenceServing: "true"
+ llm-d.ai/model: microsoft-dialogpt-large
+ llm-d.ai/role: decode
+ template:
+ metadata:
+ labels:
+ llm-d.ai/inferenceServing: "true"
+ llm-d.ai/model: microsoft-dialogpt-large
+ llm-d.ai/role: decode
+ spec:
+ initContainers:
+ - name: routing-proxy
+ args:
+ - --port=8000
+ - --vllm-port=8200
+ - --connector=nixlv2
+ - -v=5
+ - --secure-proxy=false
+ image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
+ imagePullPolicy: Always
+ ports:
+ - containerPort: 8000
+ resources: {}
+ restartPolicy: Always
+ securityContext:
+ allowPrivilegeEscalation: false
+ runAsNonRoot: true
+
+ serviceAccountName: xpu-pd-llm-d-modelservice
+
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: accelerator
+ operator: In
+ values:
+ - intel-xpu
+ volumes:
+ - emptyDir: {}
+ name: metrics-volume
+ - name: model-storage
+ emptyDir:
+ sizeLimit: 10Gi
+
+ containers:
+ - name: vllm
+ image: ghcr.io/llm-d/llm-d-xpu:v0.2.0
+ imagePullPolicy: Never
+
+ command:
+ - python3
+ - -m
+ - vllm.entrypoints.openai.api_server
+ args:
+ - --model
+ - microsoft/DialoGPT-large
+ - --enforce-eager
+ - --tensor-parallel-size
+ - "1"
+ - --port
+ - "8200"
+ - --host
+ - 0.0.0.0
+ - --kv-transfer-config
+ - '{"kv_connector":"NixlConnector", "kv_role":"kv_consumer"}'
+ env:
+ - name: ZE_AFFINITY_MASK
+ value: "0"
+ - name: ZE_ENABLE_PCI_ID_DEVICE_ORDER
+ value: "1"
+ - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+ valueFrom:
+ fieldRef:
+ fieldPath: status.podIP
+ - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+ value: "5557"
+ - name: VLLM_LOGGING_LEVEL
+ value: DEBUG
+ - name: TORCH_LLM_ALLREDUCE
+ value: "1"
+ - name: VLLM_USE_V1
+ value: "1"
+ - name: CCL_ZE_IPC_EXCHANGE
+ value: pidfd
+ - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
+ value: "1"
+ - name: VLLM_WORKER_MULTIPROC_METHOD
+ value: spawn
+ - name: DP_SIZE
+ value: "1"
+ - name: TP_SIZE
+ value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
+
+ - name: HF_HOME
+ value: /model-cache
+
+ - name: VLLM_USE_V1
+ value: "1"
+ - name: TORCH_LLM_ALLREDUCE
+ value: "1"
+ - name: VLLM_WORKER_MULTIPROC_METHOD
+ value: "spawn"
+ ports:
+ - containerPort: 8200
+ protocol: TCP
+ - containerPort: 5557
+ protocol: TCP
+
+ resources:
+ limits:
+ cpu: "8"
+ gpu.intel.com/i915: "1"
+ memory: 24Gi
+ requests:
+ cpu: "4"
+ gpu.intel.com/i915: "1"
+ memory: 12Gi
+
+ volumeMounts:
+ - name: model-storage
+ mountPath: /model-cache
+---
+# Source: llm-d-modelservice/templates/prefill-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: xpu-pd-llm-d-modelservice-prefill
+ labels:
+ helm.sh/chart: llm-d-modelservice-v0.3.5
+ app.kubernetes.io/version: "v0.2.0"
+ app.kubernetes.io/managed-by: Helm
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ llm-d.ai/inferenceServing: "true"
+ llm-d.ai/model: microsoft-dialogpt-large
+ llm-d.ai/role: prefill
+ template:
+ metadata:
+ labels:
+ llm-d.ai/inferenceServing: "true"
+ llm-d.ai/model: microsoft-dialogpt-large
+ llm-d.ai/role: prefill
+ spec:
+
+ serviceAccountName: xpu-pd-llm-d-modelservice
+
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: accelerator
+ operator: In
+ values:
+ - intel-xpu
+ volumes:
+ - emptyDir: {}
+ name: metrics-volume
+ - name: model-storage
+ emptyDir:
+ sizeLimit: 10Gi
+
+ containers:
+ - name: vllm
+ image: ghcr.io/llm-d/llm-d-xpu:v0.2.0
+
+ command:
+ - python3
+ - -m
+ - vllm.entrypoints.openai.api_server
+ args:
+ - --model
+ - microsoft/DialoGPT-large
+ - --enforce-eager
+ - --tensor-parallel-size
+ - "1"
+ - --port
+ - "8000"
+ - --host
+ - 0.0.0.0
+ - --kv-transfer-config
+ - '{"kv_connector":"NixlConnector", "kv_role":"kv_producer"}'
+ env:
+ - name: ZE_AFFINITY_MASK
+ value: "1"
+ - name: ZE_ENABLE_PCI_ID_DEVICE_ORDER
+ value: "1"
+ - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+ value: "5557"
+ - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+ valueFrom:
+ fieldRef:
+ fieldPath: status.podIP
+ - name: VLLM_LOGGING_LEVEL
+ value: DEBUG
+ - name: TORCH_LLM_ALLREDUCE
+ value: "1"
+ - name: VLLM_USE_V1
+ value: "1"
+ - name: CCL_ZE_IPC_EXCHANGE
+ value: pidfd
+ - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
+ value: "1"
+ - name: VLLM_WORKER_MULTIPROC_METHOD
+ value: spawn
+ - name: DP_SIZE
+ value: "1"
+ - name: TP_SIZE
+ value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
+
+ - name: HF_HOME
+ value: /model-cache
+
+ - name: VLLM_USE_V1
+ value: "1"
+ - name: TORCH_LLM_ALLREDUCE
+ value: "1"
+ - name: VLLM_WORKER_MULTIPROC_METHOD
+ value: "spawn"
+ ports:
+ - containerPort: 8000
+ protocol: TCP
+ - containerPort: 5557
+ protocol: TCP
+
+ resources:
+ limits:
+ cpu: "16"
+ gpu.intel.com/i915: "1"
+ memory: 32Gi
+ requests:
+ cpu: "8"
+ gpu.intel.com/i915: "1"
+ memory: 16Gi
+
+ volumeMounts:
+ - name: model-storage
+ mountPath: /model-cache
diff --git a/examples/output-xpu.yaml b/examples/output-xpu.yaml
new file mode 100644
index 00000000..148fbc61
--- /dev/null
+++ b/examples/output-xpu.yaml
@@ -0,0 +1,109 @@
+# generated by generate-example-output.sh
+---
+# Source: llm-d-modelservice/templates/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: xpu-llm-d-modelservice
+ labels:
+ helm.sh/chart: llm-d-modelservice-v0.3.5
+ app.kubernetes.io/version: "v0.2.0"
+ app.kubernetes.io/managed-by: Helm
+---
+# Source: llm-d-modelservice/templates/decode-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: xpu-llm-d-modelservice-decode
+ labels:
+ helm.sh/chart: llm-d-modelservice-v0.3.5
+ app.kubernetes.io/version: "v0.2.0"
+ app.kubernetes.io/managed-by: Helm
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ llm-d.ai/inferenceServing: "true"
+ llm-d.ai/model: deepseek-ai-deepSeek-r1-distill-qwen-1-5B
+ llm-d.ai/role: decode
+ template:
+ metadata:
+ labels:
+ llm-d.ai/inferenceServing: "true"
+ llm-d.ai/model: deepseek-ai-deepSeek-r1-distill-qwen-1-5B
+ llm-d.ai/role: decode
+ spec:
+
+
+ serviceAccountName: xpu-llm-d-modelservice
+
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: accelerator
+ operator: In
+ values:
+ - intel-xpu
+ - intel-gpu-max
+ volumes:
+ - emptyDir: {}
+ name: metrics-volume
+ - name: model-storage
+ emptyDir:
+ sizeLimit: 10Gi
+
+ containers:
+ - name: vllm
+ image: ghcr.io/llm-d/llm-d-xpu:v0.2.0
+
+ args:
+ - --model
+ - deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+ - --port
+ - "8000"
+ - --served-model-name
+ - "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+
+ - --enforce-eager
+ - --dtype
+ - float16
+ - --disable-sliding-window
+ - --gpu-memory-util
+ - "0.9"
+ - --no-enable-prefix-caching
+ - --max-num-batched-tokens
+ - "4096"
+ - --disable-log-requests
+ - --max-model-len
+ - "4096"
+ - --block-size
+ - "64"
+ env:
+ - name: DP_SIZE
+ value: "1"
+ - name: TP_SIZE
+ value: "1"
+ - name: DP_SIZE_LOCAL
+ value: "1"
+
+ - name: HF_HOME
+ value: /model-cache
+ ports:
+ - containerPort: 8200
+ protocol: TCP
+
+ resources:
+ limits:
+ cpu: "8"
+ gpu.intel.com/xe: "1"
+ memory: 24Gi
+ requests:
+ cpu: "4"
+ gpu.intel.com/xe: "1"
+ memory: 12Gi
+
+ volumeMounts:
+ - name: model-storage
+ mountPath: /model-cache
diff --git a/examples/values-cpu.yaml b/examples/values-cpu.yaml
index c79a7e9f..561d6cdf 100644
--- a/examples/values-cpu.yaml
+++ b/examples/values-cpu.yaml
@@ -20,6 +20,9 @@ routing:
proxy:
secure: false
+accelerator:
+ type: cpu
+
# Decode pod configuation
decode:
replicas: 1
diff --git a/examples/values-pd.yaml b/examples/values-pd.yaml
index 7c1f8355..18097ee5 100644
--- a/examples/values-pd.yaml
+++ b/examples/values-pd.yaml
@@ -57,11 +57,9 @@ decode:
limits:
memory: 16Gi
cpu: "16"
- nvidia.com/gpu: "1"
requests:
cpu: "16"
memory: 16Gi
- nvidia.com/gpu: "1"
mountModelVolume: true
# Prefill pod configuation
@@ -98,9 +96,7 @@ prefill:
limits:
memory: 16Gi
cpu: "16"
- nvidia.com/gpu: "1"
requests:
cpu: "16"
memory: 16Gi
- nvidia.com/gpu: "1"
mountModelVolume: true
diff --git a/examples/values-xpu-pd.yaml b/examples/values-xpu-pd.yaml
index 74e4f8b2..42006cae 100644
--- a/examples/values-xpu-pd.yaml
+++ b/examples/values-xpu-pd.yaml
@@ -145,11 +145,9 @@ prefill:
limits:
memory: 32Gi
cpu: "16"
- gpu.intel.com/i915: "1"
requests:
cpu: "8"
memory: 16Gi
- gpu.intel.com/i915: "1"
mountModelVolume: true
acceleratorTypes:
diff --git a/examples/values-xpu.yaml b/examples/values-xpu.yaml
index 82719ebf..6b71b07d 100644
--- a/examples/values-xpu.yaml
+++ b/examples/values-xpu.yaml
@@ -53,11 +53,9 @@ decode:
limits:
memory: 24Gi
cpu: "8"
- gpu.intel.com/xe: "1"
requests:
cpu: "4"
memory: 12Gi
- gpu.intel.com/xe: "1"
mountModelVolume: true
diff --git a/hack/generate-example-output.sh b/hack/generate-example-output.sh
index 8646f406..3b852c03 100755
--- a/hack/generate-example-output.sh
+++ b/hack/generate-example-output.sh
@@ -19,9 +19,21 @@ generate_output() {
# Generate output-cpu.yaml (Simulated CPU deployment)
generate_output "cpu-sim" "examples/values-cpu.yaml" "examples/output-cpu.yaml"
+# Generate output-dra.yaml (Dynamic Resource Allocation deployment for Intel B50 GPU device)
+generate_output "dra" "examples/values-dra.yaml" "examples/output-dra.yaml" '--set modelArtifacts.uri=pvc+hf://model-pvc/meta-llama/Llama-3.1-8B-Instruct'
+
+# Generate output-gaudi.yaml (Intel Gaudi)
+generate_output "gaudi" "examples/values-gaudi.yaml" "examples/output-gaudi.yaml"
+
# Generate output-pd.yaml (PD deployment)
generate_output "pd" "examples/values-pd.yaml" "examples/output-pd.yaml"
+# Generate output-xpu-pd.yaml (Intel i915)
+generate_output "xpu-pd" "examples/values-xpu-pd.yaml" "examples/output-xpu-pd.yaml"
+
+# Generate output-xpu.yaml (Intel Xe GPU and affinity)
+generate_output "xpu" "examples/values-xpu.yaml" "examples/output-xpu.yaml"
+
# Generate output-requester.yaml (Requester deployment)
generate_output "requester" "examples/values-requester.yaml" "examples/output-requester.yaml"
@@ -30,6 +42,3 @@ generate_output "pvc" "examples/values-pd.yaml" "examples/output-pvc.yaml" '--se
# Generate output-pvc-hf.yaml (PVC HuggingFace model deployment)
generate_output "pvc-hf" "examples/values-pd.yaml" "examples/output-pvc-hf.yaml" '--set modelArtifacts.uri=pvc+hf://pvc-name/path/to/hf_hub_cache/facebook/opt-125m'
-
-# Generate output-dra.yaml (Dynamic Resource Allocation deployment for Intel B50 GPU device)
-generate_output "dra" "examples/values-dra.yaml" "examples/output-dra.yaml" '--set modelArtifacts.uri=pvc+hf://model-pvc/meta-llama/Llama-3.1-8B-Instruct'