diff --git a/README.md b/README.md index e0e2446e..3d907919 100644 --- a/README.md +++ b/README.md @@ -58,37 +58,40 @@ Check Helm's [official docs](https://helm.sh/docs/intro/using_helm/) for more gu ## Values Below are the values you can set. -| Key | Description | Type | Default | -|----------------------------------------|-------------------------------------------------------------------------------------------------------------------|--------------|---------------------------------------------| -| `modelArtifacts.name` | name of model in the form namespace/modelId. Required. | string | N/A | -| `modelArtifacts.uri` | Model artifacts URI. Current formats supported include `hf://`, `pvc://`, and `oci://` | string | N/A | -| `modelArtifacts.size` | Size used to create an emptyDir volume for downloading the model. | string | N/A | -| `modelArtifacts.authSecretName` | The name of the Secret containing `HF_TOKEN` for `hf://` artifacts that require a token for downloading a model. | string | N/A | -| `modelArtifacts.mountPath` | Path to mount the volume created to store models | string | /model-cache | -| `multinode` | Determines whether to create P/D using Deployments (false) or LeaderWorkerSets (true) | bool | `false` | -| `routing.servicePort` | The port the routing proxy sidecar listens on.
If there is no sidecar, this is the port the request goes to. | int | N/A | -| `routing.proxy.image` | Image used for the sidecar | string | `ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6` | -| `routing.proxy.targetPort` | The port the vLLM decode container listens on.
If proxy is present, it will forward request to this port. | string | N/A | -| `routing.proxy.debugLevel` | Debug level of the routing proxy | int | 5 | -| `routing.proxy.parentRefs[*].name` | The name of the inference gateway | string | N/A | -| `decode.create` | If true, creates decode Deployment or LeaderWorkerSet | List | `true` | -| `decode.annotations` | Annotations that should be added to the Deployment or LeaderWorkerSet | Dict | {} | -| `decode.tolerations` | Tolerations that should be added to the Deployment or LeaderWorkerSet | List | [] | -| `decode.replicas` | Number of replicas for decode pods | int | 1 | -| `decode.extraConfig` | Extra pod configuration | dict | {} | -| `decode.containers[*].name` | Name of the container for the decode deployment/LWS | string | N/A | -| `decode.containers[*].image` | Image of the container for the decode deployment/LWS | string | N/A | -| `decode.containers[*].args` | List of arguments for the decode container. | List[string] | [] | -| `decode.containers[*].modelCommand` | Nature of the command. One of `vllmServe`, `imageDefault` or `custom` | string | `imageDefault` | -| `decode.containers[*].command` | List of commands for the decode container. | List[string] | [] | -| `decode.containers[*].ports` | List of ports for the decode container. | List[Port] | [] | -| `decode.containers[*].extraConfig` | Extra container configuration | dict | {} | -| `decode.parallelism.data` | Amount of data parallelism | int | 1 | -| `decode.parallelism.tensor` | Amount of tensor parallelism | int | 1 | -| `decode.acceleratorTypes.labelKey` | Key of label on node that identifies the hosted GPU type | string | N/A | -| `decode.acceleratorTypes.labelValue` | Value of label on node that identifies type of hosted GPU | string | N/A | -| `prefill` | Same fields supported in `decode` | See above | See above | -| `extraObjects` | Additional Kubernetes objects to be deployed alongside the main application | List | [] | +| Key | Description | Type | Default | +|----------------------------------------|-------------------------------------------------------------------------------------------------------------------|-----------------|---------------------------------------------| +| `modelArtifacts.name` | name of model in the form namespace/modelId. Required. | string | N/A | +| `modelArtifacts.uri` | Model artifacts URI. Current formats supported include `hf://`, `pvc://`, and `oci://` | string | N/A | +| `modelArtifacts.size` | Size used to create an emptyDir volume for downloading the model. | string | N/A | +| `modelArtifacts.authSecretName` | The name of the Secret containing `HF_TOKEN` for `hf://` artifacts that require a token for downloading a model. | string | N/A | +| `modelArtifacts.mountPath` | Path to mount the volume created to store models | string | /model-cache | +| `multinode` | Determines whether to create P/D using Deployments (false) or LeaderWorkerSets (true) | bool | `false` | +| `routing.servicePort` | The port the routing proxy sidecar listens on.
If there is no sidecar, this is the port the request goes to. | int | N/A | +| `routing.proxy.image` | Image used for the sidecar | string | `ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6` | +| `routing.proxy.targetPort` | The port the vLLM decode container listens on.
If proxy is present, it will forward request to this port. | string | N/A | +| `routing.proxy.debugLevel` | Debug level of the routing proxy | int | 5 | +| `routing.proxy.parentRefs[*].name` | The name of the inference gateway | string | N/A | +| `decode.create` | If true, creates decode Deployment or LeaderWorkerSet | List | `true` | +| `decode.annotations` | Annotations that should be added to the Deployment or LeaderWorkerSet | Dict | {} | +| `decode.tolerations` | Tolerations that should be added to the Deployment or LeaderWorkerSet | List | [] | +| `decode.replicas` | Number of replicas for decode pods | int | 1 | +| `decode.extraConfig` | Extra pod configuration | dict | {} | +| `decode.containers[*].name` | Name of the container for the decode deployment/LWS | string | N/A | +| `decode.containers[*].image` | Image of the container for the decode deployment/LWS | string | N/A | +| `decode.containers[*].args` | List of arguments for the decode container. | List[string] | [] | +| `decode.containers[*].modelCommand` | Nature of the command. One of `vllmServe`, `imageDefault` or `custom` | string | `imageDefault` | +| `decode.containers[*].command` | List of commands for the decode container. | List[string] | [] | +| `decode.containers[*].ports` | List of ports for the decode container. | List[Port] | [] | +| `decode.containers[*].extraConfig` | Extra container configuration | dict | {} | +| `decode.initContainers`. | List of initContainers that should be added (in addition to routing proxy if enabled) | List[Container] | N/A | +| `decode.parallelism.tensor` | Amount of tensor parallelism | int | 1 | +| `decode.parallelism.data` | Amount of data parallelism | int | 1 | +| `decode.parallelism.dataLocal` | Amount of data local parallelism | int | 1 | +| `decode.parallelism.workers` | Number of workers over which data parallelism is implemented | int | 1 | +| `decode.acceleratorTypes.labelKey` | Key of label on node that identifies the hosted GPU type | string | N/A | +| `decode.acceleratorTypes.labelValue` | Value of label on node that identifies type of hosted GPU | string | N/A | +| `prefill` | Same fields supported in `decode` | See above | See above | +| `extraObjects` | Additional Kubernetes objects to be deployed alongside the main application | List | [] | ## Contribute diff --git a/charts/llm-d-modelservice/Chart.yaml b/charts/llm-d-modelservice/Chart.yaml index bec901fd..87359da7 100644 --- a/charts/llm-d-modelservice/Chart.yaml +++ b/charts/llm-d-modelservice/Chart.yaml @@ -13,7 +13,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: "v0.3.4" +version: "v0.3.5" # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. diff --git a/charts/llm-d-modelservice/templates/_helpers.tpl b/charts/llm-d-modelservice/templates/_helpers.tpl index ba6d287e..3477979b 100644 --- a/charts/llm-d-modelservice/templates/_helpers.tpl +++ b/charts/llm-d-modelservice/templates/_helpers.tpl @@ -122,14 +122,87 @@ initContainers: {{- end }} {{- end }} -{{/* Desired P/D tensor parallelism -- user set or defaults to 1 */}} +{{/* Desired tensor parallelism -- +- if tensor set, return it +- else return 1 +*/}} {{- define "llm-d-modelservice.tensorParallelism" -}} -{{- if and . .tensor }}{{ .tensor }}{{ else }}1{{ end }} +{{- if and . .tensor -}} +{{ .tensor }} +{{- else -}} +1 +{{- end -}} {{- end }} -{{/* Desired P/D data parallelism -- user set or defaults to 1 */}} +{{/* +Desired data parallelism -- +- if data set, return it +- else if dataLocal and workers set, return dataLocal * workers +- else if dataLocal set, return dataLocal (w = 1) +- else return 1 (dpl = 1, w = 1) +*/}} {{- define "llm-d-modelservice.dataParallelism" -}} -{{- if and . .data }}{{ .data }}{{ else }}1{{ end }} +{{- if and . .data -}} +{{ .data }} +{{- else if and . .dataLocal .workers -}} +{{ mul .dataLocal .workers }} +{{- else if and . .dataLocal -}} +{{ .dataLocal }} +{{- else -}} +1 +{{- end -}} +{{- end }} + +{{/* +Desired data local parallelism -- +- if dataLocal set, return it +- else if data and workers set, return data / workers +- else if data set, return data (w = 1) +- else return 1 (dp = 1, w = 1) +*/}} +{{- define "llm-d-modelservice.dataLocalParallelism" -}} +{{- if and . .dataLocal -}} +{{ .dataLocal }} +{{- else if and . .data .workers -}} +{{ $result := div (int .data) (int .workers) }} +{{- if ne (int .data) (mul $result .workers) -}} +{{- fail "parallelism.data must be a multiple of parallelism.workers" -}} +{{- else -}} +{{ $result }} +{{- end -}} +{{- else if and . .data -}} +{{ .data }} +{{- else -}} +1 +{{- end -}} +{{- end }} + +{{/* +Desired number of workers -- +- if workers set, return it +- else if data and dataLocal set, return data / dataLocal +- else return 1 (dp = 1, dpl = 1) +*/}} +{{- define "llm-d-modelservice.numWorkers" -}} +{{- if and . .workers -}} +{{ .workers }} +{{- else if and . .data .dataLocal -}} +{{ $result := div (int .data) (int .dataLocal) }} +{{- if ne (int .data) (mul $result .dataLocal) -}} +{{- fail "parallelism.data must be a multiple of parallelism.dataLocal" -}} +{{- else -}} +{{ $result }} +{{- end -}} +{{- else -}} +1 +{{- end -}} +{{- end }} + +{{/* +Required number of GPU per worker -- dpl * tp +*/}} +{{- define "llm-d-modelservice.numGpuPerWorker" -}} +{{ mul (include "llm-d-modelservice.dataLocalParallelism" .) (include "llm-d-modelservice.tensorParallelism" .) }} {{- end }} {{/* @@ -172,21 +245,21 @@ nvidia.com/gpu {{/* P/D deployment container resources */}} {{- define "llm-d-modelservice.resources" -}} -{{- $tensorParallelism := int (include "llm-d-modelservice.tensorParallelism" .parallelism) -}} +{{- $numGpus := int (include "llm-d-modelservice.numGpuPerWorker" .parallelism) -}} {{- $acceleratorResource := include "llm-d-modelservice.acceleratorResource" . -}} {{- $limits := dict }} {{- if and .resources .resources.limits }} {{- $limits = deepCopy .resources.limits }} {{- end }} -{{- if and (ge (int $tensorParallelism) 1) (ne $acceleratorResource "") }} -{{- $limits = mergeOverwrite $limits (dict $acceleratorResource (toString $tensorParallelism)) }} +{{- if and (ge (int $numGpus) 1) (ne $acceleratorResource "") }} +{{- $limits = mergeOverwrite $limits (dict $acceleratorResource (toString $numGpus)) }} {{- end }} {{- $requests := dict }} {{- if and .resources .resources.requests }} {{- $requests = deepCopy .resources.requests }} {{- end }} -{{- if and (ge (int $tensorParallelism) 1) (ne $acceleratorResource "") }} -{{- $requests = mergeOverwrite $requests (dict $acceleratorResource (toString $tensorParallelism)) }} +{{- if and (ge (int $numGpus) 1) (ne $acceleratorResource "") }} +{{- $requests = mergeOverwrite $requests (dict $acceleratorResource (toString $numGpus)) }} {{- end }} resources: limits: @@ -417,6 +490,16 @@ args: - --tensor-parallel-size - {{ $tensorParallelism | quote }} {{- end }} + {{- $dataParallelism := int (include "llm-d-modelservice.dataParallelism" .parallelism) -}} + {{- if gt (int $dataParallelism) 1 }} + - --data-parallel-size + - {{ $dataParallelism | quote }} + {{- end }} + {{- $dataLocalParallelism := int (include "llm-d-modelservice.dataLocalParallelism" .parallelism) -}} + {{- if gt (int $dataLocalParallelism) 1 }} + - --data-parallel-size-local + - {{ $dataLocalParallelism | quote }} + {{- end }} - --served-model-name - {{ .Values.modelArtifacts.name | quote }} {{- with .container.args }} @@ -435,6 +518,16 @@ args: - --tensor-parallel-size - {{ $tensorParallelism | quote }} {{- end }} + {{- $dataParallelism := int (include "llm-d-modelservice.dataParallelism" .parallelism) -}} + {{- if gt (int $dataParallelism) 1 }} + - --data-parallel-size + - {{ $dataParallelism | quote }} + {{- end }} + {{- $dataLocalParallelism := int (include "llm-d-modelservice.dataLocalParallelism" .parallelism) -}} + {{- if gt (int $dataLocalParallelism) 1 }} + - --data-parallel-size-local + - {{ $dataLocalParallelism | quote }} + {{- end }} - --served-model-name - {{ .Values.modelArtifacts.name | quote }} {{- with .container.args }} @@ -516,4 +609,6 @@ context is a dict with helm root context plus: value: {{ include "llm-d-modelservice.dataParallelism" .parallelism | quote }} - name: TP_SIZE value: {{ include "llm-d-modelservice.tensorParallelism" .parallelism | quote }} +- name: DP_SIZE_LOCAL + value: {{ include "llm-d-modelservice.dataLocalParallelism" .parallelism | quote }} {{- end }} {{- /* define "llm-d-modelservice.parallelismEnv" */}} diff --git a/charts/llm-d-modelservice/templates/decode-lws.yaml b/charts/llm-d-modelservice/templates/decode-lws.yaml index d6b28555..2172169b 100644 --- a/charts/llm-d-modelservice/templates/decode-lws.yaml +++ b/charts/llm-d-modelservice/templates/decode-lws.yaml @@ -20,7 +20,7 @@ spec: replicas: {{ ternary .Values.decode.replicas 1 (hasKey .Values.decode "replicas") }} {{- end }} leaderWorkerTemplate: - size: {{ int (include "llm-d-modelservice.dataParallelism" .Values.decode.parallelism) }} + size: {{ int (include "llm-d-modelservice.numWorkers" .Values.decode.parallelism) }} {{- if .Values.decode.subGroupPolicy }} subGroupPolicy: {{- toYaml .Values.decode.subGroupPolicy | nindent 6 }} diff --git a/charts/llm-d-modelservice/templates/prefill-lws.yaml b/charts/llm-d-modelservice/templates/prefill-lws.yaml index 94e4f3ae..3ce797c4 100644 --- a/charts/llm-d-modelservice/templates/prefill-lws.yaml +++ b/charts/llm-d-modelservice/templates/prefill-lws.yaml @@ -20,7 +20,7 @@ spec: replicas: {{ ternary .Values.prefill.replicas 1 (hasKey .Values.prefill "replicas") }} {{- end }} leaderWorkerTemplate: - size: {{ int (include "llm-d-modelservice.dataParallelism" .Values.prefill.parallelism) }} + size: {{ int (include "llm-d-modelservice.numWorkers" .Values.prefill.parallelism) }} {{- if .Values.prefill.subGroupPolicy }} subGroupPolicy: {{- toYaml .Values.prefill.subGroupPolicy | nindent 6 }} diff --git a/examples/output-cpu.yaml b/examples/output-cpu.yaml index a7c8cf6c..3ff57c0b 100644 --- a/examples/output-cpu.yaml +++ b/examples/output-cpu.yaml @@ -6,7 +6,7 @@ kind: ServiceAccount metadata: name: cpu-sim-llm-d-modelservice labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- @@ -16,7 +16,7 @@ kind: Deployment metadata: name: cpu-sim-llm-d-modelservice-decode labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -76,6 +76,8 @@ spec: value: "1" - name: TP_SIZE value: "1" + - name: DP_SIZE_LOCAL + value: "1" - name: HF_HOME value: /model-cache @@ -99,7 +101,7 @@ kind: Deployment metadata: name: cpu-sim-llm-d-modelservice-prefill labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -142,6 +144,8 @@ spec: value: "1" - name: TP_SIZE value: "1" + - name: DP_SIZE_LOCAL + value: "1" - name: HF_HOME value: /model-cache diff --git a/examples/output-dra.yaml b/examples/output-dra.yaml index e773db26..e545b657 100644 --- a/examples/output-dra.yaml +++ b/examples/output-dra.yaml @@ -6,7 +6,7 @@ kind: ServiceAccount metadata: name: dra-llm-d-modelservice labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- @@ -16,7 +16,7 @@ kind: Deployment metadata: name: dra-llm-d-modelservice-decode labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -78,6 +78,8 @@ spec: value: "1" - name: TP_SIZE value: "1" + - name: DP_SIZE_LOCAL + value: "1" - name: HF_HUB_CACHE value: /model-cache/ diff --git a/examples/output-gaudi.yaml b/examples/output-gaudi.yaml new file mode 100644 index 00000000..b2897a89 --- /dev/null +++ b/examples/output-gaudi.yaml @@ -0,0 +1,101 @@ +# generated by generate-example-output.sh +--- +# Source: llm-d-modelservice/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: gaudi-llm-d-modelservice + labels: + helm.sh/chart: llm-d-modelservice-v0.3.5 + app.kubernetes.io/version: "v0.2.0" + app.kubernetes.io/managed-by: Helm +--- +# Source: llm-d-modelservice/templates/decode-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gaudi-llm-d-modelservice-decode + labels: + helm.sh/chart: llm-d-modelservice-v0.3.5 + app.kubernetes.io/version: "v0.2.0" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: random_model + llm-d.ai/role: decode + template: + metadata: + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: random_model + llm-d.ai/role: decode + spec: + + + serviceAccountName: gaudi-llm-d-modelservice + + volumes: + - emptyDir: {} + name: metrics-volume + - name: model-storage + persistentVolumeClaim: + claimName: model-pvc + readOnly: true + containers: + - name: vllm + image: opea/vllm-gaudi:1.22.0 + + args: + - --model + - meta-llama/Llama-3.1-8B-Instruct + - --port + - "8000" + - --served-model-name + - "meta-llama/Llama-3.1-8B-Instruct" + + - --block-size=128 + - --max-num-seqs=256 + - --max-seq-len-to-capture=2048 + - --max-model-len=2048 + - --max-num-batched-token=16000 + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: HABANA_LOGS + value: /tmp/habana_logs + - name: VLLM_SKIP_WARMUP + value: "true" + - name: DO_NOT_TRACK + value: "1" + - name: VLLM_USE_V1 + value: "1" + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + + - name: HF_HUB_CACHE + value: /model-cache/ + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: llm-d-hf-token + key: HF_TOKEN + ports: + - containerPort: 8200 + protocol: TCP + + resources: + limits: + habana.ai/gaudi: "1" + requests: + habana.ai/gaudi: "1" + + volumeMounts: + - name: model-storage + mountPath: /model-cache diff --git a/examples/output-pd.yaml b/examples/output-pd.yaml index 3a9294f8..a5ecd9ba 100644 --- a/examples/output-pd.yaml +++ b/examples/output-pd.yaml @@ -6,7 +6,7 @@ kind: ServiceAccount metadata: name: pd-llm-d-modelservice labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- @@ -16,7 +16,7 @@ kind: Deployment metadata: name: pd-llm-d-modelservice-decode labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -92,6 +92,8 @@ spec: value: "1" - name: TP_SIZE value: "1" + - name: DP_SIZE_LOCAL + value: "1" - name: HF_HOME value: /model-cache @@ -121,7 +123,7 @@ kind: Deployment metadata: name: pd-llm-d-modelservice-prefill labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -180,6 +182,8 @@ spec: value: "1" - name: TP_SIZE value: "1" + - name: DP_SIZE_LOCAL + value: "1" - name: HF_HOME value: /model-cache diff --git a/examples/output-pvc-hf.yaml b/examples/output-pvc-hf.yaml index cb3d97b9..76c318cb 100644 --- a/examples/output-pvc-hf.yaml +++ b/examples/output-pvc-hf.yaml @@ -6,7 +6,7 @@ kind: ServiceAccount metadata: name: pvc-hf-llm-d-modelservice labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- @@ -16,7 +16,7 @@ kind: Deployment metadata: name: pvc-hf-llm-d-modelservice-decode labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -92,6 +92,8 @@ spec: value: "1" - name: TP_SIZE value: "1" + - name: DP_SIZE_LOCAL + value: "1" - name: HF_HUB_CACHE value: /model-cache/path/to/hf_hub_cache @@ -121,7 +123,7 @@ kind: Deployment metadata: name: pvc-hf-llm-d-modelservice-prefill labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -180,6 +182,8 @@ spec: value: "1" - name: TP_SIZE value: "1" + - name: DP_SIZE_LOCAL + value: "1" - name: HF_HUB_CACHE value: /model-cache/path/to/hf_hub_cache diff --git a/examples/output-pvc.yaml b/examples/output-pvc.yaml index 2a7dd867..0e5ff307 100644 --- a/examples/output-pvc.yaml +++ b/examples/output-pvc.yaml @@ -6,7 +6,7 @@ kind: ServiceAccount metadata: name: pvc-llm-d-modelservice labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- @@ -16,7 +16,7 @@ kind: Deployment metadata: name: pvc-llm-d-modelservice-decode labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -92,6 +92,8 @@ spec: value: "1" - name: TP_SIZE value: "1" + - name: DP_SIZE_LOCAL + value: "1" ports: - containerPort: 8200 @@ -119,7 +121,7 @@ kind: Deployment metadata: name: pvc-llm-d-modelservice-prefill labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -178,6 +180,8 @@ spec: value: "1" - name: TP_SIZE value: "1" + - name: DP_SIZE_LOCAL + value: "1" ports: - containerPort: 8000 diff --git a/examples/output-requester.yaml b/examples/output-requester.yaml index 29456785..28339e62 100644 --- a/examples/output-requester.yaml +++ b/examples/output-requester.yaml @@ -6,7 +6,7 @@ kind: ServiceAccount metadata: name: requester-llm-d-modelservice labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- @@ -81,6 +81,8 @@ spec: value: "1" - name: TP_SIZE value: "1" + - name: DP_SIZE_LOCAL + value: "1" - name: HF_HOME value: /model-cache @@ -138,7 +140,7 @@ kind: Deployment metadata: name: requester-llm-d-modelservice-prefill labels: - helm.sh/chart: llm-d-modelservice-v0.3.4 + helm.sh/chart: llm-d-modelservice-v0.3.5 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -197,6 +199,8 @@ spec: value: "1" - name: TP_SIZE value: "1" + - name: DP_SIZE_LOCAL + value: "1" - name: HF_HOME value: /model-cache diff --git a/examples/output-xpu-pd.yaml b/examples/output-xpu-pd.yaml new file mode 100644 index 00000000..2ce8dd30 --- /dev/null +++ b/examples/output-xpu-pd.yaml @@ -0,0 +1,271 @@ +# generated by generate-example-output.sh +--- +# Source: llm-d-modelservice/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: xpu-pd-llm-d-modelservice + labels: + helm.sh/chart: llm-d-modelservice-v0.3.5 + app.kubernetes.io/version: "v0.2.0" + app.kubernetes.io/managed-by: Helm +--- +# Source: llm-d-modelservice/templates/decode-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: xpu-pd-llm-d-modelservice-decode + labels: + helm.sh/chart: llm-d-modelservice-v0.3.5 + app.kubernetes.io/version: "v0.2.0" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: microsoft-dialogpt-large + llm-d.ai/role: decode + template: + metadata: + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: microsoft-dialogpt-large + llm-d.ai/role: decode + spec: + initContainers: + - name: routing-proxy + args: + - --port=8000 + - --vllm-port=8200 + - --connector=nixlv2 + - -v=5 + - --secure-proxy=false + image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0 + imagePullPolicy: Always + ports: + - containerPort: 8000 + resources: {} + restartPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + + serviceAccountName: xpu-pd-llm-d-modelservice + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: accelerator + operator: In + values: + - intel-xpu + volumes: + - emptyDir: {} + name: metrics-volume + - name: model-storage + emptyDir: + sizeLimit: 10Gi + + containers: + - name: vllm + image: ghcr.io/llm-d/llm-d-xpu:v0.2.0 + imagePullPolicy: Never + + command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + args: + - --model + - microsoft/DialoGPT-large + - --enforce-eager + - --tensor-parallel-size + - "1" + - --port + - "8200" + - --host + - 0.0.0.0 + - --kv-transfer-config + - '{"kv_connector":"NixlConnector", "kv_role":"kv_consumer"}' + env: + - name: ZE_AFFINITY_MASK + value: "0" + - name: ZE_ENABLE_PCI_ID_DEVICE_ORDER + value: "1" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: TORCH_LLM_ALLREDUCE + value: "1" + - name: VLLM_USE_V1 + value: "1" + - name: CCL_ZE_IPC_EXCHANGE + value: pidfd + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" + - name: VLLM_WORKER_MULTIPROC_METHOD + value: spawn + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + + - name: HF_HOME + value: /model-cache + + - name: VLLM_USE_V1 + value: "1" + - name: TORCH_LLM_ALLREDUCE + value: "1" + - name: VLLM_WORKER_MULTIPROC_METHOD + value: "spawn" + ports: + - containerPort: 8200 + protocol: TCP + - containerPort: 5557 + protocol: TCP + + resources: + limits: + cpu: "8" + gpu.intel.com/i915: "1" + memory: 24Gi + requests: + cpu: "4" + gpu.intel.com/i915: "1" + memory: 12Gi + + volumeMounts: + - name: model-storage + mountPath: /model-cache +--- +# Source: llm-d-modelservice/templates/prefill-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: xpu-pd-llm-d-modelservice-prefill + labels: + helm.sh/chart: llm-d-modelservice-v0.3.5 + app.kubernetes.io/version: "v0.2.0" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: microsoft-dialogpt-large + llm-d.ai/role: prefill + template: + metadata: + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: microsoft-dialogpt-large + llm-d.ai/role: prefill + spec: + + serviceAccountName: xpu-pd-llm-d-modelservice + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: accelerator + operator: In + values: + - intel-xpu + volumes: + - emptyDir: {} + name: metrics-volume + - name: model-storage + emptyDir: + sizeLimit: 10Gi + + containers: + - name: vllm + image: ghcr.io/llm-d/llm-d-xpu:v0.2.0 + + command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + args: + - --model + - microsoft/DialoGPT-large + - --enforce-eager + - --tensor-parallel-size + - "1" + - --port + - "8000" + - --host + - 0.0.0.0 + - --kv-transfer-config + - '{"kv_connector":"NixlConnector", "kv_role":"kv_producer"}' + env: + - name: ZE_AFFINITY_MASK + value: "1" + - name: ZE_ENABLE_PCI_ID_DEVICE_ORDER + value: "1" + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: TORCH_LLM_ALLREDUCE + value: "1" + - name: VLLM_USE_V1 + value: "1" + - name: CCL_ZE_IPC_EXCHANGE + value: pidfd + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" + - name: VLLM_WORKER_MULTIPROC_METHOD + value: spawn + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + + - name: HF_HOME + value: /model-cache + + - name: VLLM_USE_V1 + value: "1" + - name: TORCH_LLM_ALLREDUCE + value: "1" + - name: VLLM_WORKER_MULTIPROC_METHOD + value: "spawn" + ports: + - containerPort: 8000 + protocol: TCP + - containerPort: 5557 + protocol: TCP + + resources: + limits: + cpu: "16" + gpu.intel.com/i915: "1" + memory: 32Gi + requests: + cpu: "8" + gpu.intel.com/i915: "1" + memory: 16Gi + + volumeMounts: + - name: model-storage + mountPath: /model-cache diff --git a/examples/output-xpu.yaml b/examples/output-xpu.yaml new file mode 100644 index 00000000..148fbc61 --- /dev/null +++ b/examples/output-xpu.yaml @@ -0,0 +1,109 @@ +# generated by generate-example-output.sh +--- +# Source: llm-d-modelservice/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: xpu-llm-d-modelservice + labels: + helm.sh/chart: llm-d-modelservice-v0.3.5 + app.kubernetes.io/version: "v0.2.0" + app.kubernetes.io/managed-by: Helm +--- +# Source: llm-d-modelservice/templates/decode-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: xpu-llm-d-modelservice-decode + labels: + helm.sh/chart: llm-d-modelservice-v0.3.5 + app.kubernetes.io/version: "v0.2.0" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepSeek-r1-distill-qwen-1-5B + llm-d.ai/role: decode + template: + metadata: + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepSeek-r1-distill-qwen-1-5B + llm-d.ai/role: decode + spec: + + + serviceAccountName: xpu-llm-d-modelservice + + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: accelerator + operator: In + values: + - intel-xpu + - intel-gpu-max + volumes: + - emptyDir: {} + name: metrics-volume + - name: model-storage + emptyDir: + sizeLimit: 10Gi + + containers: + - name: vllm + image: ghcr.io/llm-d/llm-d-xpu:v0.2.0 + + args: + - --model + - deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B + - --port + - "8000" + - --served-model-name + - "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" + + - --enforce-eager + - --dtype + - float16 + - --disable-sliding-window + - --gpu-memory-util + - "0.9" + - --no-enable-prefix-caching + - --max-num-batched-tokens + - "4096" + - --disable-log-requests + - --max-model-len + - "4096" + - --block-size + - "64" + env: + - name: DP_SIZE + value: "1" + - name: TP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + + - name: HF_HOME + value: /model-cache + ports: + - containerPort: 8200 + protocol: TCP + + resources: + limits: + cpu: "8" + gpu.intel.com/xe: "1" + memory: 24Gi + requests: + cpu: "4" + gpu.intel.com/xe: "1" + memory: 12Gi + + volumeMounts: + - name: model-storage + mountPath: /model-cache diff --git a/examples/values-cpu.yaml b/examples/values-cpu.yaml index c79a7e9f..561d6cdf 100644 --- a/examples/values-cpu.yaml +++ b/examples/values-cpu.yaml @@ -20,6 +20,9 @@ routing: proxy: secure: false +accelerator: + type: cpu + # Decode pod configuation decode: replicas: 1 diff --git a/examples/values-pd.yaml b/examples/values-pd.yaml index 7c1f8355..18097ee5 100644 --- a/examples/values-pd.yaml +++ b/examples/values-pd.yaml @@ -57,11 +57,9 @@ decode: limits: memory: 16Gi cpu: "16" - nvidia.com/gpu: "1" requests: cpu: "16" memory: 16Gi - nvidia.com/gpu: "1" mountModelVolume: true # Prefill pod configuation @@ -98,9 +96,7 @@ prefill: limits: memory: 16Gi cpu: "16" - nvidia.com/gpu: "1" requests: cpu: "16" memory: 16Gi - nvidia.com/gpu: "1" mountModelVolume: true diff --git a/examples/values-xpu-pd.yaml b/examples/values-xpu-pd.yaml index 74e4f8b2..42006cae 100644 --- a/examples/values-xpu-pd.yaml +++ b/examples/values-xpu-pd.yaml @@ -145,11 +145,9 @@ prefill: limits: memory: 32Gi cpu: "16" - gpu.intel.com/i915: "1" requests: cpu: "8" memory: 16Gi - gpu.intel.com/i915: "1" mountModelVolume: true acceleratorTypes: diff --git a/examples/values-xpu.yaml b/examples/values-xpu.yaml index 82719ebf..6b71b07d 100644 --- a/examples/values-xpu.yaml +++ b/examples/values-xpu.yaml @@ -53,11 +53,9 @@ decode: limits: memory: 24Gi cpu: "8" - gpu.intel.com/xe: "1" requests: cpu: "4" memory: 12Gi - gpu.intel.com/xe: "1" mountModelVolume: true diff --git a/hack/generate-example-output.sh b/hack/generate-example-output.sh index 8646f406..3b852c03 100755 --- a/hack/generate-example-output.sh +++ b/hack/generate-example-output.sh @@ -19,9 +19,21 @@ generate_output() { # Generate output-cpu.yaml (Simulated CPU deployment) generate_output "cpu-sim" "examples/values-cpu.yaml" "examples/output-cpu.yaml" +# Generate output-dra.yaml (Dynamic Resource Allocation deployment for Intel B50 GPU device) +generate_output "dra" "examples/values-dra.yaml" "examples/output-dra.yaml" '--set modelArtifacts.uri=pvc+hf://model-pvc/meta-llama/Llama-3.1-8B-Instruct' + +# Generate output-gaudi.yaml (Intel Gaudi) +generate_output "gaudi" "examples/values-gaudi.yaml" "examples/output-gaudi.yaml" + # Generate output-pd.yaml (PD deployment) generate_output "pd" "examples/values-pd.yaml" "examples/output-pd.yaml" +# Generate output-xpu-pd.yaml (Intel i915) +generate_output "xpu-pd" "examples/values-xpu-pd.yaml" "examples/output-xpu-pd.yaml" + +# Generate output-xpu.yaml (Intel Xe GPU and affinity) +generate_output "xpu" "examples/values-xpu.yaml" "examples/output-xpu.yaml" + # Generate output-requester.yaml (Requester deployment) generate_output "requester" "examples/values-requester.yaml" "examples/output-requester.yaml" @@ -30,6 +42,3 @@ generate_output "pvc" "examples/values-pd.yaml" "examples/output-pvc.yaml" '--se # Generate output-pvc-hf.yaml (PVC HuggingFace model deployment) generate_output "pvc-hf" "examples/values-pd.yaml" "examples/output-pvc-hf.yaml" '--set modelArtifacts.uri=pvc+hf://pvc-name/path/to/hf_hub_cache/facebook/opt-125m' - -# Generate output-dra.yaml (Dynamic Resource Allocation deployment for Intel B50 GPU device) -generate_output "dra" "examples/values-dra.yaml" "examples/output-dra.yaml" '--set modelArtifacts.uri=pvc+hf://model-pvc/meta-llama/Llama-3.1-8B-Instruct'