From 1b7be9f14c3435daa527488db6a1895a090555f1 Mon Sep 17 00:00:00 2001 From: lazy1 <674194901@qq.com> Date: Fri, 1 Aug 2025 00:11:49 +0800 Subject: [PATCH 1/2] feat: add helm chart for models --- charts/ome-predefined-models/Chart.yaml | 6 + charts/ome-predefined-models/README.md | 231 ++++++++++++ .../templates/models.yaml | 187 +++++++++ .../templates/srt-runtimes.yaml | 354 ++++++++++++++++++ .../templates/vllm-runtimes.yaml | 199 ++++++++++ charts/ome-predefined-models/values.yaml | 134 +++++++ 6 files changed, 1111 insertions(+) create mode 100644 charts/ome-predefined-models/Chart.yaml create mode 100644 charts/ome-predefined-models/README.md create mode 100644 charts/ome-predefined-models/templates/models.yaml create mode 100644 charts/ome-predefined-models/templates/srt-runtimes.yaml create mode 100644 charts/ome-predefined-models/templates/vllm-runtimes.yaml create mode 100644 charts/ome-predefined-models/values.yaml diff --git a/charts/ome-predefined-models/Chart.yaml b/charts/ome-predefined-models/Chart.yaml new file mode 100644 index 00000000..ac9f2bb9 --- /dev/null +++ b/charts/ome-predefined-models/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: ome-predefined-models +description: OME Predefined Models and Serving Runtimes +type: application +version: 0.1.0 +appVersion: "1.16.0" diff --git a/charts/ome-predefined-models/README.md b/charts/ome-predefined-models/README.md new file mode 100644 index 00000000..0f15e0c2 --- /dev/null +++ b/charts/ome-predefined-models/README.md @@ -0,0 +1,231 @@ +# ome-predefined-models + +![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.16.0](https://img.shields.io/badge/AppVersion-1.16.0-informational?style=flat-square) + +OME Predefined Models and Serving Runtimes + +## Description + +This Helm chart provides a collection of predefined models and serving runtimes for OME (Open Model Engine). Instead of manually managing these resources through kustomize, users can now deploy them natively using Helm with fine-grained control over which models and runtimes to enable. + +## Features + +- **Predefined Models**: Deploy popular models from various vendors (Meta, DeepSeek, Intfloat, Microsoft, Moonshot AI, NVIDIA) +- **Serving Runtimes**: Support for both vLLM and SRT (SGLang Runtime) configurations +- **Selective Deployment**: Enable/disable specific models and runtimes through values configuration +- **Production Ready**: Includes proper resource limits, health checks, and monitoring configurations + +## Installation + +### Prerequisites + +- Kubernetes cluster with GPU nodes +- OME CRDs already installed (`ome-crd` chart) +- OME controller running (`ome-resources` chart) + +### Install the chart + +```bash +# Add the repository (if using a Helm repository) +helm repo add ome +helm repo update + +# Install with default values +helm install ome-predefined-models ome/ome-predefined-models + +# Or install from local chart +helm install ome-predefined-models ./charts/ome-predefined-models +``` + +### Custom Configuration + +Create a `custom-values.yaml` file to customize which models and runtimes to enable: + +```yaml +# Enable all resources +global: + enableAll: false + +# Enable specific models +models: + meta: + enabled: true + llama_3_3_70b_instruct: + enabled: true + llama_4_maverick_17b_128e_instruct_fp8: + enabled: false + + deepseek: + enabled: true + deepseek_v3: + enabled: true + deepseek_r1: + enabled: false + + intfloat: + enabled: true + e5_mistral_7b_instruct: + enabled: true + +# Enable specific runtimes +runtimes: + vllm: + enabled: true + e5_mistral_7b_instruct: + enabled: true + llama_3_3_70b_instruct: + enabled: true + + srt: + enabled: true + deepseek_rdma: + enabled: true + e5_mistral_7b_instruct: + enabled: true +``` + +Then install with your custom values: + +```bash +helm install ome-predefined-models ./charts/ome-predefined-models -f custom-values.yaml +``` + +## Supported Models + +### Meta/Llama Models + +- `llama-3-3-70b-instruct` - Llama 3.3 70B Instruct model +- `llama-4-maverick-17b-128e-instruct-fp8` - Llama 4 Maverick 17B model (FP8) +- `llama-4-scout-17b-16e-instruct` - Llama 4 Scout 17B model + +### DeepSeek Models + +- `deepseek-v3` - DeepSeek V3 model +- `deepseek-r1` - DeepSeek R1 model + +### Intfloat Models + +- `e5-mistral-7b-instruct` - E5 Mistral 7B Instruct model + +### Microsoft Models + +- `phi-3-vision-128k-instruct` - Phi-3 Vision 128K Instruct model + +### Moonshot AI Models + +- `kimi-k2-instruct` - Kimi K2 Instruct model + +### NVIDIA Models + +- `llama-3-1-nemotron-ultra-253b-v1` - Llama 3.1 Nemotron Ultra 253B +- `llama-3-3-nemotron-super-49b-v1` - Llama 3.3 Nemotron Super 49B +- `llama-3-1-nemotron-nano-8b-v1` - Llama 3.1 Nemotron Nano 8B + +## Supported Runtimes + +### vLLM Runtimes + +- Optimized for inference workloads +- Built-in OpenAI-compatible API server +- Efficient memory utilization + +### SRT (SGLang Runtime) Runtimes + +- Advanced serving capabilities +- Support for complex multi-node deployments +- RDMA support for high-performance networking + +## Configuration Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| global.enableAll | bool | `false` | Enable all predefined resources | +| models.meta.enabled | bool | `true` | Enable Meta/Llama models | +| models.deepseek.enabled | bool | `true` | Enable DeepSeek models | +| models.intfloat.enabled | bool | `true` | Enable Intfloat models | +| models.microsoft.enabled | bool | `false` | Enable Microsoft models | +| models.moonshotai.enabled | bool | `false` | Enable Moonshot AI models | +| models.nvidia.enabled | bool | `false` | Enable NVIDIA models | +| runtimes.vllm.enabled | bool | `true` | Enable vLLM runtimes | +| runtimes.srt.enabled | bool | `true` | Enable SRT runtimes | + +## Usage Examples + +### Deploy Only Essential Models + +```yaml +global: + enableAll: false + +models: + meta: + enabled: true + llama_3_3_70b_instruct: + enabled: true + + intfloat: + enabled: true + e5_mistral_7b_instruct: + enabled: true + +runtimes: + vllm: + enabled: true + llama_3_3_70b_instruct: + enabled: true + e5_mistral_7b_instruct: + enabled: true +``` + +### High-Performance Setup with RDMA + +```yaml +models: + deepseek: + enabled: true + deepseek_v3: + enabled: true + +runtimes: + srt: + enabled: true + deepseek_rdma: + enabled: true +``` + +## Monitoring + +All deployed runtimes include Prometheus metrics endpoints configured for monitoring: + +- Metrics endpoint: `/metrics` +- Health check endpoint: `/health` +- Generate health check: `/health_generate` (for SRT runtimes) + +## Troubleshooting + +### Common Issues + +1. **Models not downloading**: Ensure proper Hugging Face token is configured +2. **GPU resources**: Verify GPU nodes have sufficient resources +3. **RDMA configuration**: For RDMA-enabled runtimes, ensure proper network setup + +### Debugging Commands + +```bash +# Check deployed models +kubectl get clusterbasemodels + +# Check deployed runtimes +kubectl get clusterservingruntimes + +# Check pod logs +kubectl logs -l app=ome-predefined-models +``` + +## Contributing + +To add new models or runtimes: + +1. Add the configuration to the appropriate template file +2. Update the `values.yaml` with the new configuration options +3. Update this README with the new resource information diff --git a/charts/ome-predefined-models/templates/models.yaml b/charts/ome-predefined-models/templates/models.yaml new file mode 100644 index 00000000..ff2b90b8 --- /dev/null +++ b/charts/ome-predefined-models/templates/models.yaml @@ -0,0 +1,187 @@ +{{- if or .Values.global.enableAll .Values.models.meta.enabled }} +{{- if or .Values.global.enableAll .Values.models.meta.llama_3_3_70b_instruct.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: llama-3-3-70b-instruct +spec: + disabled: false + displayName: meta.llama-3.3-70b-instruct + storage: + storageUri: hf://meta-llama/Llama-3.3-70B-Instruct + path: /raid/models/meta/llama-3-3-70b-instruct + key: "hf-token" + vendor: meta + version: "1.0.0" +{{- end }} +{{- if or .Values.global.enableAll .Values.models.meta.llama_4_maverick_17b_128e_instruct_fp8.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: llama-4-maverick-17b-128e-instruct-fp8 +spec: + vendor: meta + disabled: false + displayName: meta.llama-4-maverick-17b-128e-instruct-fp8 + version: "1.0.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "4.51.0.dev0" + modelType: llama + modelArchitecture: Llama4ForConditionalGeneration + storage: + storageUri: hf://meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + path: /raid/models/meta/llama-4-maverick-17b-128e-instruct-fp8 + key: "hf-token" +{{- end }} +{{- if or .Values.global.enableAll .Values.models.meta.llama_4_scout_17b_16e_instruct.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: llama-4-scout-17b-16e-instruct +spec: + disabled: false + displayName: meta.llama-4-scout-17b-16e-instruct + vendor: meta + version: "1.0.0" + storage: + storageUri: hf://meta-llama/Llama-4-Scout-17B-16E-Instruct + path: /raid/models/meta/llama-4-scout-17b-16e-instruct + key: "hf-token" +{{- end }} +{{- end }} + +{{- if or .Values.global.enableAll .Values.models.deepseek.enabled }} +{{- if or .Values.global.enableAll .Values.models.deepseek.deepseek_v3.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: deepseek-v3 +spec: + vendor: deepseek-ai + disabled: false + version: "1.0.0" + storage: + storageUri: hf://deepseek-ai/DeepSeek-V3 + path: /raid/models/deepseek-ai/deepseek-v3 +{{- end }} +{{- if or .Values.global.enableAll .Values.models.deepseek.deepseek_r1.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: deepseek-r1 +spec: + vendor: deepseek-ai + disabled: false + version: "1.0.0" + storage: + storageUri: hf://deepseek-ai/DeepSeek-R1 + path: /raid/models/deepseek-ai/deepseek-r1 +{{- end }} +{{- end }} + +{{- if or .Values.global.enableAll .Values.models.intfloat.enabled }} +{{- if or .Values.global.enableAll .Values.models.intfloat.e5_mistral_7b_instruct.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: e5-mistral-7b-instruct +spec: + disabled: false + displayName: intfloat.e5-mistral-7b-instruct + storage: + storageUri: hf://intfloat/e5-mistral-7b-instruct + path: /raid/models/intfloat/e5-mistral-7b-instruct + vendor: intfloat + version: "0.0" +{{- end }} +{{- end }} + +{{- if or .Values.global.enableAll .Values.models.microsoft.enabled }} +{{- if or .Values.global.enableAll .Values.models.microsoft.phi_3_vision_128k_instruct.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: phi-3-vision-128k-instruct +spec: + disabled: false + displayName: microsoft.phi-3-vision-128k-instruct + storage: + storageUri: hf://microsoft/Phi-3-vision-128k-instruct + path: /raid/models/microsoft/phi-3-vision-128k-instruct + vendor: microsoft + version: "0.1" +{{- end }} +{{- end }} + +{{- if or .Values.global.enableAll .Values.models.moonshotai.enabled }} +{{- if or .Values.global.enableAll .Values.models.moonshotai.kimi_k2_instruct.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: kimi-k2-instruct +spec: + vendor: moonshotai + disabled: false + version: "1.0.0" + storage: + storageUri: hf://moonshotai/Kimi-K2-Instruct + path: /raid/models/moonshotai/Kimi-K2-Instruct +{{- end }} +{{- end }} + +{{- if or .Values.global.enableAll .Values.models.nvidia.enabled }} +{{- if or .Values.global.enableAll .Values.models.nvidia.llama_3_1_nemotron_ultra_253b_v1.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: llama-3-1-nemotron-ultra-253b-v1 +spec: + vendor: nvidia + disabled: false + version: "1.0.0" + storage: + storageUri: hf://nvidia/Llama-3.1-Nemotron-70B-Instruct + path: /raid/models/nvidia/llama-3-1-nemotron-ultra-253b-v1 +{{- end }} +{{- if or .Values.global.enableAll .Values.models.nvidia.llama_3_3_nemotron_super_49b_v1.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: llama-3-3-nemotron-super-49b-v1 +spec: + vendor: nvidia + disabled: false + version: "1.0.0" + storage: + storageUri: hf://nvidia/Llama-3.3-Nemotron-Super-49B-v1 + path: /raid/models/nvidia/llama-3-3-nemotron-super-49b-v1 +{{- end }} +{{- if or .Values.global.enableAll .Values.models.nvidia.llama_3_1_nemotron_nano_8b_v1.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: llama-3-1-nemotron-nano-8b-v1 +spec: + vendor: nvidia + disabled: false + version: "1.0.0" + storage: + storageUri: hf://nvidia/Llama-3.1-Nemotron-Nano-8B-v1 + path: /raid/models/nvidia/llama-3-1-nemotron-nano-8b-v1 +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/ome-predefined-models/templates/srt-runtimes.yaml b/charts/ome-predefined-models/templates/srt-runtimes.yaml new file mode 100644 index 00000000..28ec3220 --- /dev/null +++ b/charts/ome-predefined-models/templates/srt-runtimes.yaml @@ -0,0 +1,354 @@ +{{- if or .Values.global.enableAll .Values.runtimes.srt.enabled }} +{{- if or .Values.global.enableAll .Values.runtimes.srt.e5_mistral_7b_instruct.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-e5-7b-mistral-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.34.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: MistralModel + autoSelect: true + priority: 1 + modelSizeRange: + min: 5B + max: 10B + protocolVersions: + - openAI + engineConfig: + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --model-path="$MODEL_PATH" \ + --tp-size 1 \ + --is-embedding + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 80Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 80Gi + nvidia.com/gpu: 1 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory +{{- end }} +{{- if or .Values.global.enableAll .Values.runtimes.srt.deepseek_rdma.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-deepseek-rdma +spec: + disabled: false + modelSizeRange: + min: 650B + max: 700B + supportedModelFormats: + - modelFormat: + name: safetensors + version: "1.0.0" + version: "1.0.0" + modelFramework: + name: transformers + version: "4.46.3" + modelArchitecture: DeepseekV3ForCausalLM + quantization: "fp8" + autoSelect: true + priority: 1 + - modelFormat: + name: safetensors + version: "1.0.0" + version: "1.0.0" + modelFramework: + name: transformers + version: "4.33.1" + modelArchitecture: DeepseekV3ForCausalLM + quantization: "fp8" + autoSelect: true + priority: 1 + protocolVersions: + - openAI + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --selector component=engine leaderworkerset.sigs.k8s.io/worker-index=0 ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] + engineConfig: + annotations: + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + leader: + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + enableServiceLinks: false + hostIPC: true + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: devinf + hostPath: + path: /dev/infiniband + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:dev + ports: + - containerPort: 30000 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=30000 \ + --model-path="$MODEL_PATH" \ + --tp-size "$TP_SIZE" \ + --dp-size "$DP_SIZE" \ + --nccl-init-addr "$NCCL_INIT_ADDR" \ + --nnodes "$NNODES" \ + --node-rank "$NODE_RANK" \ + --disable-disk-cache \ + --enable-metrics \ + --api-key sgl + volumeMounts: + - mountPath: /dev/shm + name: dshm + - name: devinf + mountPath: /dev/infiniband + resources: + requests: + cpu: 100 + memory: 500Gi + nvidia.com/gpu: 8 + limits: + cpu: 100 + memory: 500Gi + nvidia.com/gpu: 8 + readinessProbe: + httpGet: + path: /health_generate + port: 30000 + httpHeaders: + - name: Authorization + value: "Bearer sgl" + failureThreshold: 10 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 30000 + httpHeaders: + - name: Authorization + value: "Bearer sgl" + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 30000 + httpHeaders: + - name: Authorization + value: "Bearer sgl" + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 +{{- end }} +{{- if or .Values.global.enableAll .Values.runtimes.srt.llama_3_3_70b_instruct.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-3-70b-instruct +spec: + disabled: false + modelSizeRange: + min: 60B + max: 80B + supportedModelFormats: + - modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "4.48.3" + modelArchitecture: LlamaForCausalLM + autoSelect: true + priority: 1 + protocolVersions: + - openAI + engineConfig: + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --model-path="$MODEL_PATH" \ + --tp-size 4 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 40 + memory: 320Gi + nvidia.com/gpu: 4 + limits: + cpu: 40 + memory: 320Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/ome-predefined-models/templates/vllm-runtimes.yaml b/charts/ome-predefined-models/templates/vllm-runtimes.yaml new file mode 100644 index 00000000..c9be947c --- /dev/null +++ b/charts/ome-predefined-models/templates/vllm-runtimes.yaml @@ -0,0 +1,199 @@ +{{- if or .Values.global.enableAll .Values.runtimes.vllm.enabled }} +{{- if or .Values.global.enableAll .Values.runtimes.vllm.e5_mistral_7b_instruct.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-e5-mistral-7b-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.34.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: MistralModel + autoSelect: true + priority: 1 + version: "0.0" + protocolVersions: + - openAI + modelSizeRange: + min: 5B + max: 10B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: docker.io/vllm/vllm-openai:v0.9.0.1 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m vllm.entrypoints.openai.api_server \ + --port=8080 \ + --model="$MODEL_PATH" \ + --middleware=vllm.entrypoints.openai.middleware.log_opc_header \ + --max-log-len=0 \ + --served-model-name=vllm-model \ + --tensor-parallel-size=1 \ + --gpu-memory-utilization=0.8 \ + --enforce-eager + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 80Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 80Gi + nvidia.com/gpu: 1 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 +{{- end }} +{{- if or .Values.global.enableAll .Values.runtimes.vllm.llama_3_3_70b_instruct.enabled }} +--- +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-llama-3-3-70b-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.48.3" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: LlamaForCausalLM + autoSelect: true + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 60B + max: 80B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: docker.io/vllm/vllm-openai:v0.9.0.1 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m vllm.entrypoints.openai.api_server \ + --port=8080 \ + --model="$MODEL_PATH" \ + --middleware=vllm.entrypoints.openai.middleware.log_opc_header \ + --max-log-len=0 \ + --served-model-name=vllm-model \ + --tensor-parallel-size=4 \ + --gpu-memory-utilization=0.8 \ + --enforce-eager + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 40 + memory: 320Gi + nvidia.com/gpu: 4 + limits: + cpu: 40 + memory: 320Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/ome-predefined-models/values.yaml b/charts/ome-predefined-models/values.yaml new file mode 100644 index 00000000..9b572dd4 --- /dev/null +++ b/charts/ome-predefined-models/values.yaml @@ -0,0 +1,134 @@ +# OME Predefined Models and Serving Runtimes Configuration + +# Global settings +global: + # Set to true to enable all predefined resources by default + enableAll: false + +# Predefined Models Configuration +models: + # Meta/Llama models + meta: + enabled: true + llama_3_3_70b_instruct: + enabled: true + llama_4_maverick_17b_128e_instruct_fp8: + enabled: false + llama_4_scout_17b_16e_instruct: + enabled: false + + # DeepSeek models + deepseek: + enabled: true + deepseek_v3: + enabled: true + deepseek_r1: + enabled: false + + # Intfloat models + intfloat: + enabled: true + e5_mistral_7b_instruct: + enabled: true + + # Microsoft models + microsoft: + enabled: false + phi_3_vision_128k_instruct: + enabled: false + + # Moonshot AI models + moonshotai: + enabled: false + kimi_k2_instruct: + enabled: false + + # NVIDIA models + nvidia: + enabled: false + llama_3_1_nemotron_ultra_253b_v1: + enabled: false + llama_3_3_nemotron_super_49b_v1: + enabled: false + llama_3_1_nemotron_nano_8b_v1: + enabled: false + +# Serving Runtimes Configuration +runtimes: + # SRT (Shared Runtime) configurations + srt: + enabled: true + deepseek_rdma: + enabled: true + deepseek_rdma_pd: + enabled: false + llama_4_maverick_17b_128e_instruct_fp8: + enabled: false + llama_4_maverick_17b_128e_instruct_fp8_pd: + enabled: false + llama_4_scout_17b_16e_instruct: + enabled: false + llama_4_scout_17b_16e_instruct_pd: + enabled: false + e5_mistral_7b_instruct: + enabled: true + llama_3_3_70b_instruct: + enabled: true + llama_3_3_70b_instruct_pd: + enabled: false + mistral_7b_instruct: + enabled: false + mistral_7b_instruct_pd: + enabled: false + mixtral_8x7b_instruct: + enabled: false + mixtral_8x7b_instruct_pd: + enabled: false + llama_3_2_1b_instruct: + enabled: false + llama_3_2_3b_instruct: + enabled: false + llama_3_2_3b_instruct_pd: + enabled: false + llama_3_2_90b_vision_instruct: + enabled: false + llama_3_1_70b_instruct: + enabled: false + llama_3_1_70b_instruct_pd: + enabled: false + llama_3_2_11b_vision_instruct: + enabled: false + llama_3_2_1b_instruct_pd: + enabled: false + kimi_k2_pd: + enabled: false + + # vLLM runtime configurations + vllm: + enabled: true + mistral_7b_instruct: + enabled: false + mixtral_8x7b_instruct: + enabled: false + e5_mistral_7b_instruct: + enabled: true + llama_3_1_405b_instruct_fp8: + enabled: false + llama_3_1_nemotron_nano_8b_v1: + enabled: false + llama_3_1_nemotron_ultra_253b_v1: + enabled: false + llama_3_2_11b_vision_instruct: + enabled: false + llama_3_2_1b_instruct: + enabled: false + llama_3_2_3b_instruct: + enabled: false + llama_3_3_70b_instruct: + enabled: true + llama_3_3_nemotron_super_49b_v1: + enabled: false + llama_4_maverick_17b_128e_instruct_fp8: + enabled: false + llama_4_scout_17b_16e_instruct: + enabled: false From 24e263a0acf3e259a4d604651e72dda57e749d1c Mon Sep 17 00:00:00 2001 From: lazy1 <674194901@qq.com> Date: Fri, 1 Aug 2025 00:25:00 +0800 Subject: [PATCH 2/2] remove useless lines --- charts/ome-predefined-models/README.md | 32 +------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/charts/ome-predefined-models/README.md b/charts/ome-predefined-models/README.md index 0f15e0c2..c8461646 100644 --- a/charts/ome-predefined-models/README.md +++ b/charts/ome-predefined-models/README.md @@ -26,8 +26,7 @@ This Helm chart provides a collection of predefined models and serving runtimes ### Install the chart ```bash -# Add the repository (if using a Helm repository) -helm repo add ome +helm repo add ome https://sgl-project.github.io/ome helm repo update # Install with default values @@ -193,35 +192,6 @@ runtimes: enabled: true ``` -## Monitoring - -All deployed runtimes include Prometheus metrics endpoints configured for monitoring: - -- Metrics endpoint: `/metrics` -- Health check endpoint: `/health` -- Generate health check: `/health_generate` (for SRT runtimes) - -## Troubleshooting - -### Common Issues - -1. **Models not downloading**: Ensure proper Hugging Face token is configured -2. **GPU resources**: Verify GPU nodes have sufficient resources -3. **RDMA configuration**: For RDMA-enabled runtimes, ensure proper network setup - -### Debugging Commands - -```bash -# Check deployed models -kubectl get clusterbasemodels - -# Check deployed runtimes -kubectl get clusterservingruntimes - -# Check pod logs -kubectl logs -l app=ome-predefined-models -``` - ## Contributing To add new models or runtimes: