diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index 3bac9b613..5bbbac7cb 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -83,18 +83,18 @@ $ helm install triton-llama3-8b-instruct \ To deploy the EndpointPicker in a high-availability (HA) active-passive configuration, you can enable leader election. When enabled, the EPP deployment will have multiple replicas, but only one "leader" replica will be active and ready to process traffic at any given time. If the leader pod fails, another pod will be elected as the new leader, ensuring service continuity. -To enable HA, set `inferenceExtension.enableLeaderElection` to `true` and increase the number of replicas in your `values.yaml` file: +To enable HA, set `inferenceExtension.flags.has-enable-leader-election` to `true` and increase the number of replicas in your `values.yaml` file: ```yaml inferenceExtension: replicas: 3 - enableLeaderElection: true + has-enable-leader-election: true ``` Then apply it with: ```txt -helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml \ +helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml ``` ## Uninstall @@ -122,10 +122,9 @@ The following table list the configurable parameters of the chart. | `inferenceExtension.env` | List of environment variables to set in the endpoint picker container as free-form YAML. Defaults to `[]`. | | `inferenceExtension.extraContainerPorts` | List of additional container ports to expose. Defaults to `[]`. | | `inferenceExtension.extraServicePorts` | List of additional service ports to expose. Defaults to `[]`. | -| `inferenceExtension.flags` | List of flags which are passed through to endpoint picker. | +| `inferenceExtension.flags` | List of flags which are passed through to endpoint picker. Example flags, enable-pprof, grpc-port etc. Refer [runner.go](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/cmd/epp/runner/runner.go) for complete list. | +| `inferenceExtension.flags.has-enable-leader-election` | Enable leader election for high availability. When enabled, only one EPP pod (the leader) will be ready to serve traffic. | | `provider.name` | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`. | -| `inferenceExtension.enableLeaderElection` | Enable leader election for high availability. When enabled, only one EPP pod (the leader) will be ready to serve traffic. It is recommended to set `inferenceExtension.replicas` to a value greater than 1 when this is set to `true`. Defaults to `false`. | - ## Notes diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index 8d483656e..443085ef5 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -32,51 +32,9 @@ inferenceExtension: # ENABLE_EXPERIMENTAL_FEATURE: "true" flags: - - name: grpc-port - value: 9002 - - name: grpc-health-port - value: 9003 - - name: metrics-port - value: 9090 - - name: enable-pprof - value: "true" # Enable pprof handlers for profiling and debugging - - name: pool-group - value: "inference.networking.k8s.io" # Log verbosity - name: v value: 1 - - name: secure-serving - value: "true" - - name: health-checking - value: "false" - - name: cert-path - value: "" - - name: total-queued-requests-metric - value: "vllm:num_requests_waiting" - - name: kv-cache-usage-percentage-metric - value: "vllm:gpu_cache_usage_perc" - - name: lora-info-metric - value: "vllm:lora_requests_info" - - name: refresh-metrics-interval - value: "50ms" - - name: refresh-prometheus-metrics-interval - value: "5s" - - name: metrics-staleness-threshold - value: "2s" - - name: config-file - value: "" - - name: config-text - value: "" - - name: model-server-metrics-port - value: 0 - - name: model-server-metrics-path - value: "/metrics" - - name: model-server-metrics-scheme - value: "http" - - name: model-server-metrics-https-insecure-skip-verify - value: "true" - - name: has-enable-leader-election - value: false inferencePool: targetPorts: