Skip to content

Commit 1e9ea89

Browse files
authored
Read metrics Datasource configuration from config file (#2441)
* Read metrics Datasource configuration from config file Signed-off-by: mohamedmahameed <[email protected]> * Add MetricsDataSource to values, update go.mod Signed-off-by: mohamedmahameed <[email protected]> * rename plugin, keep supporting command line flags for depracated flags Signed-off-by: mohamedmahameed <[email protected]> * revert pflag import alias Signed-off-by: mohamedmahameed <[email protected]> * Add test for config precedence, add f.Changed check to check cli flag set/unset Signed-off-by: mohamedmahameed <[email protected]> * rename factory and New method to match new naming Signed-off-by: mohamedmahameed <[email protected]> --------- Signed-off-by: mohamedmahameed <[email protected]>
1 parent 5f9b768 commit 1e9ea89

12 files changed

Lines changed: 269 additions & 80 deletions

File tree

cmd/epp/runner/runner.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,7 @@ func (r *Runner) registerInTreePlugins() {
462462
fwkplugin.Register(testresponsereceived.DestinationEndpointServedVerifierType, testresponsereceived.DestinationEndpointServedVerifierFactory)
463463
// register datalayer metrics collection plugins
464464
fwkplugin.Register(sourcemetrics.MetricsDataSourceType, sourcemetrics.MetricsDataSourceFactory)
465-
fwkplugin.Register(extractormetrics.MetricsExtractorType, extractormetrics.ModelServerExtractorFactory)
465+
fwkplugin.Register(extractormetrics.MetricsExtractorType, extractormetrics.CoreMetricsExtractorFactory)
466466
// register datalayer k8s notification source plugin
467467
fwkplugin.Register(sourcenotifications.NotificationSourceType, sourcenotifications.NotificationSourceFactory)
468468
// register request control pluigns

config/charts/epplib/templates/_config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@ data:
1212
- type: queue-scorer
1313
- type: kv-cache-utilization-scorer
1414
- type: prefix-cache-scorer
15+
- type: metrics-data-source
16+
parameters:
17+
scheme: {{ .Values.inferenceExtension.metricsDataSource.scheme | default "http" | quote }}
18+
path: {{ .Values.inferenceExtension.metricsDataSource.path | default "/metrics" | quote }}
19+
insecureSkipVerify: {{ .Values.inferenceExtension.metricsDataSource.insecureSkipVerify | default true }}
20+
- type: core-metrics-extractor
1521
{{- if .Values.inferenceExtension.latencyPredictor.enabled }}
1622
- type: predicted-latency-scorer
1723
parameters:

config/charts/inferencepool/values.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,16 @@ inferenceExtension:
9494
# common latencyPredictor setting exists in config/charts/inference-extension/values.yaml
9595
enabled: false
9696

97+
# Metrics DataSource Configuration
98+
# These values configure how the EPP scrapes metrics from model server pods.
99+
metricsDataSource:
100+
# scheme is the HTTP scheme used to scrape metrics (http or https).
101+
scheme: "http"
102+
# path is the URL path on the model server pod that exposes Prometheus metrics.
103+
path: "/metrics"
104+
# insecureSkipVerify disables TLS certificate verification when scheme is https.
105+
insecureSkipVerify: true
106+
97107
inferencePool:
98108
targetPorts:
99109
- number: 8000

config/charts/standalone/values.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,16 @@ inferenceExtension:
301301
# common latencyPredictor setting exists in config/charts/inference-extension/values.yaml
302302
enabled: false
303303

304+
# Metrics DataSource Configuration
305+
# These values configure how the EPP scrapes metrics from model server pods.
306+
metricsDataSource:
307+
# scheme is the HTTP scheme used to scrape metrics (http or https).
308+
scheme: "http"
309+
# path is the URL path on the model server pod that exposes Prometheus metrics.
310+
path: "/metrics"
311+
# insecureSkipVerify disables TLS certificate verification when scheme is https.
312+
insecureSkipVerify: true
313+
304314
# Options: ["gke"]
305315
provider:
306316
name: none

pkg/epp/backend/metrics/podmetrics_parity_test.go

Lines changed: 11 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package metrics
1818

1919
import (
2020
"context"
21+
"encoding/json"
2122
"errors"
2223
"fmt"
2324
"net/http"
@@ -30,7 +31,6 @@ import (
3031
"github.com/google/go-cmp/cmp/cmpopts"
3132
"github.com/prometheus/client_golang/prometheus"
3233
"github.com/prometheus/client_golang/prometheus/promhttp"
33-
"github.com/spf13/pflag"
3434
"github.com/stretchr/testify/assert"
3535

3636
fwkdl "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/datalayer"
@@ -316,15 +316,13 @@ func parseWithDatalayerMetrics(t *testing.T, ctx context.Context, urlStr string)
316316
return nil, fmt.Errorf("failed to parse URL: %w", err)
317317
}
318318

319-
cleanup := setupTestFlags(t) // set-up test flags and restore on cleanup
320-
defer cleanup()
321-
322-
// CLI flags to match the test server URL
323-
if err := pflag.CommandLine.Set("model-server-metrics-scheme", parsedURL.Scheme); err != nil {
324-
return nil, fmt.Errorf("failed to set scheme flag: %w", err)
325-
}
326-
if err := pflag.CommandLine.Set("model-server-metrics-path", parsedURL.Path); err != nil {
327-
return nil, fmt.Errorf("failed to set path flag: %w", err)
319+
// Pass scheme and path directly as plugin parameters — no CLI flags needed.
320+
params, err := json.Marshal(map[string]any{
321+
"scheme": parsedURL.Scheme,
322+
"path": parsedURL.Path,
323+
})
324+
if err != nil {
325+
return nil, fmt.Errorf("failed to marshal datasource parameters: %w", err)
328326
}
329327

330328
mapping, err := metricextractor.NewMapping(
@@ -343,15 +341,15 @@ func parseWithDatalayerMetrics(t *testing.T, ctx context.Context, urlStr string)
343341
return nil, fmt.Errorf("failed to register mapping: %w", err)
344342
}
345343

346-
extractor, err := metricextractor.NewModelServerExtractor(registry, metricextractor.DefaultEngineTypeLabelKey)
344+
extractor, err := metricextractor.NewCoreMetricsExtractor(registry, metricextractor.DefaultEngineTypeLabelKey)
347345
if err != nil {
348346
return nil, fmt.Errorf("failed to create extractor: %w", err)
349347
}
350348

351349
plugin, err := sourcemetrics.MetricsDataSourceFactory(
352350
"test-metrics-source",
353-
nil, // use default parameters from flags
354-
nil, // no plugin handle needed for test
351+
params, // configure scheme and path via parameters
352+
nil, // no plugin handle needed for test
355353
)
356354
if err != nil {
357355
return nil, fmt.Errorf("failed to create data source: %w", err)
@@ -381,22 +379,6 @@ func parseWithDatalayerMetrics(t *testing.T, ctx context.Context, urlStr string)
381379
return endpoint.GetMetrics(), nil
382380
}
383381

384-
// setupTestFlags creates a temporary FlagSet for testing and returns a cleanup function
385-
func setupTestFlags(t *testing.T) func() {
386-
t.Helper()
387-
originalFlags := pflag.CommandLine
388-
testFlags := pflag.NewFlagSet("test", pflag.ContinueOnError)
389-
pflag.CommandLine = testFlags
390-
391-
testFlags.String("model-server-metrics-scheme", "http", "Protocol scheme used in scraping metrics from endpoints")
392-
testFlags.String("model-server-metrics-path", "/metrics", "URL path used in scraping metrics from endpoints")
393-
testFlags.Bool("model-server-metrics-https-insecure-skip-verify", false, "Skip TLS verification for HTTPS metrics endpoints")
394-
395-
return func() {
396-
pflag.CommandLine = originalFlags
397-
}
398-
}
399-
400382
// createMockServer creates an HTTP test server that serves Prometheus metrics
401383
func createMockServer(metrics []MetricMock) *httptest.Server {
402384
reg := prometheus.NewRegistry()

pkg/epp/framework/plugins/datalayer/extractor/metrics/extractor.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@ func Produces() map[string]any {
7676
}
7777
}
7878

79-
// NewModelServerExtractor returns a new model server protocol (MSP) metrics extractor,
79+
// NewCoreMetricsExtractor returns a new model server protocol (MSP) metrics extractor,
8080
// configured with the given metrics' registry.
81-
func NewModelServerExtractor(registry *MappingRegistry, engineLabelKey string) (*Extractor, error) {
81+
func NewCoreMetricsExtractor(registry *MappingRegistry, engineLabelKey string) (*Extractor, error) {
8282
if registry == nil {
8383
return nil, errors.New("mapping registry cannot be nil")
8484
}

pkg/epp/framework/plugins/datalayer/extractor/metrics/extractor_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ const (
4343
func TestExtractorExtract(t *testing.T) {
4444
ctx := context.Background()
4545

46-
if _, err := NewModelServerExtractor(nil, ""); err == nil {
46+
if _, err := NewCoreMetricsExtractor(nil, ""); err == nil {
4747
t.Error("expected to fail to create extractor with nil registry")
4848
}
4949

@@ -57,7 +57,7 @@ func TestExtractorExtract(t *testing.T) {
5757
t.Fatalf("failed to register mapping: %v", err)
5858
}
5959

60-
extractor, err := NewModelServerExtractor(registry, "")
60+
extractor, err := NewCoreMetricsExtractor(registry, "")
6161
if err != nil {
6262
t.Fatalf("failed to create extractor: %v", err)
6363
}
@@ -227,7 +227,7 @@ func TestExtractorMultiEngine(t *testing.T) {
227227
mSgl, _ := NewMapping("sglang:num_queue_reqs", "sglang:num_running_reqs", "", "", "")
228228
_ = registry.Register("sglang", mSgl)
229229

230-
extractor, _ := NewModelServerExtractor(registry, "")
230+
extractor, _ := NewCoreMetricsExtractor(registry, "")
231231

232232
// Sample metric data
233233
data := sourcemetrics.PrometheusMetricMap{
@@ -276,7 +276,7 @@ func TestBackwardCompatibility(t *testing.T) {
276276
mDef, _ := NewMapping("vllm:num_requests_waiting", "", "", "", "")
277277
_ = registry.Register(DefaultEngineType, mDef)
278278

279-
extractor, _ := NewModelServerExtractor(registry, "")
279+
extractor, _ := NewCoreMetricsExtractor(registry, "")
280280

281281
data := sourcemetrics.PrometheusMetricMap{
282282
"vllm:num_requests_waiting": &dto.MetricFamily{
@@ -306,7 +306,7 @@ func TestBackwardCompatibility(t *testing.T) {
306306
}
307307
}
308308

309-
func TestModelServerExtractorFactoryDefaultEngine(t *testing.T) {
309+
func TestCoreMetricsExtractorFactoryDefaultEngine(t *testing.T) {
310310
tests := []struct {
311311
name string
312312
params map[string]any
@@ -475,7 +475,7 @@ func TestModelServerExtractorFactoryDefaultEngine(t *testing.T) {
475475
}
476476
}
477477

478-
plugin, err := ModelServerExtractorFactory("test", params, nil)
478+
plugin, err := CoreMetricsExtractorFactory("test", params, nil)
479479

480480
if tt.wantErr {
481481
if err == nil {

pkg/epp/framework/plugins/datalayer/extractor/metrics/factories.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import (
2525
)
2626

2727
const (
28-
MetricsExtractorType = "model-server-protocol-metrics"
28+
MetricsExtractorType = "core-metrics-extractor"
2929
)
3030

3131
// Configuration parameters for metrics data source and extractor.
@@ -83,9 +83,9 @@ var defaultEngineConfigs = []engineConfigParams{
8383
// defaultEngineName is the default engine used when defaultEngine is not specified.
8484
const defaultEngineName = "vllm"
8585

86-
// ModelServerExtractorFactory is a factory function used to instantiate data layer's metrics
86+
// CoreMetricsExtractorFactory is a factory function used to instantiate data layer's metrics
8787
// Extractor plugins specified in a configuration.
88-
func ModelServerExtractorFactory(name string, parameters json.RawMessage, handle fwkplugin.Handle) (fwkplugin.Plugin, error) {
88+
func CoreMetricsExtractorFactory(name string, parameters json.RawMessage, handle fwkplugin.Handle) (fwkplugin.Plugin, error) {
8989
cfg := defaultExtractorConfigParams()
9090

9191
if parameters != nil { // overlay the defaults with configured values
@@ -157,7 +157,7 @@ func ModelServerExtractorFactory(name string, parameters json.RawMessage, handle
157157
return nil, fmt.Errorf("failed to register default mapping: %w", err)
158158
}
159159

160-
extractor, err := NewModelServerExtractor(registry, cfg.EngineLabelKey)
160+
extractor, err := NewCoreMetricsExtractor(registry, cfg.EngineLabelKey)
161161
if err != nil {
162162
return nil, err
163163
}

pkg/epp/framework/plugins/datalayer/source/metrics/datasource.go

Lines changed: 49 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,15 @@ import (
3232

3333
const MetricsDataSourceType = "metrics-data-source"
3434

35-
// Data source configuration parameters
35+
// Default values for the metrics data source configuration.
36+
const (
37+
defaultMetricsScheme = "http"
38+
defaultMetricsPath = "/metrics"
39+
defaultMetricsInsecureSkipVerify = true
40+
)
41+
42+
// metricsDatasourceParams holds the configuration parameters for the metrics data source plugin.
43+
// These values can be specified in the EndpointPickerConfig under the plugin's `parameters` field.
3644
type metricsDatasourceParams struct {
3745
// Scheme defines the protocol scheme used in metrics retrieval (e.g., "http").
3846
Scheme string `json:"scheme"`
@@ -60,68 +68,75 @@ func MetricsDataSourceFactory(name string, parameters json.RawMessage, handle fw
6068
name, parseMetrics, PrometheusMetricType)
6169
}
6270

63-
// Names of CLI flags in main
64-
//
65-
// TODO:
71+
// These flags are registered in options.go (server package) and marked as deprecated there.
72+
// They are kept for one release cycle to give users time to migrate their configuration
73+
// to the EndpointPickerConfig `parameters` field (metricsDatasourceParams).
74+
// They will be removed in a future release.
6675
//
67-
// 1. Consider having a cli package with all flag names and constants?
68-
// Can't use values from runserver as this creates an import cycle with datalayer.
69-
// Given that relevant issues/PRs have been closed so may be able to remove the cycle?
70-
// Comment from runserver package (regarding TestPodMetricsClient *backendmetrics.FakePodMetricsClient)
71-
// This should only be used in tests. We won't need this once we do not inject metrics in the tests.
72-
// TODO:(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/432) Cleanup
73-
//
74-
// 2. Deprecation notice on these flags being moved to the configuration file
76+
// TODO: Remove these constants and defaultDataSourceConfigParams() once the deprecated flags
77+
// are removed from options.go.
78+
// Note: these flag names are duplicated here (rather than imported from the server package)
79+
// to avoid an import cycle between the datalayer plugin and the server/runserver packages.
7580
const (
7681
modelServerMetricsPathFlag = "model-server-metrics-path"
7782
modelServerMetricsSchemeFlag = "model-server-metrics-scheme"
7883
modelServerMetricsInsecureSkipVerifyFlag = "model-server-metrics-https-insecure-skip-verify"
7984
)
8085

81-
// return the default configuration state. The defaults are populated from
82-
// existing command line flags.
86+
// DataSource parameters values - Priority (lowest → highest):
87+
// 1. Built-in defaults (defaultMetricsScheme / defaultMetricsPath / defaultMetricsInsecureSkipVerify)
88+
// 2. Deprecated CLI flag value (when the flag is registered and has been set by the operator)
89+
// 3. Explicit plugin `parameters` in EndpointPickerConfig
8390
func defaultDataSourceConfigParams() (*metricsDatasourceParams, error) {
84-
cfg := &metricsDatasourceParams{}
91+
cfg := &metricsDatasourceParams{
92+
Scheme: defaultMetricsScheme,
93+
Path: defaultMetricsPath,
94+
InsecureSkipVerify: defaultMetricsInsecureSkipVerify,
95+
}
8596

86-
scheme, err := fromStringFlag(modelServerMetricsSchemeFlag)
87-
if err != nil {
88-
return nil, err
97+
if scheme, ok := fromStringFlag(modelServerMetricsSchemeFlag); ok {
98+
cfg.Scheme = scheme
8999
}
90-
cfg.Scheme = scheme
91100

92-
path, err := fromStringFlag(modelServerMetricsPathFlag)
93-
if err != nil {
94-
return nil, err
101+
if path, ok := fromStringFlag(modelServerMetricsPathFlag); ok {
102+
cfg.Path = path
95103
}
96-
cfg.Path = path
97104

98-
insecure, err := fromBoolFlag(modelServerMetricsInsecureSkipVerifyFlag)
99-
if err != nil {
105+
if insecure, ok, err := fromBoolFlag(modelServerMetricsInsecureSkipVerifyFlag); err != nil {
100106
return nil, err
107+
} else if ok {
108+
cfg.InsecureSkipVerify = insecure
101109
}
102-
cfg.InsecureSkipVerify = insecure
103110

104111
return cfg, nil
105112
}
106113

107-
func fromStringFlag(name string) (string, error) {
114+
// fromStringFlag returns the value of a registered pflag string flag.
115+
// The second return value is false when the flag is not registered; no error is returned in that case.
116+
func fromStringFlag(name string) (string, bool) {
108117
f := pflag.Lookup(name)
109-
if f == nil {
110-
return "", fmt.Errorf("flag not found: %s", name)
118+
if f == nil || !f.Changed {
119+
return "", false
111120
}
112-
return f.Value.String(), nil
121+
return f.Value.String(), true
113122
}
114123

115-
func fromBoolFlag(name string) (bool, error) {
124+
// fromBoolFlag returns the value of a registered pflag bool flag.
125+
// The second return value is false when the flag is not registered; no error is returned in that case.
126+
// An error is returned only when the flag exists but its value cannot be parsed as a bool.
127+
func fromBoolFlag(name string) (bool, bool, error) {
116128
f := pflag.Lookup(name)
117129
if f == nil {
118-
return false, fmt.Errorf("flag not found: %s", name)
130+
return false, false, nil
131+
}
132+
if !f.Changed {
133+
return false, false, nil // user did NOT provide it
119134
}
120135
b, err := strconv.ParseBool(f.Value.String())
121136
if err != nil {
122-
return false, fmt.Errorf("invalid bool flag %q: %w", name, err)
137+
return false, false, fmt.Errorf("invalid bool flag %q: %w", name, err)
123138
}
124-
return b, nil
139+
return b, true, nil
125140
}
126141

127142
func parseMetrics(data io.Reader) (any, error) {

0 commit comments

Comments
 (0)