Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 61 additions & 86 deletions cmd/epp/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package runner

import (
"context"
"crypto/tls"
"errors"
"flag"
"fmt"
Expand Down Expand Up @@ -119,6 +118,8 @@ type Runner struct {
requestControlConfig *requestcontrol.Config
schedulerConfig *scheduling.SchedulerConfig
customCollectors []prometheus.Collector

testOverrideSkipNameValidation bool
}

// WithExecutableName sets the name of the executable containing the runner.
Expand Down Expand Up @@ -184,42 +185,73 @@ func (r *Runner) Run(ctx context.Context) error {
return err
}

rawConfig, err := r.parseConfigurationPhaseOne(ctx, opts)
pmc, err := backendmetrics.NewPodMetricsClientImpl(setupLog, backendmetrics.Config{
ModelServerMetricsScheme: opts.ModelServerMetricsScheme,
ModelServerMetricsHTTPSInsecure: opts.ModelServerMetricsHTTPSInsecure,
ModelServerMetricsPath: opts.ModelServerMetricsPath,

TotalQueuedRequestsMetric: opts.TotalQueuedRequestsMetric,
TotalRunningRequestsMetric: opts.TotalRunningRequestsMetric,
KVCacheUsagePercentageMetric: opts.KVCacheUsagePercentageMetric,
LoRAInfoMetric: opts.LoRAInfoMetric,
CacheInfoMetric: opts.CacheInfoMetric,
})
if err != nil {
setupLog.Error(err, "Failed to parse configuration")
return err
}

// --- Setup Datastore ---
epf, err := r.setupMetricsCollection(r.featureGates[datalayer.ExperimentalDatalayerFeatureGate], opts)
mgr, _, err := r.setup(ctx, cfg, opts, pmc)
if err != nil {
return err
}

// --- Start Manager ---
// This blocks until a signal is received.
setupLog.Info("Controller manager starting")
if err := mgr.Start(ctx); err != nil {
setupLog.Error(err, "Error starting controller manager")
return err
}
setupLog.Info("Controller manager terminated")
return nil
}

// setup configures the internal state of the Runner, including the manager,
// datastore, and other server components. It returns the initialized Manager
// without starting it, allowing for flexible in integration test.
//
// The returned Datastore is **only** meant to use in the integration test.
func (r *Runner) setup(ctx context.Context, cfg *rest.Config, opts *runserver.Options, pmc backendmetrics.PodMetricsClient) (ctrl.Manager, datastore.Datastore, error) {
rawConfig, err := r.parseConfigurationPhaseOne(ctx, opts)
if err != nil {
setupLog.Error(err, "Failed to parse configuration")
return nil, nil, err
}

epf := r.setupMetricsCollection(r.featureGates[datalayer.ExperimentalDatalayerFeatureGate], opts, pmc)
gknn, err := extractGKNN(opts.PoolName, opts.PoolGroup, opts.PoolNamespace, opts.EndpointSelector)
if err != nil {
setupLog.Error(err, "Failed to extract GKNN")
return err
return nil, nil, err
}

startCrdReconcilers := opts.EndpointSelector == "" // If endpointSelector is empty, it means it's not in the standalone mode. Then we should start the inferencePool and other CRD Reconciler.
controllerCfg := runserver.NewControllerConfig(startCrdReconcilers)

if err := controllerCfg.PopulateControllerConfig(cfg); err != nil {
setupLog.Error(err, "Failed to populate controller config")
return err
return nil, nil, err
}

ds, err := setupDatastore(ctx, epf, int32(opts.ModelServerMetricsPort), startCrdReconcilers,
opts.PoolName, opts.PoolNamespace, opts.EndpointSelector, opts.EndpointTargetPorts)
opts.PoolNamespace, opts.PoolName, opts.EndpointSelector, opts.EndpointTargetPorts)
if err != nil {
setupLog.Error(err, "Failed to setup datastore")
return err
return nil, nil, err
}
eppConfig, err := r.parseConfigurationPhaseTwo(ctx, rawConfig, ds)
if err != nil {
setupLog.Error(err, "Failed to parse configuration")
return err
return nil, nil, err
}

// --- Setup Metrics Server ---
Expand All @@ -245,10 +277,13 @@ func (r *Runner) Run(ctx context.Context) error {
isLeader := &atomic.Bool{}
isLeader.Store(false)

mgr, err := runserver.NewDefaultManager(controllerCfg, *gknn, cfg, metricsServerOptions, opts.EnableLeaderElection)
mgr, err := runserver.NewDefaultManager(controllerCfg, *gknn, cfg, metricsServerOptions, opts.EnableLeaderElection, r.testOverrideSkipNameValidation)
if r.testOverrideSkipNameValidation {
setupLog.Info("Warning: testOverrideSkipNameValidation is set to true, this should be only used in test.")
}
if err != nil {
setupLog.Error(err, "Failed to create controller manager")
return err
return nil, nil, err
}

if opts.EnableLeaderElection {
Expand All @@ -267,15 +302,15 @@ func (r *Runner) Run(ctx context.Context) error {
setupLog.Info("Setting pprof handlers")
if err = profiling.SetupPprofHandlers(mgr); err != nil {
setupLog.Error(err, "Failed to setup pprof handlers")
return err
return nil, nil, err
}
}

// --- Initialize Core EPP Components ---
if r.schedulerConfig == nil {
err := errors.New("scheduler config must be set either by config api or through code")
setupLog.Error(err, "failed to create scheduler")
return err
return nil, nil, err
}

setupLog.Info("parsed config", "scheduler-config", r.schedulerConfig)
Expand All @@ -285,7 +320,7 @@ func (r *Runner) Run(ctx context.Context) error {
datalayerMetricsEnabled := r.featureGates[datalayer.ExperimentalDatalayerFeatureGate]
if err := r.setupDataLayer(datalayerMetricsEnabled, eppConfig.DataConfig, epf, mgr); err != nil {
setupLog.Error(err, "failed to initialize data layer")
return err
return nil, nil, err
}

saturationDetector := utilizationdetector.NewDetector(eppConfig.SaturationDetectorConfig, setupLog)
Expand All @@ -299,7 +334,7 @@ func (r *Runner) Run(ctx context.Context) error {
setupLog.Info("Initializing experimental Flow Control layer")
registry, err := fcregistry.NewFlowRegistry(eppConfig.FlowControlConfig.Registry, setupLog)
if err != nil {
return fmt.Errorf("failed to initialize Flow Registry: %w", err)
return nil, nil, fmt.Errorf("failed to initialize Flow Registry: %w", err)
}
fc, err := fccontroller.NewFlowController(
ctx,
Expand All @@ -308,7 +343,7 @@ func (r *Runner) Run(ctx context.Context) error {
locator,
)
if err != nil {
return fmt.Errorf("failed to initialize Flow Controller: %w", err)
return nil, nil, fmt.Errorf("failed to initialize Flow Controller: %w", err)
}
go registry.Run(ctx)
admissionController = requestcontrol.NewFlowControlAdmissionController(fc, opts.PoolName)
Expand All @@ -335,31 +370,23 @@ func (r *Runner) Run(ctx context.Context) error {
SaturationDetector: saturationDetector,
UseExperimentalDatalayerV2: r.featureGates[datalayer.ExperimentalDatalayerFeatureGate], // pluggable data layer feature flag
}

if err := serverRunner.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "Failed to setup EPP controllers")
return err
return nil, nil, err
}

// --- Add Runnables to Manager ---
// Register health server.
if err := registerHealthServer(mgr, ctrl.Log.WithName("health"), ds, opts.GRPCHealthPort, isLeader, opts.EnableLeaderElection); err != nil {
return err
return nil, nil, err
}

// Register ext-proc server.
if err := registerExtProcServer(mgr, serverRunner, ctrl.Log.WithName("ext-proc")); err != nil {
return err
}

// --- Start Manager ---
// This blocks until a signal is received.
setupLog.Info("Controller manager starting")
if err := mgr.Start(ctx); err != nil {
setupLog.Error(err, "Error starting controller manager")
return err
return nil, nil, err
}
setupLog.Info("Controller manager terminated")
return nil
return mgr, ds, nil
}

// NewEndpointPoolFromOptions constructs an EndpointPool from standalone options.
Expand Down Expand Up @@ -596,48 +623,11 @@ func (r *Runner) setupDataLayer(enableNewMetrics bool, cfg *datalayer.Config,
return nil
}

func (r *Runner) setupMetricsCollection(enableNewMetrics bool, opts *runserver.Options) (datalayer.EndpointFactory, error) {
func (r *Runner) setupMetricsCollection(enableNewMetrics bool, opts *runserver.Options, pmc backendmetrics.PodMetricsClient) datalayer.EndpointFactory {
if enableNewMetrics {
return datalayer.NewEndpointFactory(nil, opts.RefreshMetricsInterval), nil
}
return setupMetricsV1(opts)
}

func setupMetricsV1(opts *runserver.Options) (datalayer.EndpointFactory, error) {
mapping, err := backendmetrics.NewMetricMapping(
opts.TotalQueuedRequestsMetric,
opts.TotalRunningRequestsMetric,
opts.KVCacheUsagePercentageMetric,
opts.LoRAInfoMetric,
opts.CacheInfoMetric,
)
if err != nil {
setupLog.Error(err, "Failed to create metric mapping from flags.")
return nil, err
}
verifyMetricMapping(*mapping)

var metricsHttpClient *http.Client
if opts.ModelServerMetricsScheme == "https" {
metricsHttpClient = &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: opts.ModelServerMetricsHTTPSInsecure,
},
},
}
} else {
metricsHttpClient = http.DefaultClient
return datalayer.NewEndpointFactory(nil, opts.RefreshMetricsInterval)
}

pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{
MetricMapping: mapping,
ModelServerMetricsPath: opts.ModelServerMetricsPath,
ModelServerMetricsScheme: opts.ModelServerMetricsScheme,
Client: metricsHttpClient,
},
opts.RefreshMetricsInterval)
return pmf, nil
return backendmetrics.NewPodMetricsFactory(pmc, opts.RefreshMetricsInterval)
}

// registerExtProcServer adds the ExtProcServerRunner as a Runnable to the manager.
Expand Down Expand Up @@ -667,21 +657,6 @@ func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds datastore.
return nil
}

func verifyMetricMapping(mapping backendmetrics.MetricMapping) {
if mapping.TotalQueuedRequests == nil {
setupLog.Info("Not scraping metric: TotalQueuedRequests")
}
if mapping.KVCacheUtilization == nil {
setupLog.Info("Not scraping metric: KVCacheUtilization")
}
if mapping.LoraRequestInfo == nil {
setupLog.Info("Not scraping metric: LoraRequestInfo")
}
if mapping.CacheConfigInfo == nil {
setupLog.Info("Not scraping metric: CacheConfigInfo")
}
}

func extractDeploymentName(podName string) (string, error) {
regex := regexp.MustCompile(`^(.+)-[a-z0-9]+-[a-z0-9]+$`)

Expand Down
34 changes: 34 additions & 0 deletions cmd/epp/runner/test_runner.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
Copyright 2025 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package runner

import (
"context"

"k8s.io/client-go/rest"
ctrl "sigs.k8s.io/controller-runtime"
backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server"
)

// NewTestRunnerSetup creates a setup runner dedicated for integration test and its corresponding dataStore.
func NewTestRunnerSetup(ctx context.Context, cfg *rest.Config, opts *runserver.Options, pmc backendmetrics.PodMetricsClient) (ctrl.Manager, datastore.Datastore, error) {
runner := NewRunner()
runner.testOverrideSkipNameValidation = true
return runner.setup(ctx, cfg, opts, pmc)
}
62 changes: 62 additions & 0 deletions pkg/epp/backend/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@ package metrics

import (
"context"
"crypto/tls"
"fmt"
"net/http"
"strconv"
"strings"

"github.com/go-logr/logr"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"github.com/prometheus/common/model"
Expand All @@ -49,6 +51,66 @@ type PodMetricsClientImpl struct {
Client *http.Client
}

type Config struct {
ModelServerMetricsScheme string
ModelServerMetricsHTTPSInsecure bool
ModelServerMetricsPath string

TotalQueuedRequestsMetric string
TotalRunningRequestsMetric string
KVCacheUsagePercentageMetric string
LoRAInfoMetric string
CacheInfoMetric string
}

func NewPodMetricsClientImpl(logger logr.Logger, config Config) (PodMetricsClient, error) {
mapping, err := NewMetricMapping(
config.TotalQueuedRequestsMetric,
config.TotalRunningRequestsMetric,
config.KVCacheUsagePercentageMetric,
config.LoRAInfoMetric,
config.CacheInfoMetric,
)
if err != nil {
return nil, err
}
verifyMetricMapping(logger, *mapping)

var metricsHttpClient *http.Client
if config.ModelServerMetricsScheme == "https" {
metricsHttpClient = &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: config.ModelServerMetricsHTTPSInsecure,
},
},
}
} else {
metricsHttpClient = http.DefaultClient
}
return &PodMetricsClientImpl{
MetricMapping: mapping,
ModelServerMetricsPath: config.ModelServerMetricsPath,
ModelServerMetricsScheme: config.ModelServerMetricsScheme,
Client: metricsHttpClient,
}, nil
}

func verifyMetricMapping(logger logr.Logger, mapping MetricMapping) {
if mapping.TotalQueuedRequests == nil {
logger.Info("Not scraping metric: TotalQueuedRequests")
}
if mapping.KVCacheUtilization == nil {
logger.Info("Not scraping metric: KVCacheUtilization")
}
if mapping.LoraRequestInfo == nil {
logger.Info("Not scraping metric: LoraRequestInfo")
}
if mapping.CacheConfigInfo == nil {
logger.Info("Not scraping metric: CacheConfigInfo")
}
}

// FetchMetrics fetches metrics from a given pod, clones the existing metrics object and returns an updated one.
func (p *PodMetricsClientImpl) FetchMetrics(ctx context.Context, metadata *fwkdl.EndpointMetadata, existing *MetricsState) (*MetricsState, error) {
url := p.getMetricEndpoint(metadata)
Expand Down
Loading