diff --git a/api/v1/inferencepool_types.go b/api/v1/inferencepool_types.go
index 92e7aff14b..a4d2d32189 100644
--- a/api/v1/inferencepool_types.go
+++ b/api/v1/inferencepool_types.go
@@ -80,6 +80,19 @@ type InferencePoolSpec struct {
// +required
TargetPorts []Port `json:"targetPorts,omitempty"`
+ // AppProtocol describes the application protocol for all the target ports.
+ //
+ // If unspecified, the protocol defaults to HTTP/1.1.
+ //
+ // Supported values include:
+ // * "http": HTTP/1.1. This is the default.
+ // * "kubernetes.io/h2c": HTTP/2 over cleartext.
+ //
+ // +kubebuilder:validation:Enum=http;"kubernetes.io/h2c"
+ // +kubebuilder:default="http"
+ // +optional
+ AppProtocol AppProtocol `json:"appProtocol,omitempty"`
+
// EndpointPickerRef is a reference to the Endpoint Picker extension and its
// associated configuration.
//
@@ -96,6 +109,20 @@ type Port struct {
Number PortNumber `json:"number,omitempty"`
}
+// AppProtocol describes the application protocol for a port.
+type AppProtocol string
+
+const (
+ // AppProtocolHTTP represents the HTTP/1.1 protocol.
+ // This is the default protocol if AppProtocol is unspecified.
+ AppProtocolHTTP AppProtocol = "http"
+
+ // AppProtocolH2C represents HTTP/2 over cleartext (h2c).
+ // This protocol is typically used for gRPC workloads where TLS is terminated
+ // at the Gateway or not used within the cluster.
+ AppProtocolH2C AppProtocol = "kubernetes.io/h2c"
+)
+
// EndpointPickerRef specifies a reference to an Endpoint Picker extension and its
// associated configuration.
// +kubebuilder:validation:XValidation:rule="self.kind != 'Service' || has(self.port)",message="port is required when kind is 'Service' or unspecified (defaults to 'Service')"
diff --git a/client-go/applyconfiguration/api/v1/inferencepoolspec.go b/client-go/applyconfiguration/api/v1/inferencepoolspec.go
index ca44987492..b9b258e3f6 100644
--- a/client-go/applyconfiguration/api/v1/inferencepoolspec.go
+++ b/client-go/applyconfiguration/api/v1/inferencepoolspec.go
@@ -18,11 +18,16 @@ limitations under the License.
package v1
+import (
+ apiv1 "sigs.k8s.io/gateway-api-inference-extension/api/v1"
+)
+
// InferencePoolSpecApplyConfiguration represents a declarative configuration of the InferencePoolSpec type for use
// with apply.
type InferencePoolSpecApplyConfiguration struct {
Selector *LabelSelectorApplyConfiguration `json:"selector,omitempty"`
TargetPorts []PortApplyConfiguration `json:"targetPorts,omitempty"`
+ AppProtocol *apiv1.AppProtocol `json:"appProtocol,omitempty"`
EndpointPickerRef *EndpointPickerRefApplyConfiguration `json:"endpointPickerRef,omitempty"`
}
@@ -53,6 +58,14 @@ func (b *InferencePoolSpecApplyConfiguration) WithTargetPorts(values ...*PortApp
return b
}
+// WithAppProtocol sets the AppProtocol field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the AppProtocol field is set to the value of the last call.
+func (b *InferencePoolSpecApplyConfiguration) WithAppProtocol(value apiv1.AppProtocol) *InferencePoolSpecApplyConfiguration {
+ b.AppProtocol = &value
+ return b
+}
+
// WithEndpointPickerRef sets the EndpointPickerRef field in the declarative configuration to the given value
// and returns the receiver, so that objects can be built by chaining "With" function invocations.
// If called multiple times, the EndpointPickerRef field is set to the value of the last call.
diff --git a/config/crd/bases/inference.networking.k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.k8s.io_inferencepools.yaml
index a3f7696333..67cbdb1354 100644
--- a/config/crd/bases/inference.networking.k8s.io_inferencepools.yaml
+++ b/config/crd/bases/inference.networking.k8s.io_inferencepools.yaml
@@ -42,6 +42,20 @@ spec:
spec:
description: Spec defines the desired state of the InferencePool.
properties:
+ appProtocol:
+ default: http
+ description: |-
+ AppProtocol describes the application protocol for all the target ports.
+
+ If unspecified, the protocol defaults to HTTP/1.1.
+
+ Supported values include:
+ * "http": HTTP/1.1. This is the default.
+ * "kubernetes.io/h2c": HTTP/2 over cleartext.
+ enum:
+ - http
+ - kubernetes.io/h2c
+ type: string
endpointPickerRef:
description: |-
EndpointPickerRef is a reference to the Endpoint Picker extension and its
diff --git a/docs/proposals/2162-grpc-support/README.md b/docs/proposals/2162-grpc-support/README.md
new file mode 100644
index 0000000000..9406be052f
--- /dev/null
+++ b/docs/proposals/2162-grpc-support/README.md
@@ -0,0 +1,110 @@
+# gRPC support
+
+Author(s): @zetxqx, @ahg-g
+
+For the full, detailed proposal, please see the [original proposal](https://docs.google.com/document/d/1H-WazsrSQOVi8bGgfBLuQ7RTypwa__EncVNu-yRBw1U/edit?tab=t.4i912lhthtwx#heading=h.cvvvoep0ljs9).
+
+## Motivation
+Model servers (like vLLM [gRPC](https://github.com/vllm-project/vllm/blob/main/vllm/grpc/vllm_engine.proto) and SGLang [gRPC](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/grpc/sglang_scheduler.proto)) now offer gRPC endpoints. As the gateway layer, Inference gateway extension needs to support gRPC not only for compatibility with those gRPC model server but also for the following benefits:
+* gRPC Protocol Efficiency: It has binary framing, more efficient than text-based JSON, reducing payload size and parsing overhead.
+* Flexibility: Gives us the flexibility to support tokenization at GAIE Level, because all those gRPC endpoints are supporting tokens-in, tokens-out.
+
+## Goal
+* InferencePool API changes to support gRPC
+* EPP changes to support gRPC including:
+ * gRPC-in, gRPC-out
+ * HTTP-in, gRPC-out
+
+## Proposed API Changes
+The current InferencePool implementation defaults to HTTP/1.1 communication. To support gRPC, which operates over HTTP/2, a field must be introduced for the gateway controller to identify the appropriate appProtocol for model server communication.
+
+This proposal introduces an `AppProtocol` (similar to k8s [servicePort](https://github.com/kubernetes/api/blob/82d2200b6363cca3aba07c043b95d88704c2ddb3/core/v1/types.go#L6204C1-L6220C92)) field to the `InferencePoolSpec` struct within the `InferencePool`. This field applies to all `TargetPorts`.
+
+```go
+// InferencePoolSpec defines the desired state of the InferencePool.
+type InferencePoolSpec struct {
+ // ... other fields
+
+ // ... omitted
+ TargetPorts []Port `json:"targetPorts,omitempty"`
+
+ // AppProtocol describes the application protocol for all the target ports.
+ //
+ // If unspecified, the protocol defaults to HTTP/1.1.
+ //
+ // Supported values include:
+ // * "http": HTTP/1.1. This is the default.
+ // * "kubernetes.io/h2c": HTTP/2 over cleartext.
+ //
+ // +kubebuilder:validation:Enum=http;"kubernetes.io/h2c"
+ // +optional
+ AppProtocol AppProtocol `json:"appProtocol,omitempty"`
+
+ // ... omitted
+}
+```
+
+## EndPointPicker (EPP) Enhancements
+The current implementation of the `pkg/epp/handlers/server.go` (ext_proc streaming server) is limited to handling HTTP/JSON payloads. To accommodate gRPC model servers, EPP need to be updated to support two primary traffic patterns:
+
+1. **gRPC-in-gRPC-out:** Both the client and model server utilize gRPC. In this scenario, EPP primarily needs to decode gRPC protobuf messages.
+2. **http-in-gRPC-out:** The client sends HTTP/JSON requests (OpenAI API), while the model server expects gRPC. This requires EPP to perform the following transcoding tasks:
+ 1. Transcode incoming HTTP/JSON requests to gRPC.
+ 2. Transcode gRPC responses back to HTTP/JSON.
+ 3. For streaming workloads, transcode gRPC response streams into Server-Sent Events (SSE) format.
+
+The diagram below outlines the proposed changes within the ext_proc streaming server sequence diagram (simplified/omitted some components to focus on the key changes), highlighting new logic for protocol detection and transcoding in green:
+
+
+
+Specifically, the key components within the EPP codebase necessitating modification are:
+* `pkg/epp/handlers/server.go`: Update the main `Process` loop to detect content type and delegate parsing.
+* `pkg/epp/codec` (New Package): Implement parsers for JSON (existing logic) and gRPC (new logic).
+
+**More implementation details:**
+1. The EPP should determine when transcoding is required for http-in, gRPC-out scenarios. This can be achieved through one of the following methods:
+ 1. Implementing a configuration flag or environment variable within EPP to explicitly signal the need for transcoding.
+ 2. **(Preferred)** EPP can inspect the observed InferencePool specification. If `InferencePoolSpec.AppProtocol` is designated as `kubernetes.io/h2c`, transcoding should happen.
+2. EPP needs to know how to do protocol conversion. This will be mainly based on headers diff between HTTP/JSON and gRPC.
+3. A designated folder will be required to maintain copies of the vLLM and SGLang protocol buffers. To ensure production stability, a compatibility matrix will be needed for users, mapping supported GAIE versions to model server proto versions.
+
+## Implementation Plan
+There are several dimensions we can parallel the whole work:
+* **Support for different protocol pattern:**
+ * gRPC-in, gRPC-out
+ * HTTP-in, gRPC-out
+* **Support for Key APIs** (listing most critical pathways):
+ * Generate (Non-Streaming) - `/chat/completion`
+ * Generate (Streaming)
+ * GetModelInfo `/v1/models`
+
+The initial plan prioritizes the following:
+1. The initial focus will be on vLLm gRPC. Meanwhile, efforts will continue to establish a separate package for common proto definitions across model servers.
+2. Implement gRPC-in, gRPC-out first, as it presents the simplest path forward. However, rapid support for HTTP-in, gRPC-out is crucial given the majority of users rely on the OpenAI compatible endpoint.
+3. The Generate API must be prioritized among all supported APIs.
+
+### Gateway Implementation Requirements
+* Modify the InferencePool API to incorporate an `appProtocol` field.
+* Introduce conformance tests to validate gRPC support.
+* Await the necessary gateway layer support for specifying the `appProtocol` within InferencePool definitions.
+
+### Phase 1: gRPC-to-gRPC Protocol Support
+* Implement the Generate API in non-streaming mode.
+* Implement the streaming mode for the Generate API.
+* Provide practical examples, including epp and vLLM gRPC manifests, and update the Helm chart as necessary.
+* Conduct performance benchmarking to confirm no degradation compared to the existing HTTP support.
+
+### Phase 2: HTTP-to-gRPC Conversion Support
+* Support non-streaming `/chat/completion`.
+* Support streaming `/chat/completion`.
+* Update examples and the Helm chart accordingly.
+* Perform benchmarking to ensure minimal performance impact relative to existing HTTP support.
+
+### Phase 3: Additional API Support
+* Implement support for GetModelInfo and the `/v1/models` endpoint.
+* Implement support for GetServerInfo (if necessary).
+* Address metrics scraping specifically for gRPC endpoints (if necessary).
+
+### Future Plan
+* Introduce support for SGLang gRPC.
+* Integrate disaggregated tokenization capabilities.
diff --git a/docs/proposals/2162-grpc-support/images/epp_envoy_grpc.svg b/docs/proposals/2162-grpc-support/images/epp_envoy_grpc.svg
new file mode 100644
index 0000000000..25b7685ad2
--- /dev/null
+++ b/docs/proposals/2162-grpc-support/images/epp_envoy_grpc.svg
@@ -0,0 +1,102 @@
+
\ No newline at end of file
diff --git a/test/cel/inferencepool_test.go b/test/cel/inferencepool_test.go
index deccb6e03f..4d3a473cd0 100644
--- a/test/cel/inferencepool_test.go
+++ b/test/cel/inferencepool_test.go
@@ -47,7 +47,7 @@ func TestValidateInferencePool(t *testing.T) {
EndpointPickerRef: v1.EndpointPickerRef{
Name: "epp",
Kind: "Service",
- Port: ptrTo(v1.Port{Number: 9002}),
+ Port: &v1.Port{Number: 9002},
},
},
}
@@ -63,6 +63,13 @@ func TestValidateInferencePool(t *testing.T) {
},
wantErrors: nil,
},
+ {
+ desc: "passes validation with a appProtocol configured",
+ mutate: func(ip *v1.InferencePool) {
+ ip.Spec.AppProtocol = v1.AppProtocolH2C
+ },
+ wantErrors: nil,
+ },
{
desc: "fails validation when kind is unset (defaults to Service) and port is missing",
mutate: func(ip *v1.InferencePool) {
diff --git a/test/cel/main_test.go b/test/cel/main_test.go
index 2758a56320..7ead528182 100644
--- a/test/cel/main_test.go
+++ b/test/cel/main_test.go
@@ -97,10 +97,6 @@ func TestMain(m *testing.M) {
os.Exit(rc)
}
-func ptrTo[T any](a T) *T {
- return &a
-}
-
func celErrorStringMatches(got, want string) bool {
gotL := strings.ToLower(got)
wantL := strings.ToLower(want)