Enable streaming usage metrics for OpenAI-compatible providers

skamenan7 · skamenan7 · commit 7c05120c3837 · 2025-12-10T09:04:48.000-05:00
Inject stream_options={"include_usage": True} when streaming and OpenTelemetry telemetry is active. Telemetry always overrides any caller preference to ensure complete and consistent observability metrics. Changes: - Add conditional stream_options injection to OpenAIMixin (benefits OpenAI, Bedrock, Runpod, Together, Fireworks providers) - Add conditional stream_options injection to LiteLLMOpenAIMixin (benefits litellm-based providers that call parent methods) - Add telemetry-gated stream_options injection to WatsonX via helper method (WatsonX bypasses LiteLLMOpenAIMixin by calling litellm.acompletion directly, so it uses _inject_stream_options_for_telemetry helper to avoid code duplication) - Check telemetry status using trace.get_current_span().is_recording() - Override include_usage=False when telemetry active to prevent metric gaps - Unit tests for this functionality - Remove legacy ungated stream_options from Bedrock and Runpod providers (pre-#4127 code that bypassed telemetry gating) Fixes #3981
diff --git a/src/llama_stack/providers/remote/inference/bedrock/bedrock.py b/src/llama_stack/providers/remote/inference/bedrock/bedrock.py
@@ -81,14 +81,7 @@ async def openai_chat_completion(
         self,
         params: OpenAIChatCompletionRequestWithExtraBody,
     ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        """Override to enable streaming usage metrics and handle authentication errors."""
-        # Enable streaming usage metrics when telemetry is active
-        if params.stream:
-            if params.stream_options is None:
-                params.stream_options = {"include_usage": True}
-            elif "include_usage" not in params.stream_options:
-                params.stream_options = {**params.stream_options, "include_usage": True}
-
+        """Override to handle authentication errors and null responses."""
         try:
             logger.debug(f"Calling Bedrock OpenAI API with model={params.model}, stream={params.stream}")
             result = await super().openai_chat_completion(params=params)
diff --git a/src/llama_stack/providers/remote/inference/runpod/runpod.py b/src/llama_stack/providers/remote/inference/runpod/runpod.py
@@ -4,14 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from collections.abc import AsyncIterator
-
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
-from llama_stack_api import (
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
-    OpenAIChatCompletionRequestWithExtraBody,
-)
 
 from .config import RunpodImplConfig
 
@@ -29,15 +22,3 @@ class RunpodInferenceAdapter(OpenAIMixin):
     def get_base_url(self) -> str:
         """Get base URL for OpenAI client."""
         return str(self.config.base_url)
-
-    async def openai_chat_completion(
-        self,
-        params: OpenAIChatCompletionRequestWithExtraBody,
-    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        """Override to add RunPod-specific stream_options requirement."""
-        params = params.model_copy()
-
-        if params.stream and not params.stream_options:
-            params.stream_options = {"include_usage": True}
-
-        return await super().openai_chat_completion(params)
diff --git a/src/llama_stack/providers/remote/inference/watsonx/watsonx.py b/src/llama_stack/providers/remote/inference/watsonx/watsonx.py
@@ -47,6 +47,40 @@ def __init__(self, config: WatsonXConfig):
             openai_compat_api_base=self.get_base_url(),
         )
 
+    def _inject_stream_options_for_telemetry(
+        self,
+        stream_options: dict | None,
+        is_streaming: bool,
+    ) -> dict | None:
+        """
+        Inject stream_options when streaming and telemetry is active.
+
+        Active telemetry takes precedence over caller preference to ensure
+        complete and consistent observability metrics.
+
+        Args:
+            stream_options: Original stream_options from params
+            is_streaming: Whether this is a streaming request
+
+        Returns:
+            Modified stream_options with include_usage=True if telemetry active,
+            otherwise returns original stream_options unchanged
+        """
+        if not is_streaming:
+            return stream_options
+
+        from opentelemetry import trace
+
+        span = trace.get_current_span()
+        if not span or not span.is_recording():
+            return stream_options
+
+        # Telemetry is active - inject include_usage
+        if stream_options is None:
+            return {"include_usage": True}
+        else:
+            return {**stream_options, "include_usage": True}
+
     async def openai_chat_completion(
         self,
         params: OpenAIChatCompletionRequestWithExtraBody,
@@ -55,14 +89,11 @@ async def openai_chat_completion(
         Override parent method to add timeout and inject usage object when missing.
         This works around a LiteLLM defect where usage block is sometimes dropped.
         """
-
-        # Add usage tracking for streaming when telemetry is active
-        stream_options = params.stream_options
-        if params.stream:
-            if stream_options is None:
-                stream_options = {"include_usage": True}
-            elif "include_usage" not in stream_options:
-                stream_options = {**stream_options, "include_usage": True}
+        # Inject stream_options when streaming and telemetry is active
+        stream_options = self._inject_stream_options_for_telemetry(
+            params.stream_options,
+            params.stream,
+        )
 
         model_obj = await self.model_store.get_model(params.model)
 
@@ -183,6 +214,12 @@ async def openai_completion(
         """
         from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
 
+        # Inject stream_options when streaming and telemetry is active
+        stream_options = self._inject_stream_options_for_telemetry(
+            params.stream_options,
+            params.stream,
+        )
+
         model_obj = await self.model_store.get_model(params.model)
 
         request_params = await prepare_openai_completion_params(
@@ -199,7 +236,7 @@ async def openai_completion(
             seed=params.seed,
             stop=params.stop,
             stream=params.stream,
-            stream_options=params.stream_options,
+            stream_options=stream_options,
             temperature=params.temperature,
             top_p=params.top_p,
             user=params.user,
diff --git a/src/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/src/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -179,6 +179,20 @@ async def openai_completion(
         self,
         params: OpenAICompletionRequestWithExtraBody,
     ) -> OpenAICompletion:
+        # Inject stream_options when streaming and telemetry is active
+        stream_options = params.stream_options
+        if params.stream:
+            from opentelemetry import trace
+
+            span = trace.get_current_span()
+            if span and span.is_recording():
+                if stream_options is None:
+                    stream_options = {"include_usage": True}
+                else:
+                    # Active telemetry takes precedence over caller preference.
+                    # This ensures complete and consistent observability metrics.
+                    stream_options = {**stream_options, "include_usage": True}
+
         if not self.model_store:
             raise ValueError("Model store is not initialized")
 
@@ -201,7 +215,7 @@ async def openai_completion(
             seed=params.seed,
             stop=params.stop,
             stream=params.stream,
-            stream_options=params.stream_options,
+            stream_options=stream_options,
             temperature=params.temperature,
             top_p=params.top_p,
             user=params.user,
@@ -216,14 +230,19 @@ async def openai_chat_completion(
         self,
         params: OpenAIChatCompletionRequestWithExtraBody,
     ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        # Add usage tracking for streaming when telemetry is active
-
+        # Inject stream_options when streaming and telemetry is active
         stream_options = params.stream_options
         if params.stream:
-            if stream_options is None:
-                stream_options = {"include_usage": True}
-            elif "include_usage" not in stream_options:
-                stream_options = {**stream_options, "include_usage": True}
+            from opentelemetry import trace
+
+            span = trace.get_current_span()
+            if span and span.is_recording():
+                if stream_options is None:
+                    stream_options = {"include_usage": True}
+                else:
+                    # Active telemetry takes precedence over caller preference.
+                    # This ensures complete and consistent observability metrics.
+                    stream_options = {**stream_options, "include_usage": True}
 
         if not self.model_store:
             raise ValueError("Model store is not initialized")
diff --git a/src/llama_stack/providers/utils/inference/openai_mixin.py b/src/llama_stack/providers/utils/inference/openai_mixin.py
@@ -271,6 +271,20 @@ async def openai_completion(
         """
         Direct OpenAI completion API call.
         """
+        # Inject stream_options when streaming and telemetry is active
+        if params.stream:
+            from opentelemetry import trace
+
+            span = trace.get_current_span()
+            if span and span.is_recording():
+                params = params.model_copy()
+                if params.stream_options is None:
+                    params.stream_options = {"include_usage": True}
+                else:
+                    # Active telemetry takes precedence over caller preference.
+                    # This ensures complete and consistent observability metrics.
+                    params.stream_options = {**params.stream_options, "include_usage": True}
+
         # TODO: fix openai_completion to return type compatible with OpenAI's API response
         provider_model_id = await self._get_provider_model_id(params.model)
         self._validate_model_allowed(provider_model_id)
@@ -308,6 +322,20 @@ async def openai_chat_completion(
         """
         Direct OpenAI chat completion API call.
         """
+        # Inject stream_options when streaming and telemetry is active
+        if params.stream:
+            from opentelemetry import trace
+
+            span = trace.get_current_span()
+            if span and span.is_recording():
+                params = params.model_copy()
+                if params.stream_options is None:
+                    params.stream_options = {"include_usage": True}
+                else:
+                    # Active telemetry takes precedence over caller preference.
+                    # This ensures complete and consistent observability metrics.
+                    params.stream_options = {**params.stream_options, "include_usage": True}
+
         provider_model_id = await self._get_provider_model_id(params.model)
         self._validate_model_allowed(provider_model_id)
 
diff --git a/tests/unit/providers/inference/test_litellm_openai_mixin.py b/tests/unit/providers/inference/test_litellm_openai_mixin.py
diff --git a/tests/unit/providers/utils/inference/test_openai_mixin.py b/tests/unit/providers/utils/inference/test_openai_mixin.py