ci: Deprecate vllm "guided_decoding" with "structured_outputs" (#8508)

yinggeh · web-flow · commit 1323e8c1befa · 2025-11-14T12:43:59.000-08:00
diff --git a/python/openai/README.md b/python/openai/README.md
@@ -610,10 +610,9 @@ tool calling result: The weather in Dallas, Texas is 85 degrees fahrenheit. It i
 
 #### Named Tool Calling
 
-The OpenAI frontend supports named function calling, utilizing guided decoding in the vLLM and TensorRT-LLM backends. Users can specify one of the tools in `tool_choice` to force the model to select a specific tool for function calling.
+The OpenAI frontend supports named function calling, utilizing structured outputs in the vLLM backend and guided decoding in TensorRT-LLM backend. Users can specify one of the tools in `tool_choice` to force the model to select a specific tool for function calling.
 
 > [!NOTE]
-> The latest release of TensorRT-LLM (v0.18.0) does not yet support guided decoding. To enable this feature, use a build from the main branch of TensorRT-LLM.
 > For instructions on enabling guided decoding in the TensorRT-LLM backend, please refer to [this guide](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/guided_decoding.md)
 
 Example for making a named tool calling request:
diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py
@@ -678,7 +678,8 @@ def _get_streaming_response_delta(
             # check to make sure we haven't "forgotten" to stream
             # any tokens that were generated but previously
             # matched by partial json parsing, such as '}'.
-            # only happens if we are NOT using guided decoding
+            # only happens if we are NOT using structured outputs
+            # or guided decoding
             if (
                 self._should_check_for_unstreamed_tool_arg_tokens(
                     response_delta=response_delta,
diff --git a/python/openai/openai_frontend/engine/utils/triton.py b/python/openai/openai_frontend/engine/utils/triton.py
@@ -113,11 +113,11 @@ def _create_vllm_generate_request(
 
     guided_json = _get_guided_json_from_tool(request)
     if guided_json is not None:
-        from vllm.sampling_params import GuidedDecodingParams
+        from vllm.sampling_params import StructuredOutputsParams
 
         sampling_parameters_json = json.loads(sampling_parameters)
-        sampling_parameters_json["guided_decoding"] = json.dumps(
-            asdict(GuidedDecodingParams.from_optional(json=guided_json))
+        sampling_parameters_json["structured_outputs"] = json.dumps(
+            asdict(StructuredOutputsParams.from_optional(json=guided_json))
         )
         sampling_parameters = json.dumps(sampling_parameters_json)
 
diff --git a/python/openai/tests/test_tool_calling.py b/python/openai/tests/test_tool_calling.py
@@ -341,10 +341,6 @@ async def test_tool_call_with_reply_response(
         # validate if steaming and non-streaming generates the same content
         assert "".join(chunks) == choice.message.content
 
-    @pytest.mark.skipif(
-        os.environ.get("IMAGE_KIND") == "TRTLLM",
-        reason="latest release version of Tensorrt LLM 0.18 doesn't support guided decoding",
-    )
     @pytest.mark.asyncio
     async def test_tool_call_with_named_tool_choice(
         self, client: openai.AsyncOpenAI, model: str
@@ -448,10 +444,6 @@ async def test_tool_call_with_named_tool_choice(
         assert choice.message.role == role_name
         assert choice.message.tool_calls[0].function.name == function_name
 
-    @pytest.mark.skipif(
-        os.environ.get("IMAGE_KIND") == "TRTLLM",
-        reason="latest release version of Tensorrt LLM 0.18 doesn't support guided decoding",
-    )
     @pytest.mark.asyncio
     async def test_tool_call_with_required_tool_choice(
         self, client: openai.AsyncOpenAI, model: str