From 6623be7ac6ea444c3498c4585e7199f4f2d3f3c9 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Thu, 3 Oct 2024 11:26:35 -0600
Subject: [PATCH] fix: do not use existence of tool_parser as proxy for
 tool_choice_auto

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/entrypoints/openai/serving_chat.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 41f131f56b51..1ef3471240b8 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -505,15 +505,18 @@ async def chat_completion_stream_generator(
                         #   any tokens that were generated but previously
                         #   matched by partial json parsing
                         # only happens if we are NOT using guided decoding
-                        if tool_parser:
+                        if tool_choice_auto:
+                            assert tool_parser is not None
                             index = len(
                                 tool_parser.prev_tool_call_arr) - 1 if len(
                                     tool_parser.prev_tool_call_arr) > 0 else 0
                         else:
                             index = 0
 
-                        if self._should_check_for_unstreamed_tool_arg_tokens(
-                                delta_message, output) and tool_parser:
+                        if tool_choice_auto and \
+                            self._should_check_for_unstreamed_tool_arg_tokens(
+                                delta_message, output):
+                            assert tool_parser is not None
                             # get the expected call based on partial JSON
                             # parsing which "autocompletes" the JSON
                             expected_call = json.dumps(