feat(huggingface): support reasoning tokens (#558)

njbrake · web-flow · commit 8d06bf3341a1 · 2025-10-17T10:58:22.000-04:00
## Description  ## PR Type  ## Checklist - [x] I have added unit tests that prove my fix/feature works - [x] New and existing tests pass locally - [x] Documentation was updated where necessary - [x] I have read and followed the [contribution guidelines](https://github.com/mozilla-ai/any-llm/blob/main/CONTRIBUTING.md)```
diff --git a/src/any_llm/constants.py b/src/any_llm/constants.py
@@ -5,6 +5,13 @@
 
 INSIDE_NOTEBOOK = hasattr(builtins, "__IPYTHON__")
 
+REASONING_FIELD_NAMES = [
+    "reasoning_content",
+    "thinking",
+    "think",
+    "chain_of_thought",
+]
+
 
 class LLMProvider(StrEnum):
     """String enum for supported providers."""
diff --git a/src/any_llm/providers/huggingface/huggingface.py b/src/any_llm/providers/huggingface/huggingface.py
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING, Any
 
 from any_llm.any_llm import AnyLLM
+from any_llm.constants import REASONING_FIELD_NAMES
 from any_llm.types.completion import (
     ChatCompletion,
     ChatCompletionChunk,
@@ -11,6 +12,7 @@
     CompletionParams,
     CompletionUsage,
     CreateEmbeddingResponse,
+    Reasoning,
 )
 
 MISSING_PACKAGES_ERROR = None
@@ -21,6 +23,7 @@
         _convert_models_list,
         _convert_params,
         _create_openai_chunk_from_huggingface_chunk,
+        _normalize_reasoning_on_message,
     )
 except ImportError as e:
     MISSING_PACKAGES_ERROR = e
@@ -47,7 +50,7 @@ class HuggingfaceProvider(AnyLLM):
     SUPPORTS_RESPONSES = False
     SUPPORTS_COMPLETION_IMAGE = False
     SUPPORTS_COMPLETION_PDF = False
-    SUPPORTS_COMPLETION_REASONING = False
+    SUPPORTS_COMPLETION_REASONING = True
     SUPPORTS_EMBEDDING = False
     SUPPORTS_LIST_MODELS = True
 
@@ -101,14 +104,96 @@ def _init_client(self, api_key: str | None = None, api_base: str | None = None,
             **kwargs,
         )
 
+    @staticmethod
+    def _find_reasoning_tag(text: str, opening: bool = True) -> tuple[int, str] | None:
+        """Find the first reasoning tag (opening or closing) in text.
+
+        Returns (position, tag_name) or None if no tag found.
+        """
+        earliest_pos = len(text)
+        earliest_tag = None
+
+        for tag_name in REASONING_FIELD_NAMES:
+            tag = f"<{tag_name}>" if opening else f"</{tag_name}>"
+            pos = text.find(tag)
+            if pos != -1 and pos < earliest_pos:
+                earliest_pos = pos
+                earliest_tag = tag_name
+
+        return (earliest_pos, earliest_tag) if earliest_tag else None
+
+    @staticmethod
+    def _is_partial_reasoning_tag(text: str, opening: bool = True) -> bool:
+        """Check if text could be the start of any reasoning tag."""
+        for tag_name in REASONING_FIELD_NAMES:
+            tag = f"<{tag_name}>" if opening else f"</{tag_name}>"
+            for i in range(1, len(tag) + 1):
+                if text.startswith(tag[:i]):
+                    return True
+        return False
+
     async def _stream_completion_async(
         self,
         **kwargs: Any,
     ) -> AsyncIterator[ChatCompletionChunk]:
         response: AsyncIterator[HuggingFaceChatCompletionStreamOutput] = await self.client.chat_completion(**kwargs)
 
+        buffer = ""
+        current_tag = None
+        reasoning_buffer = ""
+
         async for chunk in response:
-            yield self._convert_completion_chunk_response(chunk)
+            original_chunk = self._convert_completion_chunk_response(chunk)
+
+            if not (len(original_chunk.choices) > 0 and original_chunk.choices[0].delta.content):
+                yield original_chunk
+                continue
+
+            buffer += original_chunk.choices[0].delta.content
+            content_parts = []
+            reasoning_parts = []
+
+            while buffer:
+                if current_tag is None:
+                    tag_info = self._find_reasoning_tag(buffer, opening=True)
+                    if tag_info:
+                        tag_start, tag_name = tag_info
+                        if tag_start > 0:
+                            content_parts.append(buffer[:tag_start])
+                        tag_full = f"<{tag_name}>"
+                        buffer = buffer[tag_start + len(tag_full) :]
+                        current_tag = tag_name
+                    elif self._is_partial_reasoning_tag(buffer, opening=True):
+                        break
+                    else:
+                        content_parts.append(buffer)
+                        buffer = ""
+                else:
+                    tag_close = f"</{current_tag}>"
+                    tag_end = buffer.find(tag_close)
+                    if tag_end != -1:
+                        reasoning_parts.append(reasoning_buffer + buffer[:tag_end])
+                        reasoning_buffer = ""
+                        buffer = buffer[tag_end + len(tag_close) :]
+                        current_tag = None
+                    elif self._is_partial_reasoning_tag(buffer, opening=False):
+                        reasoning_buffer += buffer
+                        buffer = ""
+                        break
+                    else:
+                        reasoning_buffer += buffer
+                        buffer = ""
+
+            if content_parts or reasoning_parts:
+                modified_chunk = original_chunk.model_copy(deep=True)
+                modified_chunk.choices[0].delta.content = "".join(content_parts) if content_parts else None
+                if reasoning_parts:
+                    modified_chunk.choices[0].delta.reasoning = Reasoning(content="".join(reasoning_parts))
+                yield modified_chunk
+            elif not buffer:
+                modified_chunk = original_chunk.model_copy(deep=True)
+                modified_chunk.choices[0].delta.content = None
+                yield modified_chunk
 
     async def _acompletion(
         self,
@@ -127,10 +212,19 @@ async def _acompletion(
         choices_out: list[Choice] = []
         for i, ch in enumerate(data.get("choices", [])):
             msg = ch.get("message", {})
+
+            _normalize_reasoning_on_message(msg)
+
+            reasoning_obj = None
+            if msg.get("reasoning") and isinstance(msg["reasoning"], dict):
+                if "content" in msg["reasoning"]:
+                    reasoning_obj = Reasoning(content=msg["reasoning"]["content"])
+
             message = ChatCompletionMessage(
                 role="assistant",
                 content=msg.get("content"),
                 tool_calls=msg.get("tool_calls"),
+                reasoning=reasoning_obj,
             )
             choices_out.append(Choice(index=i, finish_reason=ch.get("finish_reason"), message=message))
 
diff --git a/src/any_llm/providers/huggingface/utils.py b/src/any_llm/providers/huggingface/utils.py
@@ -1,3 +1,4 @@
+import re
 import uuid
 from collections.abc import Iterable
 from typing import Any, Literal, cast
@@ -8,16 +9,54 @@
 )
 from openai.lib._parsing import type_to_response_format_param
 
+from any_llm.constants import REASONING_FIELD_NAMES
 from any_llm.types.completion import (
     ChatCompletionChunk,
     ChoiceDelta,
     ChunkChoice,
     CompletionParams,
     CompletionUsage,
+    Reasoning,
 )
 from any_llm.types.model import Model
 
 
+def _normalize_reasoning_on_message(message_dict: dict[str, Any]) -> None:
+    """Mutate a message dict to extract reasoning from content tags and provider-specific fields."""
+    if isinstance(message_dict.get("reasoning"), dict) and "content" in message_dict["reasoning"]:
+        return
+
+    reasoning_content = None
+
+    for field_name in REASONING_FIELD_NAMES:
+        if field_name in message_dict and message_dict[field_name] is not None:
+            reasoning_content = message_dict[field_name]
+            break
+
+    if reasoning_content is None and isinstance(message_dict.get("reasoning"), str):
+        reasoning_content = message_dict["reasoning"]
+
+    content = message_dict.get("content")
+    if isinstance(content, str):
+        for tag_name in REASONING_FIELD_NAMES:
+            tag_open = f"<{tag_name}>"
+            tag_close = f"</{tag_name}>"
+            think_pattern = re.escape(tag_open) + r"(.*?)" + re.escape(tag_close)
+            matches = re.findall(think_pattern, content, re.DOTALL)
+            if matches:
+                extracted_reasoning = "\n".join(matches)
+                if reasoning_content:
+                    reasoning_content = f"{reasoning_content}\n{extracted_reasoning}"
+                else:
+                    reasoning_content = extracted_reasoning
+                content = re.sub(think_pattern, "", content, flags=re.DOTALL).strip()
+
+        message_dict["content"] = content
+
+    if reasoning_content is not None:
+        message_dict["reasoning"] = {"content": str(reasoning_content)}
+
+
 def _create_openai_chunk_from_huggingface_chunk(chunk: HuggingFaceChatCompletionStreamOutput) -> ChatCompletionChunk:
     """Convert a HuggingFace streaming chunk to OpenAI ChatCompletionChunk format."""
 
@@ -30,14 +69,31 @@ def _create_openai_chunk_from_huggingface_chunk(chunk: HuggingFaceChatCompletion
 
     for i, hf_choice in enumerate(hf_choices):
         hf_delta = hf_choice.delta
-        content = hf_delta.content
-        role = hf_delta.role
 
-        openai_role = None
-        if role:
-            openai_role = cast("Literal['developer', 'system', 'user', 'assistant', 'tool']", role)
+        delta_dict: dict[str, Any] = {}
+        if hf_delta.content is not None:
+            delta_dict["content"] = hf_delta.content
+        if hf_delta.role is not None:
+            delta_dict["role"] = hf_delta.role
+        if hasattr(hf_delta, "reasoning"):
+            delta_dict["reasoning"] = hf_delta.reasoning
 
-        delta = ChoiceDelta(content=content, role=openai_role)
+        _normalize_reasoning_on_message(delta_dict)
+
+        openai_role = None
+        if delta_dict.get("role"):
+            openai_role = cast("Literal['developer', 'system', 'user', 'assistant', 'tool']", delta_dict["role"])
+
+        reasoning_obj = None
+        if delta_dict.get("reasoning") and isinstance(delta_dict["reasoning"], dict):
+            if "content" in delta_dict["reasoning"]:
+                reasoning_obj = Reasoning(content=delta_dict["reasoning"]["content"])
+
+        delta = ChoiceDelta(
+            content=delta_dict.get("content"),
+            role=openai_role,
+            reasoning=reasoning_obj,
+        )
 
         choice = ChunkChoice(
             index=i,
diff --git a/src/any_llm/providers/openai/utils.py b/src/any_llm/providers/openai/utils.py
@@ -4,6 +4,7 @@
 
 from openai.types.chat.chat_completion import ChatCompletion as OpenAIChatCompletion
 
+from any_llm.constants import REASONING_FIELD_NAMES
 from any_llm.logging import logger
 from any_llm.types.completion import ChatCompletion
 
@@ -13,11 +14,7 @@ def _normalize_reasoning_on_message(message_dict: dict[str, Any]) -> None:
     if isinstance(message_dict.get("reasoning"), dict) and "content" in message_dict["reasoning"]:
         return
 
-    possible_fields = [
-        "reasoning_content",
-        "thinking",
-        "chain_of_thought",
-    ]
+    possible_fields = REASONING_FIELD_NAMES
     value: Any | None = None
     for field_name in possible_fields:
         if field_name in message_dict and message_dict[field_name] is not None:
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -30,6 +30,7 @@ def provider_reasoning_model_map() -> dict[LLMProvider, str]:
         LLMProvider.MOONSHOT: "kimi-thinking-preview",
         LLMProvider.DATABRICKS: "databricks-gpt-oss-20b",  # Untested, needs to be verified once we get a Databricks account
         LLMProvider.BEDROCK: "us.anthropic.claude-haiku-4-5-20251001-v1:0",
+        LLMProvider.HUGGINGFACE: "huggingface/tgi",
     }
 
 
diff --git a/tests/unit/providers/test_huggingface_provider.py b/tests/unit/providers/test_huggingface_provider.py

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ def provider_reasoning_model_map() -> dict[LLMProvider, str]:`
`30`	`30`	`LLMProvider.MOONSHOT: "kimi-thinking-preview",`
`31`	`31`	`LLMProvider.DATABRICKS: "databricks-gpt-oss-20b", # Untested, needs to be verified once we get a Databricks account`
`32`	`32`	`LLMProvider.BEDROCK: "us.anthropic.claude-haiku-4-5-20251001-v1:0",`
	`33`	`+ LLMProvider.HUGGINGFACE: "huggingface/tgi",`
`33`	`34`	`}`
`34`	`35`
`35`	`36`