feat(retry): add comprehensive tracking of all failed attempts and exceptions (#1802)

jxnl · web-flow · commit 9584ce7d5c52 · 2025-09-04T15:56:51.000-04:00
## Description This PR enhances the retry mechanism in Instructor to track all failed completions and exceptions across the entire range of retries, not just the final failure. This provides much better debugging capabilities and insight into retry patterns. ## Changes ### Core Changes - **New data structure**: Tracks individual retry failures with attempt number, exception, and completion response - **Enhanced **: Now includes a list containing all retry failures - **Updated retry functions**: Both and now collect comprehensive failure information ### Key Features - Track attempt number for each failure - Store the actual exception that occurred - Preserve completion responses (when available) for analysis - Maintain backward compatibility with existing exception handling ### Benefits - **Better debugging**: See exactly what failed at each retry attempt - **Pattern analysis**: Identify if failures are consistent or changing across retries - **Completion inspection**: Access raw LLM responses that failed validation - **Comprehensive error reporting**: Full visibility into the retry process ## Usage Example ```python try: response = client.chat.completions.create( response_model=MyModel, messages=messages, max_retries=3 ) except InstructorRetryException as e: print(f"Failed after {e.n_attempts} attempts") # New: Access all failed attempts for attempt in e.failed_attempts: print(f"Attempt {attempt.attempt_number}: {attempt.exception}") if attempt.completion: # Analyze the raw completion that failed analyze_completion(attempt.completion) ``` ## Testing - All existing tests pass (backward compatibility maintained) - Linting and formatting checks pass - Example demonstrates the new functionality ## Backward Compatibility This change is fully backward compatible. The new field is optional and defaults to an empty list if not provided. This PR was written by [Cursor](https://cursor.com)  ---- > [!IMPORTANT] > Enhances retry mechanism to track all failed attempts and exceptions for improved debugging and analysis. > > - **Behavior**: > - Introduces `FailedAttempt` in `exceptions.py` to track retry attempts with attempt number, exception, and completion. > - Updates `InstructorRetryException` to include `failed_attempts` list. > - Modifies `retry_sync` and `retry_async` in `retry.py` to populate `failed_attempts` with each failed attempt. > - **Benefits**: > - Provides detailed tracking of all retry attempts for better debugging and analysis. > - Maintains backward compatibility by defaulting `failed_attempts` to an empty list if not provided. > > <sup>This description was created by </sup>[<img alt="Ellipsis" src="https://img.shields.io/badge/Ellipsis-blue?color=175173">](https://www.ellipsis.dev?ref=567-labs%2Finstructor&utm_source=github&utm_medium=referral)<sup> for 69f2017. You can [customize](https://app.ellipsis.dev/567-labs/settings/summaries) this summary. It will automatically update as commits are pushed.</sup>
diff --git a/instructor/core/exceptions.py b/instructor/core/exceptions.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any
+from typing import Any, NamedTuple
 
 
 class InstructorError(Exception):
@@ -9,6 +9,14 @@ class InstructorError(Exception):
     pass
 
 
+class FailedAttempt(NamedTuple):
+    """Represents a single failed retry attempt."""
+
+    attempt_number: int
+    exception: Exception
+    completion: Any | None = None
+
+
 class IncompleteOutputException(InstructorError):
     """Exception raised when the output from LLM is incomplete due to max tokens limit reached."""
 
@@ -34,13 +42,15 @@ def __init__(
         n_attempts: int,
         total_usage: int,
         create_kwargs: dict[str, Any] | None = None,
+        failed_attempts: list[FailedAttempt] | None = None,
         **kwargs: dict[str, Any],
     ):
         self.last_completion = last_completion
         self.messages = messages
         self.n_attempts = n_attempts
         self.total_usage = total_usage
         self.create_kwargs = create_kwargs
+        self.failed_attempts = failed_attempts or []
         super().__init__(*args, **kwargs)
 
 
diff --git a/instructor/core/retry.py b/instructor/core/retry.py
@@ -6,7 +6,7 @@
 from json import JSONDecodeError
 from typing import Any, Callable, TypeVar
 
-from .exceptions import InstructorRetryException, AsyncValidationError
+from .exceptions import InstructorRetryException, AsyncValidationError, FailedAttempt
 from .hooks import Hooks
 from ..mode import Mode
 from ..processing.response import (
@@ -175,6 +175,9 @@ def retry_sync(
     # Pre-extract stream flag to avoid repeated lookup
     stream = kwargs.get("stream", False)
 
+    # Track all failed attempts
+    failed_attempts: list[FailedAttempt] = []
+
     try:
         response = None
         for attempt in max_retries:
@@ -200,6 +203,15 @@ def retry_sync(
                     logger.debug(f"Parse error: {e}")
                     hooks.emit_parse_error(e)
 
+                    # Track this failed attempt
+                    failed_attempts.append(
+                        FailedAttempt(
+                            attempt_number=attempt.retry_state.attempt_number,
+                            exception=e,
+                            completion=response,
+                        )
+                    )
+
                     # Check if this is the last attempt
                     if isinstance(max_retries, Retrying) and hasattr(
                         max_retries, "stop"
@@ -231,6 +243,15 @@ def retry_sync(
                     logger.debug(f"Completion error: {e}")
                     hooks.emit_completion_error(e)
 
+                    # Track this failed attempt
+                    failed_attempts.append(
+                        FailedAttempt(
+                            attempt_number=attempt.retry_state.attempt_number,
+                            exception=e,
+                            completion=response,
+                        )
+                    )
+
                     # Check if this is the last attempt for completion errors
                     if isinstance(max_retries, Retrying) and hasattr(
                         max_retries, "stop"
@@ -261,6 +282,7 @@ def retry_sync(
             ),  # Use the optimized function instead of nested lookups
             create_kwargs=kwargs,
             total_usage=total_usage,
+            failed_attempts=failed_attempts,
         ) from e
 
 
@@ -304,6 +326,9 @@ async def retry_async(
     # Pre-extract stream flag to avoid repeated lookup
     stream = kwargs.get("stream", False)
 
+    # Track all failed attempts
+    failed_attempts: list[FailedAttempt] = []
+
     try:
         response = None
         async for attempt in max_retries:
@@ -333,6 +358,15 @@ async def retry_async(
                     logger.debug(f"Parse error: {e}")
                     hooks.emit_parse_error(e)
 
+                    # Track this failed attempt
+                    failed_attempts.append(
+                        FailedAttempt(
+                            attempt_number=attempt.retry_state.attempt_number,
+                            exception=e,
+                            completion=response,
+                        )
+                    )
+
                     # Check if this is the last attempt
                     if isinstance(max_retries, AsyncRetrying) and hasattr(
                         max_retries, "stop"
@@ -364,6 +398,15 @@ async def retry_async(
                     logger.debug(f"Completion error: {e}")
                     hooks.emit_completion_error(e)
 
+                    # Track this failed attempt
+                    failed_attempts.append(
+                        FailedAttempt(
+                            attempt_number=attempt.retry_state.attempt_number,
+                            exception=e,
+                            completion=response,
+                        )
+                    )
+
                     # Check if this is the last attempt for completion errors
                     if isinstance(max_retries, AsyncRetrying) and hasattr(
                         max_retries, "stop"
@@ -394,4 +437,5 @@ async def retry_async(
             ),  # Use the optimized function instead of nested lookups
             create_kwargs=kwargs,
             total_usage=total_usage,
+            failed_attempts=failed_attempts,
         ) from e