Skip to content

Commit 34afd55

Browse files
committed
Improve error handling
1 parent c6ce239 commit 34afd55

File tree

3 files changed

+104
-44
lines changed

3 files changed

+104
-44
lines changed

constants.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,6 @@
99
# Rate limit error code
1010
RATE_LIMIT_ERROR_CODE = 429
1111

12-
# Rate limit error message
13-
RATE_LIMIT_ERROR_MESSAGE = "Rate limit exceeded: free-models-per-day"
14-
1512
# Public endpoints that don't require authentication
1613
PUBLIC_ENDPOINTS = ["/api/v1/models"]
1714

routes.py

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from key_manager import KeyManager
1717
from utils import (
1818
verify_access_key,
19-
check_rate_limit_openai,
19+
check_rate_limit_chat,
2020
check_rate_limit
2121
)
2222

@@ -128,8 +128,10 @@ async def proxy_endpoint(
128128
request, path, api_key, is_stream, is_completion
129129
)
130130

131-
except Exception as e:
131+
except (Exception, HTTPException) as e:
132132
logger.error("Error proxying request: %s", str(e))
133+
if isinstance(e, HTTPException):
134+
raise e
133135
raise HTTPException(status_code=500, detail=f"Proxy error: {str(e)}") from e
134136

135137

@@ -143,25 +145,27 @@ async def handle_completions(
143145
"""Handle chat completions using the OpenAI client."""
144146
try:
145147
# Extract headers to forward
146-
forward_headers = {}
147-
for k, v in request.headers.items():
148-
if k.lower() in ["http-referer", "x-title"]:
149-
forward_headers[k] = v
148+
forward_headers = {
149+
k: v
150+
for k, v in request.headers.items()
151+
if k.lower()
152+
not in ["host", "content-length", "connection", "authorization"]
153+
}
150154

151155
# Create a copy of the request body to modify
152156
completion_args = request_body.copy()
153157

158+
# Ensure we don't pass 'stream' twice
159+
if "stream" in completion_args:
160+
del completion_args["stream"]
161+
154162
# Move non-standard parameters that OpenAI SDK doesn't support directly to extra_body
155163
extra_body = {}
156164
openai_unsupported_params = ["include_reasoning", "transforms", "route", "provider"]
157165
for param in openai_unsupported_params:
158166
if param in completion_args:
159167
extra_body[param] = completion_args.pop(param)
160168

161-
# Ensure we don't pass 'stream' twice
162-
if "stream" in completion_args:
163-
del completion_args["stream"]
164-
165169
# Create a properly formatted request to the OpenAI API
166170
if is_stream:
167171
logger.info("Making streaming chat completion request")
@@ -186,7 +190,7 @@ async def stream_response() -> AsyncGenerator[bytes, None]:
186190
logger.error("Error in streaming response: %s", err)
187191
# Check if this is a rate limit error
188192
if api_key:
189-
has_rate_limit_error_, reset_time_ms_ = check_rate_limit_openai(err)
193+
has_rate_limit_error_, reset_time_ms_ = check_rate_limit_chat(err)
190194
if has_rate_limit_error_:
191195
logger.warning("Rate limit detected in stream. Disabling key.")
192196
await key_manager.disable_key(
@@ -221,26 +225,30 @@ async def stream_response() -> AsyncGenerator[bytes, None]:
221225
)
222226
except (APIError, Exception) as e:
223227
logger.error("Error in chat completions: %s", str(e))
224-
# Check if this is a rate limit error
225-
if api_key and isinstance(e, APIError):
226-
has_rate_limit_error, reset_time_ms = check_rate_limit_openai(e)
227-
if has_rate_limit_error:
228-
logger.warning("Rate limit detected in stream. Disabling key.")
229-
await key_manager.disable_key(api_key, reset_time_ms)
230-
231-
# Try again with a new key
232-
new_api_key = await key_manager.get_next_key()
233-
if new_api_key:
234-
new_client = await get_openai_client(new_api_key)
235-
return await handle_completions(
236-
new_client, request, request_body, new_api_key, is_stream
237-
)
238-
228+
code = 500
229+
detail = f"Error processing chat completion: {str(e)}"
230+
if isinstance(e, APIError):
231+
# Check if this is a rate limit error
232+
if api_key:
233+
has_rate_limit_error, reset_time_ms = check_rate_limit_chat(e)
234+
if has_rate_limit_error:
235+
logger.warning("Rate limit detected in stream. Disabling key.")
236+
await key_manager.disable_key(api_key, reset_time_ms)
237+
238+
# Try again with a new key
239+
new_api_key = await key_manager.get_next_key()
240+
if new_api_key:
241+
new_client = await get_openai_client(new_api_key)
242+
return await handle_completions(
243+
new_client, request, request_body, new_api_key, is_stream
244+
)
245+
code = e.code or code
246+
detail = e.body or detail
239247
# Raise the exception
240-
raise HTTPException(500, f"Error processing chat completion: {str(e)}") from e
248+
raise HTTPException(code, detail) from e
241249

242250

243-
async def _check_httpx_err(body: str or bytes, api_key: str or None):
251+
async def _check_httpx_err(body: str | bytes, api_key: str | None):
244252
if api_key and (isinstance(body, str) and body.startswith("data: ") or (
245253
isinstance(body, bytes) and body.startswith(b"data: "))):
246254
body = body[6:]

utils.py

Lines changed: 68 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@
44
"""
55

66
import socket
7+
import time
78
import json
89
from typing import Optional, Tuple
910

1011
from fastapi import Header, HTTPException
1112
from openai import APIError
1213

1314
from config import config, logger
14-
from constants import RATE_LIMIT_ERROR_MESSAGE, RATE_LIMIT_ERROR_CODE
15+
from constants import RATE_LIMIT_ERROR_CODE
1516

1617

1718
def get_local_ip() -> str:
@@ -55,7 +56,58 @@ async def verify_access_key(
5556

5657
return True
5758

58-
def check_rate_limit_openai(err: APIError) -> Tuple[bool, Optional[int]]:
59+
60+
def parse_google_rate_error(data: str) -> Optional[int]:
61+
# data = {
62+
# 'error': {
63+
# 'code': 429,
64+
# 'message': 'You exceeded your current quota, please check your plan and billing details.',
65+
# 'status': 'RESOURCE_EXHAUSTED',
66+
# 'details': [
67+
# {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [
68+
# {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_paid_tier_input_token_count',
69+
# 'quotaId': 'GenerateContentPaidTierInputTokensPerModelPerMinute',
70+
# 'quotaDimensions': {'model': 'gemini-2.0-pro-exp', 'location': 'global'},
71+
# 'quotaValue': '10000000'}
72+
# ]},
73+
# {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [
74+
# {'description': 'Learn more about Gemini API quotas',
75+
# 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}
76+
# ]},
77+
# {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '5s'}
78+
# ]
79+
# }
80+
# }
81+
reset_time_ms = None
82+
time_units = {'s': 1000, 'm': 60000, 'h': 3600000}
83+
try:
84+
data = json.loads(data)
85+
except Exception as e:
86+
logger.info("Json.loads error %s", e)
87+
else:
88+
retry_delay_ms = None
89+
try:
90+
message = data["error"].get("message", "")
91+
92+
retry_info = next((item for item in data['error']['details'] if
93+
item.get('@type') == 'type.googleapis.com/google.rpc.RetryInfo'), {})
94+
retry_delay = retry_info.get('retryDelay', '0s')
95+
96+
num_part = ''.join(c for c in retry_delay if c.isdigit())
97+
unit_part = ''.join(c for c in retry_delay if c.isalpha())
98+
99+
retry_delay_ms = int(num_part) * time_units.get(unit_part, 1000) if num_part else 0
100+
except (TypeError, KeyError) as err:
101+
logger.info("google reply parsing error %s", err)
102+
else:
103+
logger.info("google rate limit %s, retry: %s", message, retry_delay)
104+
105+
if retry_delay_ms:
106+
reset_time_ms = int(time.time() * 1000) + retry_delay_ms
107+
108+
return reset_time_ms
109+
110+
def check_rate_limit_chat(err: APIError) -> Tuple[bool, Optional[int]]:
59111
"""
60112
Check for rate limit error.
61113
@@ -68,15 +120,15 @@ def check_rate_limit_openai(err: APIError) -> Tuple[bool, Optional[int]]:
68120
has_rate_limit_error = False
69121
reset_time_ms = None
70122

71-
if err.code == RATE_LIMIT_ERROR_CODE and isinstance(err.body, dict):
72-
try:
73-
reset_time_ms = int(err.body["metadata"]["headers"]["X-RateLimit-Reset"])
74-
has_rate_limit_error = True
75-
except (TypeError, KeyError):
76-
pass
77-
78-
if reset_time_ms is None and RATE_LIMIT_ERROR_MESSAGE in err.message:
123+
if err.code == RATE_LIMIT_ERROR_CODE:
79124
has_rate_limit_error = True
125+
if isinstance(err.body, dict):
126+
try:
127+
reset_time_ms = int(err.body["metadata"]["headers"]["X-RateLimit-Reset"])
128+
except (TypeError, KeyError):
129+
raw = err.body.get("metadata", {}).get("raw", "")
130+
if raw and has_rate_limit_error:
131+
reset_time_ms = parse_google_rate_error(raw)
80132

81133
return has_rate_limit_error, reset_time_ms
82134

@@ -100,16 +152,19 @@ def check_rate_limit(data: str or bytes) -> Tuple[bool, Optional[int]]:
100152
else:
101153
if isinstance(err, dict) and "error" in err:
102154
code = err["error"].get("code", 0)
103-
msg = err["error"].get("message", 0)
104155
try:
105156
x_rate_limit = int(err["error"]["metadata"]["headers"]["X-RateLimit-Reset"])
106157
except (TypeError, KeyError):
107-
x_rate_limit = 0
158+
raw = err["error"].get("metadata", {}).get("raw", "")
159+
if raw and code == RATE_LIMIT_ERROR_CODE:
160+
x_rate_limit = parse_google_rate_error(raw)
161+
else:
162+
x_rate_limit = 0
108163

109164
if x_rate_limit > 0:
110165
has_rate_limit_error = True
111166
reset_time_ms = x_rate_limit
112-
elif code == RATE_LIMIT_ERROR_CODE and msg == RATE_LIMIT_ERROR_MESSAGE:
167+
elif code == RATE_LIMIT_ERROR_CODE:
113168
has_rate_limit_error = True
114169

115170
return has_rate_limit_error, reset_time_ms

0 commit comments

Comments
 (0)