feat(ratelimit): add global delay for upstream rate limits #1

Aculeasis · Aculeasis · commit 0c2c5563d4bd · 2025-07-22T08:38:14.000+03:00
Introduces a mechanism to handle upstream rate limits from OpenRouter, which can occur when a specific model is overloaded. This generalizes the previous handling that was specific to Google's 'RESOURCE_EXHAUSTED' errors.

BREAKING CHANGE: The configuration option openrouter.google_rate_delay has been renamed to openrouter.global_rate_delay to reflect its broader purpose.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # OpenRouter Proxy
 
-A simple proxy server for OpenRouter API that helps bypass rate limits on free API keys 
+A simple proxy server for OpenRouter API that helps bypass rate limits on free API keys
 by rotating through multiple API keys in a round-robin fashion.
 
 ## Features
@@ -64,10 +64,12 @@ openrouter:
   # Time in seconds to temporarily disable a key when rate limit is reached by default
   rate_limit_cooldown: 14400  # 4 hours
   free_only: false # try to show only free models
-  # Google sometimes returns 429 RESOURCE_EXHAUSTED errors repeatedly, which can cause Roo Code to stop.
-  # This prevents repeated failures by introducing a delay before retrying.
-  # google_rate_delay: 10 # in sec
-  google_rate_delay: 0
+  # OpenRouter can return a 429 error if a model is overloaded.
+  # Additionally, Google sometimes returns 429 RESOURCE_EXHAUSTED errors repeatedly,
+  # which can cause Roo Code to stop.
+  # This option prevents repeated failures by introducing a delay before retrying.
+  # global_rate_delay: 10 # in seconds
+  global_rate_delay: 0
 
 # Proxy settings for outgoing requests to OpenRouter
 requestProxy:
diff --git a/config.py b/config.py
@@ -121,14 +121,14 @@ def normalize_and_validate_config(config_data: Dict[str, Any]):
          )
          openrouter_config["free_only"] = default_free_only
 
-    default_google_rate_delay = 0
-    if not isinstance(openrouter_config.get("google_rate_delay"), (int, float)):
+    default_global_rate_delay = 0
+    if not isinstance(openrouter_config.get("global_rate_delay"), (int, float)):
          logger.warning(
-             "'openrouter.google_rate_delay' missing or invalid in config.yml. "
+             "'openrouter.global_rate_delay' missing or invalid in config.yml. "
              "Using default: %s",
-             default_google_rate_delay
+             default_global_rate_delay
          )
-         openrouter_config["google_rate_delay"] = default_google_rate_delay
+         openrouter_config["global_rate_delay"] = default_global_rate_delay
 
     # --- Request Proxy Section ---
     if not isinstance(config_data.get("requestProxy"), dict):
diff --git a/config.yml.example b/config.yml.example
@@ -31,10 +31,12 @@ openrouter:
   # Time in seconds to temporarily disable a key when rate limit is reached by default
   rate_limit_cooldown: 14400  # 4 hours
   free_only: false # try to show only free models
-  # Google sometimes returns 429 RESOURCE_EXHAUSTED errors repeatedly, which can cause Roo Code to stop.
-  # This prevents repeated failures by introducing a delay before retrying.
-  # google_rate_delay: 10 # in sec
-  google_rate_delay: 0
+  # OpenRouter can return a 429 error if a model is overloaded.
+  # Additionally, Google sometimes returns 429 RESOURCE_EXHAUSTED errors repeatedly,
+  # which can cause Roo Code to stop.
+  # This option prevents repeated failures by introducing a delay before retrying.
+  # global_rate_delay: 10 # in seconds
+  global_rate_delay: 0
 
 # Proxy settings for outgoing requests to OpenRouter
 requestProxy:
diff --git a/constants.py b/constants.py
@@ -10,3 +10,8 @@
 RATE_LIMIT_ERROR_CODE = 429
 
 MODELS_ENDPOINTS = ["/api/v1/models"]
+
+GLOBAL_LIMIT_PATTERN = "is temporarily rate-limited upstream"
+
+GOOGLE_LIMIT_ERROR = "Google returned RESOURCE_EXHAUSTED code"
+GLOBAL_LIMIT_ERROR = "Model is temporarily rate-limited upstream"
diff --git a/utils.py b/utils.py
@@ -11,7 +11,7 @@
 from fastapi import Header, HTTPException
 
 from config import config, logger
-from constants import RATE_LIMIT_ERROR_CODE
+from constants import RATE_LIMIT_ERROR_CODE, GOOGLE_LIMIT_ERROR, GLOBAL_LIMIT_ERROR, GLOBAL_LIMIT_PATTERN
 
 
 def get_local_ip() -> str:
@@ -57,7 +57,22 @@ async def verify_access_key(
     return True
 
 
-async def is_google_error(data: str) -> bool:
+def check_global_limit(data: str) -> Optional[str]:
+    """
+    Checks for a global rate limit error message from OpenRouter.
+
+    Example message:
+    "google/gemini-2.0-flash-exp:free is temporarily rate-limited upstream.
+    Please retry shortly, or add your own key to accumulate your rate limits:
+    https://openrouter.ai/settings/integrations"
+    """
+    if isinstance(data, str) and GLOBAL_LIMIT_PATTERN in data:
+        logger.warning("Model %s is overloaded.", data.split(' ', 1)[0])
+        return GLOBAL_LIMIT_ERROR
+    return None
+
+
+def check_google_error(data: str) -> Optional[str]:
     # data = {
     #     'error': {
     #         'code': 429,
@@ -84,23 +99,9 @@ async def is_google_error(data: str) -> bool:
         except Exception as e:
             logger.info("Json.loads error %s", e)
         else:
-            if data["error"].get("status", "") == "RESOURCE_EXHAUSTED":
-                if config["openrouter"]["google_rate_delay"]:
-                    # I think this is global rate limit, so 'retryDelay' is useless
-                    # try:
-                    #     retry_info = next(
-                    #         (item for item in data['error']['details']
-                    #          if item.get('@type') == 'type.googleapis.com/google.rpc.RetryInfo'), {}
-                    #     )
-                    #     retry_delay = retry_info['retryDelay']
-                    #     retry_delay_s = int(''.join(c for c in retry_delay if c.isdigit()))
-                    # except (TypeError, KeyError, ValueError) as _:
-                    #     retry_delay_s = GOOGLE_DELAY
-                    logger.info("Google returned RESOURCE_EXHAUSTED, wait %s sec",
-                                config["openrouter"]["google_rate_delay"])
-                    await asyncio.sleep(config["openrouter"]["google_rate_delay"])
-                return True
-    return False
+            if data.get("error", {}).get("status", "") == "RESOURCE_EXHAUSTED":
+                return GOOGLE_LIMIT_ERROR
+    return None
 
 
 async def check_rate_limit(data: str or bytes) -> Tuple[bool, Optional[int]]:
@@ -125,9 +126,13 @@ async def check_rate_limit(data: str or bytes) -> Tuple[bool, Optional[int]]:
             try:
                 x_rate_limit = int(err["error"]["metadata"]["headers"]["X-RateLimit-Reset"])
             except (TypeError, KeyError):
-                if (code == RATE_LIMIT_ERROR_CODE and
-                        await is_google_error(err["error"].get("metadata", {}).get("raw", ""))):
-                    return False, None
+                if code == RATE_LIMIT_ERROR_CODE and (raw := err["error"].get("metadata", {}).get("raw", "")):
+                    issue = check_global_limit(raw) or check_google_error(raw)
+                    if issue:
+                        if config["openrouter"]["global_rate_delay"]:
+                            logger.info("%s, waiting %s seconds.", issue, config["openrouter"]["global_rate_delay"])
+                            await asyncio.sleep(config["openrouter"]["global_rate_delay"])
+                        return False, None
                 x_rate_limit = 0
 
             if x_rate_limit > 0: