Skip to content

Commit f8924d2

Browse files
committed
feat: Add OpenRouter prompt caching support with usage tracking
Backend (api.rs): - Add usage: {include: true} to OpenRouter requests (streaming and non-streaming) - Extract and log cached_tokens from prompt_tokens_details - Log cache hit ratio when tokens are cached SDK (llm.py): - Add cached_tokens, prompt_tokens, completion_tokens fields to LLMResponse - Extract cached_tokens from usage.prompt_tokens_details - Log cache hit percentage when available This enables proper cost tracking with Anthropic prompt caching via OpenRouter. The cost returned by OpenRouter already includes the cache discount.
1 parent 2b469ee commit f8924d2

2 files changed

Lines changed: 82 additions & 5 deletions

File tree

sdk/python/term_sdk/llm.py

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,14 +103,31 @@ def __str__(self) -> str:
103103

104104
@dataclass
105105
class LLMResponse:
106-
"""Response from LLM."""
106+
"""Response from LLM.
107+
108+
Attributes:
109+
text: The response text content
110+
model: The model used
111+
tokens: Total tokens used
112+
cost: Cost in USD (after cache discount if applicable)
113+
latency_ms: Response latency in milliseconds
114+
function_calls: List of function/tool calls
115+
raw: Raw response data
116+
cached_tokens: Number of tokens read from cache (reduces cost)
117+
prompt_tokens: Number of input/prompt tokens
118+
completion_tokens: Number of output/completion tokens
119+
"""
107120
text: str
108121
model: str
109122
tokens: int = 0
110123
cost: float = 0.0
111124
latency_ms: int = 0
112125
function_calls: List[FunctionCall] = field(default_factory=list)
113126
raw: Optional[Dict[str, Any]] = None
127+
# Cache info (OpenRouter with usage: {include: true})
128+
cached_tokens: int = 0
129+
prompt_tokens: int = 0
130+
completion_tokens: int = 0
114131

115132
def json(self) -> Optional[Dict]:
116133
"""Parse response text as JSON."""
@@ -1939,15 +1956,24 @@ def _parse_platform_response(self, data: Dict, model: str, start: float) -> LLMR
19391956
completion_tokens = usage.get("completion_tokens", 0)
19401957
total_tokens = usage.get("total_tokens", prompt_tokens + completion_tokens)
19411958

1942-
cost = data.get("cost_usd", 0.0)
1959+
# Extract cached tokens from prompt_tokens_details (OpenRouter with usage: {include: true})
1960+
prompt_details = usage.get("prompt_tokens_details", {}) or {}
1961+
cached_tokens = prompt_details.get("cached_tokens", 0) or 0
1962+
1963+
cost = data.get("cost_usd", 0.0) or 0.0
19431964
latency_ms = int((time.time() - start) * 1000)
19441965

19451966
self.total_tokens += total_tokens
19461967
self.total_cost += cost
19471968
self.request_count += 1
19481969
self._update_model_stats(response_model, total_tokens, cost)
19491970

1950-
_log(f"[platform] {response_model}: {total_tokens} tokens, ${cost:.4f}, {latency_ms}ms")
1971+
# Log with cache info if available
1972+
if cached_tokens > 0:
1973+
cache_pct = (cached_tokens / prompt_tokens * 100) if prompt_tokens > 0 else 0
1974+
_log(f"[platform] {response_model}: {total_tokens} tokens ({cached_tokens} cached, {cache_pct:.0f}%), ${cost:.4f}, {latency_ms}ms")
1975+
else:
1976+
_log(f"[platform] {response_model}: {total_tokens} tokens, ${cost:.4f}, {latency_ms}ms")
19511977

19521978
# Parse function calls / tool calls if present in platform response
19531979
function_calls = []
@@ -2010,6 +2036,9 @@ def _parse_platform_response(self, data: Dict, model: str, start: float) -> LLMR
20102036
latency_ms=latency_ms,
20112037
function_calls=function_calls,
20122038
raw=data,
2039+
cached_tokens=cached_tokens,
2040+
prompt_tokens=prompt_tokens,
2041+
completion_tokens=completion_tokens,
20132042
)
20142043

20152044
def _parse_response(self, data: Dict, model: str, start: float) -> LLMResponse:
@@ -2038,17 +2067,26 @@ def _parse_response(self, data: Dict, model: str, start: float) -> LLMResponse:
20382067
completion_tokens = usage.get("completion_tokens", 0)
20392068
total_tokens = prompt_tokens + completion_tokens
20402069

2070+
# Extract cached tokens from prompt_tokens_details (OpenRouter with usage: {include: true})
2071+
prompt_details = usage.get("prompt_tokens_details", {}) or {}
2072+
cached_tokens = prompt_details.get("cached_tokens", 0) or 0
2073+
20412074
# Use provider-reported cost if available (OpenRouter returns usage.cost)
20422075
# OpenAI doesn't return cost, so default to 0
2043-
cost = usage.get("cost", 0.0)
2076+
cost = usage.get("cost", 0.0) or 0.0
20442077
latency_ms = int((time.time() - start) * 1000)
20452078

20462079
self.total_tokens += total_tokens
20472080
self.total_cost += cost
20482081
self.request_count += 1
20492082
self._update_model_stats(model, total_tokens, cost)
20502083

2051-
_log(f"{model}: {total_tokens} tokens, ${cost:.4f}, {latency_ms}ms")
2084+
# Log with cache info if available
2085+
if cached_tokens > 0:
2086+
cache_pct = (cached_tokens / prompt_tokens * 100) if prompt_tokens > 0 else 0
2087+
_log(f"{model}: {total_tokens} tokens ({cached_tokens} cached, {cache_pct:.0f}%), ${cost:.4f}, {latency_ms}ms")
2088+
else:
2089+
_log(f"{model}: {total_tokens} tokens, ${cost:.4f}, {latency_ms}ms")
20522090

20532091
return LLMResponse(
20542092
text=text,
@@ -2058,6 +2096,9 @@ def _parse_response(self, data: Dict, model: str, start: float) -> LLMResponse:
20582096
latency_ms=latency_ms,
20592097
function_calls=function_calls,
20602098
raw=data,
2099+
cached_tokens=cached_tokens,
2100+
prompt_tokens=prompt_tokens,
2101+
completion_tokens=completion_tokens,
20612102
)
20622103

20632104
def _update_model_stats(self, model: str, tokens: int, cost: float):

src/api.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3892,6 +3892,15 @@ async fn make_llm_request(
38923892
}
38933893
}
38943894

3895+
// For OpenRouter: add usage: {include: true} to get cost and cache info in response
3896+
// This enables prompt_tokens_details.cached_tokens and usage.cost fields
3897+
// See: https://openrouter.ai/docs/guides/guides/usage-accounting
3898+
if provider == "openrouter" {
3899+
if let Some(base) = body.as_object_mut() {
3900+
base.insert("usage".to_string(), serde_json::json!({"include": true}));
3901+
}
3902+
}
3903+
38953904
// Transform request for Anthropic Messages API format
38963905
// Only for direct Anthropic API - OpenRouter handles the transformation itself
38973906
// OpenRouter uses OpenAI-compatible format (messages array with system role)
@@ -4016,6 +4025,24 @@ async fn make_llm_request(
40164025
// If provider doesn't report cost, it will be None (SDK will use 0)
40174026
let cost_usd = provider_cost;
40184027

4028+
// Log cache information if available (OpenRouter with usage: {include: true})
4029+
// cached_tokens = tokens read from cache (reduces cost)
4030+
let cached_tokens = json["usage"]["prompt_tokens_details"]["cached_tokens"]
4031+
.as_u64()
4032+
.unwrap_or(0);
4033+
if cached_tokens > 0 {
4034+
let prompt_tokens = json["usage"]["prompt_tokens"].as_u64().unwrap_or(0);
4035+
let cache_hit_ratio = if prompt_tokens > 0 {
4036+
(cached_tokens as f64 / prompt_tokens as f64) * 100.0
4037+
} else {
4038+
0.0
4039+
};
4040+
info!(
4041+
"LLM cache hit: {} cached of {} prompt tokens ({:.1}% hit rate)",
4042+
cached_tokens, prompt_tokens, cache_hit_ratio
4043+
);
4044+
}
4045+
40194046
// Extract tool_calls if present (OpenAI/OpenRouter format)
40204047
let tool_calls = json["choices"][0]["message"]["tool_calls"]
40214048
.as_array()
@@ -4372,6 +4399,15 @@ async fn make_llm_stream_request(
43724399
}
43734400
}
43744401

4402+
// For OpenRouter: add usage: {include: true} to get cost and cache info in final SSE chunk
4403+
// This enables prompt_tokens_details.cached_tokens and usage.cost fields
4404+
// See: https://openrouter.ai/docs/guides/guides/usage-accounting
4405+
if provider == "openrouter" {
4406+
if let Some(base) = body.as_object_mut() {
4407+
base.insert("usage".to_string(), serde_json::json!({"include": true}));
4408+
}
4409+
}
4410+
43754411
// Transform request for Anthropic Messages API format
43764412
// (system messages must be top-level `system` param, not in messages array)
43774413
// Skip if using Responses API

0 commit comments

Comments
 (0)