@@ -103,14 +103,31 @@ def __str__(self) -> str:
103103
104104@dataclass
105105class LLMResponse :
106- """Response from LLM."""
106+ """Response from LLM.
107+
108+ Attributes:
109+ text: The response text content
110+ model: The model used
111+ tokens: Total tokens used
112+ cost: Cost in USD (after cache discount if applicable)
113+ latency_ms: Response latency in milliseconds
114+ function_calls: List of function/tool calls
115+ raw: Raw response data
116+ cached_tokens: Number of tokens read from cache (reduces cost)
117+ prompt_tokens: Number of input/prompt tokens
118+ completion_tokens: Number of output/completion tokens
119+ """
107120 text : str
108121 model : str
109122 tokens : int = 0
110123 cost : float = 0.0
111124 latency_ms : int = 0
112125 function_calls : List [FunctionCall ] = field (default_factory = list )
113126 raw : Optional [Dict [str , Any ]] = None
127+ # Cache info (OpenRouter with usage: {include: true})
128+ cached_tokens : int = 0
129+ prompt_tokens : int = 0
130+ completion_tokens : int = 0
114131
115132 def json (self ) -> Optional [Dict ]:
116133 """Parse response text as JSON."""
@@ -1939,15 +1956,24 @@ def _parse_platform_response(self, data: Dict, model: str, start: float) -> LLMR
19391956 completion_tokens = usage .get ("completion_tokens" , 0 )
19401957 total_tokens = usage .get ("total_tokens" , prompt_tokens + completion_tokens )
19411958
1942- cost = data .get ("cost_usd" , 0.0 )
1959+ # Extract cached tokens from prompt_tokens_details (OpenRouter with usage: {include: true})
1960+ prompt_details = usage .get ("prompt_tokens_details" , {}) or {}
1961+ cached_tokens = prompt_details .get ("cached_tokens" , 0 ) or 0
1962+
1963+ cost = data .get ("cost_usd" , 0.0 ) or 0.0
19431964 latency_ms = int ((time .time () - start ) * 1000 )
19441965
19451966 self .total_tokens += total_tokens
19461967 self .total_cost += cost
19471968 self .request_count += 1
19481969 self ._update_model_stats (response_model , total_tokens , cost )
19491970
1950- _log (f"[platform] { response_model } : { total_tokens } tokens, ${ cost :.4f} , { latency_ms } ms" )
1971+ # Log with cache info if available
1972+ if cached_tokens > 0 :
1973+ cache_pct = (cached_tokens / prompt_tokens * 100 ) if prompt_tokens > 0 else 0
1974+ _log (f"[platform] { response_model } : { total_tokens } tokens ({ cached_tokens } cached, { cache_pct :.0f} %), ${ cost :.4f} , { latency_ms } ms" )
1975+ else :
1976+ _log (f"[platform] { response_model } : { total_tokens } tokens, ${ cost :.4f} , { latency_ms } ms" )
19511977
19521978 # Parse function calls / tool calls if present in platform response
19531979 function_calls = []
@@ -2010,6 +2036,9 @@ def _parse_platform_response(self, data: Dict, model: str, start: float) -> LLMR
20102036 latency_ms = latency_ms ,
20112037 function_calls = function_calls ,
20122038 raw = data ,
2039+ cached_tokens = cached_tokens ,
2040+ prompt_tokens = prompt_tokens ,
2041+ completion_tokens = completion_tokens ,
20132042 )
20142043
20152044 def _parse_response (self , data : Dict , model : str , start : float ) -> LLMResponse :
@@ -2038,17 +2067,26 @@ def _parse_response(self, data: Dict, model: str, start: float) -> LLMResponse:
20382067 completion_tokens = usage .get ("completion_tokens" , 0 )
20392068 total_tokens = prompt_tokens + completion_tokens
20402069
2070+ # Extract cached tokens from prompt_tokens_details (OpenRouter with usage: {include: true})
2071+ prompt_details = usage .get ("prompt_tokens_details" , {}) or {}
2072+ cached_tokens = prompt_details .get ("cached_tokens" , 0 ) or 0
2073+
20412074 # Use provider-reported cost if available (OpenRouter returns usage.cost)
20422075 # OpenAI doesn't return cost, so default to 0
2043- cost = usage .get ("cost" , 0.0 )
2076+ cost = usage .get ("cost" , 0.0 ) or 0.0
20442077 latency_ms = int ((time .time () - start ) * 1000 )
20452078
20462079 self .total_tokens += total_tokens
20472080 self .total_cost += cost
20482081 self .request_count += 1
20492082 self ._update_model_stats (model , total_tokens , cost )
20502083
2051- _log (f"{ model } : { total_tokens } tokens, ${ cost :.4f} , { latency_ms } ms" )
2084+ # Log with cache info if available
2085+ if cached_tokens > 0 :
2086+ cache_pct = (cached_tokens / prompt_tokens * 100 ) if prompt_tokens > 0 else 0
2087+ _log (f"{ model } : { total_tokens } tokens ({ cached_tokens } cached, { cache_pct :.0f} %), ${ cost :.4f} , { latency_ms } ms" )
2088+ else :
2089+ _log (f"{ model } : { total_tokens } tokens, ${ cost :.4f} , { latency_ms } ms" )
20522090
20532091 return LLMResponse (
20542092 text = text ,
@@ -2058,6 +2096,9 @@ def _parse_response(self, data: Dict, model: str, start: float) -> LLMResponse:
20582096 latency_ms = latency_ms ,
20592097 function_calls = function_calls ,
20602098 raw = data ,
2099+ cached_tokens = cached_tokens ,
2100+ prompt_tokens = prompt_tokens ,
2101+ completion_tokens = completion_tokens ,
20612102 )
20622103
20632104 def _update_model_stats (self , model : str , tokens : int , cost : float ):
0 commit comments