@@ -29,6 +29,55 @@ use serde::{Deserialize, Serialize};
2929
3030const SCOREBOARD_VERSION : u32 = 1 ;
3131
32+ fn now_unix_ms ( ) -> u64 {
33+ std:: time:: SystemTime :: now ( )
34+ . duration_since ( std:: time:: UNIX_EPOCH )
35+ . map ( |duration| {
36+ // Truncation is intentional — we want a stable u64 epoch-ms.
37+ #[ allow( clippy:: cast_possible_truncation) ]
38+ let ms = duration. as_millis ( ) as u64 ;
39+ ms
40+ } )
41+ . unwrap_or ( 0 )
42+ }
43+
44+ /// Multiply `successes` / `failures` / `total_latency_ms` by
45+ /// `0.5 ^ (elapsed_ms / half_life_ms)`. Rows without a prior
46+ /// observation (loaded from older scoreboards or freshly inserted) are
47+ /// skipped so decay doesn't punish data that predates the feature.
48+ fn apply_decay ( entry : & mut ModelStats , now_ms : u64 , half_life_ms : u64 ) {
49+ if entry. last_observation_ms == 0 || half_life_ms == 0 || now_ms <= entry. last_observation_ms {
50+ return ;
51+ }
52+ let elapsed_ms = now_ms. saturating_sub ( entry. last_observation_ms ) ;
53+ // `0.5 ^ k` = `exp(-k * ln 2)`. We never use more than `elapsed /
54+ // half_life` half-lives of decay; even across 100 years of elapsed
55+ // time with a 1-hour half-life the exponent is finite and safe in
56+ // f64 (factor ≈ 0 well before any overflow).
57+ #[ allow( clippy:: cast_precision_loss) ]
58+ let ratio = ( elapsed_ms as f64 ) / ( half_life_ms as f64 ) ;
59+ let factor = ( -ratio * std:: f64:: consts:: LN_2 ) . exp ( ) ;
60+ entry. successes = decay_count ( entry. successes , factor) ;
61+ entry. failures = decay_count ( entry. failures , factor) ;
62+ entry. total_latency_ms = decay_count ( entry. total_latency_ms , factor) ;
63+ }
64+
65+ fn decay_count ( value : u64 , factor : f64 ) -> u64 {
66+ if factor <= 0.0 || value == 0 {
67+ return 0 ;
68+ }
69+ if factor >= 1.0 {
70+ return value;
71+ }
72+ #[ allow(
73+ clippy:: cast_precision_loss,
74+ clippy:: cast_possible_truncation,
75+ clippy:: cast_sign_loss
76+ ) ]
77+ let decayed = ( ( value as f64 ) * factor) . round ( ) as u64 ;
78+ decayed
79+ }
80+
3281/// Coarse prompt bucket used as the scoreboard key. Keeping the alphabet
3382/// small means the scoreboard converges fast even on small session
3483/// corpora; finer-grained bucketing (topic, tool density, …) is a natural
@@ -74,6 +123,13 @@ impl Display for PromptBucket {
74123}
75124
76125/// Aggregated statistics for one candidate model inside a single bucket.
126+ ///
127+ /// Counts decay toward zero over time when the scoreboard is recorded
128+ /// against with a non-`None` `half_life_ms` — the exponent is
129+ /// `0.5 ^ (elapsed_ms / half_life_ms)`, applied to `successes`,
130+ /// `failures`, and `total_latency_ms` before the new sample is
131+ /// added. Token totals are not decayed (they're only kept for cost
132+ /// analysis, not the selection decision).
77133#[ derive( Debug , Clone , Default , PartialEq , Serialize , Deserialize ) ]
78134pub struct ModelStats {
79135 #[ serde( default ) ]
@@ -86,6 +142,19 @@ pub struct ModelStats {
86142 pub total_input_tokens : u64 ,
87143 #[ serde( default ) ]
88144 pub total_output_tokens : u64 ,
145+ /// Cumulative provider cost in micro-dollars (10⁻⁶ USD) for this
146+ /// (bucket, model) row. Storing an integer keeps the JSON schema
147+ /// free of floating-point values; a u64 of micro-dollars holds up to
148+ /// ~18 trillion dollars, which is enough headroom. Not decayed — we
149+ /// want a faithful lifetime cost figure for the `claw eval` report.
150+ #[ serde( default ) ]
151+ pub total_cost_micros : u64 ,
152+ /// Unix-millis of the last observation merged into this row.
153+ /// Default 0 on rows loaded from pre-decay scoreboards; the first
154+ /// recorded outcome after load skips decay so we don't penalize
155+ /// historical data simply because it predates the feature.
156+ #[ serde( default ) ]
157+ pub last_observation_ms : u64 ,
89158}
90159
91160impl ModelStats {
@@ -115,6 +184,14 @@ impl ModelStats {
115184 self . total_latency_ms / samples
116185 }
117186 }
187+
188+ /// Cumulative cost in dollars for this (bucket, model) row. Derived
189+ /// from `total_cost_micros`.
190+ #[ must_use]
191+ #[ allow( clippy:: cast_precision_loss) ]
192+ pub fn total_cost_usd ( & self ) -> f64 {
193+ ( self . total_cost_micros as f64 ) / 1_000_000.0
194+ }
118195}
119196
120197/// Reason the selector returned a particular model. Surfaced to stderr for
@@ -345,6 +422,9 @@ impl RouterScoreboard {
345422 /// Record a turn's outcome. `input_tokens` / `output_tokens` are purely
346423 /// informational (cost analysis); the routing decision itself only
347424 /// depends on `success`.
425+ ///
426+ /// Equivalent to [`record_outcome_with_decay`](Self::record_outcome_with_decay)
427+ /// called with `half_life_ms = None` and `cost_micros = 0`.
348428 pub fn record_outcome (
349429 & mut self ,
350430 bucket : PromptBucket ,
@@ -354,17 +434,57 @@ impl RouterScoreboard {
354434 input_tokens : u32 ,
355435 output_tokens : u32 ,
356436 ) {
437+ self . record_outcome_with_decay (
438+ bucket,
439+ model,
440+ success,
441+ latency_ms,
442+ input_tokens,
443+ output_tokens,
444+ 0 ,
445+ None ,
446+ ) ;
447+ }
448+
449+ /// Record a turn's outcome, optionally decaying prior counts first
450+ /// and crediting cost against the row.
451+ ///
452+ /// - `cost_micros`: provider cost for this turn in micro-dollars.
453+ /// Added to the row's lifetime cumulative cost; not decayed.
454+ /// - `half_life_ms`: if `Some(h)` and the row has a recorded
455+ /// `last_observation_ms`, existing `successes`, `failures`, and
456+ /// `total_latency_ms` are multiplied by `0.5 ^ (elapsed_ms / h)`
457+ /// before the new sample lands. Passing `None` preserves the
458+ /// pre-decay append-forever behavior.
459+ #[ allow( clippy:: too_many_arguments) ]
460+ pub fn record_outcome_with_decay (
461+ & mut self ,
462+ bucket : PromptBucket ,
463+ model : & str ,
464+ success : bool ,
465+ latency_ms : u64 ,
466+ input_tokens : u32 ,
467+ output_tokens : u32 ,
468+ cost_micros : u64 ,
469+ half_life_ms : Option < u64 > ,
470+ ) {
471+ let now_ms = now_unix_ms ( ) ;
357472 let entry = self
358473 . state
359474 . buckets
360475 . entry ( bucket)
361476 . or_default ( )
362477 . entry ( model. to_string ( ) )
363478 . or_default ( ) ;
479+
480+ if let Some ( half_life) = half_life_ms {
481+ apply_decay ( entry, now_ms, half_life) ;
482+ }
483+
364484 if success {
365- entry. successes += 1 ;
485+ entry. successes = entry . successes . saturating_add ( 1 ) ;
366486 } else {
367- entry. failures += 1 ;
487+ entry. failures = entry . failures . saturating_add ( 1 ) ;
368488 }
369489 entry. total_latency_ms = entry. total_latency_ms . saturating_add ( latency_ms) ;
370490 entry. total_input_tokens = entry
@@ -373,6 +493,8 @@ impl RouterScoreboard {
373493 entry. total_output_tokens = entry
374494 . total_output_tokens
375495 . saturating_add ( u64:: from ( output_tokens) ) ;
496+ entry. total_cost_micros = entry. total_cost_micros . saturating_add ( cost_micros) ;
497+ entry. last_observation_ms = now_ms;
376498 }
377499
378500 /// Pick the model for the next turn.
@@ -620,6 +742,156 @@ mod tests {
620742 assert_eq ! ( short. failures, 0 ) ;
621743 }
622744
745+ #[ test]
746+ fn decay_halves_counts_after_one_half_life ( ) {
747+ // decay_count is the actual workhorse; verify the math directly
748+ // instead of spinning a real clock.
749+ assert_eq ! ( decay_count( 100 , 0.5 ) , 50 ) ;
750+ assert_eq ! ( decay_count( 10 , 0.25 ) , 3 ) ; // 2.5 rounds to 3
751+ assert_eq ! ( decay_count( 0 , 0.5 ) , 0 ) ;
752+ assert_eq ! ( decay_count( 1 , 1.0 ) , 1 ) ;
753+ assert_eq ! ( decay_count( 5 , 0.0 ) , 0 ) ;
754+ }
755+
756+ #[ test]
757+ fn apply_decay_skips_rows_with_no_prior_observation ( ) {
758+ let mut stats = ModelStats {
759+ successes : 10 ,
760+ failures : 5 ,
761+ total_latency_ms : 1_000 ,
762+ last_observation_ms : 0 , // no prior observation → skip
763+ ..Default :: default ( )
764+ } ;
765+ apply_decay ( & mut stats, 1_000_000 , 100_000 ) ;
766+ assert_eq ! ( stats. successes, 10 ) ;
767+ assert_eq ! ( stats. failures, 5 ) ;
768+ assert_eq ! ( stats. total_latency_ms, 1_000 ) ;
769+ }
770+
771+ #[ test]
772+ fn apply_decay_scales_counts_by_half_life_exponent ( ) {
773+ let mut stats = ModelStats {
774+ successes : 100 ,
775+ failures : 40 ,
776+ total_latency_ms : 10_000 ,
777+ total_input_tokens : 7_777 , // not decayed — cost data only
778+ total_output_tokens : 3_333 ,
779+ total_cost_micros : 5_000_000 , // $5, not decayed
780+ last_observation_ms : 1_000 ,
781+ } ;
782+ // One half-life elapsed: factor should be ~0.5.
783+ apply_decay ( & mut stats, 1_000 + 3_600_000 , 3_600_000 ) ;
784+ assert ! (
785+ ( 49 ..=51 ) . contains( & stats. successes) ,
786+ "expected ~50, got {}" ,
787+ stats. successes
788+ ) ;
789+ assert ! (
790+ ( 19 ..=21 ) . contains( & stats. failures) ,
791+ "expected ~20, got {}" ,
792+ stats. failures
793+ ) ;
794+ assert ! (
795+ ( 4_900 ..=5_100 ) . contains( & stats. total_latency_ms) ,
796+ "expected ~5000, got {}" ,
797+ stats. total_latency_ms
798+ ) ;
799+ // Tokens and cost untouched — decay applies only to counts.
800+ assert_eq ! ( stats. total_input_tokens, 7_777 ) ;
801+ assert_eq ! ( stats. total_output_tokens, 3_333 ) ;
802+ assert_eq ! ( stats. total_cost_micros, 5_000_000 ) ;
803+ }
804+
805+ #[ test]
806+ fn record_outcome_with_decay_forgets_old_samples ( ) {
807+ let mut board = RouterScoreboard :: in_memory ( ) ;
808+ // Seed a stale observation from "a year ago" (last_observation_ms
809+ // well in the past). Set that directly to avoid waiting on
810+ // wall-clock time during the test.
811+ {
812+ let entry = board
813+ . state
814+ . buckets
815+ . entry ( PromptBucket :: Short )
816+ . or_default ( )
817+ . entry ( "claude-haiku-4-5" . to_string ( ) )
818+ . or_default ( ) ;
819+ entry. successes = 100 ;
820+ entry. failures = 0 ;
821+ // 30 half-lives in the past: decay factor 2^-30 ≈ 1e-9, so
822+ // everything rounds to zero.
823+ let thirty_half_lives_ago = now_unix_ms ( ) . saturating_sub ( 30 * 3_600_000 ) ;
824+ entry. last_observation_ms = thirty_half_lives_ago;
825+ }
826+
827+ board. record_outcome_with_decay (
828+ PromptBucket :: Short ,
829+ "claude-haiku-4-5" ,
830+ true ,
831+ 100 ,
832+ 10 ,
833+ 5 ,
834+ 0 ,
835+ Some ( 3_600_000 ) , // 1-hour half-life
836+ ) ;
837+ let stats = board. stats ( PromptBucket :: Short , "claude-haiku-4-5" ) ;
838+ // All 100 historical successes should have decayed to 0; only
839+ // the newly recorded success remains.
840+ assert_eq ! ( stats. successes, 1 ) ;
841+ assert_eq ! ( stats. failures, 0 ) ;
842+ }
843+
844+ #[ test]
845+ fn record_outcome_with_decay_accumulates_cost_without_decaying_it ( ) {
846+ let mut board = RouterScoreboard :: in_memory ( ) ;
847+ board. record_outcome_with_decay (
848+ PromptBucket :: Short ,
849+ "claude-haiku-4-5" ,
850+ true ,
851+ 50 ,
852+ 100 ,
853+ 40 ,
854+ 125_000 , // $0.125 for this turn
855+ Some ( 3_600_000 ) ,
856+ ) ;
857+ board. record_outcome_with_decay (
858+ PromptBucket :: Short ,
859+ "claude-haiku-4-5" ,
860+ true ,
861+ 75 ,
862+ 200 ,
863+ 80 ,
864+ 250_000 , // another $0.25
865+ Some ( 3_600_000 ) ,
866+ ) ;
867+ let stats = board. stats ( PromptBucket :: Short , "claude-haiku-4-5" ) ;
868+ // Both costs should land even though the row saw decay on its
869+ // counts; cost is not among the decayed fields.
870+ assert_eq ! ( stats. total_cost_micros, 375_000 ) ;
871+ assert ! ( ( stats. total_cost_usd( ) - 0.375 ) . abs( ) < 1e-9 ) ;
872+ }
873+
874+ #[ test]
875+ fn record_outcome_wrapper_does_not_decay ( ) {
876+ let mut board = RouterScoreboard :: in_memory ( ) ;
877+ // Seed a huge count with an old timestamp.
878+ {
879+ let entry = board
880+ . state
881+ . buckets
882+ . entry ( PromptBucket :: Short )
883+ . or_default ( )
884+ . entry ( "claude-opus-4-6" . to_string ( ) )
885+ . or_default ( ) ;
886+ entry. successes = 50 ;
887+ entry. last_observation_ms = 1 ; // ancient
888+ }
889+ // The non-decay wrapper should preserve the count.
890+ board. record_outcome ( PromptBucket :: Short , "claude-opus-4-6" , true , 100 , 10 , 5 ) ;
891+ let stats = board. stats ( PromptBucket :: Short , "claude-opus-4-6" ) ;
892+ assert_eq ! ( stats. successes, 51 ) ;
893+ }
894+
623895 #[ test]
624896 fn save_and_load_roundtrip_preserves_scores ( ) {
625897 let dir = tempdir ( ) ;
0 commit comments