MoonshotAI · RealKai42 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/src/kimi_cli/soul/compaction.py b/src/kimi_cli/soul/compaction.py
@@ -4,6 +4,7 @@
 from typing import TYPE_CHECKING, NamedTuple, Protocol, runtime_checkable
 
 import kosong
+from kosong.chat_provider import TokenUsage
 from kosong.message import Message
 from kosong.tooling.empty import EmptyToolset
 
@@ -14,9 +15,47 @@
 from kimi_cli.wire.types import ContentPart, TextPart, ThinkPart
 
 
+class CompactionResult(NamedTuple):
+    messages: Sequence[Message]
+    usage: TokenUsage | None
+
+    @property
+    def estimated_token_count(self) -> int:
+        """Estimate the token count of the compacted messages.
+
+        When LLM usage is available, ``usage.output`` gives the exact token count
+        of the generated summary (the first message).  Preserved messages (all
+        subsequent messages) are estimated from their text length.
+
+        When usage is not available (no compaction LLM call was made), all
+        messages are estimated from text length.
+
+        The estimate is intentionally conservative — it will be replaced by the
+        real value on the next LLM call.
+        """
+        if self.usage is not None and len(self.messages) > 0:
+            summary_tokens = self.usage.output
+            preserved_tokens = _estimate_text_tokens(self.messages[1:])
+            return summary_tokens + preserved_tokens
+
+        return _estimate_text_tokens(self.messages)
+
+
+def _estimate_text_tokens(messages: Sequence[Message]) -> int:
+    """Estimate tokens from message text content using a character-based heuristic."""
+    total_chars = 0
+    for msg in messages:
+        for part in msg.content:
+            if isinstance(part, TextPart):
+                total_chars += len(part.text)
+    # ~4 chars per token for English; somewhat underestimates for CJK text,
+    # but this is a temporary estimate that gets corrected on the next LLM call.
+    return total_chars // 4
-    """Estimate tokens from message text content using a character-based heuristic."""
-    total_chars = 0
-    for msg in messages:
-        for part in msg.content:
-            if isinstance(part, TextPart):
-                total_chars += len(part.text)
-    # ~4 chars per token for English; somewhat underestimates for CJK text,
-    # but this is a temporary estimate that gets corrected on the next LLM call.
-    return total_chars // 4
+    """Estimate tokens from message content using a character-based heuristic.
+
+    This includes:
+    - Textual content parts (TextPart, ThinkPart) counted by character length.
+    - Tool call payloads, approximated from their string representation.
+    - A conservative fallback token cost for any non-text content parts.
+    """
+    total_chars = 0
+    extra_tokens = 0
+
+    for msg in messages:
+        # Count textual content parts.
+        for part in getattr(msg, "content", []) or []:
+            if isinstance(part, TextPart):
+                total_chars += len(part.text)
+            elif isinstance(part, ThinkPart):
+                total_chars += len(part.text)
+            else:
+                # Non-text parts (e.g., images, custom structures) still consume tokens
+                # at the provider. Assign a small conservative cost so we bias high.
+                extra_tokens += 32
+
+        # Roughly account for tool call names/arguments, which are serialized as text.
+        for tool_call in getattr(msg, "tool_calls", []) or []:
+            # Use repr() to capture both the function name and arguments textually.
+            total_chars += len(repr(getattr(tool_call, "function", tool_call)))
+
+    # ~4 chars per token for English; somewhat underestimates for CJK text,
+    # but this is a temporary estimate that gets corrected on the next LLM call.
+    # Add extra_tokens so that non-text parts are not underestimated to zero.
+    return total_chars // 4 + extra_tokens
-    """Estimate tokens from message text content using a character-based heuristic."""
-    total_chars = 0
-    for msg in messages:
-        for part in msg.content:
-            if isinstance(part, TextPart):
-                total_chars += len(part.text)
-    # ~4 chars per token for English; somewhat underestimates for CJK text,
-    # but this is a temporary estimate that gets corrected on the next LLM call.
-    return total_chars // 4
+    """Estimate tokens from message content using a character-based heuristic.
+
+    This includes:
+    - Textual content parts (TextPart, ThinkPart) counted by character length.
+    - Tool call payloads, approximated from their string representation.
+    - A conservative fallback token cost for any non-text content parts.
+    """
+    total_chars = 0
+    extra_tokens = 0
+
+    for msg in messages:
+        # Count textual content parts.
+        for part in getattr(msg, "content", []) or []:
+            if isinstance(part, TextPart):
+                total_chars += len(part.text)
+            elif isinstance(part, ThinkPart):
+                total_chars += len(part.text)
+            else:
+                # Non-text parts (e.g., images, custom structures) still consume tokens
+                # at the provider. Assign a small conservative cost so we bias high.
+                extra_tokens += 32
+
+        # Roughly account for tool call names/arguments, which are serialized as text.
+        for tool_call in getattr(msg, "tool_calls", []) or []:
+            # Use repr() to capture both the function name and arguments textually.
+            total_chars += len(repr(getattr(tool_call, "function", tool_call)))
+
+    # ~4 chars per token for English; somewhat underestimates for CJK text,
+    # but this is a temporary estimate that gets corrected on the next LLM call.
+    # Add extra_tokens so that non-text parts are not underestimated to zero.
+    return total_chars // 4 + extra_tokens
+
+
 @runtime_checkable
 class Compaction(Protocol):
-    async def compact(self, messages: Sequence[Message], llm: LLM) -> Sequence[Message]:
+    async def compact(self, messages: Sequence[Message], llm: LLM) -> CompactionResult:
         """
         Compact a sequence of messages into a new sequence of messages.
 
@@ -25,7 +64,7 @@ async def compact(self, messages: Sequence[Message], llm: LLM) -> Sequence[Messa
             llm (LLM): The LLM to use for compaction.
 
         Returns:
-            Sequence[Message]: The compacted messages.
+            CompactionResult: The compacted messages and token usage from the compaction LLM call.
 
         Raises:
             ChatProviderError: When the chat provider returns an error.
@@ -43,10 +82,10 @@ class SimpleCompaction:
     def __init__(self, max_preserved_messages: int = 2) -> None:
         self.max_preserved_messages = max_preserved_messages
 
-    async def compact(self, messages: Sequence[Message], llm: LLM) -> Sequence[Message]:
+    async def compact(self, messages: Sequence[Message], llm: LLM) -> CompactionResult:
         compact_message, to_preserve = self.prepare(messages)
         if compact_message is None:
-            return to_preserve
+            return CompactionResult(messages=to_preserve, usage=None)
 
         # Call kosong.step to get the compacted context
         # TODO: set max completion tokens
@@ -73,7 +112,7 @@ async def compact(self, messages: Sequence[Message], llm: LLM) -> Sequence[Messa
         content.extend(part for part in compacted_msg.content if not isinstance(part, ThinkPart))
         compacted_messages: list[Message] = [Message(role="user", content=content)]
         compacted_messages.extend(to_preserve)
-        return compacted_messages
+        return CompactionResult(messages=compacted_messages, usage=result.usage)
 
     class PrepareResult(NamedTuple):
         compact_message: Message | None

diff --git a/src/kimi_cli/soul/kimisoul.py b/src/kimi_cli/soul/kimisoul.py
@@ -32,7 +32,7 @@
     wire_send,
 )
 from kimi_cli.soul.agent import Agent, Runtime
-from kimi_cli.soul.compaction import SimpleCompaction
+from kimi_cli.soul.compaction import CompactionResult, SimpleCompaction
 from kimi_cli.soul.context import Context
 from kimi_cli.soul.message import check_message, system, tool_result_to_message
 from kimi_cli.soul.slash import registry as soul_slash_registry
@@ -555,7 +555,7 @@ async def compact_context(self) -> None:
 
         chat_provider = self._runtime.llm.chat_provider if self._runtime.llm is not None else None
 
-        async def _run_compaction_once() -> Sequence[Message]:
+        async def _run_compaction_once() -> CompactionResult:
             if self._runtime.llm is None:
                 raise LLMNotSet()
             return await self._compaction.compact(self._context.history, self._runtime.llm)
@@ -567,18 +567,22 @@ async def _run_compaction_once() -> Sequence[Message]:
             stop=stop_after_attempt(self._loop_control.max_retries_per_step),
             reraise=True,
         )
-        async def _compact_with_retry() -> Sequence[Message]:
+        async def _compact_with_retry() -> CompactionResult:
             return await self._run_with_connection_recovery(
                 "compaction",
                 _run_compaction_once,
                 chat_provider=chat_provider,
             )
 
         wire_send(CompactionBegin())
-        compacted_messages = await _compact_with_retry()
+        compaction_result = await _compact_with_retry()
         await self._context.clear()
         await self._checkpoint()
-        await self._context.append_message(compacted_messages)
+        await self._context.append_message(compaction_result.messages)
+
+        # Estimate token count so context_usage is not reported as 0%
+        await self._context.update_token_count(compaction_result.estimated_token_count)
+
         wire_send(CompactionEnd())
 
     @staticmethod

diff --git a/tests/core/test_simple_compaction.py b/tests/core/test_simple_compaction.py
@@ -1,10 +1,11 @@
 from __future__ import annotations
 
 from inline_snapshot import snapshot
+from kosong.chat_provider import TokenUsage
 from kosong.message import Message
 
 import kimi_cli.prompts as prompts
-from kimi_cli.soul.compaction import SimpleCompaction
+from kimi_cli.soul.compaction import CompactionResult, SimpleCompaction
 from kimi_cli.wire.types import TextPart, ThinkPart
 
 
@@ -74,3 +75,54 @@ def test_prepare_builds_compact_message_and_preserves_tail():
             Message(role="assistant", content=[TextPart(text="Latest answer")]),
         ]
     )
+
+
+# --- CompactionResult.estimated_token_count tests ---
+
+
+def test_estimated_token_count_with_usage_uses_output_tokens_for_summary():
+    """When usage is available, the summary (first message) uses exact output tokens
+    and preserved messages (remaining) use character-based estimation."""
+    summary_msg = Message(role="user", content=[TextPart(text="compacted summary")])
+    preserved_msg = Message(
+        role="user",
+        content=[TextPart(text="a" * 80)],  # 80 chars → 20 tokens
+    )
+    usage = TokenUsage(input_other=1000, output=150, input_cache_read=0)
+
+    result = CompactionResult(messages=[summary_msg, preserved_msg], usage=usage)
+
+    assert result.estimated_token_count == 150 + 20
+
+
+def test_estimated_token_count_without_usage_estimates_all_from_text():
+    """Without usage (no LLM call), all messages are estimated from text content."""
+    messages = [
+        Message(role="user", content=[TextPart(text="a" * 100)]),
+        Message(role="assistant", content=[TextPart(text="b" * 200)]),
+    ]
+    result = CompactionResult(messages=messages, usage=None)
+
+    assert result.estimated_token_count == 300 // 4
+
+
+def test_estimated_token_count_ignores_non_text_parts():
+    """Non-text parts (think, etc.) should not inflate the estimate."""
+    messages = [
+        Message(
+            role="user",
+            content=[
+                TextPart(text="a" * 40),
+                ThinkPart(think="internal reasoning " * 100),
+            ],
+        ),
+    ]
+    result = CompactionResult(messages=messages, usage=None)
+
+    assert result.estimated_token_count == 40 // 4
+
+
+def test_estimated_token_count_empty_messages():
+    """Empty message list should return 0."""
+    result = CompactionResult(messages=[], usage=None)
+    assert result.estimated_token_count == 0
diff --git a/web/src/features/chat/chat.tsx b/web/src/features/chat/chat.tsx
@@ -160,7 +160,7 @@ export const ChatWorkspace = memo(function ChatWorkspaceComponent({
 
   const maxTokens = maxContextSize ?? 64000;
   const usedTokens = Math.round(contextUsage * maxTokens);
-  const usagePercent = Math.round(contextUsage * 100);
+  const usagePercent = Math.round(contextUsage * 1000) / 10;
 
   const canSendMessage = true;
   const isStreaming = status === "streaming";

diff --git a/web/src/features/chat/components/prompt-toolbar/toolbar-context.tsx b/web/src/features/chat/components/prompt-toolbar/toolbar-context.tsx
@@ -53,7 +53,7 @@ export const ToolbarContextIndicator = memo(
             )}
           >
             <ContextProgressIcon usedPercent={usedPercent} size={14} />
-            <span>{usagePercent}% context</span>
+            <span>{usagePercent.toFixed(1)}% context</span>
           </button>
         </HoverCardTrigger>
         <HoverCardContent
@@ -64,7 +64,7 @@ export const ToolbarContextIndicator = memo(
         >
           <div className="w-full space-y-2 p-3">
             <div className="flex items-center justify-between gap-3 text-xs">
-              <p>{usagePercent}%</p>
+              <p>{usagePercent.toFixed(1)}%</p>
               <p className="font-mono text-muted-foreground">
                 {used} / {total}
               </p>