Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 44 additions & 5 deletions src/kimi_cli/soul/compaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import TYPE_CHECKING, NamedTuple, Protocol, runtime_checkable

import kosong
from kosong.chat_provider import TokenUsage
from kosong.message import Message
from kosong.tooling.empty import EmptyToolset

Expand All @@ -14,9 +15,47 @@
from kimi_cli.wire.types import ContentPart, TextPart, ThinkPart


class CompactionResult(NamedTuple):
messages: Sequence[Message]
usage: TokenUsage | None

@property
def estimated_token_count(self) -> int:
"""Estimate the token count of the compacted messages.

When LLM usage is available, ``usage.output`` gives the exact token count
of the generated summary (the first message). Preserved messages (all
subsequent messages) are estimated from their text length.

When usage is not available (no compaction LLM call was made), all
messages are estimated from text length.

The estimate is intentionally conservative — it will be replaced by the
real value on the next LLM call.
"""
if self.usage is not None and len(self.messages) > 0:
summary_tokens = self.usage.output
preserved_tokens = _estimate_text_tokens(self.messages[1:])
return summary_tokens + preserved_tokens

return _estimate_text_tokens(self.messages)


def _estimate_text_tokens(messages: Sequence[Message]) -> int:
"""Estimate tokens from message text content using a character-based heuristic."""
total_chars = 0
for msg in messages:
for part in msg.content:
if isinstance(part, TextPart):
total_chars += len(part.text)
# ~4 chars per token for English; somewhat underestimates for CJK text,
# but this is a temporary estimate that gets corrected on the next LLM call.
return total_chars // 4
Comment on lines +45 to +53
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_estimate_text_tokens currently only counts TextPart in message.content and ignores other token-bearing fields like Message.tool_calls (function names/arguments) and any non-text content that still consumes tokens (e.g., images). Since Context.token_count is used to decide when to compact (token_count + reserved >= max_context_size), this underestimation can prevent compaction and lead to provider context-limit errors. Consider extending the estimator to include tool call names/arguments (and optionally apply a fallback cost for non-text parts) so the estimate is biased high rather than low.

Suggested change
"""Estimate tokens from message text content using a character-based heuristic."""
total_chars = 0
for msg in messages:
for part in msg.content:
if isinstance(part, TextPart):
total_chars += len(part.text)
# ~4 chars per token for English; somewhat underestimates for CJK text,
# but this is a temporary estimate that gets corrected on the next LLM call.
return total_chars // 4
"""Estimate tokens from message content using a character-based heuristic.
This includes:
- Textual content parts (TextPart, ThinkPart) counted by character length.
- Tool call payloads, approximated from their string representation.
- A conservative fallback token cost for any non-text content parts.
"""
total_chars = 0
extra_tokens = 0
for msg in messages:
# Count textual content parts.
for part in getattr(msg, "content", []) or []:
if isinstance(part, TextPart):
total_chars += len(part.text)
elif isinstance(part, ThinkPart):
total_chars += len(part.text)
else:
# Non-text parts (e.g., images, custom structures) still consume tokens
# at the provider. Assign a small conservative cost so we bias high.
extra_tokens += 32
# Roughly account for tool call names/arguments, which are serialized as text.
for tool_call in getattr(msg, "tool_calls", []) or []:
# Use repr() to capture both the function name and arguments textually.
total_chars += len(repr(getattr(tool_call, "function", tool_call)))
# ~4 chars per token for English; somewhat underestimates for CJK text,
# but this is a temporary estimate that gets corrected on the next LLM call.
# Add extra_tokens so that non-text parts are not underestimated to zero.
return total_chars // 4 + extra_tokens

Copilot uses AI. Check for mistakes.
Comment on lines +51 to +53
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The estimator uses floor division (total_chars // 4), which can return 0 for short-but-non-empty text (and will always round down). Because this value is written into Context.token_count and drives both UI context usage and compaction triggering, rounding down is risky. Consider using math.ceil(total_chars / 4) (and possibly max(1, …) when total_chars > 0) so the estimate is not systematically under-reporting.

Copilot uses AI. Check for mistakes.


@runtime_checkable
class Compaction(Protocol):
async def compact(self, messages: Sequence[Message], llm: LLM) -> Sequence[Message]:
async def compact(self, messages: Sequence[Message], llm: LLM) -> CompactionResult:
"""
Compact a sequence of messages into a new sequence of messages.

Expand All @@ -25,7 +64,7 @@ async def compact(self, messages: Sequence[Message], llm: LLM) -> Sequence[Messa
llm (LLM): The LLM to use for compaction.

Returns:
Sequence[Message]: The compacted messages.
CompactionResult: The compacted messages and token usage from the compaction LLM call.

Raises:
ChatProviderError: When the chat provider returns an error.
Expand All @@ -43,10 +82,10 @@ class SimpleCompaction:
def __init__(self, max_preserved_messages: int = 2) -> None:
self.max_preserved_messages = max_preserved_messages

async def compact(self, messages: Sequence[Message], llm: LLM) -> Sequence[Message]:
async def compact(self, messages: Sequence[Message], llm: LLM) -> CompactionResult:
compact_message, to_preserve = self.prepare(messages)
if compact_message is None:
return to_preserve
return CompactionResult(messages=to_preserve, usage=None)

# Call kosong.step to get the compacted context
# TODO: set max completion tokens
Expand All @@ -73,7 +112,7 @@ async def compact(self, messages: Sequence[Message], llm: LLM) -> Sequence[Messa
content.extend(part for part in compacted_msg.content if not isinstance(part, ThinkPart))
compacted_messages: list[Message] = [Message(role="user", content=content)]
compacted_messages.extend(to_preserve)
return compacted_messages
return CompactionResult(messages=compacted_messages, usage=result.usage)

class PrepareResult(NamedTuple):
compact_message: Message | None
Expand Down
14 changes: 9 additions & 5 deletions src/kimi_cli/soul/kimisoul.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
wire_send,
)
from kimi_cli.soul.agent import Agent, Runtime
from kimi_cli.soul.compaction import SimpleCompaction
from kimi_cli.soul.compaction import CompactionResult, SimpleCompaction
from kimi_cli.soul.context import Context
from kimi_cli.soul.message import check_message, system, tool_result_to_message
from kimi_cli.soul.slash import registry as soul_slash_registry
Expand Down Expand Up @@ -555,7 +555,7 @@ async def compact_context(self) -> None:

chat_provider = self._runtime.llm.chat_provider if self._runtime.llm is not None else None

async def _run_compaction_once() -> Sequence[Message]:
async def _run_compaction_once() -> CompactionResult:
if self._runtime.llm is None:
raise LLMNotSet()
return await self._compaction.compact(self._context.history, self._runtime.llm)
Expand All @@ -567,18 +567,22 @@ async def _run_compaction_once() -> Sequence[Message]:
stop=stop_after_attempt(self._loop_control.max_retries_per_step),
reraise=True,
)
async def _compact_with_retry() -> Sequence[Message]:
async def _compact_with_retry() -> CompactionResult:
return await self._run_with_connection_recovery(
"compaction",
_run_compaction_once,
chat_provider=chat_provider,
)

wire_send(CompactionBegin())
compacted_messages = await _compact_with_retry()
compaction_result = await _compact_with_retry()
await self._context.clear()
await self._checkpoint()
await self._context.append_message(compacted_messages)
await self._context.append_message(compaction_result.messages)

# Estimate token count so context_usage is not reported as 0%
await self._context.update_token_count(compaction_result.estimated_token_count)

Comment on lines 579 to +585
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update_token_count is fed from compaction_result.estimated_token_count, but this method calls _checkpoint() between clear() and append_message(). When _checkpoint_with_user_message is enabled, _checkpoint() appends a CHECKPOINT … user message into the context history; that message’s tokens are not included in estimated_token_count, so Context.token_count becomes inconsistent with Context.history. Consider estimating from the full post-compaction history (or adding the checkpoint message cost) before updating the token count.

Copilot uses AI. Check for mistakes.
Comment on lines +583 to +585
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change updates Context.token_count during compact_context(), which also affects the compaction trigger logic (token_count + reserved >= max_context_size). There doesn’t appear to be a test exercising compact_context() end-to-end to ensure token counts are updated as expected (including the checkpoint message case). Consider adding a unit/integration test around KimiSoul.compact_context() to prevent regressions in context usage reporting and compaction behavior.

Copilot uses AI. Check for mistakes.
wire_send(CompactionEnd())

@staticmethod
Expand Down
54 changes: 53 additions & 1 deletion tests/core/test_simple_compaction.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from __future__ import annotations

from inline_snapshot import snapshot
from kosong.chat_provider import TokenUsage
from kosong.message import Message

import kimi_cli.prompts as prompts
from kimi_cli.soul.compaction import SimpleCompaction
from kimi_cli.soul.compaction import CompactionResult, SimpleCompaction
from kimi_cli.wire.types import TextPart, ThinkPart


Expand Down Expand Up @@ -74,3 +75,54 @@ def test_prepare_builds_compact_message_and_preserves_tail():
Message(role="assistant", content=[TextPart(text="Latest answer")]),
]
)


# --- CompactionResult.estimated_token_count tests ---


def test_estimated_token_count_with_usage_uses_output_tokens_for_summary():
"""When usage is available, the summary (first message) uses exact output tokens
and preserved messages (remaining) use character-based estimation."""
summary_msg = Message(role="user", content=[TextPart(text="compacted summary")])
preserved_msg = Message(
role="user",
content=[TextPart(text="a" * 80)], # 80 chars → 20 tokens
)
usage = TokenUsage(input_other=1000, output=150, input_cache_read=0)

result = CompactionResult(messages=[summary_msg, preserved_msg], usage=usage)

assert result.estimated_token_count == 150 + 20


def test_estimated_token_count_without_usage_estimates_all_from_text():
"""Without usage (no LLM call), all messages are estimated from text content."""
messages = [
Message(role="user", content=[TextPart(text="a" * 100)]),
Message(role="assistant", content=[TextPart(text="b" * 200)]),
]
result = CompactionResult(messages=messages, usage=None)

assert result.estimated_token_count == 300 // 4


def test_estimated_token_count_ignores_non_text_parts():
"""Non-text parts (think, etc.) should not inflate the estimate."""
messages = [
Message(
role="user",
content=[
TextPart(text="a" * 40),
ThinkPart(think="internal reasoning " * 100),
],
),
]
result = CompactionResult(messages=messages, usage=None)

assert result.estimated_token_count == 40 // 4


def test_estimated_token_count_empty_messages():
"""Empty message list should return 0."""
result = CompactionResult(messages=[], usage=None)
assert result.estimated_token_count == 0
2 changes: 1 addition & 1 deletion web/src/features/chat/chat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ export const ChatWorkspace = memo(function ChatWorkspaceComponent({

const maxTokens = maxContextSize ?? 64000;
const usedTokens = Math.round(contextUsage * maxTokens);
const usagePercent = Math.round(contextUsage * 100);
const usagePercent = Math.round(contextUsage * 1000) / 10;

const canSendMessage = true;
const isStreaming = status === "streaming";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ export const ToolbarContextIndicator = memo(
)}
>
<ContextProgressIcon usedPercent={usedPercent} size={14} />
<span>{usagePercent}% context</span>
<span>{usagePercent.toFixed(1)}% context</span>
</button>
</HoverCardTrigger>
<HoverCardContent
Expand All @@ -64,7 +64,7 @@ export const ToolbarContextIndicator = memo(
>
<div className="w-full space-y-2 p-3">
<div className="flex items-center justify-between gap-3 text-xs">
<p>{usagePercent}%</p>
<p>{usagePercent.toFixed(1)}%</p>
<p className="font-mono text-muted-foreground">
{used} / {total}
</p>
Expand Down
Loading