-
Notifications
You must be signed in to change notification settings - Fork 4.1k
Refactor/base layer message split #436
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
94a1b86
e03124d
e35a827
0a9d24e
4ccee85
f38ce0d
82a2fae
dfc3dff
7d8894d
a46fe14
98afd39
0d6b22f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,173 @@ | ||||||||||||||||||
| package utils | ||||||||||||||||||
|
|
||||||||||||||||||
| import ( | ||||||||||||||||||
| "strings" | ||||||||||||||||||
| ) | ||||||||||||||||||
|
|
||||||||||||||||||
| const defaultCodeBlockBuffer = 500 | ||||||||||||||||||
|
|
||||||||||||||||||
| // SplitMessage splits long messages into chunks, preserving code block integrity. | ||||||||||||||||||
| // The function prefers to split at maxLen - defaultCodeBlockBuffer to leave room for code blocks, | ||||||||||||||||||
| // but may extend up to maxLen when needed to avoid breaking incomplete code blocks. | ||||||||||||||||||
| // Call SplitMessage with the full text content and the maximum allowed length of a single message; | ||||||||||||||||||
| // it returns a slice of message chunks that each respect maxLen and avoid splitting fenced code blocks. | ||||||||||||||||||
|
huaaudio marked this conversation as resolved.
Outdated
|
||||||||||||||||||
| func SplitMessage(content string, maxLen int) []string { | ||||||||||||||||||
| var messages []string | ||||||||||||||||||
| codeBlockBuffer := defaultCodeBlockBuffer | ||||||||||||||||||
|
|
||||||||||||||||||
| for len(content) > 0 { | ||||||||||||||||||
| if len(content) <= maxLen { | ||||||||||||||||||
| messages = append(messages, content) | ||||||||||||||||||
| break | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| // Effective split point: maxLen minus buffer, to leave room for code blocks | ||||||||||||||||||
| effectiveLimit := maxLen - codeBlockBuffer | ||||||||||||||||||
|
huaaudio marked this conversation as resolved.
|
||||||||||||||||||
| if effectiveLimit < maxLen/2 { | ||||||||||||||||||
| effectiveLimit = maxLen / 2 | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
|
huaaudio marked this conversation as resolved.
|
||||||||||||||||||
| // Find natural split point within the effective limit | ||||||||||||||||||
| msgEnd := findLastNewline(content[:effectiveLimit], 200) | ||||||||||||||||||
| if msgEnd <= 0 { | ||||||||||||||||||
| msgEnd = findLastSpace(content[:effectiveLimit], 100) | ||||||||||||||||||
| } | ||||||||||||||||||
| if msgEnd <= 0 { | ||||||||||||||||||
| msgEnd = effectiveLimit | ||||||||||||||||||
| } | ||||||||||||||||||
|
Comment on lines
+36
to
+43
|
||||||||||||||||||
|
|
||||||||||||||||||
| // Check if this would end with an incomplete code block | ||||||||||||||||||
| candidate := content[:msgEnd] | ||||||||||||||||||
| unclosedIdx := findLastUnclosedCodeBlock(candidate) | ||||||||||||||||||
|
|
||||||||||||||||||
| if unclosedIdx >= 0 { | ||||||||||||||||||
| // Message would end with incomplete code block | ||||||||||||||||||
| // Try to extend up to maxLen to include the closing ``` | ||||||||||||||||||
| if len(content) > msgEnd { | ||||||||||||||||||
| closingIdx := findNextClosingCodeBlock(content, msgEnd) | ||||||||||||||||||
| if closingIdx > 0 && closingIdx <= maxLen { | ||||||||||||||||||
| // Extend to include the closing ``` | ||||||||||||||||||
| msgEnd = closingIdx | ||||||||||||||||||
| } else { | ||||||||||||||||||
| // Code block is too long to fit in one chunk or missing closing fence. | ||||||||||||||||||
| // Try to split inside by injecting closing and reopening fences. | ||||||||||||||||||
| headerEnd := strings.Index(content[unclosedIdx:], "\n") | ||||||||||||||||||
| if headerEnd == -1 { | ||||||||||||||||||
| headerEnd = unclosedIdx + 3 | ||||||||||||||||||
| } else { | ||||||||||||||||||
| headerEnd += unclosedIdx | ||||||||||||||||||
| } | ||||||||||||||||||
| header := strings.TrimSpace(content[unclosedIdx:headerEnd]) | ||||||||||||||||||
|
|
||||||||||||||||||
|
Comment on lines
+60
to
+67
|
||||||||||||||||||
| // If we have a reasonable amount of content after the header, split inside | ||||||||||||||||||
| if msgEnd > headerEnd+20 { | ||||||||||||||||||
| // Find a better split point closer to maxLen | ||||||||||||||||||
| innerLimit := maxLen - 5 // Leave room for "\n```" | ||||||||||||||||||
| betterEnd := findLastNewline(content[:innerLimit], 200) | ||||||||||||||||||
| if betterEnd > headerEnd { | ||||||||||||||||||
| msgEnd = betterEnd | ||||||||||||||||||
| } else { | ||||||||||||||||||
| msgEnd = innerLimit | ||||||||||||||||||
| } | ||||||||||||||||||
| messages = append(messages, strings.TrimRight(content[:msgEnd], " \t\n\r")+"\n```") | ||||||||||||||||||
| content = strings.TrimSpace(header + "\n" + content[msgEnd:]) | ||||||||||||||||||
| continue | ||||||||||||||||||
|
Comment on lines
+78
to
+80
|
||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| // Otherwise, try to split before the code block starts | ||||||||||||||||||
| newEnd := findLastNewline(content[:unclosedIdx], 200) | ||||||||||||||||||
| if newEnd <= 0 { | ||||||||||||||||||
| newEnd = findLastSpace(content[:unclosedIdx], 100) | ||||||||||||||||||
| } | ||||||||||||||||||
| if newEnd > 0 { | ||||||||||||||||||
| msgEnd = newEnd | ||||||||||||||||||
| } else { | ||||||||||||||||||
| // If we can't split before, we MUST split inside (last resort) | ||||||||||||||||||
| if unclosedIdx > 20 { | ||||||||||||||||||
| msgEnd = unclosedIdx | ||||||||||||||||||
| } else { | ||||||||||||||||||
| msgEnd = maxLen - 5 | ||||||||||||||||||
| messages = append(messages, strings.TrimRight(content[:msgEnd], " \t\n\r")+"\n```") | ||||||||||||||||||
| content = strings.TrimSpace(header + "\n" + content[msgEnd:]) | ||||||||||||||||||
| continue | ||||||||||||||||||
|
Comment on lines
+95
to
+98
|
||||||||||||||||||
| } | ||||||||||||||||||
| } | ||||||||||||||||||
| } | ||||||||||||||||||
| } | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| if msgEnd <= 0 { | ||||||||||||||||||
| msgEnd = effectiveLimit | ||||||||||||||||||
| } | ||||||||||||||||||
|
||||||||||||||||||
| } | |
| } | |
| // Ensure msgEnd is a valid, positive index so the loop always makes progress. | |
| if msgEnd <= 0 { | |
| msgEnd = 1 | |
| } else if msgEnd > len(content) { | |
| msgEnd = len(content) | |
| } |
Copilot
AI
Feb 18, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The SplitMessage function uses byte-based indexing (string slicing) which can split multi-byte UTF-8 characters, potentially creating invalid UTF-8 sequences. When slicing at positions found by findLastNewline, findLastSpace, or at arbitrary positions like effectiveLimit or innerLimit, the function may split in the middle of a multi-byte character. Consider using rune-based indexing or validating split points to ensure they fall on character boundaries. The test case "Preserve Unicode characters" acknowledges this issue but doesn't verify the output is valid UTF-8.
Copilot
AI
Feb 18, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SplitMessage introduces non-trivial behavior (natural boundaries, code block fence handling, and the 500-char buffer) but there are no unit tests verifying chunk sizes, code-block integrity, and edge cases (e.g., long code blocks without closing fences, no whitespace, Unicode input). Given the repo has existing Go unit tests, please add focused tests in pkg/utils for this helper.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sendconverts the full message to[]runeonly to check emptiness (len(runes) == 0), which allocates and can be expensive for long messages. This can be replaced with a simplemsg.Content == ""(orlen(msg.Content)==0) check without changing behavior.