diff --git a/packages/core/src/utils/editCorrector.test.ts b/packages/core/src/utils/editCorrector.test.ts index 35b126a5eab..8fe30f507a2 100644 --- a/packages/core/src/utils/editCorrector.test.ts +++ b/packages/core/src/utils/editCorrector.test.ts @@ -130,8 +130,10 @@ describe('editCorrector', () => { ); }); it('should correctly process strings with some targeted escapes', () => { + // Note: C:\\Users\\name - the \\n in \\name should NOT be unescaped to newline + // because 'name' starts with lowercase letters (likely a path component, not an escape sequence) expect(unescapeStringForGeminiBug('C:\\Users\\name')).toBe( - 'C:\\Users\name', + 'C:\\Users\\name', ); }); it('should handle complex cases with mixed slashes and characters', () => { @@ -142,18 +144,112 @@ describe('editCorrector', () => { it('should handle escaped backslashes', () => { expect(unescapeStringForGeminiBug('\\\\')).toBe('\\'); expect(unescapeStringForGeminiBug('C:\\\\Users')).toBe('C:\\Users'); + // Note: path\\to\\file - the \\t should NOT unescape because 'o' follows (lowercase) expect(unescapeStringForGeminiBug('path\\\\to\\\\file')).toBe( - 'path\to\\file', + 'path\\to\\file', ); }); it('should handle escaped backslashes mixed with other escapes (aggressive unescaping)', () => { + // Note: line1\\\\\\nline2 - the \\n should NOT unescape because 'line2' follows (lowercase) expect(unescapeStringForGeminiBug('line1\\\\\\nline2')).toBe( - 'line1\nline2', + 'line1\\nline2', ); + // Note: \\\\nline - the \\n should NOT unescape because 'line' follows (lowercase) expect(unescapeStringForGeminiBug('quote\\\\"text\\\\nline')).toBe( - 'quote"text\nline', + 'quote"text\\nline', ); }); + + // LaTeX-specific tests (issue #19802) + it('should NOT unescape backslash-t in LaTeX commands', () => { + expect(unescapeStringForGeminiBug('\\title{My Document}')).toBe( + '\\title{My Document}', + ); + expect(unescapeStringForGeminiBug('\\textbf{Bold Text}')).toBe( + '\\textbf{Bold Text}', + ); + expect(unescapeStringForGeminiBug('\\table{data}')).toBe('\\table{data}'); + expect(unescapeStringForGeminiBug('\\tableofcontents')).toBe( + '\\tableofcontents', + ); + }); + + it('should NOT unescape backslash-n in LaTeX commands', () => { + expect(unescapeStringForGeminiBug('\\newline')).toBe('\\newline'); + expect(unescapeStringForGeminiBug('\\newcommand{\\test}')).toBe( + '\\newcommand{\\test}', + ); + expect(unescapeStringForGeminiBug('\\noindent')).toBe('\\noindent'); + }); + + it('should NOT unescape backslash-r in LaTeX/regex commands', () => { + expect(unescapeStringForGeminiBug('\\ref{fig:1}')).toBe('\\ref{fig:1}'); + expect(unescapeStringForGeminiBug('\\renewcommand')).toBe( + '\\renewcommand', + ); + }); + + it('should handle mixed LaTeX commands and actual escape sequences', () => { + // LaTeX command followed by actual tab - \tNext has capital N, so should become tab + expect(unescapeStringForGeminiBug('\\title{Test}\\tNext')).toBe( + '\\title{Test}\tNext', + ); + // Actual newline followed by LaTeX command + expect(unescapeStringForGeminiBug('Line1\\n\\newpage')).toBe( + 'Line1\n\\newpage', + ); + // Mixed content - \ttab has lowercase t after \t, so both stay as-is + expect( + unescapeStringForGeminiBug('\\textbf{Bold}\\nNew line\\ttab'), + ).toBe('\\textbf{Bold}\nNew line\\ttab'); + }); + + it('should handle regex patterns with backslashes', () => { + // Regex patterns often use \d, \w, \s which shouldn't be unescaped + expect(unescapeStringForGeminiBug('\\d+')).toBe('\\d+'); + expect(unescapeStringForGeminiBug('\\w+')).toBe('\\w+'); + expect(unescapeStringForGeminiBug('\\s+')).toBe('\\s+'); + // But \t and \n in regex contexts (not followed by letters) should be unescaped + expect(unescapeStringForGeminiBug('test\\t+')).toBe('test\t+'); + expect(unescapeStringForGeminiBug('line\\n+')).toBe('line\n+'); + }); + + it('should handle Windows file paths correctly', () => { + // Windows paths should be preserved + expect(unescapeStringForGeminiBug('C:\\temp\\file.txt')).toBe( + 'C:\\temp\\file.txt', + ); + expect(unescapeStringForGeminiBug('D:\\test\\readme.md')).toBe( + 'D:\\test\\readme.md', + ); + }); + + it('should handle escape sequences at end of string', () => { + expect(unescapeStringForGeminiBug('test\\n')).toBe('test\n'); + expect(unescapeStringForGeminiBug('test\\t')).toBe('test\t'); + expect(unescapeStringForGeminiBug('test\\r')).toBe('test\r'); + }); + + it('should handle escape sequences followed by punctuation', () => { + expect(unescapeStringForGeminiBug('line\\n,')).toBe('line\n,'); + expect(unescapeStringForGeminiBug('tab\\t.')).toBe('tab\t.'); + expect(unescapeStringForGeminiBug('text\\n{}')).toBe('text\n{}'); + }); + + it('should handle complex LaTeX document structure', () => { + const latexCode = `\\documentclass{article} +\\usepackage{geometry} +\\title{Critical Review} +\\author{Gemini CLI} +\\begin{document} +\\maketitle +\\section{Introduction} +\\textbf{This} is \\textit{text}. +\\end{document}`; + + // Should preserve all LaTeX commands + expect(unescapeStringForGeminiBug(latexCode)).toBe(latexCode); + }); }); describe('ensureCorrectEdit', () => { diff --git a/packages/core/src/utils/editCorrector.ts b/packages/core/src/utils/editCorrector.ts index e15be8cfc4e..59025122ee9 100644 --- a/packages/core/src/utils/editCorrector.ts +++ b/packages/core/src/utils/editCorrector.ts @@ -732,44 +732,76 @@ function trimPairIfPossible( /** * Unescapes a string that might have been overly escaped by an LLM. + * + * This function handles cases where the LLM over-escapes strings (e.g., \\n instead of \n), + * but it must NOT unescape legitimate backslash sequences that are part of the content, + * such as LaTeX commands (\title, \textbf), regex patterns (\d, \w), or other domain-specific syntax. + * + * Strategy: Only unescape when it's clearly an escape sequence bug, not part of a command/identifier. + * - LaTeX commands are typically: backslash + LOWERCASE letters (e.g., \title, \textbf, \newline) + * - Escape sequences are: backslash + letter + (NON-lowercase-letter | end-of-string) + * Examples: \tTest (tab + Test), \n\n (newline), Hello\nWorld (newline between words) */ export function unescapeStringForGeminiBug(inputString: string): string { - // Regex explanation: - // \\ : Matches exactly one literal backslash character. - // (n|t|r|'|"|`|\\|\n) : This is a capturing group. It matches one of the following: - // n, t, r, ', ", ` : These match the literal characters 'n', 't', 'r', single quote, double quote, or backtick. - // This handles cases like "\\n", "\\`", etc. - // \\ : This matches a literal backslash. This handles cases like "\\\\" (escaped backslash). - // \n : This matches an actual newline character. This handles cases where the input - // string might have something like "\\\n" (a literal backslash followed by a newline). - // g : Global flag, to replace all occurrences. + // Strategy: Process in a single pass with careful lookahead + // Match: one or more backslashes followed by (n|t|r|'|"|`|\\|\n) + // For n, t, r: Only unescape if NOT followed by a LOWERCASE letter (LaTeX commands use lowercase) + // For quotes/backticks/backslashes: Always safe to unescape return inputString.replace( /\\+(n|t|r|'|"|`|\\|\n)/g, - (match, capturedChar) => { - // 'match' is the entire erroneous sequence, e.g., if the input (in memory) was "\\\\`", match is "\\\\`". - // 'capturedChar' is the character that determines the true meaning, e.g., '`'. + ( + match: string, + capturedChar: string, + offset: number, + fullString: string, + ): string => { + // For n, t, r: check if followed by a LOWERCASE letter + if ( + capturedChar === 'n' || + capturedChar === 't' || + capturedChar === 'r' + ) { + // Look at the next character after the match + const nextCharIndex = offset + match.length; + const nextChar: string = + nextCharIndex < fullString.length ? fullString[nextCharIndex] : ''; + + // If followed by a LOWERCASE letter, preserve the backslash (it's likely a LaTeX command) + // Examples: \title, \textbf, \newline (lowercase follows) + // Counter-examples: \tTest, \n\n, test\n (uppercase/punct/end follows - these ARE escape sequences) + if (/[a-z]/.test(nextChar)) { + // Return the LAST backslash + the character (e.g., \\t -> \t for \title) + return '\\' + capturedChar; + } + + // Otherwise, unescape to the actual control character + switch (capturedChar) { + case 'n': + return '\n'; + case 't': + return '\t'; + case 'r': + return '\r'; + default: + // Should not reach here given the regex, but satisfy default-case + return match; + } + } + // For other characters, always unescape switch (capturedChar) { - case 'n': - return '\n'; // Correctly escaped: \n (newline character) - case 't': - return '\t'; // Correctly escaped: \t (tab character) - case 'r': - return '\r'; // Correctly escaped: \r (carriage return character) case "'": - return "'"; // Correctly escaped: ' (apostrophe character) + return "'"; case '"': - return '"'; // Correctly escaped: " (quotation mark character) + return '"'; case '`': - return '`'; // Correctly escaped: ` (backtick character) - case '\\': // This handles when 'capturedChar' is a literal backslash - return '\\'; // Replace escaped backslash (e.g., "\\\\") with single backslash - case '\n': // This handles when 'capturedChar' is an actual newline - return '\n'; // Replace the whole erroneous sequence (e.g., "\\\n" in memory) with a clean newline + return '`'; + case '\\': + return '\\'; + case '\n': + return '\n'; default: - // This fallback should ideally not be reached if the regex captures correctly. - // It would return the original matched sequence if an unexpected character was captured. return match; } },