Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 100 additions & 4 deletions packages/core/src/utils/editCorrector.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,10 @@ describe('editCorrector', () => {
);
});
it('should correctly process strings with some targeted escapes', () => {
// Note: C:\\Users\\name - the \\n in \\name should NOT be unescaped to newline
// because 'name' starts with lowercase letters (likely a path component, not an escape sequence)
expect(unescapeStringForGeminiBug('C:\\Users\\name')).toBe(
'C:\\Users\name',
'C:\\Users\\name',
);
});
it('should handle complex cases with mixed slashes and characters', () => {
Expand All @@ -142,18 +144,112 @@ describe('editCorrector', () => {
it('should handle escaped backslashes', () => {
expect(unescapeStringForGeminiBug('\\\\')).toBe('\\');
expect(unescapeStringForGeminiBug('C:\\\\Users')).toBe('C:\\Users');
// Note: path\\to\\file - the \\t should NOT unescape because 'o' follows (lowercase)
expect(unescapeStringForGeminiBug('path\\\\to\\\\file')).toBe(
'path\to\\file',
'path\\to\\file',
);
});
it('should handle escaped backslashes mixed with other escapes (aggressive unescaping)', () => {
// Note: line1\\\\\\nline2 - the \\n should NOT unescape because 'line2' follows (lowercase)
expect(unescapeStringForGeminiBug('line1\\\\\\nline2')).toBe(
'line1\nline2',
'line1\\nline2',
);
// Note: \\\\nline - the \\n should NOT unescape because 'line' follows (lowercase)
expect(unescapeStringForGeminiBug('quote\\\\"text\\\\nline')).toBe(
'quote"text\nline',
'quote"text\\nline',
);
});

// LaTeX-specific tests (issue #19802)
it('should NOT unescape backslash-t in LaTeX commands', () => {
expect(unescapeStringForGeminiBug('\\title{My Document}')).toBe(
'\\title{My Document}',
);
expect(unescapeStringForGeminiBug('\\textbf{Bold Text}')).toBe(
'\\textbf{Bold Text}',
);
expect(unescapeStringForGeminiBug('\\table{data}')).toBe('\\table{data}');
expect(unescapeStringForGeminiBug('\\tableofcontents')).toBe(
'\\tableofcontents',
);
});

it('should NOT unescape backslash-n in LaTeX commands', () => {
expect(unescapeStringForGeminiBug('\\newline')).toBe('\\newline');
expect(unescapeStringForGeminiBug('\\newcommand{\\test}')).toBe(
'\\newcommand{\\test}',
);
expect(unescapeStringForGeminiBug('\\noindent')).toBe('\\noindent');
});

it('should NOT unescape backslash-r in LaTeX/regex commands', () => {
expect(unescapeStringForGeminiBug('\\ref{fig:1}')).toBe('\\ref{fig:1}');
expect(unescapeStringForGeminiBug('\\renewcommand')).toBe(
'\\renewcommand',
);
});

it('should handle mixed LaTeX commands and actual escape sequences', () => {
// LaTeX command followed by actual tab - \tNext has capital N, so should become tab
expect(unescapeStringForGeminiBug('\\title{Test}\\tNext')).toBe(
'\\title{Test}\tNext',
);
// Actual newline followed by LaTeX command
expect(unescapeStringForGeminiBug('Line1\\n\\newpage')).toBe(
'Line1\n\\newpage',
);
// Mixed content - \ttab has lowercase t after \t, so both stay as-is
expect(
unescapeStringForGeminiBug('\\textbf{Bold}\\nNew line\\ttab'),
).toBe('\\textbf{Bold}\nNew line\\ttab');
});

it('should handle regex patterns with backslashes', () => {
// Regex patterns often use \d, \w, \s which shouldn't be unescaped
expect(unescapeStringForGeminiBug('\\d+')).toBe('\\d+');
expect(unescapeStringForGeminiBug('\\w+')).toBe('\\w+');
expect(unescapeStringForGeminiBug('\\s+')).toBe('\\s+');
// But \t and \n in regex contexts (not followed by letters) should be unescaped
expect(unescapeStringForGeminiBug('test\\t+')).toBe('test\t+');
expect(unescapeStringForGeminiBug('line\\n+')).toBe('line\n+');
});

it('should handle Windows file paths correctly', () => {
// Windows paths should be preserved
expect(unescapeStringForGeminiBug('C:\\temp\\file.txt')).toBe(
'C:\\temp\\file.txt',
);
expect(unescapeStringForGeminiBug('D:\\test\\readme.md')).toBe(
'D:\\test\\readme.md',
);
});

it('should handle escape sequences at end of string', () => {
expect(unescapeStringForGeminiBug('test\\n')).toBe('test\n');
expect(unescapeStringForGeminiBug('test\\t')).toBe('test\t');
expect(unescapeStringForGeminiBug('test\\r')).toBe('test\r');
});

it('should handle escape sequences followed by punctuation', () => {
expect(unescapeStringForGeminiBug('line\\n,')).toBe('line\n,');
expect(unescapeStringForGeminiBug('tab\\t.')).toBe('tab\t.');
expect(unescapeStringForGeminiBug('text\\n{}')).toBe('text\n{}');
});

it('should handle complex LaTeX document structure', () => {
const latexCode = `\\documentclass{article}
\\usepackage{geometry}
\\title{Critical Review}
\\author{Gemini CLI}
\\begin{document}
\\maketitle
\\section{Introduction}
\\textbf{This} is \\textit{text}.
\\end{document}`;

// Should preserve all LaTeX commands
expect(unescapeStringForGeminiBug(latexCode)).toBe(latexCode);
});
});

describe('ensureCorrectEdit', () => {
Expand Down
86 changes: 59 additions & 27 deletions packages/core/src/utils/editCorrector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -732,44 +732,76 @@ function trimPairIfPossible(

/**
* Unescapes a string that might have been overly escaped by an LLM.
*
* This function handles cases where the LLM over-escapes strings (e.g., \\n instead of \n),
* but it must NOT unescape legitimate backslash sequences that are part of the content,
* such as LaTeX commands (\title, \textbf), regex patterns (\d, \w), or other domain-specific syntax.
*
* Strategy: Only unescape when it's clearly an escape sequence bug, not part of a command/identifier.
* - LaTeX commands are typically: backslash + LOWERCASE letters (e.g., \title, \textbf, \newline)
* - Escape sequences are: backslash + letter + (NON-lowercase-letter | end-of-string)
* Examples: \tTest (tab + Test), \n\n (newline), Hello\nWorld (newline between words)
*/
export function unescapeStringForGeminiBug(inputString: string): string {
// Regex explanation:
// \\ : Matches exactly one literal backslash character.
// (n|t|r|'|"|`|\\|\n) : This is a capturing group. It matches one of the following:
// n, t, r, ', ", ` : These match the literal characters 'n', 't', 'r', single quote, double quote, or backtick.
// This handles cases like "\\n", "\\`", etc.
// \\ : This matches a literal backslash. This handles cases like "\\\\" (escaped backslash).
// \n : This matches an actual newline character. This handles cases where the input
// string might have something like "\\\n" (a literal backslash followed by a newline).
// g : Global flag, to replace all occurrences.
// Strategy: Process in a single pass with careful lookahead
// Match: one or more backslashes followed by (n|t|r|'|"|`|\\|\n)
// For n, t, r: Only unescape if NOT followed by a LOWERCASE letter (LaTeX commands use lowercase)
// For quotes/backticks/backslashes: Always safe to unescape

return inputString.replace(
/\\+(n|t|r|'|"|`|\\|\n)/g,
(match, capturedChar) => {
// 'match' is the entire erroneous sequence, e.g., if the input (in memory) was "\\\\`", match is "\\\\`".
// 'capturedChar' is the character that determines the true meaning, e.g., '`'.
(
match: string,
capturedChar: string,
offset: number,
fullString: string,
): string => {
// For n, t, r: check if followed by a LOWERCASE letter
if (
capturedChar === 'n' ||
capturedChar === 't' ||
capturedChar === 'r'
) {
// Look at the next character after the match
const nextCharIndex = offset + match.length;
const nextChar: string =
nextCharIndex < fullString.length ? fullString[nextCharIndex] : '';

// If followed by a LOWERCASE letter, preserve the backslash (it's likely a LaTeX command)
// Examples: \title, \textbf, \newline (lowercase follows)
// Counter-examples: \tTest, \n\n, test\n (uppercase/punct/end follows - these ARE escape sequences)
if (/[a-z]/.test(nextChar)) {
// Return the LAST backslash + the character (e.g., \\t -> \t for \title)
return '\\' + capturedChar;
}

// Otherwise, unescape to the actual control character
switch (capturedChar) {
case 'n':
return '\n';
case 't':
return '\t';
case 'r':
return '\r';
default:
// Should not reach here given the regex, but satisfy default-case
return match;
}
}

// For other characters, always unescape
switch (capturedChar) {
case 'n':
return '\n'; // Correctly escaped: \n (newline character)
case 't':
return '\t'; // Correctly escaped: \t (tab character)
case 'r':
return '\r'; // Correctly escaped: \r (carriage return character)
case "'":
return "'"; // Correctly escaped: ' (apostrophe character)
return "'";
case '"':
return '"'; // Correctly escaped: " (quotation mark character)
return '"';
case '`':
return '`'; // Correctly escaped: ` (backtick character)
case '\\': // This handles when 'capturedChar' is a literal backslash
return '\\'; // Replace escaped backslash (e.g., "\\\\") with single backslash
case '\n': // This handles when 'capturedChar' is an actual newline
return '\n'; // Replace the whole erroneous sequence (e.g., "\\\n" in memory) with a clean newline
return '`';
case '\\':
return '\\';
case '\n':
return '\n';
default:
// This fallback should ideally not be reached if the regex captures correctly.
// It would return the original matched sequence if an unexpected character was captured.
return match;
}
},
Expand Down