Skip to content

Commit 24fdf31

Browse files
authored
fix(tarko): improve markdown link parsing edge cases (#1398)
1 parent c583e7e commit 24fdf31

File tree

1 file changed

+34
-6
lines changed

1 file changed

+34
-6
lines changed

multimodal/tarko/agent-web-ui/src/sdk/markdown-renderer/utils/linkPreprocessor.ts

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,16 @@
1111
* Regular expression to match URLs that are not already in markdown link format
1212
* Matches http/https URLs that are:
1313
* 1. Not preceded by ]( (to avoid double-processing existing markdown links)
14-
* 2. Not already wrapped in markdown link syntax
15-
* 3. Followed by word boundary, Chinese characters, or other non-URL characters
14+
* 2. Not preceded by < or ` (to avoid processing HTML links or code blocks)
15+
* 3. Not preceded by [ (to avoid creating nested brackets)
16+
* 4. Not already wrapped in markdown link syntax
17+
* 5. Followed by Chinese characters or other non-URL characters
18+
* 6. Excludes trailing punctuation that shouldn't be part of URLs
1619
*
17-
* Updated to be more precise and avoid false positives
20+
* Updated to handle more edge cases and avoid false positives
1821
*/
1922
const URL_REGEX =
20-
/(?<!\]\()\b(https?:\/\/[^\s\u4e00-\u9fff\u3000-\u303f\uff00-\uffef\)\]]+)(?=[\s\u4e00-\u9fff\u3000-\u303f\uff00-\uffef\)\]]|$)/g;
23+
/(?<!\]\(|<|`|\[)\b(https?:\/\/[^\s\u4e00-\u9fff\u3000-\u303f\uff00-\uffef\)\]<>`]+?)(?:[.,;:!?](?=[\s\u4e00-\u9fff\u3000-\u303f\uff00-\uffef])|(?=[\s\u4e00-\u9fff\u3000-\u303f\uff00-\uffef\)\]])|$)/g;
2124

2225
/**
2326
* Preprocess markdown content to fix URL parsing issues
@@ -30,8 +33,33 @@ export function preprocessMarkdownLinks(content: string): string {
3033
return content;
3134
}
3235

33-
// Replace bare URLs with markdown link format [url](url)
34-
return content.replace(URL_REGEX, '[$1]($1)');
36+
// Split content by code blocks to avoid processing URLs inside them
37+
const codeBlockRegex = /```[\s\S]*?```|`[^`]*`/g;
38+
const parts: Array<{ text: string; isCodeBlock: boolean }> = [];
39+
let lastIndex = 0;
40+
let match;
41+
42+
while ((match = codeBlockRegex.exec(content)) !== null) {
43+
// Add text before code block
44+
if (match.index > lastIndex) {
45+
parts.push({ text: content.slice(lastIndex, match.index), isCodeBlock: false });
46+
}
47+
// Add code block
48+
parts.push({ text: match[0], isCodeBlock: true });
49+
lastIndex = match.index + match[0].length;
50+
}
51+
52+
// Add remaining text
53+
if (lastIndex < content.length) {
54+
parts.push({ text: content.slice(lastIndex), isCodeBlock: false });
55+
}
56+
57+
// Process only non-code-block parts
58+
return parts
59+
.map(part =>
60+
part.isCodeBlock ? part.text : part.text.replace(URL_REGEX, '[$1]($1)')
61+
)
62+
.join('');
3563
}
3664

3765
/**

0 commit comments

Comments
 (0)