Skip to content

Commit dfb19de

Browse files
committed
fix: enhance strong and emphasis token handling in parser
1 parent 310f4ef commit dfb19de

File tree

3 files changed

+129
-3
lines changed

3 files changed

+129
-3
lines changed

packages/markdown-parser/src/parser/inline-parsers/index.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,12 +165,20 @@ export function parseInlineTokens(
165165
result.push(currentTextNode)
166166
}
167167
}
168-
const emphasisContent = content.slice(idx)
168+
const closeIndex = content.indexOf('*', idx + 1)
169+
const emphasisContent = content.slice(idx, closeIndex > -1 ? closeIndex + 1 : undefined)
169170
const { node } = parseEmphasisToken([
170171
{ type: 'em_open', tag: 'em', content: '', markup: '*', info: '', meta: null },
171172
{ type: 'text', tag: '', content: emphasisContent.replace(/\*/g, ''), markup: '', info: '', meta: null },
172173
{ type: 'em_close', tag: 'em', content: '', markup: '*', info: '', meta: null },
173174
], 0, options as any)
175+
176+
if (closeIndex !== -1 && closeIndex < content.length - 1) {
177+
const afterContent = content.slice(closeIndex + 1)
178+
if (afterContent) {
179+
handleToken({ type: 'text', content: afterContent, raw: afterContent } as unknown as MarkdownToken)
180+
}
181+
}
174182
resetCurrentTextNode()
175183
pushNode(node)
176184
i++

packages/markdown-parser/src/plugins/fixStrongTokens.ts

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,97 @@ export function applyFixStrongTokens(md: MarkdownIt) {
2525
}
2626

2727
function fixStrongTokens(tokens: MarkdownToken[]): MarkdownToken[] {
28+
let strongIndex = 0
29+
const cleansStrong = new Set<number>()
30+
const cleansEm = new Set<number>()
31+
let emIndex = 0
2832
for (let i = 0; i < tokens.length; i++) {
2933
const t = tokens[i]
30-
if (t.type === 'strong_open' || t.type === 'strong_close') {
34+
const type = t.type
35+
if (type === 'strong_open') {
36+
strongIndex++
3137
const markup = String(t.markup ?? '')
32-
if (markup !== '**') {
38+
let j = i - 1
39+
while (j >= 0 && tokens[j].type === 'text' && tokens[j].content === '') {
40+
j--
41+
}
42+
const preToken = tokens[j]
43+
let k = i + 1
44+
while (k < tokens.length && tokens[k].type === 'text' && tokens[k].content === '') {
45+
k++
46+
}
47+
const postToken = tokens[k]
48+
49+
if (markup === '__' && (preToken?.content?.endsWith('_') || postToken?.content?.startsWith('_') || postToken?.markup?.includes('_'))) {
3350
t.type = 'text'
51+
t.tag = ''
3452
t.content = markup
53+
t.raw = markup
54+
t.markup = ''
55+
t.attrs = null
56+
t.map = null
57+
t.info = ''
58+
t.meta = null
59+
cleansStrong.add(strongIndex)
60+
}
61+
}
62+
else if (type === 'strong_close') {
63+
if (cleansStrong.has(strongIndex) && t.markup === '__') {
64+
t.type = 'text'
65+
t.content = t.markup
66+
t.raw = String(t.markup ?? '')
67+
t.tag = ''
68+
t.markup = ''
69+
t.attrs = null
70+
t.map = null
71+
t.info = ''
72+
t.meta = null
73+
}
74+
strongIndex--
75+
if (strongIndex < 0)
76+
strongIndex = 0
77+
}
78+
else if (type === 'em_open') {
79+
emIndex++
80+
const markup = String(t.markup ?? '')
81+
let j = i - 1
82+
while (j >= 0 && tokens[j].type === 'text' && tokens[j].content === '') {
83+
j--
84+
}
85+
const preToken = tokens[j]
86+
let k = i + 1
87+
while (k < tokens.length && tokens[k].type === 'text' && tokens[k].content === '') {
88+
k++
89+
}
90+
const postToken = tokens[k]
91+
if (markup === '_' && (preToken?.content?.endsWith('_') || postToken?.content?.startsWith('_') || postToken?.markup?.includes('_'))) {
92+
t.type = 'text'
93+
t.tag = ''
94+
t.content = markup
95+
t.raw = markup
96+
t.markup = ''
97+
t.attrs = null
98+
t.map = null
99+
t.info = ''
100+
t.meta = null
101+
cleansEm.add(emIndex)
102+
}
103+
}
104+
else if (type === 'em_close') {
105+
if (cleansEm.has(emIndex) && t.markup === '_') {
106+
t.type = 'text'
107+
t.content = t.markup
108+
t.raw = String(t.markup ?? '')
109+
t.tag = ''
35110
t.markup = ''
111+
t.attrs = null
112+
t.map = null
113+
t.info = ''
114+
t.meta = null
36115
}
116+
emIndex--
117+
if (emIndex < 0)
118+
emIndex = 0
37119
}
38120
}
39121
if (tokens.length < 5)
@@ -57,6 +139,7 @@ function fixStrongTokens(tokens: MarkdownToken[]): MarkdownToken[] {
57139
markup: '**',
58140
info: '',
59141
meta: null,
142+
raw: '',
60143
},
61144
tokens[i],
62145
tokens[i + 1],
@@ -71,6 +154,7 @@ function fixStrongTokens(tokens: MarkdownToken[]): MarkdownToken[] {
71154
markup: '**',
72155
info: '',
73156
meta: null,
157+
raw: '',
74158
},
75159
]
76160
if (textContent) {
@@ -97,10 +181,12 @@ function fixStrongTokens(tokens: MarkdownToken[]): MarkdownToken[] {
97181
markup: '**',
98182
info: '',
99183
meta: null,
184+
raw: '',
100185
},
101186
{
102187
type: 'text',
103188
content: _nextToken?.type === 'text' ? String(_nextToken.content ?? '') : '',
189+
raw: _nextToken?.type === 'text' ? String(_nextToken.content ?? '') : '',
104190
},
105191
{
106192
type: 'strong_close',
@@ -112,6 +198,7 @@ function fixStrongTokens(tokens: MarkdownToken[]): MarkdownToken[] {
112198
markup: '**',
113199
info: '',
114200
meta: null,
201+
raw: '',
115202
},
116203
] as MarkdownToken[]
117204
const beforeText = tokenContent.slice(0, -1)
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import { getMarkdown, parseMarkdownToStructure } from 'stream-markdown-parser'
2+
import { describe, expect, it } from 'vitest'
3+
4+
describe('fixStrongTokens plugin (parse-token assertions)', () => {
5+
it('produces strong token around inner underscore for `a __b_c__ d`', () => {
6+
const md = getMarkdown('t')
7+
const content = 'a __b_c__ d'
8+
const nodes = parseMarkdownToStructure(content, md)
9+
// top-level should be a paragraph
10+
const para = nodes[0] as any
11+
expect(para).toBeDefined()
12+
expect(para.type).toBe('paragraph')
13+
const strong = para.children?.find((c: any) => c.type === 'strong')
14+
expect(strong).toBeDefined()
15+
const text = strong.children?.[0]
16+
expect(text).toBeDefined()
17+
expect(text.type).toBe('text')
18+
expect(text.content).toBe('b_c')
19+
})
20+
21+
it('parses malformed emphasis without throwing and returns tokens', () => {
22+
const md = getMarkdown('t')
23+
const content = 'this is *a test * with unmatched star'
24+
const nodes = parseMarkdownToStructure(content, md)
25+
// basic sanity: nodes array exists and contains at least one paragraph/inline-derived node
26+
const emphasis = nodes[0].children?.find((c: any) => c.type === 'emphasis')
27+
expect(emphasis.type).toBe('emphasis')
28+
expect(emphasis.children?.[0].content).toBe('a test ')
29+
expect(emphasis.raw).toBe('*a test *')
30+
})
31+
})

0 commit comments

Comments
 (0)