fix: enhance strong and emphasis token handling in parser

Simon-He95 · Simon-He95 · commit dfb19de48c0e · 2025-11-26T19:22:05.000+08:00
diff --git a/packages/markdown-parser/src/parser/inline-parsers/index.ts b/packages/markdown-parser/src/parser/inline-parsers/index.ts
@@ -165,12 +165,20 @@ export function parseInlineTokens(
           result.push(currentTextNode)
         }
       }
-      const emphasisContent = content.slice(idx)
+      const closeIndex = content.indexOf('*', idx + 1)
+      const emphasisContent = content.slice(idx, closeIndex > -1 ? closeIndex + 1 : undefined)
       const { node } = parseEmphasisToken([
         { type: 'em_open', tag: 'em', content: '', markup: '*', info: '', meta: null },
         { type: 'text', tag: '', content: emphasisContent.replace(/\*/g, ''), markup: '', info: '', meta: null },
         { type: 'em_close', tag: 'em', content: '', markup: '*', info: '', meta: null },
       ], 0, options as any)
+
+      if (closeIndex !== -1 && closeIndex < content.length - 1) {
+        const afterContent = content.slice(closeIndex + 1)
+        if (afterContent) {
+          handleToken({ type: 'text', content: afterContent, raw: afterContent } as unknown as MarkdownToken)
+        }
+      }
       resetCurrentTextNode()
       pushNode(node)
       i++
diff --git a/packages/markdown-parser/src/plugins/fixStrongTokens.ts b/packages/markdown-parser/src/plugins/fixStrongTokens.ts
@@ -25,15 +25,97 @@ export function applyFixStrongTokens(md: MarkdownIt) {
 }
 
 function fixStrongTokens(tokens: MarkdownToken[]): MarkdownToken[] {
+  let strongIndex = 0
+  const cleansStrong = new Set<number>()
+  const cleansEm = new Set<number>()
+  let emIndex = 0
   for (let i = 0; i < tokens.length; i++) {
     const t = tokens[i]
-    if (t.type === 'strong_open' || t.type === 'strong_close') {
+    const type = t.type
+    if (type === 'strong_open') {
+      strongIndex++
       const markup = String(t.markup ?? '')
-      if (markup !== '**') {
+      let j = i - 1
+      while (j >= 0 && tokens[j].type === 'text' && tokens[j].content === '') {
+        j--
+      }
+      const preToken = tokens[j]
+      let k = i + 1
+      while (k < tokens.length && tokens[k].type === 'text' && tokens[k].content === '') {
+        k++
+      }
+      const postToken = tokens[k]
+
+      if (markup === '__' && (preToken?.content?.endsWith('_') || postToken?.content?.startsWith('_') || postToken?.markup?.includes('_'))) {
         t.type = 'text'
+        t.tag = ''
         t.content = markup
+        t.raw = markup
+        t.markup = ''
+        t.attrs = null
+        t.map = null
+        t.info = ''
+        t.meta = null
+        cleansStrong.add(strongIndex)
+      }
+    }
+    else if (type === 'strong_close') {
+      if (cleansStrong.has(strongIndex) && t.markup === '__') {
+        t.type = 'text'
+        t.content = t.markup
+        t.raw = String(t.markup ?? '')
+        t.tag = ''
+        t.markup = ''
+        t.attrs = null
+        t.map = null
+        t.info = ''
+        t.meta = null
+      }
+      strongIndex--
+      if (strongIndex < 0)
+        strongIndex = 0
+    }
+    else if (type === 'em_open') {
+      emIndex++
+      const markup = String(t.markup ?? '')
+      let j = i - 1
+      while (j >= 0 && tokens[j].type === 'text' && tokens[j].content === '') {
+        j--
+      }
+      const preToken = tokens[j]
+      let k = i + 1
+      while (k < tokens.length && tokens[k].type === 'text' && tokens[k].content === '') {
+        k++
+      }
+      const postToken = tokens[k]
+      if (markup === '_' && (preToken?.content?.endsWith('_') || postToken?.content?.startsWith('_') || postToken?.markup?.includes('_'))) {
+        t.type = 'text'
+        t.tag = ''
+        t.content = markup
+        t.raw = markup
+        t.markup = ''
+        t.attrs = null
+        t.map = null
+        t.info = ''
+        t.meta = null
+        cleansEm.add(emIndex)
+      }
+    }
+    else if (type === 'em_close') {
+      if (cleansEm.has(emIndex) && t.markup === '_') {
+        t.type = 'text'
+        t.content = t.markup
+        t.raw = String(t.markup ?? '')
+        t.tag = ''
         t.markup = ''
+        t.attrs = null
+        t.map = null
+        t.info = ''
+        t.meta = null
       }
+      emIndex--
+      if (emIndex < 0)
+        emIndex = 0
     }
   }
   if (tokens.length < 5)
@@ -57,6 +139,7 @@ function fixStrongTokens(tokens: MarkdownToken[]): MarkdownToken[] {
         markup: '**',
         info: '',
         meta: null,
+        raw: '',
       },
       tokens[i],
       tokens[i + 1],
@@ -71,6 +154,7 @@ function fixStrongTokens(tokens: MarkdownToken[]): MarkdownToken[] {
         markup: '**',
         info: '',
         meta: null,
+        raw: '',
       },
     ]
     if (textContent) {
@@ -97,10 +181,12 @@ function fixStrongTokens(tokens: MarkdownToken[]): MarkdownToken[] {
         markup: '**',
         info: '',
         meta: null,
+        raw: '',
       },
       {
         type: 'text',
         content: _nextToken?.type === 'text' ? String(_nextToken.content ?? '') : '',
+        raw: _nextToken?.type === 'text' ? String(_nextToken.content ?? '') : '',
       },
       {
         type: 'strong_close',
@@ -112,6 +198,7 @@ function fixStrongTokens(tokens: MarkdownToken[]): MarkdownToken[] {
         markup: '**',
         info: '',
         meta: null,
+        raw: '',
       },
     ] as MarkdownToken[]
     const beforeText = tokenContent.slice(0, -1)
diff --git a/test/fix-strong-tokens-parse.test.ts b/test/fix-strong-tokens-parse.test.ts
@@ -0,0 +1,31 @@
+import { getMarkdown, parseMarkdownToStructure } from 'stream-markdown-parser'
+import { describe, expect, it } from 'vitest'
+
+describe('fixStrongTokens plugin (parse-token assertions)', () => {
+  it('produces strong token around inner underscore for `a __b_c__ d`', () => {
+    const md = getMarkdown('t')
+    const content = 'a __b_c__ d'
+    const nodes = parseMarkdownToStructure(content, md)
+    // top-level should be a paragraph
+    const para = nodes[0] as any
+    expect(para).toBeDefined()
+    expect(para.type).toBe('paragraph')
+    const strong = para.children?.find((c: any) => c.type === 'strong')
+    expect(strong).toBeDefined()
+    const text = strong.children?.[0]
+    expect(text).toBeDefined()
+    expect(text.type).toBe('text')
+    expect(text.content).toBe('b_c')
+  })
+
+  it('parses malformed emphasis without throwing and returns tokens', () => {
+    const md = getMarkdown('t')
+    const content = 'this is *a test * with unmatched star'
+    const nodes = parseMarkdownToStructure(content, md)
+    // basic sanity: nodes array exists and contains at least one paragraph/inline-derived node
+    const emphasis = nodes[0].children?.find((c: any) => c.type === 'emphasis')
+    expect(emphasis.type).toBe('emphasis')
+    expect(emphasis.children?.[0].content).toBe('a test ')
+    expect(emphasis.raw).toBe('*a test *')
+  })
+})