Skip to content

Commit 6182a6d

Browse files
authored
improvement: support 2-space markdown and paragraphs without newline before lists (#6512)
Fixes #6510 This loosens some markdown limitations to act more like GitHub Flavored markdown (GFM) such that we allow: - 2 spaces for tabs - Lists can start after paragraph without an empty line ``` uv run marimo edit marimo/_smoke_tests/markdown/sane_lists.py ```
1 parent 9ed2b19 commit 6182a6d

File tree

7 files changed

+548
-14
lines changed

7 files changed

+548
-14
lines changed

marimo/_output/md.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
import pymdownx.emoji # type: ignore
1515

1616
from marimo._output.hypertext import Html
17+
from marimo._output.md_extensions.breakless_lists import (
18+
BreaklessListsExtension,
19+
)
1720
from marimo._output.md_extensions.external_links import ExternalLinksExtension
21+
from marimo._output.md_extensions.flexible_indent import (
22+
FlexibleIndentExtension,
23+
)
1824
from marimo._output.md_extensions.iconify import IconifyExtension
1925
from marimo._output.rich_help import mddoc
2026
from marimo._utils.url import is_url
@@ -196,6 +202,10 @@ def _get_extensions() -> list[Union[str, markdown.Extension]]:
196202
"footnotes",
197203
# Sane lists, to include <ol start="n">
198204
"sane_lists",
205+
# Flexible indentation - supports 2 or 4 space indentation
206+
FlexibleIndentExtension(),
207+
# Breakless lists - more compact list formatting
208+
BreaklessListsExtension(),
199209
# Links
200210
ExternalLinksExtension(),
201211
# Iconify

marimo/_output/md_extensions/__init__.py

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# Copyright 2024 Marimo. All rights reserved.
2+
3+
import re
4+
from xml.etree.ElementTree import Element
5+
6+
from markdown import ( # type: ignore
7+
Extension,
8+
Markdown,
9+
preprocessors,
10+
treeprocessors,
11+
)
12+
13+
14+
class BreaklessListsPreprocessor(preprocessors.Preprocessor): # type: ignore[misc]
15+
"""
16+
Enables CommonMark-style list interruption of paragraphs.
17+
18+
In CommonMark, lists can interrupt paragraphs without requiring a blank line.
19+
Python-Markdown requires blank lines, so this preprocessor adds them automatically
20+
when it detects a list immediately following a paragraph.
21+
"""
22+
23+
# Pattern to match lines that start list items (ordered or unordered)
24+
LIST_START_PATTERN = re.compile(r"^(\s*)([*+-]|\d+\.)(\s+)", re.MULTILINE)
25+
26+
def __init__(self, md: Markdown) -> None:
27+
super().__init__(md)
28+
29+
def run(self, lines: list[str]) -> list[str]:
30+
"""Process the lines and insert blank lines before lists that follow paragraphs."""
31+
if not lines:
32+
return lines
33+
34+
result_lines: list[str] = []
35+
i = 0
36+
37+
while i < len(lines):
38+
current_line = lines[i]
39+
result_lines.append(current_line)
40+
41+
# Check if we need to look ahead for a list
42+
if i + 1 < len(lines):
43+
next_line = lines[i + 1]
44+
45+
# If current line is not empty and next line starts a list
46+
if (
47+
current_line.strip() # Current line has content
48+
and self.LIST_START_PATTERN.match(next_line)
49+
): # Next line starts a list
50+
# Check if there's already a blank line
51+
if current_line.strip():
52+
# Insert blank line to enable list interruption
53+
result_lines.append("")
54+
55+
i += 1
56+
57+
return result_lines
58+
59+
60+
class BreaklessListsTreeProcessor(treeprocessors.Treeprocessor): # type: ignore[misc]
61+
"""
62+
Removes paragraph tags from list items to create compact lists.
63+
64+
This makes lists more compact by removing <p> tags within <li> elements.
65+
"""
66+
67+
def run(self, root: Element) -> None:
68+
def is_only_child(parent: Element, child: Element) -> bool:
69+
return len(parent) == 1 and parent[0] is child
70+
71+
for element in root.iter(tag="li"):
72+
for p in element.findall(".//p"):
73+
# If paragraph has no attributes and is the only child
74+
if not p.attrib and is_only_child(element, p):
75+
# Swap the paragraph with the list item
76+
element.text = p.text
77+
element.tail = p.tail
78+
# Copy over the children
79+
for child in p:
80+
element.append(child)
81+
# Remove the paragraph tag
82+
element.remove(p)
83+
84+
85+
class BreaklessListsExtension(Extension): # type: ignore[misc]
86+
"""
87+
Extension to enable CommonMark-style list interruption of paragraphs.
88+
89+
This allows lists to follow paragraphs without requiring blank lines,
90+
matching CommonMark specification behavior. Also makes lists compact
91+
by removing paragraph tags within list items.
92+
"""
93+
94+
def extendMarkdown(self, md: Markdown) -> None:
95+
# Register preprocessor to enable list interruption
96+
md.preprocessors.register(
97+
BreaklessListsPreprocessor(md),
98+
"breakless_lists_preproc",
99+
# Run early in preprocessing, before other processors
100+
30,
101+
)
102+
103+
# Register tree processor to make lists compact
104+
md.treeprocessors.register(
105+
BreaklessListsTreeProcessor(md),
106+
"breakless_lists_tree",
107+
# Run after lists are parsed but before paragraph cleanup
108+
10,
109+
)
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# Copyright 2024 Marimo. All rights reserved.
2+
3+
import re
4+
5+
from markdown import Extension, Markdown, preprocessors # type: ignore
6+
7+
8+
class FlexibleIndentPreprocessor(preprocessors.Preprocessor): # type: ignore[misc]
9+
"""
10+
Preprocessor to standardize list indentation to specific levels.
11+
Normalizes inconsistent indentation to match the allowed levels.
12+
"""
13+
14+
# Pattern to match lines that start list items (ordered or unordered)
15+
# Captures: (indentation, list_marker, trailing_space, content)
16+
LIST_PATTERN = re.compile(r"^(\s*)([*+-]|\d+\.)(\s+)(.*)$", re.MULTILINE)
17+
INDENT_LEVELS = [2, 4]
18+
BASE_INDENT_SIZE = 4
19+
FOUR_SPACES = " "
20+
21+
def __init__(self, md: Markdown) -> None:
22+
super().__init__(md)
23+
24+
def _detect_base_indent(self, lines: list[str]) -> int:
25+
"""
26+
Detect the base indentation level used in the document.
27+
28+
Returns 2 for 2-space indentation or 4 for 4-space indentation.
29+
"""
30+
indents: list[int] = []
31+
for line in lines:
32+
match = self.LIST_PATTERN.match(line)
33+
if match:
34+
indent_str = match.group(1)
35+
if indent_str: # Skip non-indented items
36+
indent_count = len(
37+
indent_str.replace("\t", self.FOUR_SPACES)
38+
)
39+
indents.append(indent_count)
40+
41+
if not indents:
42+
return self.BASE_INDENT_SIZE
43+
44+
# Find the smallest non-zero indent - this is likely our base level
45+
min_indent = min(indents)
46+
47+
# Choose the closest allowed indent level
48+
if min_indent <= 2:
49+
return 2
50+
else:
51+
return self.BASE_INDENT_SIZE
52+
53+
def _normalize_indentation(self, indent_str: str, base_level: int) -> str:
54+
"""
55+
Normalize indentation to consistent 2-space increments.
56+
57+
This ensures that both 2-space and 4-space indentation patterns
58+
result in the same normalized output.
59+
60+
Args:
61+
indent_str: The original indentation string
62+
base_level: The detected base indentation level (2 or 4)
63+
64+
Returns:
65+
Normalized indentation string using 2-space increments
66+
"""
67+
# Convert tabs to spaces (assuming 1 tab = 4 spaces)
68+
normalized = indent_str.replace("\t", self.FOUR_SPACES)
69+
indent_count = len(normalized)
70+
71+
if indent_count == 0:
72+
return ""
73+
74+
# Calculate the intended nesting level based on the base level
75+
nesting_level = max(1, round(indent_count / base_level))
76+
77+
# Always output using 4-space increments since that is what the markdown spec requires
78+
return " " * (4 * nesting_level)
79+
80+
def _get_list_depth(self, indent_str: str, base_level: int = 2) -> int:
81+
"""Calculate the nesting depth of a list item."""
82+
normalized = indent_str.replace("\t", self.FOUR_SPACES)
83+
indent_count = len(normalized)
84+
85+
if indent_count == 0:
86+
return 0
87+
88+
# Calculate depth based on the base level
89+
return max(1, round(indent_count / base_level))
90+
91+
def run(self, lines: list[str]) -> list[str]:
92+
"""Process the lines and normalize list indentation."""
93+
if not lines:
94+
return lines
95+
96+
# Detect the base indentation level used in this document
97+
base_level = self._detect_base_indent(lines)
98+
99+
result_lines: list[str] = []
100+
101+
for line in lines:
102+
match = self.LIST_PATTERN.match(line)
103+
if match:
104+
indent, marker, space, content = match.groups()
105+
106+
# Normalize the indentation based on detected base level
107+
normalized_indent = self._normalize_indentation(
108+
indent, base_level
109+
)
110+
111+
# Reconstruct the line with normalized indentation
112+
normalized_line = (
113+
f"{normalized_indent}{marker}{space}{content}"
114+
)
115+
result_lines.append(normalized_line)
116+
else:
117+
result_lines.append(line)
118+
119+
return result_lines
120+
121+
122+
class FlexibleIndentExtension(Extension): # type: ignore[misc]
123+
"""
124+
Extension to provide flexible list indentation support.
125+
"""
126+
127+
def extendMarkdown(self, md: Markdown) -> None:
128+
"""Add the preprocessor to the markdown instance."""
129+
# Register preprocessor to normalize indentation
130+
md.preprocessors.register(
131+
FlexibleIndentPreprocessor(md),
132+
"flexible_indent",
133+
# Run early, before breakless_lists and other list processing
134+
35,
135+
)

0 commit comments

Comments
 (0)