Skip to content

Commit 53e2d69

Browse files
committed
feat(IDocTags): add fine-grained content serialization filtering
Signed-off-by: Panos Vagenas <[email protected]>
1 parent d8bc627 commit 53e2d69

8 files changed

Lines changed: 880 additions & 162 deletions

docling_core/experimental/idoctags.py

Lines changed: 126 additions & 87 deletions
Large diffs are not rendered by default.

examples/convert_to_idoctags.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ def _count_yes(key: str) -> int:
358358
for content in [True, False]:
359359
try:
360360
params_probe = IDocTagsParams()
361-
params_probe.add_content = content
361+
params_probe.add_text_content = content
362362
params_probe.mode = mode
363363
params_probe.escape_mode = esc_mode
364364
params_probe.pretty_indentation = " " if mode==IDocTagsSerializationMode.HUMAN_FRIENDLY else None
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
<doctag version="1.0.0">
2+
<list ordered="false">
3+
<list_text>item of leading list</list_text>
4+
</list>
5+
<title>Title of the Document</title>
6+
<text>Author 1
7+
Affiliation 1</text>
8+
<text>Author 2
9+
Affiliation 2</text>
10+
<heading level="1">1. Introduction</heading>
11+
<text>This paper introduces the biggest invention ever made. ...</text>
12+
<list ordered="false">
13+
<list_text>list item 1</list_text>
14+
<list_text>list item 2</list_text>
15+
<list_text>list item 3</list_text>
16+
<list ordered="true">
17+
<list_text>list item 3.a</list_text>
18+
<list_text>list item 3.b</list_text>
19+
<list_text>list item 3.c</list_text>
20+
<list ordered="true">
21+
<list_text>list item 3.c.i</list_text>
22+
</list>
23+
</list>
24+
<list_text>list item 4</list_text>
25+
</list>
26+
<floating_group class="table">
27+
<caption>This is the caption of table 1.</caption>
28+
<otsl>
29+
<fcel/>
30+
Product
31+
<fcel/>
32+
Years
33+
<lcel/>
34+
<nl/>
35+
<ucel/>
36+
<fcel/>
37+
2016
38+
<fcel/>
39+
2017
40+
<nl/>
41+
<fcel/>
42+
Apple
43+
<fcel/>
44+
49823
45+
<fcel/>
46+
695944
47+
<nl/>
48+
</otsl>
49+
</floating_group>
50+
<floating_group class="picture">
51+
<caption>This is the caption of figure 1.</caption>
52+
</floating_group>
53+
<floating_group class="picture">
54+
<caption>This is the caption of figure 2.</caption>
55+
</floating_group>
56+
<list ordered="false">
57+
<list_text>item 1 of list</list_text>
58+
</list>
59+
<list ordered="false">
60+
<list_text>item 1 of list after empty list</list_text>
61+
<list_text>item 2 of list after empty list</list_text>
62+
</list>
63+
<list ordered="false">
64+
<list_text>item 1 of neighboring list</list_text>
65+
<list_text>item 2 of neighboring list</list_text>
66+
<list ordered="false">
67+
<list_text>item 1 of sub list</list_text>
68+
<list_text>
69+
<inline>
70+
<text>Here a code snippet:</text>
71+
<code class="Python"><![CDATA[print("Hello world")]]></code>
72+
<text>(to be displayed inline)</text>
73+
</inline>
74+
</list_text>
75+
<list_text>
76+
<inline>
77+
<text>Here a formula:</text>
78+
<formula>E=mc^2</formula>
79+
<text>(to be displayed inline)</text>
80+
</inline>
81+
</list_text>
82+
</list>
83+
</list>
84+
<text>Here a code block:</text>
85+
<code class="Python"><![CDATA[print("Hello world")]]></code>
86+
<text>Here a formula block:</text>
87+
<formula>E=mc^2</formula>
88+
<inline>
89+
<text>Some formatting chops:</text>
90+
<text>
91+
<bold>bold</bold>
92+
</text>
93+
<text>
94+
<italic>italic</italic>
95+
</text>
96+
<text>
97+
<underline>underline</underline>
98+
</text>
99+
<text>
100+
<strikethrough>strikethrough</strikethrough>
101+
</text>
102+
<text>
103+
<subscript>subscript</subscript>
104+
</text>
105+
<text>
106+
<superscript>superscript</superscript>
107+
</text>
108+
<text>hyperlink</text>
109+
<text><![CDATA[&]]></text>
110+
<text>
111+
<strikethrough>
112+
<underline>
113+
<italic>
114+
<bold>everything at the same time.</bold>
115+
</italic>
116+
</underline>
117+
</strikethrough>
118+
</text>
119+
</inline>
120+
<list ordered="true">
121+
<list_text>Item 1 in A</list_text>
122+
<list_text>Item 2 in A</list_text>
123+
<list_text>Item 3 in A</list_text>
124+
<list ordered="true">
125+
<list_text>Item 1 in B</list_text>
126+
<list_text>Item 2 in B</list_text>
127+
<list ordered="true">
128+
<list_text>Item 1 in C</list_text>
129+
<list_text>Item 2 in C</list_text>
130+
</list>
131+
<list_text>Item 3 in B</list_text>
132+
</list>
133+
<list_text>Item 4 in A</list_text>
134+
</list>
135+
<list ordered="false">
136+
<list_text>List item without parent list group</list_text>
137+
</list>
138+
<text>The end.</text>
139+
<floating_group class="picture">
140+
<caption>Picture Caption</caption>
141+
<picture>
142+
<meta>
143+
<summary>Picture Summary</summary>
144+
<description>Picture Description</description>
145+
</meta>
146+
<location value="5" resolution="512"/>
147+
<location value="492" resolution="512"/>
148+
<location value="15" resolution="512"/>
149+
<location value="502" resolution="512"/>
150+
</picture>
151+
</floating_group>
152+
<floating_group class="picture">
153+
<caption>Picture Caption</caption>
154+
<picture>
155+
<meta>
156+
<summary>Picture Summary</summary>
157+
<description>Picture Description</description>
158+
<classification>Pie chart</classification>
159+
</meta>
160+
<location value="5" resolution="512"/>
161+
<location value="492" resolution="512"/>
162+
<location value="15" resolution="512"/>
163+
<location value="502" resolution="512"/>
164+
<otsl>
165+
<fcel/>
166+
Foo
167+
<fcel/>
168+
Bar
169+
<nl/>
170+
<fcel/>
171+
One
172+
<fcel/>
173+
Two
174+
<nl/>
175+
</otsl>
176+
</picture>
177+
</floating_group>
178+
<code>0 == 0</code>
179+
<code>
180+
<location value="5" resolution="512"/>
181+
<location value="492" resolution="512"/>
182+
<location value="15" resolution="512"/>
183+
<location value="502" resolution="512"/>
184+
with location
185+
</code>
186+
</doctag>
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
<doctag version="1.0.0">
2+
<list ordered="false">
3+
<list_text>item of leading list</list_text>
4+
</list>
5+
<title>Title of the Document</title>
6+
<text>Author 1
7+
Affiliation 1</text>
8+
<text>Author 2
9+
Affiliation 2</text>
10+
<heading level="1">1. Introduction</heading>
11+
<text>This paper introduces the biggest invention ever made. ...</text>
12+
<list ordered="false">
13+
<list_text>list item 1</list_text>
14+
<list_text>list item 2</list_text>
15+
<list_text>list item 3</list_text>
16+
<list ordered="true">
17+
<list_text>list item 3.a</list_text>
18+
<list_text>list item 3.b</list_text>
19+
<list_text>list item 3.c</list_text>
20+
<list ordered="true">
21+
<list_text>list item 3.c.i</list_text>
22+
</list>
23+
</list>
24+
<list_text>list item 4</list_text>
25+
</list>
26+
<floating_group class="table">
27+
<caption>This is the caption of table 1.</caption>
28+
</floating_group>
29+
<floating_group class="picture">
30+
<caption>This is the caption of figure 1.</caption>
31+
</floating_group>
32+
<floating_group class="picture">
33+
<caption>This is the caption of figure 2.</caption>
34+
</floating_group>
35+
<list ordered="false">
36+
<list_text>item 1 of list</list_text>
37+
</list>
38+
<list ordered="false">
39+
<list_text>item 1 of list after empty list</list_text>
40+
<list_text>item 2 of list after empty list</list_text>
41+
</list>
42+
<list ordered="false">
43+
<list_text>item 1 of neighboring list</list_text>
44+
<list_text>item 2 of neighboring list</list_text>
45+
<list ordered="false">
46+
<list_text>item 1 of sub list</list_text>
47+
<list_text>
48+
<inline>
49+
<text>Here a code snippet:</text>
50+
<code class="Python"></code>
51+
<text>(to be displayed inline)</text>
52+
</inline>
53+
</list_text>
54+
<list_text>
55+
<inline>
56+
<text>Here a formula:</text>
57+
<formula>E=mc^2</formula>
58+
<text>(to be displayed inline)</text>
59+
</inline>
60+
</list_text>
61+
</list>
62+
</list>
63+
<text>Here a code block:</text>
64+
<code class="Python"></code>
65+
<text>Here a formula block:</text>
66+
<formula>E=mc^2</formula>
67+
<inline>
68+
<text>Some formatting chops:</text>
69+
<text>
70+
<bold>bold</bold>
71+
</text>
72+
<text>
73+
<italic>italic</italic>
74+
</text>
75+
<text>
76+
<underline>underline</underline>
77+
</text>
78+
<text>
79+
<strikethrough>strikethrough</strikethrough>
80+
</text>
81+
<text>
82+
<subscript>subscript</subscript>
83+
</text>
84+
<text>
85+
<superscript>superscript</superscript>
86+
</text>
87+
<text>hyperlink</text>
88+
<text><![CDATA[&]]></text>
89+
<text>
90+
<strikethrough>
91+
<underline>
92+
<italic>
93+
<bold>everything at the same time.</bold>
94+
</italic>
95+
</underline>
96+
</strikethrough>
97+
</text>
98+
</inline>
99+
<list ordered="true">
100+
<list_text>Item 1 in A</list_text>
101+
<list_text>Item 2 in A</list_text>
102+
<list_text>Item 3 in A</list_text>
103+
<list ordered="true">
104+
<list_text>Item 1 in B</list_text>
105+
<list_text>Item 2 in B</list_text>
106+
<list ordered="true">
107+
<list_text>Item 1 in C</list_text>
108+
<list_text>Item 2 in C</list_text>
109+
</list>
110+
<list_text>Item 3 in B</list_text>
111+
</list>
112+
<list_text>Item 4 in A</list_text>
113+
</list>
114+
<list ordered="false">
115+
<list_text>List item without parent list group</list_text>
116+
</list>
117+
<text>The end.</text>
118+
<floating_group class="picture">
119+
<caption>Picture Caption</caption>
120+
<picture>
121+
<meta>
122+
<summary>Picture Summary</summary>
123+
<description>Picture Description</description>
124+
</meta>
125+
<location value="5" resolution="512"/>
126+
<location value="492" resolution="512"/>
127+
<location value="15" resolution="512"/>
128+
<location value="502" resolution="512"/>
129+
</picture>
130+
</floating_group>
131+
<floating_group class="picture">
132+
<caption>Picture Caption</caption>
133+
<picture>
134+
<meta>
135+
<summary>Picture Summary</summary>
136+
<description>Picture Description</description>
137+
<classification>Pie chart</classification>
138+
</meta>
139+
<location value="5" resolution="512"/>
140+
<location value="492" resolution="512"/>
141+
<location value="15" resolution="512"/>
142+
<location value="502" resolution="512"/>
143+
<otsl>
144+
<fcel/>
145+
Foo
146+
<fcel/>
147+
Bar
148+
<nl/>
149+
<fcel/>
150+
One
151+
<fcel/>
152+
Two
153+
<nl/>
154+
</otsl>
155+
</picture>
156+
</floating_group>
157+
<code></code>
158+
<code>
159+
<location value="5" resolution="512"/>
160+
<location value="492" resolution="512"/>
161+
<location value="15" resolution="512"/>
162+
<location value="502" resolution="512"/>
163+
</code>
164+
</doctag>

0 commit comments

Comments
 (0)