Skip to content

Commit 576bada

Browse files
authored
fix: security vulnerabilities with XML External Entity and related attacks (#3009)
* fix(uspto): disable external entity resolution in SAX parser to prevent XXE Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * style(uspto): use vertical bar annotation instead of Optional and Union Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(jats): add parser options to prevent XXE attacks Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * style(jats): use vertical bar annotation instead of Optional and Union Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
1 parent bf417e6 commit 576bada

4 files changed

Lines changed: 177 additions & 70 deletions

File tree

docling/backend/xml/jats_backend.py

Lines changed: 48 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,27 @@
1+
"""Backend to parse articles in JATS (Journal Article Tag Suite) XML format.
2+
3+
JATS is a standard XML format used by publishers and journal archives including
4+
PubMed Central (PMC), bioRxiv, and medRxiv for representing journal articles.
5+
6+
Security Note:
7+
This module uses lxml.etree.XMLParser with secure configuration to protect
8+
against XML External Entity (XXE) attacks and XML bombs. The parser is
9+
configured with:
10+
11+
- resolve_entities: False (prevents entity resolution attacks)
12+
- no_network: True (blocks all network access)
13+
- dtd_validation: False (disables DTD validation)
14+
- load_dtd: False (prevents loading external DTDs)
15+
16+
This configuration ensures safe parsing of JATS XML files while blocking
17+
external entity fetching and preventing XXE attacks.
18+
"""
19+
120
import logging
221
import traceback
322
from io import BytesIO
423
from pathlib import Path
5-
from typing import Final, Optional, Union, cast
24+
from typing import Final, cast
625

726
from bs4 import BeautifulSoup, NavigableString, Tag
827
from docling_core.types.doc import (
@@ -26,11 +45,11 @@
2645

2746
_log = logging.getLogger(__name__)
2847

29-
JATS_DTD_URL: Final = ["JATS-journalpublishing", "JATS-archive"]
30-
DEFAULT_HEADER_ACKNOWLEDGMENTS: Final = "Acknowledgments"
31-
DEFAULT_HEADER_ABSTRACT: Final = "Abstract"
32-
DEFAULT_HEADER_REFERENCES: Final = "References"
33-
DEFAULT_TEXT_ETAL: Final = "et al."
48+
JATS_DTD_URL: Final[list[str]] = ["JATS-journalpublishing", "JATS-archive"]
49+
DEFAULT_HEADER_ACKNOWLEDGMENTS: Final[str] = "Acknowledgments"
50+
DEFAULT_HEADER_ABSTRACT: Final[str] = "Abstract"
51+
DEFAULT_HEADER_REFERENCES: Final[str] = "References"
52+
DEFAULT_TEXT_ETAL: Final[str] = "et al."
3453

3554

3655
class Abstract(TypedDict):
@@ -87,20 +106,26 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
87106
"""
88107

89108
@override
90-
def __init__(
91-
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
92-
) -> None:
109+
def __init__(self, in_doc: "InputDocument", path_or_stream: BytesIO | Path) -> None:
93110
super().__init__(in_doc, path_or_stream)
94111
self.path_or_stream = path_or_stream
95112

96113
# Initialize the root of the document hierarchy
97-
self.root: Optional[NodeItem] = None
114+
self.root: NodeItem | None = None
98115
self.hlevel: int = 0
99116
self.valid: bool = False
100117
try:
101118
if isinstance(self.path_or_stream, BytesIO):
102119
self.path_or_stream.seek(0)
103-
self.tree: etree._ElementTree = etree.parse(self.path_or_stream)
120+
parser = etree.XMLParser(
121+
resolve_entities=False,
122+
load_dtd=False,
123+
no_network=True,
124+
dtd_validation=False,
125+
)
126+
self.tree: etree._ElementTree = etree.parse(
127+
self.path_or_stream, parser=parser
128+
)
104129

105130
doc_info: etree.DocInfo = self.tree.docinfo
106131
if doc_info.system_url and any(
@@ -172,7 +197,7 @@ def convert(self) -> DoclingDocument:
172197
return doc
173198

174199
@staticmethod
175-
def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:
200+
def _get_text(node: etree._Element, sep: str | None = None) -> str:
176201
skip_tags = ["term", "disp-formula", "inline-formula"]
177202
text: str = (
178203
node.text.replace("\n", " ")
@@ -189,9 +214,9 @@ def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:
189214

190215
return text
191216

192-
def _find_metadata(self) -> Optional[etree._Element]:
217+
def _find_metadata(self) -> etree._Element | None:
193218
meta_names: list[str] = ["article-meta", "book-part-meta"]
194-
meta: Optional[etree._Element] = None
219+
meta: etree._Element | None = None
195220
for name in meta_names:
196221
node = self.tree.xpath(f".//{name}")
197222
if len(node) > 0:
@@ -222,7 +247,7 @@ def _parse_abstract(self) -> list[Abstract]:
222247
def _parse_authors(self) -> list[Author]:
223248
# Get mapping between affiliation ids and names
224249
authors: list[Author] = []
225-
meta: Optional[etree._Element] = self._find_metadata()
250+
meta: etree._Element | None = self._find_metadata()
226251
if meta is None:
227252
return authors
228253

@@ -390,7 +415,7 @@ def _parse_element_citation(self, node: etree._Element) -> str:
390415
"part-title",
391416
"trans-title",
392417
]
393-
title_node: Optional[etree._Element] = None
418+
title_node: etree._Element | None = None
394419
for name in titles:
395420
name_node = node.xpath(name)
396421
if len(name_node) > 0:
@@ -493,12 +518,12 @@ def _add_figure_captions(
493518
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
494519
) -> None:
495520
label_node = node.xpath("label")
496-
label: Optional[str] = (
521+
label: str | None = (
497522
JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
498523
)
499524

500525
caption_node = node.xpath("caption")
501-
caption: Optional[str]
526+
caption: str | None
502527
if len(caption_node) > 0:
503528
caption = ""
504529
for caption_par in list(caption_node[0]):
@@ -511,7 +536,7 @@ def _add_figure_captions(
511536

512537
# TODO: format label vs caption once styling is supported
513538
fig_text: str = f"{label}{' ' if label and caption else ''}{caption}"
514-
fig_caption: Optional[TextItem] = (
539+
fig_caption: TextItem | None = (
515540
doc.add_text(label=DocItemLabel.CAPTION, text=fig_text)
516541
if fig_text
517542
else None
@@ -538,7 +563,7 @@ def _add_metadata(
538563
return
539564

540565
@staticmethod
541-
def parse_table_data(element: Tag) -> Optional[TableData]:
566+
def parse_table_data(element: Tag) -> TableData | None:
542567
# TODO, see how to implement proper support for rich tables from HTML backend
543568
nested_tables = element.find("table")
544569
if nested_tables is not None:
@@ -654,7 +679,7 @@ def _add_table(
654679
label = table_xml_component["label"]
655680
caption = table_xml_component["caption"]
656681
table_text: str = f"{label}{' ' if label and caption else ''}{caption}"
657-
table_caption: Optional[TextItem] = (
682+
table_caption: TextItem | None = (
658683
doc.add_text(label=DocItemLabel.CAPTION, text=table_text)
659684
if table_text
660685
else None
@@ -681,7 +706,7 @@ def _add_tables(
681706

682707
# Caption
683708
caption_node = node.xpath("caption")
684-
caption: Optional[str]
709+
caption: str | None
685710
if caption_node:
686711
caption = ""
687712
for caption_par in list(caption_node[0]):
@@ -738,7 +763,7 @@ def _walk_linear(
738763
# add elements and decide whether to stop walking
739764
if child.tag in ("sec", "ack"):
740765
header = child.xpath("title|label")
741-
text: Optional[str] = None
766+
text: str | None = None
742767
if len(header) > 0:
743768
text = JatsDocumentBackend._get_text(header[0])
744769
elif child.tag == "ack":

0 commit comments

Comments
 (0)