docling-project
diff --git a/‎docling/backend/xml/jats_backend.py‎
Lines changed: 48 additions & 23 deletions b/‎docling/backend/xml/jats_backend.py‎
Lines changed: 48 additions & 23 deletions
@@ -1,8 +1,27 @@
+"""Backend to parse articles in JATS (Journal Article Tag Suite) XML format.
+
+JATS is a standard XML format used by publishers and journal archives including
+PubMed Central (PMC), bioRxiv, and medRxiv for representing journal articles.
+
+Security Note:
+    This module uses lxml.etree.XMLParser with secure configuration to protect
+    against XML External Entity (XXE) attacks and XML bombs. The parser is
+    configured with:
+
+    - resolve_entities: False (prevents entity resolution attacks)
+    - no_network: True (blocks all network access)
+    - dtd_validation: False (disables DTD validation)
+    - load_dtd: False (prevents loading external DTDs)
+
+    This configuration ensures safe parsing of JATS XML files while blocking
+    external entity fetching and preventing XXE attacks.
+"""
+
 import logging
 import traceback
 from io import BytesIO
 from pathlib import Path
-from typing import Final, Optional, Union, cast
+from typing import Final, cast
 
 from bs4 import BeautifulSoup, NavigableString, Tag
 from docling_core.types.doc import (
@@ -26,11 +45,11 @@
 
 _log = logging.getLogger(__name__)
 
-JATS_DTD_URL: Final = ["JATS-journalpublishing", "JATS-archive"]
-DEFAULT_HEADER_ACKNOWLEDGMENTS: Final = "Acknowledgments"
-DEFAULT_HEADER_ABSTRACT: Final = "Abstract"
-DEFAULT_HEADER_REFERENCES: Final = "References"
-DEFAULT_TEXT_ETAL: Final = "et al."
+JATS_DTD_URL: Final[list[str]] = ["JATS-journalpublishing", "JATS-archive"]
+DEFAULT_HEADER_ACKNOWLEDGMENTS: Final[str] = "Acknowledgments"
+DEFAULT_HEADER_ABSTRACT: Final[str] = "Abstract"
+DEFAULT_HEADER_REFERENCES: Final[str] = "References"
+DEFAULT_TEXT_ETAL: Final[str] = "et al."
 
 
 class Abstract(TypedDict):
@@ -87,20 +106,26 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
     """
 
     @override
-    def __init__(
-        self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
-    ) -> None:
+    def __init__(self, in_doc: "InputDocument", path_or_stream: BytesIO | Path) -> None:
         super().__init__(in_doc, path_or_stream)
         self.path_or_stream = path_or_stream
 
         # Initialize the root of the document hierarchy
-        self.root: Optional[NodeItem] = None
+        self.root: NodeItem | None = None
         self.hlevel: int = 0
         self.valid: bool = False
         try:
             if isinstance(self.path_or_stream, BytesIO):
                 self.path_or_stream.seek(0)
-            self.tree: etree._ElementTree = etree.parse(self.path_or_stream)
+            parser = etree.XMLParser(
+                resolve_entities=False,
+                load_dtd=False,
+                no_network=True,
+                dtd_validation=False,
+            )
+            self.tree: etree._ElementTree = etree.parse(
+                self.path_or_stream, parser=parser
+            )
 
             doc_info: etree.DocInfo = self.tree.docinfo
             if doc_info.system_url and any(
@@ -172,7 +197,7 @@ def convert(self) -> DoclingDocument:
         return doc
 
     @staticmethod
-    def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:
+    def _get_text(node: etree._Element, sep: str | None = None) -> str:
         skip_tags = ["term", "disp-formula", "inline-formula"]
         text: str = (
             node.text.replace("\n", " ")
@@ -189,9 +214,9 @@ def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:
 
         return text
 
-    def _find_metadata(self) -> Optional[etree._Element]:
+    def _find_metadata(self) -> etree._Element | None:
         meta_names: list[str] = ["article-meta", "book-part-meta"]
-        meta: Optional[etree._Element] = None
+        meta: etree._Element | None = None
         for name in meta_names:
             node = self.tree.xpath(f".//{name}")
             if len(node) > 0:
@@ -222,7 +247,7 @@ def _parse_abstract(self) -> list[Abstract]:
     def _parse_authors(self) -> list[Author]:
         # Get mapping between affiliation ids and names
         authors: list[Author] = []
-        meta: Optional[etree._Element] = self._find_metadata()
+        meta: etree._Element | None = self._find_metadata()
         if meta is None:
             return authors
 
@@ -390,7 +415,7 @@ def _parse_element_citation(self, node: etree._Element) -> str:
             "part-title",
             "trans-title",
         ]
-        title_node: Optional[etree._Element] = None
+        title_node: etree._Element | None = None
         for name in titles:
             name_node = node.xpath(name)
             if len(name_node) > 0:
@@ -493,12 +518,12 @@ def _add_figure_captions(
         self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
     ) -> None:
         label_node = node.xpath("label")
-        label: Optional[str] = (
+        label: str | None = (
             JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
         )
 
         caption_node = node.xpath("caption")
-        caption: Optional[str]
+        caption: str | None
         if len(caption_node) > 0:
             caption = ""
             for caption_par in list(caption_node[0]):
@@ -511,7 +536,7 @@ def _add_figure_captions(
 
         # TODO: format label vs caption once styling is supported
         fig_text: str = f"{label}{' ' if label and caption else ''}{caption}"
-        fig_caption: Optional[TextItem] = (
+        fig_caption: TextItem | None = (
             doc.add_text(label=DocItemLabel.CAPTION, text=fig_text)
             if fig_text
             else None
@@ -538,7 +563,7 @@ def _add_metadata(
         return
 
     @staticmethod
-    def parse_table_data(element: Tag) -> Optional[TableData]:
+    def parse_table_data(element: Tag) -> TableData | None:
         # TODO, see how to implement proper support for rich tables from HTML backend
         nested_tables = element.find("table")
         if nested_tables is not None:
@@ -654,7 +679,7 @@ def _add_table(
         label = table_xml_component["label"]
         caption = table_xml_component["caption"]
         table_text: str = f"{label}{' ' if label and caption else ''}{caption}"
-        table_caption: Optional[TextItem] = (
+        table_caption: TextItem | None = (
             doc.add_text(label=DocItemLabel.CAPTION, text=table_text)
             if table_text
             else None
@@ -681,7 +706,7 @@ def _add_tables(
 
         # Caption
         caption_node = node.xpath("caption")
-        caption: Optional[str]
+        caption: str | None
         if caption_node:
             caption = ""
             for caption_par in list(caption_node[0]):
@@ -738,7 +763,7 @@ def _walk_linear(
             # add elements and decide whether to stop walking
             if child.tag in ("sec", "ack"):
                 header = child.xpath("title|label")
-                text: Optional[str] = None
+                text: str | None = None
                 if len(header) > 0:
                     text = JatsDocumentBackend._get_text(header[0])
                 elif child.tag == "ack":