1+ """Backend to parse articles in JATS (Journal Article Tag Suite) XML format.
2+
3+ JATS is a standard XML format used by publishers and journal archives including
4+ PubMed Central (PMC), bioRxiv, and medRxiv for representing journal articles.
5+
6+ Security Note:
7+ This module uses lxml.etree.XMLParser with secure configuration to protect
8+ against XML External Entity (XXE) attacks and XML bombs. The parser is
9+ configured with:
10+
11+ - resolve_entities: False (prevents entity resolution attacks)
12+ - no_network: True (blocks all network access)
13+ - dtd_validation: False (disables DTD validation)
14+ - load_dtd: False (prevents loading external DTDs)
15+
16+ This configuration ensures safe parsing of JATS XML files while blocking
17+ external entity fetching and preventing XXE attacks.
18+ """
19+
120import logging
221import traceback
322from io import BytesIO
423from pathlib import Path
5- from typing import Final , Optional , Union , cast
24+ from typing import Final , cast
625
726from bs4 import BeautifulSoup , NavigableString , Tag
827from docling_core .types .doc import (
2645
2746_log = logging .getLogger (__name__ )
2847
29- JATS_DTD_URL : Final = ["JATS-journalpublishing" , "JATS-archive" ]
30- DEFAULT_HEADER_ACKNOWLEDGMENTS : Final = "Acknowledgments"
31- DEFAULT_HEADER_ABSTRACT : Final = "Abstract"
32- DEFAULT_HEADER_REFERENCES : Final = "References"
33- DEFAULT_TEXT_ETAL : Final = "et al."
48+ JATS_DTD_URL : Final [ list [ str ]] = ["JATS-journalpublishing" , "JATS-archive" ]
49+ DEFAULT_HEADER_ACKNOWLEDGMENTS : Final [ str ] = "Acknowledgments"
50+ DEFAULT_HEADER_ABSTRACT : Final [ str ] = "Abstract"
51+ DEFAULT_HEADER_REFERENCES : Final [ str ] = "References"
52+ DEFAULT_TEXT_ETAL : Final [ str ] = "et al."
3453
3554
3655class Abstract (TypedDict ):
@@ -87,20 +106,26 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
87106 """
88107
89108 @override
90- def __init__ (
91- self , in_doc : "InputDocument" , path_or_stream : Union [BytesIO , Path ]
92- ) -> None :
109+ def __init__ (self , in_doc : "InputDocument" , path_or_stream : BytesIO | Path ) -> None :
93110 super ().__init__ (in_doc , path_or_stream )
94111 self .path_or_stream = path_or_stream
95112
96113 # Initialize the root of the document hierarchy
97- self .root : Optional [ NodeItem ] = None
114+ self .root : NodeItem | None = None
98115 self .hlevel : int = 0
99116 self .valid : bool = False
100117 try :
101118 if isinstance (self .path_or_stream , BytesIO ):
102119 self .path_or_stream .seek (0 )
103- self .tree : etree ._ElementTree = etree .parse (self .path_or_stream )
120+ parser = etree .XMLParser (
121+ resolve_entities = False ,
122+ load_dtd = False ,
123+ no_network = True ,
124+ dtd_validation = False ,
125+ )
126+ self .tree : etree ._ElementTree = etree .parse (
127+ self .path_or_stream , parser = parser
128+ )
104129
105130 doc_info : etree .DocInfo = self .tree .docinfo
106131 if doc_info .system_url and any (
@@ -172,7 +197,7 @@ def convert(self) -> DoclingDocument:
172197 return doc
173198
174199 @staticmethod
175- def _get_text (node : etree ._Element , sep : Optional [ str ] = None ) -> str :
200+ def _get_text (node : etree ._Element , sep : str | None = None ) -> str :
176201 skip_tags = ["term" , "disp-formula" , "inline-formula" ]
177202 text : str = (
178203 node .text .replace ("\n " , " " )
@@ -189,9 +214,9 @@ def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:
189214
190215 return text
191216
192- def _find_metadata (self ) -> Optional [ etree ._Element ] :
217+ def _find_metadata (self ) -> etree ._Element | None :
193218 meta_names : list [str ] = ["article-meta" , "book-part-meta" ]
194- meta : Optional [ etree ._Element ] = None
219+ meta : etree ._Element | None = None
195220 for name in meta_names :
196221 node = self .tree .xpath (f".//{ name } " )
197222 if len (node ) > 0 :
@@ -222,7 +247,7 @@ def _parse_abstract(self) -> list[Abstract]:
222247 def _parse_authors (self ) -> list [Author ]:
223248 # Get mapping between affiliation ids and names
224249 authors : list [Author ] = []
225- meta : Optional [ etree ._Element ] = self ._find_metadata ()
250+ meta : etree ._Element | None = self ._find_metadata ()
226251 if meta is None :
227252 return authors
228253
@@ -390,7 +415,7 @@ def _parse_element_citation(self, node: etree._Element) -> str:
390415 "part-title" ,
391416 "trans-title" ,
392417 ]
393- title_node : Optional [ etree ._Element ] = None
418+ title_node : etree ._Element | None = None
394419 for name in titles :
395420 name_node = node .xpath (name )
396421 if len (name_node ) > 0 :
@@ -493,12 +518,12 @@ def _add_figure_captions(
493518 self , doc : DoclingDocument , parent : NodeItem , node : etree ._Element
494519 ) -> None :
495520 label_node = node .xpath ("label" )
496- label : Optional [ str ] = (
521+ label : str | None = (
497522 JatsDocumentBackend ._get_text (label_node [0 ]).strip () if label_node else ""
498523 )
499524
500525 caption_node = node .xpath ("caption" )
501- caption : Optional [ str ]
526+ caption : str | None
502527 if len (caption_node ) > 0 :
503528 caption = ""
504529 for caption_par in list (caption_node [0 ]):
@@ -511,7 +536,7 @@ def _add_figure_captions(
511536
512537 # TODO: format label vs caption once styling is supported
513538 fig_text : str = f"{ label } { ' ' if label and caption else '' } { caption } "
514- fig_caption : Optional [ TextItem ] = (
539+ fig_caption : TextItem | None = (
515540 doc .add_text (label = DocItemLabel .CAPTION , text = fig_text )
516541 if fig_text
517542 else None
@@ -538,7 +563,7 @@ def _add_metadata(
538563 return
539564
540565 @staticmethod
541- def parse_table_data (element : Tag ) -> Optional [ TableData ] :
566+ def parse_table_data (element : Tag ) -> TableData | None :
542567 # TODO, see how to implement proper support for rich tables from HTML backend
543568 nested_tables = element .find ("table" )
544569 if nested_tables is not None :
@@ -654,7 +679,7 @@ def _add_table(
654679 label = table_xml_component ["label" ]
655680 caption = table_xml_component ["caption" ]
656681 table_text : str = f"{ label } { ' ' if label and caption else '' } { caption } "
657- table_caption : Optional [ TextItem ] = (
682+ table_caption : TextItem | None = (
658683 doc .add_text (label = DocItemLabel .CAPTION , text = table_text )
659684 if table_text
660685 else None
@@ -681,7 +706,7 @@ def _add_tables(
681706
682707 # Caption
683708 caption_node = node .xpath ("caption" )
684- caption : Optional [ str ]
709+ caption : str | None
685710 if caption_node :
686711 caption = ""
687712 for caption_par in list (caption_node [0 ]):
@@ -738,7 +763,7 @@ def _walk_linear(
738763 # add elements and decide whether to stop walking
739764 if child .tag in ("sec" , "ack" ):
740765 header = child .xpath ("title|label" )
741- text : Optional [ str ] = None
766+ text : str | None = None
742767 if len (header ) > 0 :
743768 text = JatsDocumentBackend ._get_text (header [0 ])
744769 elif child .tag == "ack" :
0 commit comments