|
| 1 | +import os |
| 2 | +import zipfile |
| 3 | +import xml.dom.minidom as minidom |
| 4 | + |
| 5 | +from typing import BinaryIO, Any, Dict, List |
| 6 | + |
| 7 | +from ._html_converter import HtmlConverter |
| 8 | +from .._base_converter import DocumentConverter, DocumentConverterResult |
| 9 | +from .._stream_info import StreamInfo |
| 10 | + |
| 11 | +ACCEPTED_MIME_TYPE_PREFIXES = [ |
| 12 | + "application/epub", |
| 13 | + "application/epub+zip", |
| 14 | + "application/x-epub+zip", |
| 15 | +] |
| 16 | + |
| 17 | +ACCEPTED_FILE_EXTENSIONS = [".epub"] |
| 18 | + |
| 19 | +MIME_TYPE_MAPPING = { |
| 20 | + ".html": "text/html", |
| 21 | + ".xhtml": "application/xhtml+xml", |
| 22 | +} |
| 23 | + |
| 24 | + |
| 25 | +class EpubConverter(HtmlConverter): |
| 26 | + """ |
| 27 | + Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. |
| 28 | + """ |
| 29 | + |
| 30 | + def __init__(self): |
| 31 | + super().__init__() |
| 32 | + self._html_converter = HtmlConverter() |
| 33 | + |
| 34 | + def accepts( |
| 35 | + self, |
| 36 | + file_stream: BinaryIO, |
| 37 | + stream_info: StreamInfo, |
| 38 | + **kwargs: Any, # Options to pass to the converter |
| 39 | + ) -> bool: |
| 40 | + mimetype = (stream_info.mimetype or "").lower() |
| 41 | + extension = (stream_info.extension or "").lower() |
| 42 | + |
| 43 | + if extension in ACCEPTED_FILE_EXTENSIONS: |
| 44 | + return True |
| 45 | + |
| 46 | + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: |
| 47 | + if mimetype.startswith(prefix): |
| 48 | + return True |
| 49 | + |
| 50 | + return False |
| 51 | + |
| 52 | + def convert( |
| 53 | + self, |
| 54 | + file_stream: BinaryIO, |
| 55 | + stream_info: StreamInfo, |
| 56 | + **kwargs: Any, # Options to pass to the converter |
| 57 | + ) -> DocumentConverterResult: |
| 58 | + with zipfile.ZipFile(file_stream, "r") as z: |
| 59 | + # Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file.""" |
| 60 | + |
| 61 | + # Locate content.opf |
| 62 | + container_dom = minidom.parse(z.open("META-INF/container.xml")) |
| 63 | + opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute( |
| 64 | + "full-path" |
| 65 | + ) |
| 66 | + |
| 67 | + # Parse content.opf |
| 68 | + opf_dom = minidom.parse(z.open(opf_path)) |
| 69 | + metadata: Dict[str, Any] = { |
| 70 | + "title": self._get_text_from_node(opf_dom, "dc:title"), |
| 71 | + "authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"), |
| 72 | + "language": self._get_text_from_node(opf_dom, "dc:language"), |
| 73 | + "publisher": self._get_text_from_node(opf_dom, "dc:publisher"), |
| 74 | + "date": self._get_text_from_node(opf_dom, "dc:date"), |
| 75 | + "description": self._get_text_from_node(opf_dom, "dc:description"), |
| 76 | + "identifier": self._get_text_from_node(opf_dom, "dc:identifier"), |
| 77 | + } |
| 78 | + |
| 79 | + # Extract manifest items (ID → href mapping) |
| 80 | + manifest = { |
| 81 | + item.getAttribute("id"): item.getAttribute("href") |
| 82 | + for item in opf_dom.getElementsByTagName("item") |
| 83 | + } |
| 84 | + |
| 85 | + # Extract spine order (ID refs) |
| 86 | + spine_items = opf_dom.getElementsByTagName("itemref") |
| 87 | + spine_order = [item.getAttribute("idref") for item in spine_items] |
| 88 | + |
| 89 | + # Convert spine order to actual file paths |
| 90 | + base_path = "/".join( |
| 91 | + opf_path.split("/")[:-1] |
| 92 | + ) # Get base directory of content.opf |
| 93 | + spine = [ |
| 94 | + f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id] |
| 95 | + for item_id in spine_order |
| 96 | + if item_id in manifest |
| 97 | + ] |
| 98 | + |
| 99 | + # Extract and convert the content |
| 100 | + markdown_content: List[str] = [] |
| 101 | + for file in spine: |
| 102 | + if file in z.namelist(): |
| 103 | + with z.open(file) as f: |
| 104 | + filename = os.path.basename(file) |
| 105 | + extension = os.path.splitext(filename)[1].lower() |
| 106 | + mimetype = MIME_TYPE_MAPPING.get(extension) |
| 107 | + converted_content = self._html_converter.convert( |
| 108 | + f, |
| 109 | + StreamInfo( |
| 110 | + mimetype=mimetype, |
| 111 | + extension=extension, |
| 112 | + filename=filename, |
| 113 | + ), |
| 114 | + ) |
| 115 | + markdown_content.append(converted_content.markdown.strip()) |
| 116 | + |
| 117 | + # Format and add the metadata |
| 118 | + metadata_markdown = [] |
| 119 | + for key, value in metadata.items(): |
| 120 | + if isinstance(value, list): |
| 121 | + value = ", ".join(value) |
| 122 | + if value: |
| 123 | + metadata_markdown.append(f"**{key.capitalize()}:** {value}") |
| 124 | + |
| 125 | + markdown_content.insert(0, "\n".join(metadata_markdown)) |
| 126 | + |
| 127 | + return DocumentConverterResult( |
| 128 | + markdown="\n\n".join(markdown_content), title=metadata["title"] |
| 129 | + ) |
| 130 | + |
| 131 | + def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None: |
| 132 | + """Convenience function to extract a single occurrence of a tag (e.g., title).""" |
| 133 | + texts = self._get_all_texts_from_nodes(dom, tag_name) |
| 134 | + if len(texts) > 0: |
| 135 | + return texts[0] |
| 136 | + else: |
| 137 | + return None |
| 138 | + |
| 139 | + def _get_all_texts_from_nodes( |
| 140 | + self, dom: minidom.Document, tag_name: str |
| 141 | + ) -> List[str]: |
| 142 | + """Helper function to extract all occurrences of a tag (e.g., multiple authors).""" |
| 143 | + texts: List[str] = [] |
| 144 | + for node in dom.getElementsByTagName(tag_name): |
| 145 | + if node.firstChild and hasattr(node.firstChild, "nodeValue"): |
| 146 | + texts.append(node.firstChild.nodeValue.strip()) |
| 147 | + return texts |
0 commit comments