Skip to content

Commit a93e056

Browse files
authored
EPub Support. Adapted #123 to not use epublib. (#1131)
* Adapted #123 to not use epublib. * Updated README.md
1 parent c5f70b9 commit a93e056

File tree

5 files changed

+171
-1
lines changed

5 files changed

+171
-1
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
1414
At present, MarkItDown supports:
1515

1616
- PDF
17-
- PowerPoint (reading in top-to-bottom, left-to-right order)
17+
- PowerPoint
1818
- Word
1919
- Excel
2020
- Images (EXIF metadata and OCR)
@@ -23,6 +23,7 @@ At present, MarkItDown supports:
2323
- Text-based formats (CSV, JSON, XML)
2424
- ZIP files (iterates over contents)
2525
- Youtube URLs
26+
- EPubs
2627
- ... and more!
2728

2829
## Why Markdown?

packages/markitdown/src/markitdown/_markitdown.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
AudioConverter,
3939
OutlookMsgConverter,
4040
ZipConverter,
41+
EpubConverter,
4142
DocumentIntelligenceConverter,
4243
)
4344

@@ -191,6 +192,7 @@ def enable_builtins(self, **kwargs) -> None:
191192
self.register_converter(IpynbConverter())
192193
self.register_converter(PdfConverter())
193194
self.register_converter(OutlookMsgConverter())
195+
self.register_converter(EpubConverter())
194196

195197
# Register Document Intelligence converter at the top of the stack if endpoint is provided
196198
docintel_endpoint = kwargs.get("docintel_endpoint")

packages/markitdown/src/markitdown/converters/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from ._outlook_msg_converter import OutlookMsgConverter
1919
from ._zip_converter import ZipConverter
2020
from ._doc_intel_converter import DocumentIntelligenceConverter
21+
from ._epub_converter import EpubConverter
2122

2223
__all__ = [
2324
"PlainTextConverter",
@@ -37,4 +38,5 @@
3738
"OutlookMsgConverter",
3839
"ZipConverter",
3940
"DocumentIntelligenceConverter",
41+
"EpubConverter",
4042
]
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import os
2+
import zipfile
3+
import xml.dom.minidom as minidom
4+
5+
from typing import BinaryIO, Any, Dict, List
6+
7+
from ._html_converter import HtmlConverter
8+
from .._base_converter import DocumentConverter, DocumentConverterResult
9+
from .._stream_info import StreamInfo
10+
11+
ACCEPTED_MIME_TYPE_PREFIXES = [
12+
"application/epub",
13+
"application/epub+zip",
14+
"application/x-epub+zip",
15+
]
16+
17+
ACCEPTED_FILE_EXTENSIONS = [".epub"]
18+
19+
MIME_TYPE_MAPPING = {
20+
".html": "text/html",
21+
".xhtml": "application/xhtml+xml",
22+
}
23+
24+
25+
class EpubConverter(HtmlConverter):
26+
"""
27+
Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
28+
"""
29+
30+
def __init__(self):
31+
super().__init__()
32+
self._html_converter = HtmlConverter()
33+
34+
def accepts(
35+
self,
36+
file_stream: BinaryIO,
37+
stream_info: StreamInfo,
38+
**kwargs: Any, # Options to pass to the converter
39+
) -> bool:
40+
mimetype = (stream_info.mimetype or "").lower()
41+
extension = (stream_info.extension or "").lower()
42+
43+
if extension in ACCEPTED_FILE_EXTENSIONS:
44+
return True
45+
46+
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
47+
if mimetype.startswith(prefix):
48+
return True
49+
50+
return False
51+
52+
def convert(
53+
self,
54+
file_stream: BinaryIO,
55+
stream_info: StreamInfo,
56+
**kwargs: Any, # Options to pass to the converter
57+
) -> DocumentConverterResult:
58+
with zipfile.ZipFile(file_stream, "r") as z:
59+
# Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
60+
61+
# Locate content.opf
62+
container_dom = minidom.parse(z.open("META-INF/container.xml"))
63+
opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
64+
"full-path"
65+
)
66+
67+
# Parse content.opf
68+
opf_dom = minidom.parse(z.open(opf_path))
69+
metadata: Dict[str, Any] = {
70+
"title": self._get_text_from_node(opf_dom, "dc:title"),
71+
"authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
72+
"language": self._get_text_from_node(opf_dom, "dc:language"),
73+
"publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
74+
"date": self._get_text_from_node(opf_dom, "dc:date"),
75+
"description": self._get_text_from_node(opf_dom, "dc:description"),
76+
"identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
77+
}
78+
79+
# Extract manifest items (ID → href mapping)
80+
manifest = {
81+
item.getAttribute("id"): item.getAttribute("href")
82+
for item in opf_dom.getElementsByTagName("item")
83+
}
84+
85+
# Extract spine order (ID refs)
86+
spine_items = opf_dom.getElementsByTagName("itemref")
87+
spine_order = [item.getAttribute("idref") for item in spine_items]
88+
89+
# Convert spine order to actual file paths
90+
base_path = "/".join(
91+
opf_path.split("/")[:-1]
92+
) # Get base directory of content.opf
93+
spine = [
94+
f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
95+
for item_id in spine_order
96+
if item_id in manifest
97+
]
98+
99+
# Extract and convert the content
100+
markdown_content: List[str] = []
101+
for file in spine:
102+
if file in z.namelist():
103+
with z.open(file) as f:
104+
filename = os.path.basename(file)
105+
extension = os.path.splitext(filename)[1].lower()
106+
mimetype = MIME_TYPE_MAPPING.get(extension)
107+
converted_content = self._html_converter.convert(
108+
f,
109+
StreamInfo(
110+
mimetype=mimetype,
111+
extension=extension,
112+
filename=filename,
113+
),
114+
)
115+
markdown_content.append(converted_content.markdown.strip())
116+
117+
# Format and add the metadata
118+
metadata_markdown = []
119+
for key, value in metadata.items():
120+
if isinstance(value, list):
121+
value = ", ".join(value)
122+
if value:
123+
metadata_markdown.append(f"**{key.capitalize()}:** {value}")
124+
125+
markdown_content.insert(0, "\n".join(metadata_markdown))
126+
127+
return DocumentConverterResult(
128+
markdown="\n\n".join(markdown_content), title=metadata["title"]
129+
)
130+
131+
def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
132+
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
133+
texts = self._get_all_texts_from_nodes(dom, tag_name)
134+
if len(texts) > 0:
135+
return texts[0]
136+
else:
137+
return None
138+
139+
def _get_all_texts_from_nodes(
140+
self, dom: minidom.Document, tag_name: str
141+
) -> List[str]:
142+
"""Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
143+
texts: List[str] = []
144+
for node in dom.getElementsByTagName(tag_name):
145+
if node.firstChild and hasattr(node.firstChild, "nodeValue"):
146+
texts.append(node.firstChild.nodeValue.strip())
147+
return texts

packages/markitdown/tests/_test_vectors.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,4 +211,22 @@ class FileTestVector(object):
211211
],
212212
must_not_include=[],
213213
),
214+
FileTestVector(
215+
filename="test.epub",
216+
mimetype="application/epub+zip",
217+
charset=None,
218+
url=None,
219+
must_include=[
220+
"**Authors:** Test Author",
221+
"A test EPUB document for MarkItDown testing",
222+
"# Chapter 1: Test Content",
223+
"This is a **test** paragraph with some formatting",
224+
"* A bullet point",
225+
"* Another point",
226+
"# Chapter 2: More Content",
227+
"*different* style",
228+
"> This is a blockquote for testing",
229+
],
230+
must_not_include=[],
231+
),
214232
]

0 commit comments

Comments
 (0)