Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 79 additions & 20 deletions packages/markitdown/src/markitdown/converters/_pdf_converter.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,18 @@
import sys
import io

from typing import BinaryIO, Any


from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE


# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
# Load dependencies
_dependency_exc_info = None
try:
import pdfminer
import pdfminer.high_level
import pdfplumber
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()


Expand All @@ -28,16 +24,43 @@
ACCEPTED_FILE_EXTENSIONS = [".pdf"]


def _to_markdown_table(table: list[list[str]]) -> str:
"""Convert a 2D list (rows/columns) into a nicely aligned Markdown table."""
if not table:
return ""

# Normalize None → ""
table = [[cell if cell is not None else "" for cell in row] for row in table]

# Column widths
col_widths = [max(len(str(cell)) for cell in col) for col in zip(*table)]

def fmt_row(row):
return "| " + " | ".join(
str(cell).ljust(width) for cell, width in zip(row, col_widths)
) + " |"

header, *rows = table
md = [fmt_row(header)]
md.append("| " + " | ".join("-" * w for w in col_widths) + " |")
for row in rows:
md.append(fmt_row(row))

return "\n".join(md)


class PdfConverter(DocumentConverter):
"""
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
Converts PDFs to Markdown.
Supports extracting tables into aligned Markdown format (via pdfplumber).
Falls back to pdfminer if pdfplumber is missing or fails.
"""

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
Expand All @@ -55,23 +78,59 @@ def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> DocumentConverterResult:
# Check the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".pdf",
feature="pdf",
)
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)

assert isinstance(file_stream, io.IOBase) # for mypy
return DocumentConverterResult(
markdown=pdfminer.high_level.extract_text(file_stream),
)
) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2]) # type: ignore[union-attr]

assert isinstance(file_stream, io.IOBase)

markdown_chunks: list[str] = []

try:
with pdfplumber.open(file_stream) as pdf:
for page in pdf.pages:
text = page.extract_text() or ""
page_tables = page.extract_tables()

# Remove table rows from text to avoid duplication
for table in page_tables:
if not table:
continue
header_line = " ".join(table[0])
if header_line in text:
text = text.replace(header_line, "")
for row in table[1:]:
row_line = " ".join(row)
if row_line in text:
text = text.replace(row_line, "")

# Normalize whitespace: collapse multiple blank lines
lines = [line.strip() for line in text.splitlines() if line.strip()]
clean_text = "\n".join(lines)
if clean_text:
markdown_chunks.append(clean_text)

# Append tables as aligned Markdown
for table in page_tables:
md_table = _to_markdown_table(table)
if md_table:
markdown_chunks.append(md_table)

markdown = "\n\n".join(markdown_chunks).strip()

except Exception:
# Fallback if pdfplumber fails
markdown = pdfminer.high_level.extract_text(file_stream)

# Fallback if still empty
if not markdown:
markdown = pdfminer.high_level.extract_text(file_stream)

return DocumentConverterResult(markdown=markdown)