Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions packages/markitdown/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,34 @@ pip install -e packages/markitdown[all]
markitdown path-to-file.pdf > document.md
```

#### PDF Table Extraction

By default, PDF conversion outputs plain text (table structure is not preserved). You can enable experimental
table detection with the `--pdf-tables` flag:

```bash
markitdown --pdf-tables plumber invoice.pdf
markitdown --pdf-tables auto report.pdf
```

Modes:

* `none` (default): plain text via pdfminer.
* `plumber`: use `pdfplumber` if installed (general-purpose detection).
* `camelot`: use `camelot` if installed (works best on ruled tables; requires a real file path, not stdin).
* `auto`: try plumber first, then camelot; fall back to plain text.

Install optional dependencies:

```bash
pip install "markitdown[pdf-tables]"
```

Notes:
* Camelot may need Ghostscript for lattice mode (`apt-get install ghostscript` on Debian/Ubuntu).
* If dependencies are missing, MarkItDown silently falls back to plain text.
* Output is best-effort; complex/merged cells may degrade gracefully.

### Python API

```python
Expand All @@ -39,6 +67,13 @@ result = md.convert("test.xlsx")
print(result.text_content)
```

Enable PDF tables in code:

```python
result = md.convert("sample.pdf", pdf_tables="auto")
print(result.markdown)
```

### More Information

For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub.
Expand Down
1 change: 1 addition & 0 deletions packages/markitdown/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ docx = ["mammoth", "lxml"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
pdf-tables = ["pdfminer.six", "pdfplumber", "camelot-py" ]
outlook = ["olefile"]
audio-transcription = ["pydub", "SpeechRecognition"]
youtube-transcription = ["youtube-transcript-api"]
Expand Down
17 changes: 16 additions & 1 deletion packages/markitdown/src/markitdown/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,17 @@ def main():
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)

parser.add_argument(
"--pdf-tables",
dest="pdf_tables",
choices=["none", "auto", "plumber", "camelot"],
default="none",
help=(
"PDF table extraction mode: 'none' (default, plain text), 'plumber' (use pdfplumber if installed), "
"'camelot' (use camelot if installed), or 'auto' (try plumber then camelot)."
),
)

parser.add_argument("filename", nargs="?")
args = parser.parse_args()

Expand Down Expand Up @@ -191,10 +202,14 @@ def main():
sys.stdin.buffer,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
pdf_tables=args.pdf_tables,
)
else:
result = markitdown.convert(
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
args.filename,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
pdf_tables=args.pdf_tables,
)

_handle_output(args, result)
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,8 @@ def _convert(
), "File stream position should NOT change between guess iterations"

_kwargs = {k: v for k, v in kwargs.items()}
# Note: CLI-supplied options like 'pdf_tables' are passed through transparently
# to converters via **kwargs. Converters should document any custom keys they use.

# Copy any additional global options
if "llm_client" not in _kwargs and self._llm_client is not None:
Expand Down
176 changes: 152 additions & 24 deletions packages/markitdown/src/markitdown/converters/_pdf_converter.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,35 @@
import sys
import io

from typing import BinaryIO, Any
from typing import BinaryIO, Any, List, Optional


from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE


# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import pdfminer
import pdfminer.high_level
except ImportError:
# Preserve the error and stack trace for later
# pdfminer is required for basic PDF text extraction
_dependency_exc_info = None # Holds exception info if pdfminer missing
try: # pragma: no cover - dependency import
import pdfminer # type: ignore
import pdfminer.high_level # type: ignore
except ImportError: # pragma: no cover
_dependency_exc_info = sys.exc_info()

# Optional: pdfplumber for table extraction
try: # pragma: no cover - optional dependency
import pdfplumber # type: ignore
_pdfplumber_available = True
except Exception: # pragma: no cover
_pdfplumber_available = False

# Optional: camelot for table extraction (only works on stream-based pages or lattice with ghostscript)
try: # pragma: no cover - optional dependency
import camelot # type: ignore
_camelot_available = True
except Exception: # pragma: no cover
_camelot_available = False


ACCEPTED_MIME_TYPE_PREFIXES = [
"application/pdf",
Expand All @@ -28,9 +39,45 @@
ACCEPTED_FILE_EXTENSIONS = [".pdf"]


class PdfConverter(DocumentConverter):
def _format_md_table(rows: List[List[Optional[str]]]) -> str:
"""Render a 2D list into GitHub-flavored markdown table.

Very small formatting helper: left-align all columns, normalize None/whitespace.
"""
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
if not rows:
return ""
norm = [[(c or "").strip() for c in r] for r in rows]
# Drop completely empty trailing rows that sometimes appear from extractors
while len(norm) > 1 and all(len(c) == 0 for c in norm[-1]):
norm.pop()
if not norm or len(norm[0]) == 0:
return ""
widths = [max(len(r[i]) for r in norm) for i in range(len(norm[0]))]

def fmt_row(r: List[str]) -> str:
return "| " + " | ".join(r[i].ljust(widths[i]) for i in range(len(widths))) + " |"

header = fmt_row(norm[0])
separator = "| " + " | ".join("-" * max(3, widths[i]) for i in range(len(widths))) + " |"
body = [fmt_row(r) for r in norm[1:]]
# Ensure at least header + separator; if only one row, duplicate header as a body row copy
if len(body) == 0:
body.append(fmt_row(["" for _ in widths]))
return "\n".join([header, separator, *body])


class PdfConverter(DocumentConverter):
"""Convert PDFs to Markdown.

Table extraction (when enabled) is best-effort and relies on optional dependencies:
- pdfplumber: generic table detection using pdfminer layout analysis
- camelot: stronger detection for ruling-line (lattice) or stream tables

Modes (selectable via kwarg `pdf_tables` passed through from CLI):
none (default): Plain text via pdfminer
plumber: Use pdfplumber only
camelot: Use camelot only
auto: Try pdfplumber first, then camelot, else fallback
"""

def accepts(
Expand All @@ -55,23 +102,104 @@ def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> DocumentConverterResult:
# Check the dependencies
# Dependency check for baseline pdfminer
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".pdf",
feature="pdf",
converter=type(self).__name__, extension=".pdf", feature="pdf"
)
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
) from _dependency_exc_info[1].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)

assert isinstance(file_stream, io.IOBase) # for mypy
return DocumentConverterResult(
markdown=pdfminer.high_level.extract_text(file_stream),
)
mode = (kwargs.get("pdf_tables") or "none").lower()
if mode not in {"none", "auto", "plumber", "camelot"}:
mode = "none"

# Ensure we can seek back after optional libs consume the stream
if not file_stream.seekable():
# Should normally be seekable by the time we get here, but safeguard
buffer = io.BytesIO(file_stream.read())
file_stream = buffer
cur_pos = file_stream.tell()

extracted_tables: List[str] = []
body_chunks: List[str] = []

def append_tables(tables: List[List[List[Optional[str]]]]):
for t in tables:
md = _format_md_table(t)
if md.strip():
extracted_tables.append(md)

tried_any = False

# Try pdfplumber if requested/auto
if mode in {"plumber", "auto"} and _pdfplumber_available:
tried_any = True
try: # pragma: no cover - logic covered indirectly
file_stream.seek(cur_pos)
with pdfplumber.open(file_stream) as pdf: # type: ignore
for page in pdf.pages:
page_text = page.extract_text() or ""
tables = page.extract_tables() or []
if page_text.strip():
body_chunks.append(page_text.rstrip())
if tables:
append_tables(tables) # type: ignore[arg-type]
# Success path: combine text + tables appended in order encountered
if extracted_tables:
markdown = "\n\n".join(
[c for c in body_chunks if c] + extracted_tables
)
else:
markdown = "\n\n".join([c for c in body_chunks if c])
if markdown.strip():
return DocumentConverterResult(markdown=markdown)
except Exception:
# Swallow and fall through to other options
pass

# Try camelot if requested/auto
if mode in {"camelot", "auto"} and _camelot_available:
tried_any = True
try: # pragma: no cover - optional dependency path
file_stream.seek(cur_pos)
# Camelot expects a file path; if we have a local_path in stream_info use it
if stream_info.local_path:
# Try both lattice then stream to maximize recall
tables_all: List[Any] = []
try:
tables_all.extend(camelot.read_pdf(stream_info.local_path, pages="all", flavor="lattice")) # type: ignore
except Exception:
pass
try:
tables_all.extend(camelot.read_pdf(stream_info.local_path, pages="all", flavor="stream")) # type: ignore
except Exception:
pass
for tbl in tables_all:
try:
data = tbl.df.values.tolist() # pandas DataFrame
append_tables(data) # type: ignore[arg-type]
except Exception:
continue
if extracted_tables:
# Fallback body text via pdfminer
file_stream.seek(cur_pos)
plain = pdfminer.high_level.extract_text(file_stream)
markdown = plain.strip()
markdown = "\n\n".join(
[markdown] + [t for t in extracted_tables if t]
)
return DocumentConverterResult(markdown=markdown)
except Exception:
pass

# Final fallback to plain pdfminer text
file_stream.seek(cur_pos)
plain_text = pdfminer.high_level.extract_text(file_stream)
if tried_any and extracted_tables:
plain_text = "\n\n".join([plain_text.strip()] + extracted_tables)
return DocumentConverterResult(markdown=plain_text)
68 changes: 68 additions & 0 deletions packages/markitdown/tests/test_pdf_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
import io
import pytest

from markitdown import MarkItDown

try:
import reportlab # type: ignore
from reportlab.lib.pagesizes import letter # type: ignore
from reportlab.pdfgen import canvas # type: ignore
_have_reportlab = True
except Exception: # pragma: no cover
_have_reportlab = False

# We only run tests if reportlab is present locally; it's not a hard dependency.
skip_no_reportlab = pytest.mark.skipif(not _have_reportlab, reason="reportlab not installed")


def _build_pdf_with_table() -> bytes:
"""Generate a simple PDF containing a small 3x3 table drawn with text (not vector table lines)."""
buffer = io.BytesIO()
c = canvas.Canvas(buffer, pagesize=letter)
c.setFont("Helvetica", 12)
# Simple table headers and rows at fixed positions
start_x, start_y = 72, 720
data = [
["ColA", "ColB", "ColC"],
["1", "2", "3"],
["4", "5", "6"],
]
for r, row in enumerate(data):
for col, cell in enumerate(row):
c.drawString(start_x + col * 80, start_y - r * 18, cell)
c.showPage()
c.save()
return buffer.getvalue()


@skip_no_reportlab
@pytest.mark.parametrize("mode", ["none", "plumber", "auto"]) # camelot requires file path + ghostscript
def test_pdf_tables_modes(mode):
pdf_bytes = _build_pdf_with_table()
markitdown = MarkItDown()

result = markitdown.convert_stream(io.BytesIO(pdf_bytes), pdf_tables=mode)
text = result.text_content

# Base assertions: headers appear
assert "ColA" in text and "ColB" in text and "ColC" in text
# Numbers appear
for n in ["1", "2", "3", "4", "5", "6"]:
assert n in text

if mode in ("plumber", "auto"):
# Expect at least one markdown table line with pipes (header separator)
if "| ColA" in text: # header row
assert "| ColA" in text and "| ColB" in text and "| ColC" in text
assert "| ---" in text or "| ---".replace(" ", "") in text
# Not a hard failure if plumbing fails silently (e.g., pdfplumber not installed)


@skip_no_reportlab
def test_pdf_tables_invalid_mode():
pdf_bytes = _build_pdf_with_table()
markitdown = MarkItDown()
# Invalid mode should fallback to none
result = markitdown.convert_stream(io.BytesIO(pdf_bytes), pdf_tables="weird")
assert "| ColA" not in result.text_content # no table formatting expected