microsoft · FranciscoJBL · Sep 17, 2025
diff --git a/packages/markitdown/README.md b/packages/markitdown/README.md
@@ -29,6 +29,34 @@ pip install -e packages/markitdown[all]
 markitdown path-to-file.pdf > document.md
 ```
 
+#### PDF Table Extraction
+
+By default, PDF conversion outputs plain text (table structure is not preserved). You can enable experimental
+table detection with the `--pdf-tables` flag:
+
+```bash
+markitdown --pdf-tables plumber invoice.pdf
+markitdown --pdf-tables auto report.pdf
+```
+
+Modes:
+
+* `none` (default): plain text via pdfminer.
+* `plumber`: use `pdfplumber` if installed (general-purpose detection).
+* `camelot`: use `camelot` if installed (works best on ruled tables; requires a real file path, not stdin).
+* `auto`: try plumber first, then camelot; fall back to plain text.
+
+Install optional dependencies:
+
+```bash
+pip install "markitdown[pdf-tables]"
+```
+
+Notes:
+* Camelot may need Ghostscript for lattice mode (`apt-get install ghostscript` on Debian/Ubuntu).
+* If dependencies are missing, MarkItDown silently falls back to plain text.
+* Output is best-effort; complex/merged cells may degrade gracefully.
+
 ### Python API
 
 ```python
@@ -39,6 +67,13 @@ result = md.convert("test.xlsx")
 print(result.text_content)
 ```
 
+Enable PDF tables in code:
+
+```python
+result = md.convert("sample.pdf", pdf_tables="auto")
+print(result.markdown)
+```
+
 ### More Information
 
 For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub.

diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
@@ -54,6 +54,7 @@ docx = ["mammoth", "lxml"]
 xlsx = ["pandas", "openpyxl"]
 xls = ["pandas", "xlrd"]
 pdf = ["pdfminer.six"]
+pdf-tables = ["pdfminer.six", "pdfplumber", "camelot-py" ]
 outlook = ["olefile"]
 audio-transcription = ["pydub", "SpeechRecognition"]
 youtube-transcription = ["youtube-transcript-api"]

diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
@@ -110,6 +110,17 @@ def main():
         help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
     )
 
+    parser.add_argument(
+        "--pdf-tables",
+        dest="pdf_tables",
+        choices=["none", "auto", "plumber", "camelot"],
+        default="none",
+        help=(
+            "PDF table extraction mode: 'none' (default, plain text), 'plumber' (use pdfplumber if installed), "
+            "'camelot' (use camelot if installed), or 'auto' (try plumber then camelot)."
+        ),
+    )
+
     parser.add_argument("filename", nargs="?")
     args = parser.parse_args()
 
@@ -191,10 +202,14 @@ def main():
             sys.stdin.buffer,
             stream_info=stream_info,
             keep_data_uris=args.keep_data_uris,
+            pdf_tables=args.pdf_tables,
         )
     else:
         result = markitdown.convert(
-            args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
+            args.filename,
+            stream_info=stream_info,
+            keep_data_uris=args.keep_data_uris,
+            pdf_tables=args.pdf_tables,
         )
 
     _handle_output(args, result)

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -553,6 +553,8 @@ def _convert(
                 ), "File stream position should NOT change between guess iterations"
 
                 _kwargs = {k: v for k, v in kwargs.items()}
+                # Note: CLI-supplied options like 'pdf_tables' are passed through transparently
+                # to converters via **kwargs. Converters should document any custom keys they use.
 
                 # Copy any additional global options
                 if "llm_client" not in _kwargs and self._llm_client is not None:

diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -1,24 +1,35 @@
 import sys
 import io
-
-from typing import BinaryIO, Any
+from typing import BinaryIO, Any, List, Optional
 
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 
 
-# Try loading optional (but in this case, required) dependencies
-# Save reporting of any exceptions for later
-_dependency_exc_info = None
-try:
-    import pdfminer
-    import pdfminer.high_level
-except ImportError:
-    # Preserve the error and stack trace for later
+# pdfminer is required for basic PDF text extraction
+_dependency_exc_info = None  # Holds exception info if pdfminer missing
+try:  # pragma: no cover - dependency import
+    import pdfminer  # type: ignore
+    import pdfminer.high_level  # type: ignore
+except ImportError:  # pragma: no cover
     _dependency_exc_info = sys.exc_info()
 
+# Optional: pdfplumber for table extraction
+try:  # pragma: no cover - optional dependency
+    import pdfplumber  # type: ignore
+    _pdfplumber_available = True
+except Exception:  # pragma: no cover
+    _pdfplumber_available = False
+
+# Optional: camelot for table extraction (only works on stream-based pages or lattice with ghostscript)
+try:  # pragma: no cover - optional dependency
+    import camelot  # type: ignore
+    _camelot_available = True
+except Exception:  # pragma: no cover
+    _camelot_available = False
+
 
 ACCEPTED_MIME_TYPE_PREFIXES = [
     "application/pdf",
@@ -28,9 +39,45 @@
 ACCEPTED_FILE_EXTENSIONS = [".pdf"]
 
 
-class PdfConverter(DocumentConverter):
+def _format_md_table(rows: List[List[Optional[str]]]) -> str:
+    """Render a 2D list into GitHub-flavored markdown table.
+
+    Very small formatting helper: left-align all columns, normalize None/whitespace.
     """
-    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
+    if not rows:
+        return ""
+    norm = [[(c or "").strip() for c in r] for r in rows]
+    # Drop completely empty trailing rows that sometimes appear from extractors
+    while len(norm) > 1 and all(len(c) == 0 for c in norm[-1]):
+        norm.pop()
+    if not norm or len(norm[0]) == 0:
+        return ""
+    widths = [max(len(r[i]) for r in norm) for i in range(len(norm[0]))]
+
+    def fmt_row(r: List[str]) -> str:
+        return "| " + " | ".join(r[i].ljust(widths[i]) for i in range(len(widths))) + " |"
+
+    header = fmt_row(norm[0])
+    separator = "| " + " | ".join("-" * max(3, widths[i]) for i in range(len(widths))) + " |"
+    body = [fmt_row(r) for r in norm[1:]]
+    # Ensure at least header + separator; if only one row, duplicate header as a body row copy
+    if len(body) == 0:
+        body.append(fmt_row(["" for _ in widths]))
+    return "\n".join([header, separator, *body])
+
+
+class PdfConverter(DocumentConverter):
+    """Convert PDFs to Markdown.
+
+    Table extraction (when enabled) is best-effort and relies on optional dependencies:
+    - pdfplumber: generic table detection using pdfminer layout analysis
+    - camelot: stronger detection for ruling-line (lattice) or stream tables
+
+    Modes (selectable via kwarg `pdf_tables` passed through from CLI):
+        none (default): Plain text via pdfminer
+        plumber: Use pdfplumber only
+        camelot: Use camelot only
+        auto: Try pdfplumber first, then camelot, else fallback
     """
 
     def accepts(
@@ -55,23 +102,104 @@ def convert(
         self,
         file_stream: BinaryIO,
         stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+        **kwargs: Any,
     ) -> DocumentConverterResult:
-        # Check the dependencies
+        # Dependency check for baseline pdfminer
         if _dependency_exc_info is not None:
             raise MissingDependencyException(
                 MISSING_DEPENDENCY_MESSAGE.format(
-                    converter=type(self).__name__,
-                    extension=".pdf",
-                    feature="pdf",
+                    converter=type(self).__name__, extension=".pdf", feature="pdf"
                 )
-            ) from _dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
+            ) from _dependency_exc_info[1].with_traceback(  # type: ignore[union-attr]
                 _dependency_exc_info[2]
             )
 
-        assert isinstance(file_stream, io.IOBase)  # for mypy
-        return DocumentConverterResult(
-            markdown=pdfminer.high_level.extract_text(file_stream),
-        )
+        mode = (kwargs.get("pdf_tables") or "none").lower()
+        if mode not in {"none", "auto", "plumber", "camelot"}:
+            mode = "none"
+
+        # Ensure we can seek back after optional libs consume the stream
+        if not file_stream.seekable():
+            # Should normally be seekable by the time we get here, but safeguard
+            buffer = io.BytesIO(file_stream.read())
+            file_stream = buffer
+        cur_pos = file_stream.tell()
+
+        extracted_tables: List[str] = []
+        body_chunks: List[str] = []
+
+        def append_tables(tables: List[List[List[Optional[str]]]]):
+            for t in tables:
+                md = _format_md_table(t)
+                if md.strip():
+                    extracted_tables.append(md)
+
+        tried_any = False
+
+        # Try pdfplumber if requested/auto
+        if mode in {"plumber", "auto"} and _pdfplumber_available:
+            tried_any = True
+            try:  # pragma: no cover - logic covered indirectly
+                file_stream.seek(cur_pos)
+                with pdfplumber.open(file_stream) as pdf:  # type: ignore
+                    for page in pdf.pages:
+                        page_text = page.extract_text() or ""
+                        tables = page.extract_tables() or []
+                        if page_text.strip():
+                            body_chunks.append(page_text.rstrip())
+                        if tables:
+                            append_tables(tables)  # type: ignore[arg-type]
+                # Success path: combine text + tables appended in order encountered
+                if extracted_tables:
+                    markdown = "\n\n".join(
+                        [c for c in body_chunks if c] + extracted_tables
+                    )
+                else:
+                    markdown = "\n\n".join([c for c in body_chunks if c])
+                if markdown.strip():
+                    return DocumentConverterResult(markdown=markdown)
+            except Exception:
+                # Swallow and fall through to other options
+                pass
+
+        # Try camelot if requested/auto
+        if mode in {"camelot", "auto"} and _camelot_available:
+            tried_any = True
+            try:  # pragma: no cover - optional dependency path
+                file_stream.seek(cur_pos)
+                # Camelot expects a file path; if we have a local_path in stream_info use it
+                if stream_info.local_path:
+                    # Try both lattice then stream to maximize recall
+                    tables_all: List[Any] = []
+                    try:
+                        tables_all.extend(camelot.read_pdf(stream_info.local_path, pages="all", flavor="lattice"))  # type: ignore
+                    except Exception:
+                        pass
+                    try:
+                        tables_all.extend(camelot.read_pdf(stream_info.local_path, pages="all", flavor="stream"))  # type: ignore
+                    except Exception:
+                        pass
+                    for tbl in tables_all:
+                        try:
+                            data = tbl.df.values.tolist()  # pandas DataFrame
+                            append_tables(data)  # type: ignore[arg-type]
+                        except Exception:
+                            continue
+                    if extracted_tables:
+                        # Fallback body text via pdfminer
+                        file_stream.seek(cur_pos)
+                        plain = pdfminer.high_level.extract_text(file_stream)
+                        markdown = plain.strip()
+                        markdown = "\n\n".join(
+                            [markdown] + [t for t in extracted_tables if t]
+                        )
+                        return DocumentConverterResult(markdown=markdown)
+            except Exception:
+                pass
+
+        # Final fallback to plain pdfminer text
+        file_stream.seek(cur_pos)
+        plain_text = pdfminer.high_level.extract_text(file_stream)
+        if tried_any and extracted_tables:
+            plain_text = "\n\n".join([plain_text.strip()] + extracted_tables)
+        return DocumentConverterResult(markdown=plain_text)
diff --git a/packages/markitdown/tests/test_pdf_tables.py b/packages/markitdown/tests/test_pdf_tables.py
@@ -0,0 +1,68 @@
+import os
+import io
+import pytest
+
+from markitdown import MarkItDown
+
+try:
+    import reportlab  # type: ignore
+    from reportlab.lib.pagesizes import letter  # type: ignore
+    from reportlab.pdfgen import canvas  # type: ignore
+    _have_reportlab = True
+except Exception:  # pragma: no cover
+    _have_reportlab = False
+
+# We only run tests if reportlab is present locally; it's not a hard dependency.
+skip_no_reportlab = pytest.mark.skipif(not _have_reportlab, reason="reportlab not installed")
+
+
+def _build_pdf_with_table() -> bytes:
+    """Generate a simple PDF containing a small 3x3 table drawn with text (not vector table lines)."""
+    buffer = io.BytesIO()
+    c = canvas.Canvas(buffer, pagesize=letter)
+    c.setFont("Helvetica", 12)
+    # Simple table headers and rows at fixed positions
+    start_x, start_y = 72, 720
+    data = [
+        ["ColA", "ColB", "ColC"],
+        ["1", "2", "3"],
+        ["4", "5", "6"],
+    ]
+    for r, row in enumerate(data):
+        for col, cell in enumerate(row):
+            c.drawString(start_x + col * 80, start_y - r * 18, cell)
+    c.showPage()
+    c.save()
+    return buffer.getvalue()
+
+
+@skip_no_reportlab
+@pytest.mark.parametrize("mode", ["none", "plumber", "auto"])  # camelot requires file path + ghostscript
+def test_pdf_tables_modes(mode):
+    pdf_bytes = _build_pdf_with_table()
+    markitdown = MarkItDown()
+
+    result = markitdown.convert_stream(io.BytesIO(pdf_bytes), pdf_tables=mode)
+    text = result.text_content
+
+    # Base assertions: headers appear
+    assert "ColA" in text and "ColB" in text and "ColC" in text
+    # Numbers appear
+    for n in ["1", "2", "3", "4", "5", "6"]:
+        assert n in text
+
+    if mode in ("plumber", "auto"):
+        # Expect at least one markdown table line with pipes (header separator)
+        if "| ColA" in text:  # header row
+            assert "| ColA" in text and "| ColB" in text and "| ColC" in text
+            assert "| ---" in text or "| ---".replace(" ", "") in text
+        # Not a hard failure if plumbing fails silently (e.g., pdfplumber not installed)
+
+
+@skip_no_reportlab
+def test_pdf_tables_invalid_mode():
+    pdf_bytes = _build_pdf_with_table()
+    markitdown = MarkItDown()
+    # Invalid mode should fallback to none
+    result = markitdown.convert_stream(io.BytesIO(pdf_bytes), pdf_tables="weird")
+    assert "| ColA" not in result.text_content  # no table formatting expected