diff --git a/packages/markitdown/README.md b/packages/markitdown/README.md index edd27016..72e573ae 100644 --- a/packages/markitdown/README.md +++ b/packages/markitdown/README.md @@ -29,6 +29,34 @@ pip install -e packages/markitdown[all] markitdown path-to-file.pdf > document.md ``` +#### PDF Table Extraction + +By default, PDF conversion outputs plain text (table structure is not preserved). You can enable experimental +table detection with the `--pdf-tables` flag: + +```bash +markitdown --pdf-tables plumber invoice.pdf +markitdown --pdf-tables auto report.pdf +``` + +Modes: + +* `none` (default): plain text via pdfminer. +* `plumber`: use `pdfplumber` if installed (general-purpose detection). +* `camelot`: use `camelot` if installed (works best on ruled tables; requires a real file path, not stdin). +* `auto`: try plumber first, then camelot; fall back to plain text. + +Install optional dependencies: + +```bash +pip install "markitdown[pdf-tables]" +``` + +Notes: +* Camelot may need Ghostscript for lattice mode (`apt-get install ghostscript` on Debian/Ubuntu). +* If dependencies are missing, MarkItDown silently falls back to plain text. +* Output is best-effort; complex/merged cells may degrade gracefully. + ### Python API ```python @@ -39,6 +67,13 @@ result = md.convert("test.xlsx") print(result.text_content) ``` +Enable PDF tables in code: + +```python +result = md.convert("sample.pdf", pdf_tables="auto") +print(result.markdown) +``` + ### More Information For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub. diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 0d9bb616..1e02f46c 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -54,6 +54,7 @@ docx = ["mammoth", "lxml"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] pdf = ["pdfminer.six"] +pdf-tables = ["pdfminer.six", "pdfplumber", "camelot-py" ] outlook = ["olefile"] audio-transcription = ["pydub", "SpeechRecognition"] youtube-transcription = ["youtube-transcript-api"] diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6b..2d7f1f7d 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -110,6 +110,17 @@ def main(): help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", ) + parser.add_argument( + "--pdf-tables", + dest="pdf_tables", + choices=["none", "auto", "plumber", "camelot"], + default="none", + help=( + "PDF table extraction mode: 'none' (default, plain text), 'plumber' (use pdfplumber if installed), " + "'camelot' (use camelot if installed), or 'auto' (try plumber then camelot)." + ), + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() @@ -191,10 +202,14 @@ def main(): sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris, + pdf_tables=args.pdf_tables, ) else: result = markitdown.convert( - args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris + args.filename, + stream_info=stream_info, + keep_data_uris=args.keep_data_uris, + pdf_tables=args.pdf_tables, ) _handle_output(args, result) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 702b10c6..b51c5be1 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -553,6 +553,8 @@ def _convert( ), "File stream position should NOT change between guess iterations" _kwargs = {k: v for k, v in kwargs.items()} + # Note: CLI-supplied options like 'pdf_tables' are passed through transparently + # to converters via **kwargs. Converters should document any custom keys they use. # Copy any additional global options if "llm_client" not in _kwargs and self._llm_client is not None: diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 63162d52..e9c68ea2 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,7 +1,6 @@ import sys import io - -from typing import BinaryIO, Any +from typing import BinaryIO, Any, List, Optional from .._base_converter import DocumentConverter, DocumentConverterResult @@ -9,16 +8,28 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE -# Try loading optional (but in this case, required) dependencies -# Save reporting of any exceptions for later -_dependency_exc_info = None -try: - import pdfminer - import pdfminer.high_level -except ImportError: - # Preserve the error and stack trace for later +# pdfminer is required for basic PDF text extraction +_dependency_exc_info = None # Holds exception info if pdfminer missing +try: # pragma: no cover - dependency import + import pdfminer # type: ignore + import pdfminer.high_level # type: ignore +except ImportError: # pragma: no cover _dependency_exc_info = sys.exc_info() +# Optional: pdfplumber for table extraction +try: # pragma: no cover - optional dependency + import pdfplumber # type: ignore + _pdfplumber_available = True +except Exception: # pragma: no cover + _pdfplumber_available = False + +# Optional: camelot for table extraction (only works on stream-based pages or lattice with ghostscript) +try: # pragma: no cover - optional dependency + import camelot # type: ignore + _camelot_available = True +except Exception: # pragma: no cover + _camelot_available = False + ACCEPTED_MIME_TYPE_PREFIXES = [ "application/pdf", @@ -28,9 +39,45 @@ ACCEPTED_FILE_EXTENSIONS = [".pdf"] -class PdfConverter(DocumentConverter): +def _format_md_table(rows: List[List[Optional[str]]]) -> str: + """Render a 2D list into GitHub-flavored markdown table. + + Very small formatting helper: left-align all columns, normalize None/whitespace. """ - Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. + if not rows: + return "" + norm = [[(c or "").strip() for c in r] for r in rows] + # Drop completely empty trailing rows that sometimes appear from extractors + while len(norm) > 1 and all(len(c) == 0 for c in norm[-1]): + norm.pop() + if not norm or len(norm[0]) == 0: + return "" + widths = [max(len(r[i]) for r in norm) for i in range(len(norm[0]))] + + def fmt_row(r: List[str]) -> str: + return "| " + " | ".join(r[i].ljust(widths[i]) for i in range(len(widths))) + " |" + + header = fmt_row(norm[0]) + separator = "| " + " | ".join("-" * max(3, widths[i]) for i in range(len(widths))) + " |" + body = [fmt_row(r) for r in norm[1:]] + # Ensure at least header + separator; if only one row, duplicate header as a body row copy + if len(body) == 0: + body.append(fmt_row(["" for _ in widths])) + return "\n".join([header, separator, *body]) + + +class PdfConverter(DocumentConverter): + """Convert PDFs to Markdown. + + Table extraction (when enabled) is best-effort and relies on optional dependencies: + - pdfplumber: generic table detection using pdfminer layout analysis + - camelot: stronger detection for ruling-line (lattice) or stream tables + + Modes (selectable via kwarg `pdf_tables` passed through from CLI): + none (default): Plain text via pdfminer + plumber: Use pdfplumber only + camelot: Use camelot only + auto: Try pdfplumber first, then camelot, else fallback """ def accepts( @@ -55,23 +102,104 @@ def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> DocumentConverterResult: - # Check the dependencies + # Dependency check for baseline pdfminer if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( - converter=type(self).__name__, - extension=".pdf", - feature="pdf", + converter=type(self).__name__, extension=".pdf", feature="pdf" ) - ) from _dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] + ) from _dependency_exc_info[1].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] ) - assert isinstance(file_stream, io.IOBase) # for mypy - return DocumentConverterResult( - markdown=pdfminer.high_level.extract_text(file_stream), - ) + mode = (kwargs.get("pdf_tables") or "none").lower() + if mode not in {"none", "auto", "plumber", "camelot"}: + mode = "none" + + # Ensure we can seek back after optional libs consume the stream + if not file_stream.seekable(): + # Should normally be seekable by the time we get here, but safeguard + buffer = io.BytesIO(file_stream.read()) + file_stream = buffer + cur_pos = file_stream.tell() + + extracted_tables: List[str] = [] + body_chunks: List[str] = [] + + def append_tables(tables: List[List[List[Optional[str]]]]): + for t in tables: + md = _format_md_table(t) + if md.strip(): + extracted_tables.append(md) + + tried_any = False + + # Try pdfplumber if requested/auto + if mode in {"plumber", "auto"} and _pdfplumber_available: + tried_any = True + try: # pragma: no cover - logic covered indirectly + file_stream.seek(cur_pos) + with pdfplumber.open(file_stream) as pdf: # type: ignore + for page in pdf.pages: + page_text = page.extract_text() or "" + tables = page.extract_tables() or [] + if page_text.strip(): + body_chunks.append(page_text.rstrip()) + if tables: + append_tables(tables) # type: ignore[arg-type] + # Success path: combine text + tables appended in order encountered + if extracted_tables: + markdown = "\n\n".join( + [c for c in body_chunks if c] + extracted_tables + ) + else: + markdown = "\n\n".join([c for c in body_chunks if c]) + if markdown.strip(): + return DocumentConverterResult(markdown=markdown) + except Exception: + # Swallow and fall through to other options + pass + + # Try camelot if requested/auto + if mode in {"camelot", "auto"} and _camelot_available: + tried_any = True + try: # pragma: no cover - optional dependency path + file_stream.seek(cur_pos) + # Camelot expects a file path; if we have a local_path in stream_info use it + if stream_info.local_path: + # Try both lattice then stream to maximize recall + tables_all: List[Any] = [] + try: + tables_all.extend(camelot.read_pdf(stream_info.local_path, pages="all", flavor="lattice")) # type: ignore + except Exception: + pass + try: + tables_all.extend(camelot.read_pdf(stream_info.local_path, pages="all", flavor="stream")) # type: ignore + except Exception: + pass + for tbl in tables_all: + try: + data = tbl.df.values.tolist() # pandas DataFrame + append_tables(data) # type: ignore[arg-type] + except Exception: + continue + if extracted_tables: + # Fallback body text via pdfminer + file_stream.seek(cur_pos) + plain = pdfminer.high_level.extract_text(file_stream) + markdown = plain.strip() + markdown = "\n\n".join( + [markdown] + [t for t in extracted_tables if t] + ) + return DocumentConverterResult(markdown=markdown) + except Exception: + pass + + # Final fallback to plain pdfminer text + file_stream.seek(cur_pos) + plain_text = pdfminer.high_level.extract_text(file_stream) + if tried_any and extracted_tables: + plain_text = "\n\n".join([plain_text.strip()] + extracted_tables) + return DocumentConverterResult(markdown=plain_text) diff --git a/packages/markitdown/tests/test_pdf_tables.py b/packages/markitdown/tests/test_pdf_tables.py new file mode 100644 index 00000000..ba1e1c1a --- /dev/null +++ b/packages/markitdown/tests/test_pdf_tables.py @@ -0,0 +1,68 @@ +import os +import io +import pytest + +from markitdown import MarkItDown + +try: + import reportlab # type: ignore + from reportlab.lib.pagesizes import letter # type: ignore + from reportlab.pdfgen import canvas # type: ignore + _have_reportlab = True +except Exception: # pragma: no cover + _have_reportlab = False + +# We only run tests if reportlab is present locally; it's not a hard dependency. +skip_no_reportlab = pytest.mark.skipif(not _have_reportlab, reason="reportlab not installed") + + +def _build_pdf_with_table() -> bytes: + """Generate a simple PDF containing a small 3x3 table drawn with text (not vector table lines).""" + buffer = io.BytesIO() + c = canvas.Canvas(buffer, pagesize=letter) + c.setFont("Helvetica", 12) + # Simple table headers and rows at fixed positions + start_x, start_y = 72, 720 + data = [ + ["ColA", "ColB", "ColC"], + ["1", "2", "3"], + ["4", "5", "6"], + ] + for r, row in enumerate(data): + for col, cell in enumerate(row): + c.drawString(start_x + col * 80, start_y - r * 18, cell) + c.showPage() + c.save() + return buffer.getvalue() + + +@skip_no_reportlab +@pytest.mark.parametrize("mode", ["none", "plumber", "auto"]) # camelot requires file path + ghostscript +def test_pdf_tables_modes(mode): + pdf_bytes = _build_pdf_with_table() + markitdown = MarkItDown() + + result = markitdown.convert_stream(io.BytesIO(pdf_bytes), pdf_tables=mode) + text = result.text_content + + # Base assertions: headers appear + assert "ColA" in text and "ColB" in text and "ColC" in text + # Numbers appear + for n in ["1", "2", "3", "4", "5", "6"]: + assert n in text + + if mode in ("plumber", "auto"): + # Expect at least one markdown table line with pipes (header separator) + if "| ColA" in text: # header row + assert "| ColA" in text and "| ColB" in text and "| ColC" in text + assert "| ---" in text or "| ---".replace(" ", "") in text + # Not a hard failure if plumbing fails silently (e.g., pdfplumber not installed) + + +@skip_no_reportlab +def test_pdf_tables_invalid_mode(): + pdf_bytes = _build_pdf_with_table() + markitdown = MarkItDown() + # Invalid mode should fallback to none + result = markitdown.convert_stream(io.BytesIO(pdf_bytes), pdf_tables="weird") + assert "| ColA" not in result.text_content # no table formatting expected