11import sys
22import io
3-
43from typing import BinaryIO , Any
54
6-
75from .._base_converter import DocumentConverter , DocumentConverterResult
86from .._stream_info import StreamInfo
97from .._exceptions import MissingDependencyException , MISSING_DEPENDENCY_MESSAGE
108
11-
12- # Try loading optional (but in this case, required) dependencies
13- # Save reporting of any exceptions for later
9+ # Load dependencies
1410_dependency_exc_info = None
1511try :
1612 import pdfminer
1713 import pdfminer .high_level
14+ import pdfplumber
1815except ImportError :
19- # Preserve the error and stack trace for later
2016 _dependency_exc_info = sys .exc_info ()
2117
2218
2824ACCEPTED_FILE_EXTENSIONS = [".pdf" ]
2925
3026
27+ def _to_markdown_table (table : list [list [str ]]) -> str :
28+ """Convert a 2D list (rows/columns) into a nicely aligned Markdown table."""
29+ if not table :
30+ return ""
31+
32+ # Normalize None → ""
33+ table = [[cell if cell is not None else "" for cell in row ] for row in table ]
34+
35+ # Column widths
36+ col_widths = [max (len (str (cell )) for cell in col ) for col in zip (* table )]
37+
38+ def fmt_row (row ):
39+ return "| " + " | " .join (
40+ str (cell ).ljust (width ) for cell , width in zip (row , col_widths )
41+ ) + " |"
42+
43+ header , * rows = table
44+ md = [fmt_row (header )]
45+ md .append ("| " + " | " .join ("-" * w for w in col_widths ) + " |" )
46+ for row in rows :
47+ md .append (fmt_row (row ))
48+
49+ return "\n " .join (md )
50+
51+
3152class PdfConverter (DocumentConverter ):
3253 """
33- Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
54+ Converts PDFs to Markdown.
55+ Supports extracting tables into aligned Markdown format (via pdfplumber).
56+ Falls back to pdfminer if pdfplumber is missing or fails.
3457 """
3558
3659 def accepts (
3760 self ,
3861 file_stream : BinaryIO ,
3962 stream_info : StreamInfo ,
40- ** kwargs : Any , # Options to pass to the converter
63+ ** kwargs : Any ,
4164 ) -> bool :
4265 mimetype = (stream_info .mimetype or "" ).lower ()
4366 extension = (stream_info .extension or "" ).lower ()
@@ -55,23 +78,59 @@ def convert(
5578 self ,
5679 file_stream : BinaryIO ,
5780 stream_info : StreamInfo ,
58- ** kwargs : Any , # Options to pass to the converter
81+ ** kwargs : Any ,
5982 ) -> DocumentConverterResult :
60- # Check the dependencies
6183 if _dependency_exc_info is not None :
6284 raise MissingDependencyException (
6385 MISSING_DEPENDENCY_MESSAGE .format (
6486 converter = type (self ).__name__ ,
6587 extension = ".pdf" ,
6688 feature = "pdf" ,
6789 )
68- ) from _dependency_exc_info [
69- 1
70- ].with_traceback ( # type: ignore[union-attr]
71- _dependency_exc_info [2 ]
72- )
73-
74- assert isinstance (file_stream , io .IOBase ) # for mypy
75- return DocumentConverterResult (
76- markdown = pdfminer .high_level .extract_text (file_stream ),
77- )
90+ ) from _dependency_exc_info [1 ].with_traceback (_dependency_exc_info [2 ]) # type: ignore[union-attr]
91+
92+ assert isinstance (file_stream , io .IOBase )
93+
94+ markdown_chunks : list [str ] = []
95+
96+ try :
97+ with pdfplumber .open (file_stream ) as pdf :
98+ for page in pdf .pages :
99+ text = page .extract_text () or ""
100+ page_tables = page .extract_tables ()
101+
102+ # Remove table rows from text to avoid duplication
103+ for table in page_tables :
104+ if not table :
105+ continue
106+ header_line = " " .join (table [0 ])
107+ if header_line in text :
108+ text = text .replace (header_line , "" )
109+ for row in table [1 :]:
110+ row_line = " " .join (row )
111+ if row_line in text :
112+ text = text .replace (row_line , "" )
113+
114+ # Normalize whitespace: collapse multiple blank lines
115+ lines = [line .strip () for line in text .splitlines () if line .strip ()]
116+ clean_text = "\n " .join (lines )
117+ if clean_text :
118+ markdown_chunks .append (clean_text )
119+
120+ # Append tables as aligned Markdown
121+ for table in page_tables :
122+ md_table = _to_markdown_table (table )
123+ if md_table :
124+ markdown_chunks .append (md_table )
125+
126+ markdown = "\n \n " .join (markdown_chunks ).strip ()
127+
128+ except Exception :
129+ # Fallback if pdfplumber fails
130+ markdown = pdfminer .high_level .extract_text (file_stream )
131+
132+ # Fallback if still empty
133+ if not markdown :
134+ markdown = pdfminer .high_level .extract_text (file_stream )
135+
136+ return DocumentConverterResult (markdown = markdown )
0 commit comments