Skip to content

Commit 59cc0d8

Browse files
author
Ashok
committed
Added PDF table extraction feature with aligned Markdown tables and deduplication
1 parent 8a9d8f1 commit 59cc0d8

File tree

1 file changed

+79
-20
lines changed

1 file changed

+79
-20
lines changed
Lines changed: 79 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,18 @@
11
import sys
22
import io
3-
43
from typing import BinaryIO, Any
54

6-
75
from .._base_converter import DocumentConverter, DocumentConverterResult
86
from .._stream_info import StreamInfo
97
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
108

11-
12-
# Try loading optional (but in this case, required) dependencies
13-
# Save reporting of any exceptions for later
9+
# Load dependencies
1410
_dependency_exc_info = None
1511
try:
1612
import pdfminer
1713
import pdfminer.high_level
14+
import pdfplumber
1815
except ImportError:
19-
# Preserve the error and stack trace for later
2016
_dependency_exc_info = sys.exc_info()
2117

2218

@@ -28,16 +24,43 @@
2824
ACCEPTED_FILE_EXTENSIONS = [".pdf"]
2925

3026

27+
def _to_markdown_table(table: list[list[str]]) -> str:
28+
"""Convert a 2D list (rows/columns) into a nicely aligned Markdown table."""
29+
if not table:
30+
return ""
31+
32+
# Normalize None → ""
33+
table = [[cell if cell is not None else "" for cell in row] for row in table]
34+
35+
# Column widths
36+
col_widths = [max(len(str(cell)) for cell in col) for col in zip(*table)]
37+
38+
def fmt_row(row):
39+
return "| " + " | ".join(
40+
str(cell).ljust(width) for cell, width in zip(row, col_widths)
41+
) + " |"
42+
43+
header, *rows = table
44+
md = [fmt_row(header)]
45+
md.append("| " + " | ".join("-" * w for w in col_widths) + " |")
46+
for row in rows:
47+
md.append(fmt_row(row))
48+
49+
return "\n".join(md)
50+
51+
3152
class PdfConverter(DocumentConverter):
3253
"""
33-
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
54+
Converts PDFs to Markdown.
55+
Supports extracting tables into aligned Markdown format (via pdfplumber).
56+
Falls back to pdfminer if pdfplumber is missing or fails.
3457
"""
3558

3659
def accepts(
3760
self,
3861
file_stream: BinaryIO,
3962
stream_info: StreamInfo,
40-
**kwargs: Any, # Options to pass to the converter
63+
**kwargs: Any,
4164
) -> bool:
4265
mimetype = (stream_info.mimetype or "").lower()
4366
extension = (stream_info.extension or "").lower()
@@ -55,23 +78,59 @@ def convert(
5578
self,
5679
file_stream: BinaryIO,
5780
stream_info: StreamInfo,
58-
**kwargs: Any, # Options to pass to the converter
81+
**kwargs: Any,
5982
) -> DocumentConverterResult:
60-
# Check the dependencies
6183
if _dependency_exc_info is not None:
6284
raise MissingDependencyException(
6385
MISSING_DEPENDENCY_MESSAGE.format(
6486
converter=type(self).__name__,
6587
extension=".pdf",
6688
feature="pdf",
6789
)
68-
) from _dependency_exc_info[
69-
1
70-
].with_traceback( # type: ignore[union-attr]
71-
_dependency_exc_info[2]
72-
)
73-
74-
assert isinstance(file_stream, io.IOBase) # for mypy
75-
return DocumentConverterResult(
76-
markdown=pdfminer.high_level.extract_text(file_stream),
77-
)
90+
) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2]) # type: ignore[union-attr]
91+
92+
assert isinstance(file_stream, io.IOBase)
93+
94+
markdown_chunks: list[str] = []
95+
96+
try:
97+
with pdfplumber.open(file_stream) as pdf:
98+
for page in pdf.pages:
99+
text = page.extract_text() or ""
100+
page_tables = page.extract_tables()
101+
102+
# Remove table rows from text to avoid duplication
103+
for table in page_tables:
104+
if not table:
105+
continue
106+
header_line = " ".join(table[0])
107+
if header_line in text:
108+
text = text.replace(header_line, "")
109+
for row in table[1:]:
110+
row_line = " ".join(row)
111+
if row_line in text:
112+
text = text.replace(row_line, "")
113+
114+
# Normalize whitespace: collapse multiple blank lines
115+
lines = [line.strip() for line in text.splitlines() if line.strip()]
116+
clean_text = "\n".join(lines)
117+
if clean_text:
118+
markdown_chunks.append(clean_text)
119+
120+
# Append tables as aligned Markdown
121+
for table in page_tables:
122+
md_table = _to_markdown_table(table)
123+
if md_table:
124+
markdown_chunks.append(md_table)
125+
126+
markdown = "\n\n".join(markdown_chunks).strip()
127+
128+
except Exception:
129+
# Fallback if pdfplumber fails
130+
markdown = pdfminer.high_level.extract_text(file_stream)
131+
132+
# Fallback if still empty
133+
if not markdown:
134+
markdown = pdfminer.high_level.extract_text(file_stream)
135+
136+
return DocumentConverterResult(markdown=markdown)

0 commit comments

Comments
 (0)