Skip to content

Commit d7d357f

Browse files
authored
Add support for reference highlighting
1 parent 858ca87 commit d7d357f

3 files changed

Lines changed: 552 additions & 49 deletions

File tree

examples/example_notebook_reference_highlight.ipynb

Lines changed: 227 additions & 0 deletions
Large diffs are not rendered by default.

lexoid/core/parse_type/static_parser.py

Lines changed: 105 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import tempfile
44
from functools import wraps
55
from time import time
6-
from typing import Dict, List
6+
from typing import Dict, List, Tuple
77

88
import pandas as pd
99
import pdfplumber
@@ -173,34 +173,6 @@ def parse_with_pdfminer(path: str, **kwargs) -> Dict:
173173
}
174174

175175

176-
def process_table(table) -> str:
177-
"""
178-
Convert a table to markdown format.
179-
"""
180-
# Extract table data
181-
table_data = table.extract()
182-
if not table_data or not table_data[0]: # Check if table is empty
183-
return ""
184-
185-
# Convert to DataFrame and handle empty cells
186-
df = pd.DataFrame(table_data)
187-
df.replace("", pd.NA, inplace=True)
188-
df = df.dropna(how="all", axis=0)
189-
df = df.dropna(how="all", axis=1)
190-
df = df.fillna("")
191-
if len(df) == 0:
192-
return ""
193-
194-
# Use first row as header and clean it up
195-
df.columns = df.iloc[0]
196-
df = df.drop(df.index[0])
197-
df.replace(r"\n", "<br>", regex=True, inplace=True)
198-
199-
# Convert to markdown with some formatting options
200-
markdown_table = df.to_markdown(index=False, tablefmt="pipe")
201-
return f"\n{markdown_table}\n\n"
202-
203-
204176
def embed_links_in_text(page, text, links):
205177
"""
206178
Embed hyperlinks inline within the text, matching their position based on rectangles.
@@ -266,7 +238,9 @@ def embed_email_links(text: str) -> str:
266238
return email_pattern.sub(lambda match: f"<{match.group('email')}>", text)
267239

268240

269-
def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
241+
def process_pdf_page_with_pdfplumber(
242+
page, uri_rects, **kwargs
243+
) -> Tuple[str, List[Tuple[str, Tuple[float, float, float, float]]]]:
270244
"""
271245
Process a single page's content and return formatted markdown text.
272246
"""
@@ -277,6 +251,10 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
277251
x_tolerance = kwargs.get("x_tolerance", 1)
278252
y_tolerance = kwargs.get("y_tolerance", 5)
279253
next_h_line_idx = 0
254+
word_bboxes = []
255+
256+
page_width = float(page.width)
257+
page_height = float(page.height)
280258

281259
# First detect horizontal lines that could be markdown rules
282260
horizontal_lines = []
@@ -302,6 +280,57 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
302280
snap_x_tolerance = kwargs.get("snap_x_tolerance", 10)
303281
snap_y_tolerance = kwargs.get("snap_y_tolerance", 0)
304282

283+
def process_table(table):
284+
table_data = table.extract()
285+
if not table_data or not table_data[0]:
286+
return "", []
287+
288+
df = pd.DataFrame(table_data)
289+
df.replace("", pd.NA, inplace=True)
290+
df = df.dropna(how="all", axis=0).dropna(how="all", axis=1)
291+
df = df.fillna("")
292+
if len(df) == 0:
293+
return "", []
294+
295+
df.columns = df.iloc[0]
296+
df = df.drop(df.index[0])
297+
df.replace(r"\n", "<br>", regex=True, inplace=True)
298+
299+
markdown_table = df.to_markdown(index=False, tablefmt="pipe")
300+
markdown_table = f"\n{markdown_table}\n\n"
301+
302+
words_on_page = page.extract_words(
303+
extra_attrs=["top", "bottom", "fontname", "size"],
304+
)
305+
306+
def intersects(word_bbox, cell_bbox):
307+
wx0, wtop, wx1, wbot = word_bbox
308+
cx0, ctop, cx1, cbot = cell_bbox
309+
x_overlap = (wx0 <= cx1) and (wx1 >= cx0)
310+
y_overlap = (wtop <= cbot) and (wbot >= ctop)
311+
return x_overlap and y_overlap
312+
313+
table_bboxes = []
314+
for cell in table.cells: # cell is a tuple: (x0, top, x1, bottom)
315+
cx0, ctop, cx1, cbot = cell
316+
cell_bbox = (cx0, ctop, cx1, cbot)
317+
318+
for w in words_on_page:
319+
word_bbox = (w["x0"], w["top"], w["x1"], w["bottom"])
320+
if intersects(word_bbox, cell_bbox):
321+
text = (w.get("text") or "").strip()
322+
if not text:
323+
continue
324+
norm_bbox = (
325+
w["x0"] / page_width,
326+
w["top"] / page_height,
327+
w["x1"] / page_width,
328+
w["bottom"] / page_height,
329+
)
330+
table_bboxes.append((text, norm_bbox))
331+
332+
return markdown_table, table_bboxes
333+
305334
tables = page.find_tables(
306335
table_settings={
307336
"vertical_strategy": vertical_strategy,
@@ -310,11 +339,14 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
310339
"snap_y_tolerance": snap_y_tolerance,
311340
}
312341
)
313-
table_zones = [(table.bbox, process_table(table)) for table in tables]
342+
table_zones = []
343+
for table in tables:
344+
table_md, table_bboxes = process_table(table)
345+
table_zones.append((table.bbox, table_md, table_bboxes))
314346

315347
# Create a filtered page excluding table areas
316348
filtered_page = page
317-
for table_bbox, _ in table_zones:
349+
for table_bbox, _, _ in table_zones:
318350
filtered_page = filtered_page.filter(
319351
lambda obj: get_bbox_overlap(obj_to_bbox(obj), table_bbox) is None
320352
)
@@ -395,6 +427,16 @@ def apply_markdown_formatting(text, formatting):
395427
text = f"*{text}*"
396428
return text
397429

430+
def normalize_bbox(bbox):
431+
"""Convert PDF bbox to normalized coordinates (0-1)."""
432+
x0, top, x1, bottom = bbox
433+
return (
434+
x0 / page_width,
435+
top / page_height,
436+
x1 / page_width,
437+
bottom / page_height,
438+
)
439+
398440
def format_paragraph(text_elements):
399441
"""
400442
Format a paragraph with styling applied to individual words.
@@ -415,12 +457,12 @@ def format_paragraph(text_elements):
415457
formatting = get_text_formatting(element)
416458

417459
if formatting.get("monospace", False):
418-
# Wrap monospace words with backticks
419-
formatted_words.append(f"`{text}`")
460+
formatted_word = f"`{text}`"
420461
else:
421462
all_monospace = False
422-
# Apply other markdown formatting
423-
formatted_words.append(apply_markdown_formatting(text, formatting))
463+
formatted_word = apply_markdown_formatting(text, formatting)
464+
formatted_words.append(formatted_word)
465+
word_bboxes.append((formatted_word, normalize_bbox(obj_to_bbox(element))))
424466

425467
# If all words are monospace, format as a code block
426468
if all_monospace:
@@ -457,14 +499,15 @@ def detect_heading_level(font_size, body_font_size):
457499
return None
458500

459501
tables = []
460-
for bbox, table_md in table_zones:
502+
for bbox, table_md, table_bboxes in table_zones:
461503
tables.append(
462504
(
463505
"table",
464506
{
465507
"top": bbox[1],
466508
"bottom": bbox[3],
467509
"content": table_md,
510+
"bboxes": table_bboxes,
468511
},
469512
)
470513
)
@@ -512,6 +555,7 @@ def detect_heading_level(font_size, body_font_size):
512555
current_paragraph = []
513556
# Add the table
514557
markdown_content.append(element["content"])
558+
word_bboxes.extend(element["bboxes"])
515559
last_y = element["bottom"]
516560
elif element_type == "horizontal_line":
517561
while (next_h_line_idx < len(horizontal_lines)) and (
@@ -548,8 +592,9 @@ def detect_heading_level(font_size, body_font_size):
548592
markdown_content.append(format_paragraph(current_paragraph))
549593
current_paragraph = []
550594

551-
indent_level = detect_indentation_level(word, base_left)
552-
current_paragraph.append(("indent", indent_level))
595+
if heading_level is None:
596+
indent_level = detect_indentation_level(word, base_left)
597+
current_paragraph.append(("indent", indent_level))
553598

554599
# Add word to appropriate collection
555600
if heading_level:
@@ -600,14 +645,19 @@ def detect_heading_level(font_size, body_font_size):
600645
.replace("\n```\n\n```", "")
601646
)
602647

603-
return content
648+
return content, word_bboxes
604649

605650

606-
def process_pdf_with_pdfplumber(path: str, **kwargs) -> List[str]:
651+
def process_pdf_with_pdfplumber(
652+
path: str, **kwargs
653+
) -> List[Tuple[str, List[Tuple[str, Tuple[float, float, float, float]]]]]:
607654
"""
608-
Process PDF and return a list of markdown-formatted strings, one per page.
655+
Process PDF and return a list of (markdown, word_bboxes) per page.
656+
657+
Returns: List[Tuple[str, List[Tuple[str, Tuple[float, float, float, float]]]]]
658+
Each page returns a (markdown_text, [(word, (x0, top, x1, bottom))]) tuple for both content and bounding box mapping.
609659
"""
610-
page_texts = []
660+
page_data = []
611661

612662
with tempfile.TemporaryDirectory() as temp_dir:
613663
paths = split_pdf(path, temp_dir, pages_per_split=1)
@@ -616,12 +666,12 @@ def process_pdf_with_pdfplumber(path: str, **kwargs) -> List[str]:
616666
uri_rects = get_uri_rect(split_path)
617667
with pdfplumber.open(split_path) as pdf:
618668
for page in pdf.pages:
619-
page_content = process_pdf_page_with_pdfplumber(
669+
page_content, word_bboxes = process_pdf_page_with_pdfplumber(
620670
page, uri_rects, **kwargs
621671
)
622-
page_texts.append(page_content.strip())
672+
page_data.append((page_content.strip(), word_bboxes))
623673

624-
return page_texts
674+
return page_data
625675

626676

627677
def parse_with_pdfplumber(path: str, **kwargs) -> Dict:
@@ -631,9 +681,16 @@ def parse_with_pdfplumber(path: str, **kwargs) -> Dict:
631681
Returns:
632682
Dict: Dictionary containing parsed document data
633683
"""
634-
page_texts = process_pdf_with_pdfplumber(path)
684+
page_data = process_pdf_with_pdfplumber(path)
685+
page_texts = [p[0] for p in page_data]
686+
page_bboxes = [p[1] for p in page_data]
687+
635688
segments = [
636-
{"metadata": {"page": kwargs["start"] + page_num}, "content": page_text}
689+
{
690+
"metadata": {"page": kwargs["start"] + page_num},
691+
"content": page_text,
692+
"bboxes": page_bboxes[page_num - 1],
693+
}
637694
for page_num, page_text in enumerate(page_texts, start=1)
638695
]
639696

0 commit comments

Comments
 (0)