33import tempfile
44from functools import wraps
55from time import time
6- from typing import Dict , List
6+ from typing import Dict , List , Tuple
77
88import pandas as pd
99import pdfplumber
@@ -173,34 +173,6 @@ def parse_with_pdfminer(path: str, **kwargs) -> Dict:
173173 }
174174
175175
176- def process_table (table ) -> str :
177- """
178- Convert a table to markdown format.
179- """
180- # Extract table data
181- table_data = table .extract ()
182- if not table_data or not table_data [0 ]: # Check if table is empty
183- return ""
184-
185- # Convert to DataFrame and handle empty cells
186- df = pd .DataFrame (table_data )
187- df .replace ("" , pd .NA , inplace = True )
188- df = df .dropna (how = "all" , axis = 0 )
189- df = df .dropna (how = "all" , axis = 1 )
190- df = df .fillna ("" )
191- if len (df ) == 0 :
192- return ""
193-
194- # Use first row as header and clean it up
195- df .columns = df .iloc [0 ]
196- df = df .drop (df .index [0 ])
197- df .replace (r"\n" , "<br>" , regex = True , inplace = True )
198-
199- # Convert to markdown with some formatting options
200- markdown_table = df .to_markdown (index = False , tablefmt = "pipe" )
201- return f"\n { markdown_table } \n \n "
202-
203-
204176def embed_links_in_text (page , text , links ):
205177 """
206178 Embed hyperlinks inline within the text, matching their position based on rectangles.
@@ -266,7 +238,9 @@ def embed_email_links(text: str) -> str:
266238 return email_pattern .sub (lambda match : f"<{ match .group ('email' )} >" , text )
267239
268240
269- def process_pdf_page_with_pdfplumber (page , uri_rects , ** kwargs ):
241+ def process_pdf_page_with_pdfplumber (
242+ page , uri_rects , ** kwargs
243+ ) -> Tuple [str , List [Tuple [str , Tuple [float , float , float , float ]]]]:
270244 """
271245 Process a single page's content and return formatted markdown text.
272246 """
@@ -277,6 +251,10 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
277251 x_tolerance = kwargs .get ("x_tolerance" , 1 )
278252 y_tolerance = kwargs .get ("y_tolerance" , 5 )
279253 next_h_line_idx = 0
254+ word_bboxes = []
255+
256+ page_width = float (page .width )
257+ page_height = float (page .height )
280258
281259 # First detect horizontal lines that could be markdown rules
282260 horizontal_lines = []
@@ -302,6 +280,57 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
302280 snap_x_tolerance = kwargs .get ("snap_x_tolerance" , 10 )
303281 snap_y_tolerance = kwargs .get ("snap_y_tolerance" , 0 )
304282
283+ def process_table (table ):
284+ table_data = table .extract ()
285+ if not table_data or not table_data [0 ]:
286+ return "" , []
287+
288+ df = pd .DataFrame (table_data )
289+ df .replace ("" , pd .NA , inplace = True )
290+ df = df .dropna (how = "all" , axis = 0 ).dropna (how = "all" , axis = 1 )
291+ df = df .fillna ("" )
292+ if len (df ) == 0 :
293+ return "" , []
294+
295+ df .columns = df .iloc [0 ]
296+ df = df .drop (df .index [0 ])
297+ df .replace (r"\n" , "<br>" , regex = True , inplace = True )
298+
299+ markdown_table = df .to_markdown (index = False , tablefmt = "pipe" )
300+ markdown_table = f"\n { markdown_table } \n \n "
301+
302+ words_on_page = page .extract_words (
303+ extra_attrs = ["top" , "bottom" , "fontname" , "size" ],
304+ )
305+
306+ def intersects (word_bbox , cell_bbox ):
307+ wx0 , wtop , wx1 , wbot = word_bbox
308+ cx0 , ctop , cx1 , cbot = cell_bbox
309+ x_overlap = (wx0 <= cx1 ) and (wx1 >= cx0 )
310+ y_overlap = (wtop <= cbot ) and (wbot >= ctop )
311+ return x_overlap and y_overlap
312+
313+ table_bboxes = []
314+ for cell in table .cells : # cell is a tuple: (x0, top, x1, bottom)
315+ cx0 , ctop , cx1 , cbot = cell
316+ cell_bbox = (cx0 , ctop , cx1 , cbot )
317+
318+ for w in words_on_page :
319+ word_bbox = (w ["x0" ], w ["top" ], w ["x1" ], w ["bottom" ])
320+ if intersects (word_bbox , cell_bbox ):
321+ text = (w .get ("text" ) or "" ).strip ()
322+ if not text :
323+ continue
324+ norm_bbox = (
325+ w ["x0" ] / page_width ,
326+ w ["top" ] / page_height ,
327+ w ["x1" ] / page_width ,
328+ w ["bottom" ] / page_height ,
329+ )
330+ table_bboxes .append ((text , norm_bbox ))
331+
332+ return markdown_table , table_bboxes
333+
305334 tables = page .find_tables (
306335 table_settings = {
307336 "vertical_strategy" : vertical_strategy ,
@@ -310,11 +339,14 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
310339 "snap_y_tolerance" : snap_y_tolerance ,
311340 }
312341 )
313- table_zones = [(table .bbox , process_table (table )) for table in tables ]
342+ table_zones = []
343+ for table in tables :
344+ table_md , table_bboxes = process_table (table )
345+ table_zones .append ((table .bbox , table_md , table_bboxes ))
314346
315347 # Create a filtered page excluding table areas
316348 filtered_page = page
317- for table_bbox , _ in table_zones :
349+ for table_bbox , _ , _ in table_zones :
318350 filtered_page = filtered_page .filter (
319351 lambda obj : get_bbox_overlap (obj_to_bbox (obj ), table_bbox ) is None
320352 )
@@ -395,6 +427,16 @@ def apply_markdown_formatting(text, formatting):
395427 text = f"*{ text } *"
396428 return text
397429
430+ def normalize_bbox (bbox ):
431+ """Convert PDF bbox to normalized coordinates (0-1)."""
432+ x0 , top , x1 , bottom = bbox
433+ return (
434+ x0 / page_width ,
435+ top / page_height ,
436+ x1 / page_width ,
437+ bottom / page_height ,
438+ )
439+
398440 def format_paragraph (text_elements ):
399441 """
400442 Format a paragraph with styling applied to individual words.
@@ -415,12 +457,12 @@ def format_paragraph(text_elements):
415457 formatting = get_text_formatting (element )
416458
417459 if formatting .get ("monospace" , False ):
418- # Wrap monospace words with backticks
419- formatted_words .append (f"`{ text } `" )
460+ formatted_word = f"`{ text } `"
420461 else :
421462 all_monospace = False
422- # Apply other markdown formatting
423- formatted_words .append (apply_markdown_formatting (text , formatting ))
463+ formatted_word = apply_markdown_formatting (text , formatting )
464+ formatted_words .append (formatted_word )
465+ word_bboxes .append ((formatted_word , normalize_bbox (obj_to_bbox (element ))))
424466
425467 # If all words are monospace, format as a code block
426468 if all_monospace :
@@ -457,14 +499,15 @@ def detect_heading_level(font_size, body_font_size):
457499 return None
458500
459501 tables = []
460- for bbox , table_md in table_zones :
502+ for bbox , table_md , table_bboxes in table_zones :
461503 tables .append (
462504 (
463505 "table" ,
464506 {
465507 "top" : bbox [1 ],
466508 "bottom" : bbox [3 ],
467509 "content" : table_md ,
510+ "bboxes" : table_bboxes ,
468511 },
469512 )
470513 )
@@ -512,6 +555,7 @@ def detect_heading_level(font_size, body_font_size):
512555 current_paragraph = []
513556 # Add the table
514557 markdown_content .append (element ["content" ])
558+ word_bboxes .extend (element ["bboxes" ])
515559 last_y = element ["bottom" ]
516560 elif element_type == "horizontal_line" :
517561 while (next_h_line_idx < len (horizontal_lines )) and (
@@ -548,8 +592,9 @@ def detect_heading_level(font_size, body_font_size):
548592 markdown_content .append (format_paragraph (current_paragraph ))
549593 current_paragraph = []
550594
551- indent_level = detect_indentation_level (word , base_left )
552- current_paragraph .append (("indent" , indent_level ))
595+ if heading_level is None :
596+ indent_level = detect_indentation_level (word , base_left )
597+ current_paragraph .append (("indent" , indent_level ))
553598
554599 # Add word to appropriate collection
555600 if heading_level :
@@ -600,14 +645,19 @@ def detect_heading_level(font_size, body_font_size):
600645 .replace ("\n ```\n \n ```" , "" )
601646 )
602647
603- return content
648+ return content , word_bboxes
604649
605650
606- def process_pdf_with_pdfplumber (path : str , ** kwargs ) -> List [str ]:
651+ def process_pdf_with_pdfplumber (
652+ path : str , ** kwargs
653+ ) -> List [Tuple [str , List [Tuple [str , Tuple [float , float , float , float ]]]]]:
607654 """
608- Process PDF and return a list of markdown-formatted strings, one per page.
655+ Process PDF and return a list of (markdown, word_bboxes) per page.
656+
657+ Returns: List[Tuple[str, List[Tuple[str, Tuple[float, float, float, float]]]]]
658+ Each page returns a (markdown_text, [(word, (x0, top, x1, bottom))]) tuple for both content and bounding box mapping.
609659 """
610- page_texts = []
660+ page_data = []
611661
612662 with tempfile .TemporaryDirectory () as temp_dir :
613663 paths = split_pdf (path , temp_dir , pages_per_split = 1 )
@@ -616,12 +666,12 @@ def process_pdf_with_pdfplumber(path: str, **kwargs) -> List[str]:
616666 uri_rects = get_uri_rect (split_path )
617667 with pdfplumber .open (split_path ) as pdf :
618668 for page in pdf .pages :
619- page_content = process_pdf_page_with_pdfplumber (
669+ page_content , word_bboxes = process_pdf_page_with_pdfplumber (
620670 page , uri_rects , ** kwargs
621671 )
622- page_texts .append (page_content .strip ())
672+ page_data .append (( page_content .strip (), word_bboxes ))
623673
624- return page_texts
674+ return page_data
625675
626676
627677def parse_with_pdfplumber (path : str , ** kwargs ) -> Dict :
@@ -631,9 +681,16 @@ def parse_with_pdfplumber(path: str, **kwargs) -> Dict:
631681 Returns:
632682 Dict: Dictionary containing parsed document data
633683 """
634- page_texts = process_pdf_with_pdfplumber (path )
684+ page_data = process_pdf_with_pdfplumber (path )
685+ page_texts = [p [0 ] for p in page_data ]
686+ page_bboxes = [p [1 ] for p in page_data ]
687+
635688 segments = [
636- {"metadata" : {"page" : kwargs ["start" ] + page_num }, "content" : page_text }
689+ {
690+ "metadata" : {"page" : kwargs ["start" ] + page_num },
691+ "content" : page_text ,
692+ "bboxes" : page_bboxes [page_num - 1 ],
693+ }
637694 for page_num , page_text in enumerate (page_texts , start = 1 )
638695 ]
639696
0 commit comments