Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
316 changes: 274 additions & 42 deletions examples/example_notebook_reference_highlight.ipynb

Large diffs are not rendered by default.

28 changes: 21 additions & 7 deletions lexoid/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
LATEX_USER_PROMPT,
)
from lexoid.core.utils import (
bbox_router,
create_sub_pdf,
download_file,
get_webpage_soup,
Expand All @@ -55,7 +56,7 @@ def wrapper(*args, **kwargs):
if len(args) > 1:
if args[1] == ParserType.AUTO:
router_priority = kwargs.get("router_priority", "speed")
autoselect_llm = kwargs.get("autoselect_llm", True)
autoselect_llm = kwargs.get("autoselect_llm", False)
routed_parser_type, model = router(
kwargs["path"], router_priority, autoselect_llm=autoselect_llm
)
Expand All @@ -69,18 +70,16 @@ def wrapper(*args, **kwargs):
kwargs["parser_type"] = parser_type
return func(**kwargs)
except Exception as e:
if kwargs.get("parser_type") == ParserType.LLM_PARSE and kwargs.get(
"routed", False
):
parse_type = kwargs.get("parser_type")
routed = kwargs.get("routed", False)
if parse_type == ParserType.LLM_PARSE and routed:
logger.warning(
f"LLM_PARSE failed with error: {e}. Retrying with STATIC_PARSE."
)
kwargs["parser_type"] = ParserType.STATIC_PARSE
kwargs["routed"] = False
return func(**kwargs)
elif kwargs.get("parser_type") == ParserType.STATIC_PARSE and kwargs.get(
"routed", False
):
elif parse_type == ParserType.STATIC_PARSE and routed:
logger.warning(
f"STATIC_PARSE failed with error: {e}. Retrying with LLM_PARSE."
)
Expand Down Expand Up @@ -128,6 +127,21 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
result = parse_llm_doc(path, **kwargs)

result["parser_used"] = parser_type

return_bboxes = kwargs.get("return_bboxes", False)
has_bboxes = bool(result["segments"][0].get("bboxes"))
bbox_framework_different = kwargs.get("bbox_framework") != kwargs.get("framework")
if return_bboxes and (not has_bboxes or bbox_framework_different):
logger.debug("Extracting bounding boxes...")
if kwargs.get("bbox_framework", "auto") == "auto":
kwargs["bbox_framework"] = bbox_router(path)
kwargs["parser_type"] = ParserType.STATIC_PARSE
kwargs["framework"] = kwargs["bbox_framework"]
result_static = parse_static_doc(path, **kwargs)
for i, segment in enumerate(result["segments"]):
if i < len(result_static["segments"]):
segment["bboxes"] = result_static["segments"][i].get("bboxes", [])

return result


Expand Down
18 changes: 15 additions & 3 deletions lexoid/core/conversion_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,22 @@ def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
return [(0, f"data:image/png;base64,{image_base64}")]


def base64_to_cv2_image(b64_string: str) -> np.ndarray:
def base64_to_bytesio(b64_string: str) -> io.BytesIO:
image_data = base64.b64decode(b64_string.split(",")[1])
image = Image.open(io.BytesIO(image_data)).convert("L") # grayscale
return np.array(image)
return io.BytesIO(image_data)


def base64_to_pil_image(b64_string: str) -> Image.Image:
return Image.open(base64_to_bytesio(b64_string))


def base64_to_cv2_image(b64_string: str, gray_scale: bool = True) -> np.ndarray:
pil_image = base64_to_pil_image(b64_string)
if gray_scale:
image = pil_image.convert("L")
return np.array(image)
else:
return np.array(pil_image)


def cv2_to_pil(cv2_image: np.ndarray) -> Image.Image:
Expand Down
95 changes: 84 additions & 11 deletions lexoid/core/parse_type/static_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,21 @@
import pdfplumber
from docx import Document
from loguru import logger
from paddleocr import PaddleOCR
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
from pptx2md import ConversionConfig, convert


from lexoid.core.conversion_utils import (
base64_to_cv2_image,
convert_doc_to_base64_images,
)
from lexoid.core.utils import (
get_file_type,
get_uri_rect,
html_to_markdown,
split_bbox_by_word_length,
split_md_by_headings,
split_pdf,
)
Expand All @@ -30,20 +35,17 @@ def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
if "pdfplumber" in kwargs.get("framework", "pdfplumber") and not kwargs.get(
"routed", False
):
kwargs["framework"] = "pdfminer"
framework = kwargs.get("framework", "pdfplumber")
if framework != "pdfplumber":
kwargs["framework"] = "pdfplumber"
logger.warning(
f"Retrying with pdfminer due to error: {e}. Original framework: {kwargs['framework']}"
f"Retrying with pdfplumber due to error: {e}. Original framework: {framework}"
)
return func(*args, **kwargs)
elif "pdfminer" in kwargs.get("framework", "pdfplumber") and not kwargs.get(
"routed", False
):
kwargs["framework"] = "pdfplumber"
elif framework != "paddleocr":
kwargs["framework"] = "paddleocr"
logger.warning(
f"Retrying with pdfplumber due to error: {e}. Original framework: {kwargs['framework']}"
f"Retrying with paddleocr due to error: {e}. Original framework: {framework}"
)
return func(*args, **kwargs)
else:
Expand Down Expand Up @@ -81,8 +83,12 @@ def parse_static_doc(path: str, **kwargs) -> Dict:
return parse_with_pdfplumber(path, **kwargs)
elif framework == "pdfminer":
return parse_with_pdfminer(path, **kwargs)
elif framework == "paddleocr":
return parse_with_paddleocr(path, **kwargs)
else:
raise ValueError(f"Unsupported framework: {framework}")
elif "image" in file_type:
return parse_with_paddleocr(path, **kwargs)
elif "wordprocessing" in file_type:
return parse_with_docx(path, **kwargs)
elif file_type == "text/html":
Expand Down Expand Up @@ -722,3 +728,70 @@ def parse_with_docx(path: str, **kwargs) -> Dict:
"parent_title": kwargs.get("parent_title", ""),
"recursive_docs": [],
}


def parse_with_paddleocr(path: str, **kwargs) -> Dict:
"""
Parse document using PaddleOCR and return bboxes.

Args:
path (str): Path to the PDF document.

Returns:
Dict: Dictionary containing parsed document data with segments per page.
"""
ocr = PaddleOCR(use_angle_cls=False, lang="en")

base64_images = convert_doc_to_base64_images(path)

segments = []
all_texts = []

for page_num, base64_img_str in base64_images:
image_np = base64_to_cv2_image(base64_img_str, gray_scale=False)

results = ocr.predict(image_np, use_doc_unwarping=False)

page_texts = []
page_bboxes = []

height_img, width_img = image_np.shape[:2]

for text, bbox in zip(results[0]["rec_texts"], results[0]["rec_polys"]):
x_coords = bbox[:, 0]
y_coords = bbox[:, 1]
x_min = x_coords.min().item()
y_min = y_coords.min().item()
x_max = x_coords.max().item()
y_max = y_coords.max().item()

top = y_min / height_img
bottom = y_max / height_img
x0 = x_min / width_img
x1 = x_max / width_img

split_words = split_bbox_by_word_length([x0, top, x1, bottom], text)

for word_bbox, word_text in split_words:
page_texts.append(word_text)
page_bboxes.append((word_text, word_bbox))

page_text_str = " ".join(page_texts)
all_texts.append(page_text_str)

segments.append(
{
"metadata": {"page": kwargs.get("start", 1) + page_num},
"content": page_text_str,
"bboxes": page_bboxes,
}
)

return {
"raw": "\n\n".join(all_texts),
"segments": segments,
"title": kwargs.get("title", ""),
"url": kwargs.get("url", ""),
"parent_title": kwargs.get("parent_title", ""),
"recursive_docs": [],
}
Loading