Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,23 +364,37 @@ def fixed_width_page(
"""
lines: list[str] = []
last_y_coord = 0
table = str.maketrans(dict.fromkeys(range(14, 32), " "))
for y_coord, line_data in ty_groups.items():
if space_vertically and lines:
fh = line_data[0]["font_height"]
blank_lines = 0 if fh == 0 else (
int(abs(y_coord - last_y_coord) / (fh * font_height_weight)) - 1
)
lines.extend([""] * blank_lines)
line = ""

line_parts = [] # It uses a list to construct the line, avoiding string concatenation.
current_len = 0 # Track the size with int instead of len(str) giant.
last_disp = 0.0
for bt_op in line_data:
offset = int(bt_op["tx"] // char_width)
spaces = (offset - len(line)) * (ceil(last_disp) < int(bt_op["tx"]))
line = f"{line}{' ' * spaces}{bt_op['text']}"
tx = bt_op["tx"]
offset = int(tx // char_width)
needed_spaces = offset - current_len
if needed_spaces > 0 and ceil(last_disp) < int(tx):
padding = " " * needed_spaces
line_parts.append(padding)
current_len += needed_spaces

raw_text = bt_op["text"]
text = raw_text.translate(table)
line_parts.append(text)
current_len += len(text)
last_disp = bt_op["displaced_tx"]
if line.strip() or lines:
lines.append(
"".join(c if ord(c) < 14 or ord(c) > 31 else " " for c in line)
)

full_line = "".join(line_parts).rstrip()
if full_line.strip() or (space_vertically and lines):
lines.append(full_line)

last_y_coord = y_coord
return "\n".join(ln.rstrip() for ln in lines if space_vertically or ln.strip())

return "\n".join(lines)
13 changes: 13 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,3 +456,16 @@ def test_extract_text__restore_cm_stack_pop_error():
# check for the message explicitly here.
with pytest.raises(IndexError, match="list index out of range"):
page.extract_text()


@pytest.mark.timeout(60)
@pytest.mark.enable_socket
def test_slow_huge_string():
"""Tests for #3541"""
url = "https://github.com/user-attachments/files/23855795/file.pdf"
name = "issue-3541.pdf"
stream = BytesIO(get_data_from_url(url, name=name))
reader = PdfReader(stream)
page = reader.pages[0]

_ = page.extract_text(extraction_mode="layout")