Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 36 additions & 9 deletions src/biotite/structure/io/pdbx/cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
__author__ = "Patrick Kunzmann"
__all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"]

import re
import itertools
import shlex
from collections.abc import MutableMapping, Sequence
import numpy as np
from .component import _Component, MaskValue
Expand Down Expand Up @@ -449,7 +449,7 @@ def _deserialize_single(lines):
"""
category_dict = {}
for line in lines:
parts = shlex.split(line)
parts = _split_one_line(line)
column_name = parts[0].split(".")[1]
column = parts[1]
category_dict[column_name] = CIFColumn(column)
Expand Down Expand Up @@ -480,12 +480,11 @@ def _deserialize_looped(lines, expect_whitespace):
column_names = itertools.cycle(column_names)
for data_line in data_lines:
# If whitespace is expected in quote protected values,
# use standard shlex split
# use regex-based _split_one_line() to split
# Otherwise use much more faster whitespace split
# and quote removal if applicable,
# bypassing the slow shlex module
# and quote removal if applicable.
if expect_whitespace:
values = shlex.split(data_line)
values = _split_one_line(data_line)
else:
values = data_line.split()
for k in range(len(values)):
Expand Down Expand Up @@ -652,7 +651,7 @@ def __getitem__(self, key):
# Special optimization for "atom_site":
# Even if the values are quote protected,
# no whitespace is expected in escaped values
# Therefore slow shlex.split() call is not necessary
# Therefore slow regex-based _split_one_line() call is not necessary
if key == "atom_site":
expect_whitespace = False
else:
Expand Down Expand Up @@ -973,11 +972,11 @@ def _to_single(lines, is_looped):
j += 1
if is_looped:
# Create a line for the multiline string only
processed_lines[out_i] = shlex.quote(multi_line_str)
processed_lines[out_i] = f"'{multi_line_str}'"
out_i += 1
else:
# Append multiline string to previous line
processed_lines[out_i - 1] += " " + shlex.quote(multi_line_str)
processed_lines[out_i - 1] += " " + f"'{multi_line_str}'"
in_i = j + 1

elif not is_looped and lines[in_i][0] != "_":
Expand Down Expand Up @@ -1024,6 +1023,34 @@ def _multiline(value):
return value


def _split_one_line(line):
"""
Split a line into its fields.
Supporting embedded quotes (' or "), like `'a dog's life'` to `a dog's life`
"""
# Define the patterns for different types of fields
single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)"
double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)'
unquoted_pattern = r"([^\s]+)"

# Combine the patterns using alternation
combined_pattern = (
f"{single_quote_pattern}|{double_quote_pattern}|{unquoted_pattern}"
)

# Find all matches
matches = re.findall(combined_pattern, line)

# Extract non-empty groups from the matches
fields = []
for match in matches:
field = next(group for group in match if group)
if field[0] == field[-1] == "'" or field[0] == field[-1] == '"':
field = field[1:-1]
fields.append(field)
return fields


def _arrayfy(data):
if not isinstance(data, (Sequence, np.ndarray)) or isinstance(data, str):
data = [data]
Expand Down
16 changes: 16 additions & 0 deletions tests/structure/test_pdbx.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,22 @@ def test_escape(string, looped):
assert test_value == ref_value


@pytest.mark.parametrize(
"cif_line, expected_fields",
[
["'' 'embed'quote' ", ['', "embed'quote"]],
['2 "embed"quote" "\t\n"', ['2', 'embed"quote', '\t\n']],
[" 3 '' \"\" 'spac e' 'embed\"quote'", ['3', '', '', 'spac e', 'embed"quote']],
["''' \"\"\" ''quoted''", ["'", '"', "'quoted'"]]
]
)
def test_split_one_line(cif_line, expected_fields):
"""
Test whether values that have an embedded quote are properly escaped.
"""
assert pdbx.cif._split_one_line(cif_line) == expected_fields


@pytest.mark.parametrize(
"format, path, model",
itertools.product(
Expand Down