Skip to content

Enable quoted string with whitespace in CIF file 'atom_site' category. #673

@xvlaurent

Description

@xvlaurent

I have to work with in-house CIF files that have label_atom_id including whitespaces (which is allowed by the official mmCIF dictionary), but I found that biotite does not expect whitespaces in atom_site category to optimize parsing performance.

I worked a bit on this performance issue and I can propose a function that should handle unquoting faster than the present regex implementation.
I reused the benchmark from PR #619 to compare every solutions:

run number=1000000 times timeit.timeit('test_str_split(line)', number=number, globals=locals()) = 2.673100476997206 s
run number=1000000 times timeit.timeit('test_shlex_split(line)', number=number, globals=locals()) = 33.558549724999466 s
run number=1000000 times timeit.timeit('test_re_split(line)', number=number, globals=locals()) = 17.103288242000417 s
run number=1000000 times timeit.timeit('test_partitioned_split(line)', number=number, globals=locals()) = 4.747100908996799 s

Here is the testing code:

import re
import shlex

# Declare quoting patern and pre-compile regex once
single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)"
double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)'
unquoted_pattern = r"([^\s]+)"

# Combine the patterns using alternation
combined_pattern = f"{single_quote_pattern}|{double_quote_pattern}|{unquoted_pattern}"
FINAL_REGEX = re.compile(combined_pattern)


def split_line(line):
    values = line.split()
    for k in range(len(values)):
        # Remove quotes
        if (values[k][0] == '"' and values[k][-1] == '"') or (
            values[k][0] == "'" and values[k][-1] == "'"
        ):
            values[k] = values[k][1:-1]

def _split_one_line(line):
    """
    Split a line into its fields.
    Supporting embedded quotes (' or "), like `'a dog's life'` to  `a dog's life`
    """
    # Special case of multiline value, where the line starts with ';'
    if line[0] == ";":
        return [line[1:]]
    # Find all matches
    matches = FINAL_REGEX.findall(line)

    # Extract non-empty groups from the matches
    fields = []
    for match in matches:
        field = next(group for group in match if group)
        if field[0] == field[-1] == "'" or field[0] == field[-1] == '"':
            field = field[1:-1]
        fields.append(field)
    return fields


def partitioned_split_line(line: str) -> list[str]:
    # Special case of multiline value, where the line starts with ';'
    if line[0] == ";":
        return [line[1:]]

    # Initialize loop variables
    res = []
    concat_flag = False
    quote = ""

    # Loop over the line
    while line:
        # Split the line on whitespace
        word, _, line = line.partition(" ")
        # Handle the case where the word strat with a quote...
        if not concat_flag and word.startswith(("'", '"')):
            quote = word[0]
            # ... and ends with a quote as well, meaning we have a single quoted word
            if word.endswith(quote):
                res.append(word[1:-1])
                continue
            # ... and does not end with a quote, meaning we are in a multiword
            # quoted string.
            # We will need to concatenated the words until we find the closing quote
            concat_flag = True
            to_concat = word[1:]
        # Handle the case where we are in the middle of a multiword quoted string
        elif concat_flag and to_concat and quote:
            # Check if the word ends with the matching closing quote
            if word.endswith(quote):
                concat_flag = False
                to_concat += " " + word[:-1]
                quote = ""
                res.append(to_concat)
            # If not, we continue to concatenate the words
            else:
                to_concat += " " + word
        # Handle the easiest case, where we have an unquoted word
        else:
            res.append(word)
    return res


line = """ATOM 1 N PRO A 1 '1.400' "10.00" 10.000 1.00 0.00 N """


def test_re_split(line):
    _split_one_line(line)


def test_str_split(line):
    split_line(line)


def test_shlex_split(line):
    shlex.split(line)


def test_partitioned_split(line):
    partitioned_split_line(line)


if __name__ == "__main__":
    import timeit

    number = 1000000
    print(
        f"run {number=} times {timeit.timeit('test_str_split(line)', number=number, globals=locals()) = } s"
    )
    print(
        f"run {number=} times {timeit.timeit('test_shlex_split(line)', number=number, globals=locals()) = } s"
    )
    print(
        f"run {number=} times {timeit.timeit('test_re_split(line)', number=number, globals=locals()) = } s"
    )
    print(
        f"run {number=} times {timeit.timeit('test_partitioned_split(line)', number=number, globals=locals()) = } s"
    )

I copied the function _split_one_line presently used in biotite to test the perf improvement of a pre-compiled regex vs standard findall.

Would it be possible with the better perfs of partitioned_split_line to use this whitespace safe function to read atom_site blocks by default? Or at least to expose a parameter in the reader to enable it?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions