biotite-dev · padix-key · Jun 28, 2024 · Jun 16, 2024 · Jun 17, 2024 · Jun 19, 2024
diff --git a/CONTRIB.rst b/CONTRIB.rst
@@ -17,4 +17,5 @@ CONTRIBUTORS
 - Jincai Yang <https://github.com/0ut0fcontrol>
 - Tom Eulenfeld <https://github.com/trichter>
 - Benjamin E. Mayer <https://github.com/entropybit>
-- Natasha Jaffe <https://github.com/tcjaffe>
+- Natasha Jaffe <https://github.com/tcjaffe>
+- Tristen J. Mier  <https://github.com/tjmier>
diff --git a/doc/examples/scripts/sequence/homology/residue_coevolution.py b/doc/examples/scripts/sequence/homology/residue_coevolution.py
@@ -58,7 +58,7 @@
 
 # Get structure and sequence
 pdbx_file = pdbx.CIFFile.read(rcsb.fetch("1GUU", "mmcif"))
-sequence = pdbx.get_sequence(pdbx_file)[0]
+sequence = pdbx.get_sequence(pdbx_file)['A']
 # 'use_author_fields' is set to false,
 # to ensure that values in the 'res_id' annotation point to the sequence
 structure = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)

diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py
@@ -134,21 +134,42 @@ def get_sequence(pdbx_file, data_block=None):
 
     Returns
     -------
-    sequences : list of Sequence
-        The protein and nucleotide sequences for each entity
-        (equivalent to chains in most cases).
+    sequence_dict : Dictionary of Sequences 
+        Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
+        (often equivalent to chain_id and atom_site.auth_asym_id
+        in most cases). Dictionary values are sequences.
+
+    Notes
+    -----
+    The ``entity_poly.pdbx_seq_one_letter_code_can`` field contains the initial 
+    complete sequence. If the structure represents a truncated or spliced 
+    version of this initial sequence, it will include only a subset of the 
+    initial sequence. Use biotite.structure.get_residues to retrieve only 
+    the residues that are represented in the structure.
     """
+
     block = _get_block(pdbx_file, data_block)
-
     poly_category= block["entity_poly"]
+
     seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
     seq_type = poly_category["type"].as_array(str)
-    sequences = []
-    for string, stype in zip(seq_string, seq_type):
-        sequence = _convert_string_to_sequence(string, stype)
-        if sequence is not None:
-            sequences.append(sequence)
-    return sequences
+
+    sequences = [
+        _convert_string_to_sequence(string, stype)
+        for string, stype in zip(seq_string, seq_type)
+    ]
+
+    strand_ids = poly_category['pdbx_strand_id'].as_array(str)
+    strand_ids = [strand_id.split(",") for strand_id in strand_ids]
+
+    sequence_dict = {
+        strand_id: sequence
+        for sequence, strand_ids in zip(sequences, strand_ids)
+        for strand_id in strand_ids
+        if sequence is not None
+    }
+
+    return sequence_dict
 
 
 def get_model_count(pdbx_file, data_block=None):

diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py
@@ -142,7 +142,7 @@ def test_search_field(field, molecular_definition, params, ref_ids):
 def test_search_sequence():
     IDENTIY_CUTOFF = 0.9
     pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif"))
-    ref_sequence = pdbx.get_sequence(pdbx_file)[0]
+    ref_sequence = pdbx.get_sequence(pdbx_file)['A']
     query = rcsb.SequenceQuery(
         ref_sequence, "protein", min_identity=IDENTIY_CUTOFF
     )

diff --git a/tests/structure/test_pdbx.py b/tests/structure/test_pdbx.py
@@ -457,32 +457,32 @@ def test_get_sequence(format):
         File = pdbx.BinaryCIFFile
 
     pdbx_file = File.read(join(data_dir("structure"), f"5ugo.{format}"))
-    sequences = pdbx.get_sequence(pdbx_file)
+    sequences_1 = pdbx.get_sequence(pdbx_file)
     pdbx_file = File.read(join(data_dir("structure"), f"4gxy.{format}"))
-    sequences += pdbx.get_sequence(pdbx_file)
-    assert str(sequences[0]) == "CCGACGGCGCATCAGC"
-    assert type(sequences[0]) is seq.NucleotideSequence
-    assert str(sequences[1]) == "GCTGATGCGCC"
-    assert type(sequences[1]) is seq.NucleotideSequence
-    assert str(sequences[2]) == "GTCGG"
-    assert type(sequences[2]) is seq.NucleotideSequence
+    sequences_2 = pdbx.get_sequence(pdbx_file)
+    assert str(sequences_1['T']) == "CCGACGGCGCATCAGC"
+    assert type(sequences_1['T']) is seq.NucleotideSequence
+    assert str(sequences_1['P']) == "GCTGATGCGCC"
+    assert type(sequences_1['P']) is seq.NucleotideSequence
+    assert str(sequences_1['D']) == "GTCGG"
+    assert type(sequences_1['D']) is seq.NucleotideSequence
     assert (
-        str(sequences[3]) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN"
+        str(sequences_1['A']) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN"
         "AYRKAASVIAKYPHKIKSGAEAKKLPGVGTKIAEKIDEFLATGKLRKLEKIRQD"
         "DTSSSINFLTRVSGIGPSAARKFVDEGIKTLEDLRKNEDKLNHHQRIGLKYFGD"
         "FEKRIPREEMLQMQDIVLNEVKKVDSEYIATVCGSFRRGAESSGDMDVLLTHPS"
         "FTSESTKQPKLLHQVVEQLQKVHFITDTLSKGETKFMGVCQLPSKNDEKEYPHR"
         "RIDIRLIPKDQYYCGVLYFTGSDIFNKNMRAHALEKGFTINEYTIRPLGVTGVA"
         "GEPLPVDSEKDIFDYIQWKYREPKDRSE"
     )
-    assert type(sequences[3]) is seq.ProteinSequence
+    assert type(sequences_1['A']) is seq.ProteinSequence
     assert (
-        str(sequences[4]) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA"
+        str(sequences_2['A']) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA"
         "AAGGGAAGCCGGTGCAAGTCCGGCACGGTCCCGCCACTGTGACGGGGAGTCGCC"
         "CCTCGGGATGTGCCACTGGCCCGAAGGCCGGGAAGGCGGAGGGGCGGCGAGGAT"
         "CCGGAGTCAGGAAACCTGCCTGCCGTC"
     )
-    assert type(sequences[4]) is seq.NucleotideSequence
+    assert type(sequences_2['A']) is seq.NucleotideSequence
 
 
 def test_bcif_encoding():

diff --git a/tests/structure/test_sequence.py b/tests/structure/test_sequence.py
@@ -53,7 +53,7 @@ def test_pdbx_sequence_consistency(path):
 def _find_best_match(sequence, ref_sequences):
     best_alignment = None
     best_identity = 0.0
-    for ref_sequence in ref_sequences:
+    for ref_sequence in ref_sequences.values():
         if type(sequence) != type(ref_sequence):
             continue
         if isinstance(sequence, seq.ProteinSequence):