Skip to content
3 changes: 2 additions & 1 deletion CONTRIB.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ CONTRIBUTORS
- Jincai Yang <https://github.com/0ut0fcontrol>
- Tom Eulenfeld <https://github.com/trichter>
- Benjamin E. Mayer <https://github.com/entropybit>
- Natasha Jaffe <https://github.com/tcjaffe>
- Natasha Jaffe <https://github.com/tcjaffe>
- Tristen J. Mier <https://github.com/tjmier>
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@

# Get structure and sequence
pdbx_file = pdbx.CIFFile.read(rcsb.fetch("1GUU", "mmcif"))
sequence = pdbx.get_sequence(pdbx_file)[0]
sequence = pdbx.get_sequence(pdbx_file)['A']
# 'use_author_fields' is set to false,
# to ensure that values in the 'res_id' annotation point to the sequence
structure = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
Expand Down
41 changes: 31 additions & 10 deletions src/biotite/structure/io/pdbx/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,21 +134,42 @@ def get_sequence(pdbx_file, data_block=None):

Returns
-------
sequences : list of Sequence
The protein and nucleotide sequences for each entity
(equivalent to chains in most cases).
sequence_dict : Dictionary of Sequences
Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
(often equivalent to chain_id and atom_site.auth_asym_id
in most cases). Dictionary values are sequences.

Notes
-----
The ``entity_poly.pdbx_seq_one_letter_code_can`` field contains the initial
complete sequence. If the structure represents a truncated or spliced
version of this initial sequence, it will include only a subset of the
initial sequence. Use biotite.structure.get_residues to retrieve only
the residues that are represented in the structure.
"""

block = _get_block(pdbx_file, data_block)

poly_category= block["entity_poly"]

seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
seq_type = poly_category["type"].as_array(str)
sequences = []
for string, stype in zip(seq_string, seq_type):
sequence = _convert_string_to_sequence(string, stype)
if sequence is not None:
sequences.append(sequence)
return sequences

sequences = [
_convert_string_to_sequence(string, stype)
for string, stype in zip(seq_string, seq_type)
]

strand_ids = poly_category['pdbx_strand_id'].as_array(str)
strand_ids = [strand_id.split(",") for strand_id in strand_ids]

sequence_dict = {
strand_id: sequence
for sequence, strand_ids in zip(sequences, strand_ids)
for strand_id in strand_ids
if sequence is not None
}

return sequence_dict


def get_model_count(pdbx_file, data_block=None):
Expand Down
2 changes: 1 addition & 1 deletion tests/database/test_rcsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def test_search_field(field, molecular_definition, params, ref_ids):
def test_search_sequence():
IDENTIY_CUTOFF = 0.9
pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif"))
ref_sequence = pdbx.get_sequence(pdbx_file)[0]
ref_sequence = pdbx.get_sequence(pdbx_file)['A']
query = rcsb.SequenceQuery(
ref_sequence, "protein", min_identity=IDENTIY_CUTOFF
)
Expand Down
24 changes: 12 additions & 12 deletions tests/structure/test_pdbx.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,32 +457,32 @@ def test_get_sequence(format):
File = pdbx.BinaryCIFFile

pdbx_file = File.read(join(data_dir("structure"), f"5ugo.{format}"))
sequences = pdbx.get_sequence(pdbx_file)
sequences_1 = pdbx.get_sequence(pdbx_file)
pdbx_file = File.read(join(data_dir("structure"), f"4gxy.{format}"))
sequences += pdbx.get_sequence(pdbx_file)
assert str(sequences[0]) == "CCGACGGCGCATCAGC"
assert type(sequences[0]) is seq.NucleotideSequence
assert str(sequences[1]) == "GCTGATGCGCC"
assert type(sequences[1]) is seq.NucleotideSequence
assert str(sequences[2]) == "GTCGG"
assert type(sequences[2]) is seq.NucleotideSequence
sequences_2 = pdbx.get_sequence(pdbx_file)
assert str(sequences_1['T']) == "CCGACGGCGCATCAGC"
assert type(sequences_1['T']) is seq.NucleotideSequence
assert str(sequences_1['P']) == "GCTGATGCGCC"
assert type(sequences_1['P']) is seq.NucleotideSequence
assert str(sequences_1['D']) == "GTCGG"
assert type(sequences_1['D']) is seq.NucleotideSequence
assert (
str(sequences[3]) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN"
str(sequences_1['A']) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN"
"AYRKAASVIAKYPHKIKSGAEAKKLPGVGTKIAEKIDEFLATGKLRKLEKIRQD"
"DTSSSINFLTRVSGIGPSAARKFVDEGIKTLEDLRKNEDKLNHHQRIGLKYFGD"
"FEKRIPREEMLQMQDIVLNEVKKVDSEYIATVCGSFRRGAESSGDMDVLLTHPS"
"FTSESTKQPKLLHQVVEQLQKVHFITDTLSKGETKFMGVCQLPSKNDEKEYPHR"
"RIDIRLIPKDQYYCGVLYFTGSDIFNKNMRAHALEKGFTINEYTIRPLGVTGVA"
"GEPLPVDSEKDIFDYIQWKYREPKDRSE"
)
assert type(sequences[3]) is seq.ProteinSequence
assert type(sequences_1['A']) is seq.ProteinSequence
assert (
str(sequences[4]) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA"
str(sequences_2['A']) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA"
"AAGGGAAGCCGGTGCAAGTCCGGCACGGTCCCGCCACTGTGACGGGGAGTCGCC"
"CCTCGGGATGTGCCACTGGCCCGAAGGCCGGGAAGGCGGAGGGGCGGCGAGGAT"
"CCGGAGTCAGGAAACCTGCCTGCCGTC"
)
assert type(sequences[4]) is seq.NucleotideSequence
assert type(sequences_2['A']) is seq.NucleotideSequence


def test_bcif_encoding():
Expand Down
2 changes: 1 addition & 1 deletion tests/structure/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_pdbx_sequence_consistency(path):
def _find_best_match(sequence, ref_sequences):
best_alignment = None
best_identity = 0.0
for ref_sequence in ref_sequences:
for ref_sequence in ref_sequences.values():
if type(sequence) != type(ref_sequence):
continue
if isinstance(sequence, seq.ProteinSequence):
Expand Down