From 055a123f8fc691041e983f40d4bfb91406d47d71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ramon=20Vi=C3=B1as=20Torn=C3=A9?= Date: Tue, 27 Jun 2023 12:57:16 +0200 Subject: [PATCH 1/9] Script to retrieve mapping PDB <-> RFAM family --- graphein/rna/download_rfam.py | 80 +++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 graphein/rna/download_rfam.py diff --git a/graphein/rna/download_rfam.py b/graphein/rna/download_rfam.py new file mode 100644 index 000000000..f1302525f --- /dev/null +++ b/graphein/rna/download_rfam.py @@ -0,0 +1,80 @@ +from typing import List, Optional + +import requests +import pandas as pd +from tqdm import tqdm + +# RFAM API endpoint to retrieve family information +RFAM_API_URL = 'https://rfam.org/family' + + +class FamilyNotFound(ValueError): + pass + + +def _get_RFAM_family_df(family_id: str): + """ + Downloads DataFrame of PDB IDs annotated with RFAM families + :param family_id: RFAM ID + :type family_id: str + :return: Pandas DataFrame with information about the structures of the RFAM family + :rtype: pd.DataFrame + """ + # Send an HTTP GET request to retrieve the data + response = requests.get(f'{RFAM_API_URL}/{family_id}/structures?content-type=application/json') + + # Check if the request was successful (status code 200) + if response.status_code == 200: + # Extract the family names from the response + data = response.json() + df = pd.DataFrame(data['mapping']) + else: + raise FamilyNotFound( + f'Error occurred while retrieving data for family {family_id} (status code: {response.status_code})') + + return df + + +def RFAM_families_df(family_ids: Optional[List[str]] = None, + max_id: int = 4236, + verbose=True): + """ + Retrieves a DataFrame of PDB IDs annotated with RFAM families + :param family_ids: List of families to retrieve. If None, retrieves all families: RF00001, RF00002, ..., RF04236 + (we assume that RFAM family IDs are in increasing order, see: http://http.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/) + :type family_ids: Optional[List[str]] + :param max_id: Maximum identifier to try. If family_ids is None, it'll query RFAM families RF00001 .. f'RF{max_id:05d}' + :type max_id: int + :param verbose: Whether to print messages + :type verbose: bool + :return: Pandas DataFrame with information about the structures of each RFAM family + :rtype: pd.DataFrame + """ + if family_ids is None: + family_ids = [f'RF{i:05d}' for i in range(1, max_id + 1)] + + families_df = None + if verbose: + print('Retrieving RFAM families ...') + for family_id in tqdm(family_ids): + # Retrieve DF for a single family + try: + df = _get_RFAM_family_df(family_id) + + # Concatenate + if families_df is None: + families_df = df + else: + families_df = pd.concat([families_df, df]) + except FamilyNotFound as e: + if verbose: + print(e) + continue + return families_df + + +if __name__ == '__main__': + family_IDs = None # ['RF10000'] + families_df = RFAM_families_df(family_IDs) + print(families_df) + # families_df.to_csv('RFAM_families_27062023.csv', index=False) From efb683cd922a07d9a2b84faa38094c0779813242 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ramon=20Vi=C3=B1as=20Torn=C3=A9?= Date: Tue, 27 Jun 2023 15:15:31 +0200 Subject: [PATCH 2/9] Script to retrieve mapping PDB <-> RFAM family --- graphein/rna/download_rfam.py | 45 +++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/graphein/rna/download_rfam.py b/graphein/rna/download_rfam.py index f1302525f..c275d09ef 100644 --- a/graphein/rna/download_rfam.py +++ b/graphein/rna/download_rfam.py @@ -25,18 +25,50 @@ def _get_RFAM_family_df(family_id: str): # Check if the request was successful (status code 200) if response.status_code == 200: - # Extract the family names from the response + # Extract the mapping from response data = response.json() df = pd.DataFrame(data['mapping']) else: raise FamilyNotFound( - f'Error occurred while retrieving data for family {family_id} (status code: {response.status_code})') + f'Error occurred while retrieving family <-> structures mapping for family {family_id} (status code: {response.status_code})') return df +def _get_RFAM_family_info(family_id: str, + keys: Optional[List[str]] = None): + """ + Downloads DataFrame of PDB IDs annotated with RFAM families + :param family_id: RFAM ID + :type family_id: str + :param keys: List of keys from the family endpoint to annotate (https://docs.rfam.org/en/latest/api.html#family) + :type keys: Optional[List[str]] + :return: Pandas DataFrame with information about the structures of the RFAM family + :rtype: pd.DataFrame + """ + if keys is None: + keys = ['id', 'description'] + + # Send an HTTP GET request to retrieve the data + response = requests.get(f'{RFAM_API_URL}/{family_id}?content-type=application/json') + + # Check if the request was successful (status code 200) + out = {} + if response.status_code == 200: + # Extract the family information from the response + data = response.json() + for k in keys: + out[k] = data['rfam'][k] + else: + raise FamilyNotFound( + f'Error occurred while retrieving data for family {family_id} (status code: {response.status_code})') + + return out + + def RFAM_families_df(family_ids: Optional[List[str]] = None, max_id: int = 4236, + family_info_keys: Optional[List[str]] = None, verbose=True): """ Retrieves a DataFrame of PDB IDs annotated with RFAM families @@ -45,6 +77,8 @@ def RFAM_families_df(family_ids: Optional[List[str]] = None, :type family_ids: Optional[List[str]] :param max_id: Maximum identifier to try. If family_ids is None, it'll query RFAM families RF00001 .. f'RF{max_id:05d}' :type max_id: int + :param family_info_keys: List of keys from the family endpoint to annotate (https://docs.rfam.org/en/latest/api.html#family) + :type family_info_keys: Optional[List[str]] :param verbose: Whether to print messages :type verbose: bool :return: Pandas DataFrame with information about the structures of each RFAM family @@ -59,8 +93,15 @@ def RFAM_families_df(family_ids: Optional[List[str]] = None, for family_id in tqdm(family_ids): # Retrieve DF for a single family try: + # Get family <-> structures mapping df = _get_RFAM_family_df(family_id) + # Annotate family with descriptions + out = _get_RFAM_family_info(family_id, + keys=family_info_keys) + for k, v in out.items(): + df[k] = v + # Concatenate if families_df is None: families_df = df From 0f2ce29497474c37005ee0617695026c164868c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ramon=20Vi=C3=B1as=20Torn=C3=A9?= Date: Tue, 27 Jun 2023 15:18:10 +0200 Subject: [PATCH 3/9] Script to retrieve mapping PDB <-> RFAM family --- graphein/rna/download_rfam.py | 44 ++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/graphein/rna/download_rfam.py b/graphein/rna/download_rfam.py index c275d09ef..029c92e0a 100644 --- a/graphein/rna/download_rfam.py +++ b/graphein/rna/download_rfam.py @@ -5,7 +5,7 @@ from tqdm import tqdm # RFAM API endpoint to retrieve family information -RFAM_API_URL = 'https://rfam.org/family' +RFAM_API_URL = "https://rfam.org/family" class FamilyNotFound(ValueError): @@ -21,22 +21,24 @@ def _get_RFAM_family_df(family_id: str): :rtype: pd.DataFrame """ # Send an HTTP GET request to retrieve the data - response = requests.get(f'{RFAM_API_URL}/{family_id}/structures?content-type=application/json') + response = requests.get( + f"{RFAM_API_URL}/{family_id}/structures?content-type=application/json" + ) # Check if the request was successful (status code 200) if response.status_code == 200: # Extract the mapping from response data = response.json() - df = pd.DataFrame(data['mapping']) + df = pd.DataFrame(data["mapping"]) else: raise FamilyNotFound( - f'Error occurred while retrieving family <-> structures mapping for family {family_id} (status code: {response.status_code})') + f"Error occurred while retrieving family <-> structures mapping for family {family_id} (status code: {response.status_code})" + ) return df -def _get_RFAM_family_info(family_id: str, - keys: Optional[List[str]] = None): +def _get_RFAM_family_info(family_id: str, keys: Optional[List[str]] = None): """ Downloads DataFrame of PDB IDs annotated with RFAM families :param family_id: RFAM ID @@ -47,10 +49,12 @@ def _get_RFAM_family_info(family_id: str, :rtype: pd.DataFrame """ if keys is None: - keys = ['id', 'description'] + keys = ["id", "description"] # Send an HTTP GET request to retrieve the data - response = requests.get(f'{RFAM_API_URL}/{family_id}?content-type=application/json') + response = requests.get( + f"{RFAM_API_URL}/{family_id}?content-type=application/json" + ) # Check if the request was successful (status code 200) out = {} @@ -58,18 +62,21 @@ def _get_RFAM_family_info(family_id: str, # Extract the family information from the response data = response.json() for k in keys: - out[k] = data['rfam'][k] + out[k] = data["rfam"][k] else: raise FamilyNotFound( - f'Error occurred while retrieving data for family {family_id} (status code: {response.status_code})') + f"Error occurred while retrieving data for family {family_id} (status code: {response.status_code})" + ) return out -def RFAM_families_df(family_ids: Optional[List[str]] = None, - max_id: int = 4236, - family_info_keys: Optional[List[str]] = None, - verbose=True): +def RFAM_families_df( + family_ids: Optional[List[str]] = None, + max_id: int = 4236, + family_info_keys: Optional[List[str]] = None, + verbose=True, +): """ Retrieves a DataFrame of PDB IDs annotated with RFAM families :param family_ids: List of families to retrieve. If None, retrieves all families: RF00001, RF00002, ..., RF04236 @@ -85,11 +92,11 @@ def RFAM_families_df(family_ids: Optional[List[str]] = None, :rtype: pd.DataFrame """ if family_ids is None: - family_ids = [f'RF{i:05d}' for i in range(1, max_id + 1)] + family_ids = [f"RF{i:05d}" for i in range(1, max_id + 1)] families_df = None if verbose: - print('Retrieving RFAM families ...') + print("Retrieving RFAM families ...") for family_id in tqdm(family_ids): # Retrieve DF for a single family try: @@ -97,8 +104,7 @@ def RFAM_families_df(family_ids: Optional[List[str]] = None, df = _get_RFAM_family_df(family_id) # Annotate family with descriptions - out = _get_RFAM_family_info(family_id, - keys=family_info_keys) + out = _get_RFAM_family_info(family_id, keys=family_info_keys) for k, v in out.items(): df[k] = v @@ -114,7 +120,7 @@ def RFAM_families_df(family_ids: Optional[List[str]] = None, return families_df -if __name__ == '__main__': +if __name__ == "__main__": family_IDs = None # ['RF10000'] families_df = RFAM_families_df(family_IDs) print(families_df) From 4949531ea102cda1064e9efb0254a07e6b7ab4b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ramon=20Vi=C3=B1as=20Torn=C3=A9?= Date: Tue, 27 Jun 2023 15:19:32 +0200 Subject: [PATCH 4/9] Script to retrieve mapping PDB <-> RFAM family --- graphein/rna/download_rfam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphein/rna/download_rfam.py b/graphein/rna/download_rfam.py index 029c92e0a..bb872fdae 100644 --- a/graphein/rna/download_rfam.py +++ b/graphein/rna/download_rfam.py @@ -1,7 +1,7 @@ from typing import List, Optional -import requests import pandas as pd +import requests from tqdm import tqdm # RFAM API endpoint to retrieve family information From a41eabdbdd7aa574bf30d222e0f5aa245d1459d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ramon=20Vi=C3=B1as=20Torn=C3=A9?= Date: Wed, 28 Jun 2023 09:27:04 +0200 Subject: [PATCH 5/9] Retrieve mapping PDB <-> RFAM family via FTM. PDBManager-like interface --- graphein/rna/download_rfam.py | 246 ++++++++++++++++++---------------- 1 file changed, 127 insertions(+), 119 deletions(-) diff --git a/graphein/rna/download_rfam.py b/graphein/rna/download_rfam.py index bb872fdae..334da74f3 100644 --- a/graphein/rna/download_rfam.py +++ b/graphein/rna/download_rfam.py @@ -1,127 +1,135 @@ -from typing import List, Optional +import gzip +import os +import shutil +from pathlib import Path import pandas as pd -import requests -from tqdm import tqdm - -# RFAM API endpoint to retrieve family information -RFAM_API_URL = "https://rfam.org/family" - - -class FamilyNotFound(ValueError): - pass - - -def _get_RFAM_family_df(family_id: str): - """ - Downloads DataFrame of PDB IDs annotated with RFAM families - :param family_id: RFAM ID - :type family_id: str - :return: Pandas DataFrame with information about the structures of the RFAM family - :rtype: pd.DataFrame - """ - # Send an HTTP GET request to retrieve the data - response = requests.get( - f"{RFAM_API_URL}/{family_id}/structures?content-type=application/json" - ) - - # Check if the request was successful (status code 200) - if response.status_code == 200: - # Extract the mapping from response - data = response.json() - df = pd.DataFrame(data["mapping"]) - else: - raise FamilyNotFound( - f"Error occurred while retrieving family <-> structures mapping for family {family_id} (status code: {response.status_code})" - ) +import wget + +from loguru import logger as log + + +class RFAMManager: + """ A utility for downloading RFAM families and their PDB structure IDs.""" - return df - - -def _get_RFAM_family_info(family_id: str, keys: Optional[List[str]] = None): - """ - Downloads DataFrame of PDB IDs annotated with RFAM families - :param family_id: RFAM ID - :type family_id: str - :param keys: List of keys from the family endpoint to annotate (https://docs.rfam.org/en/latest/api.html#family) - :type keys: Optional[List[str]] - :return: Pandas DataFrame with information about the structures of the RFAM family - :rtype: pd.DataFrame - """ - if keys is None: - keys = ["id", "description"] - - # Send an HTTP GET request to retrieve the data - response = requests.get( - f"{RFAM_API_URL}/{family_id}?content-type=application/json" - ) - - # Check if the request was successful (status code 200) - out = {} - if response.status_code == 200: - # Extract the family information from the response - data = response.json() - for k in keys: - out[k] = data["rfam"][k] - else: - raise FamilyNotFound( - f"Error occurred while retrieving data for family {family_id} (status code: {response.status_code})" + def __init__( + self, + root_dir: str = ".", + ): + # Arguments + self.root_dir = Path(root_dir) + + # Constants + self.rfam_families_url = "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/database_files/family.txt.gz" + self.rfam_pdb_mapping_url = ( + "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.pdb.gz" ) - return out - - -def RFAM_families_df( - family_ids: Optional[List[str]] = None, - max_id: int = 4236, - family_info_keys: Optional[List[str]] = None, - verbose=True, -): - """ - Retrieves a DataFrame of PDB IDs annotated with RFAM families - :param family_ids: List of families to retrieve. If None, retrieves all families: RF00001, RF00002, ..., RF04236 - (we assume that RFAM family IDs are in increasing order, see: http://http.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/) - :type family_ids: Optional[List[str]] - :param max_id: Maximum identifier to try. If family_ids is None, it'll query RFAM families RF00001 .. f'RF{max_id:05d}' - :type max_id: int - :param family_info_keys: List of keys from the family endpoint to annotate (https://docs.rfam.org/en/latest/api.html#family) - :type family_info_keys: Optional[List[str]] - :param verbose: Whether to print messages - :type verbose: bool - :return: Pandas DataFrame with information about the structures of each RFAM family - :rtype: pd.DataFrame - """ - if family_ids is None: - family_ids = [f"RF{i:05d}" for i in range(1, max_id + 1)] - - families_df = None - if verbose: - print("Retrieving RFAM families ...") - for family_id in tqdm(family_ids): - # Retrieve DF for a single family - try: - # Get family <-> structures mapping - df = _get_RFAM_family_df(family_id) - - # Annotate family with descriptions - out = _get_RFAM_family_info(family_id, keys=family_info_keys) - for k, v in out.items(): - df[k] = v - - # Concatenate - if families_df is None: - families_df = df - else: - families_df = pd.concat([families_df, df]) - except FamilyNotFound as e: - if verbose: - print(e) - continue - return families_df + self.rfam_dir = self.root_dir / "rfam" + if not os.path.exists(self.rfam_dir): + os.makedirs(self.rfam_dir) + + self.rfam_families_archive_filename = Path(self.rfam_families_url).name + self.rfam_families_filename = Path(self.rfam_families_url).stem + self.rfam_pdb_mapping_archive_filename = Path( + self.rfam_pdb_mapping_url + ).name + self.rfam_pdb_mapping_filename = Path(self.rfam_pdb_mapping_url).stem + + self.download_metadata() + + def download_metadata(self): + """ Download metadata mapping PDB structures to RFAM families """ + self._download_rfam_families() + self._download_rfam_pdb_mapping() + + def _download_rfam_families(self): + """Download RFAM families from + https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/database_files/family.txt.gz + """ + if not os.path.exists(self.rfam_dir / self.rfam_families_filename): + log.info("Downloading RFAM families...") + wget.download(self.rfam_families_url, out=str(self.rfam_dir)) + log.info("Downloaded RFAM families") + + # Unzip all collected families + if not os.path.exists(self.rfam_dir / self.rfam_families_filename): + log.info("Unzipping RFAM sequences...") + with gzip.open( + self.rfam_dir / self.rfam_families_archive_filename, "rb" + ) as f_in: + with open( + self.rfam_dir / self.rfam_families_filename, "wb" + ) as f_out: + shutil.copyfileobj(f_in, f_out) + log.info("Unzipped RFAM families") + + def _download_rfam_pdb_mapping(self): + """Download RFAM families from + https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/database_files/Rfam.pdb.gz + """ + if not os.path.exists(self.rfam_dir / self.rfam_pdb_mapping_filename): + log.info("Downloading RFAM family - PDB structure ID mapping ...") + wget.download(self.rfam_pdb_mapping_url, out=str(self.rfam_dir)) + log.info("Downloaded RFAM family - PDB structure ID mapping") + + # Unzip all collected mappings + if not os.path.exists(self.rfam_dir / self.rfam_pdb_mapping_filename): + log.info("Unzipping RFAM family - PDB structure ID mapping...") + with gzip.open( + self.rfam_dir / self.rfam_pdb_mapping_archive_filename, "rb" + ) as f_in: + with open( + self.rfam_dir / self.rfam_pdb_mapping_filename, "wb" + ) as f_out: + shutil.copyfileobj(f_in, f_out) + log.info("Unzipped RFAM family - PDB structure ID mapping") + + def _parse_rfam_families(self) -> pd.DataFrame: + """Parse the RFAM families metadata + + :return: Pandas DataFrame with information about the RFAM families + :rtype: pd.DataFrame + """ + df = pd.read_csv( + self.rfam_dir / self.rfam_families_filename, + sep="\t", + header=None, + encoding="ISO-8859-1", + ) + # Selecting accession, ID, and description + df = df[ + [0, 1, 3] + ] # TODO: Could select other fields such as comment for an extended description of the family? + df.columns = ["rfam_acc", "id", "description"] + df = df.set_index("rfam_acc") + return df + + def _parse_rfam_pdb_mapping(self) -> pd.DataFrame: + """Parse the PDB IDs annotated with RFAM families + + :return: Pandas DataFrame with information about the structures of the RFAM family + :rtype: pd.DataFrame + """ + df = pd.read_csv( + self.rfam_dir / self.rfam_pdb_mapping_filename, sep="\t", header=0 + ) + return df + + def parse_rfam(self) -> pd.DataFrame: + """Parse mapping between PDB structures and RFAM families """ + family_info_df = self._parse_rfam_families() + rfam_pdb_mapping_df = self._parse_rfam_pdb_mapping() + df = pd.merge( + rfam_pdb_mapping_df, + family_info_df, + left_on="rfam_acc", + right_index=True, + ) + return df if __name__ == "__main__": - family_IDs = None # ['RF10000'] - families_df = RFAM_families_df(family_IDs) - print(families_df) - # families_df.to_csv('RFAM_families_27062023.csv', index=False) + rfam_manager = RFAMManager() + df = rfam_manager.parse_rfam() + print(df.head()) From 40a5e84e7fdf32f3408d9eb9e4e3222da94b9b2d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Jun 2023 07:27:45 +0000 Subject: [PATCH 6/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- graphein/rna/download_rfam.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/graphein/rna/download_rfam.py b/graphein/rna/download_rfam.py index 334da74f3..3aa91e8f3 100644 --- a/graphein/rna/download_rfam.py +++ b/graphein/rna/download_rfam.py @@ -5,12 +5,11 @@ import pandas as pd import wget - from loguru import logger as log class RFAMManager: - """ A utility for downloading RFAM families and their PDB structure IDs.""" + """A utility for downloading RFAM families and their PDB structure IDs.""" def __init__( self, @@ -39,7 +38,7 @@ def __init__( self.download_metadata() def download_metadata(self): - """ Download metadata mapping PDB structures to RFAM families """ + """Download metadata mapping PDB structures to RFAM families""" self._download_rfam_families() self._download_rfam_pdb_mapping() @@ -117,7 +116,7 @@ def _parse_rfam_pdb_mapping(self) -> pd.DataFrame: return df def parse_rfam(self) -> pd.DataFrame: - """Parse mapping between PDB structures and RFAM families """ + """Parse mapping between PDB structures and RFAM families""" family_info_df = self._parse_rfam_families() rfam_pdb_mapping_df = self._parse_rfam_pdb_mapping() df = pd.merge( From da9a8e17edd1265f6e211d3b2c0720a2a8c31f18 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Tue, 4 Jul 2023 07:36:44 +0200 Subject: [PATCH 7/9] Add source and df attributes to mirror PDBManager --- graphein/rna/download_rfam.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/graphein/rna/download_rfam.py b/graphein/rna/download_rfam.py index 3aa91e8f3..45981e89a 100644 --- a/graphein/rna/download_rfam.py +++ b/graphein/rna/download_rfam.py @@ -1,8 +1,8 @@ import gzip import os import shutil -from pathlib import Path - +import pathlib +import copy import pandas as pd import wget from loguru import logger as log @@ -16,7 +16,7 @@ def __init__( root_dir: str = ".", ): # Arguments - self.root_dir = Path(root_dir) + self.root_dir = pathlib.Path(root_dir) # Constants self.rfam_families_url = "https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/database_files/family.txt.gz" @@ -28,15 +28,18 @@ def __init__( if not os.path.exists(self.rfam_dir): os.makedirs(self.rfam_dir) - self.rfam_families_archive_filename = Path(self.rfam_families_url).name - self.rfam_families_filename = Path(self.rfam_families_url).stem - self.rfam_pdb_mapping_archive_filename = Path( + self.rfam_families_archive_filename = pathlib.Path(self.rfam_families_url).name + self.rfam_families_filename = pathlib.Path(self.rfam_families_url).stem + self.rfam_pdb_mapping_archive_filename = pathlib.Path( self.rfam_pdb_mapping_url ).name - self.rfam_pdb_mapping_filename = Path(self.rfam_pdb_mapping_url).stem + self.rfam_pdb_mapping_filename = pathlib.Path(self.rfam_pdb_mapping_url).stem self.download_metadata() + self.source: pd.DataFrame = self.parse_rfam() + self.df: pd.DataFrame = copy.deepcopy(self.source) + def download_metadata(self): """Download metadata mapping PDB structures to RFAM families""" self._download_rfam_families() From b9b635c7b6b9d09387e6c57aaec6e9fc5fbbcd39 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 4 Jul 2023 05:37:02 +0000 Subject: [PATCH 8/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- graphein/rna/download_rfam.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/graphein/rna/download_rfam.py b/graphein/rna/download_rfam.py index 45981e89a..def291fce 100644 --- a/graphein/rna/download_rfam.py +++ b/graphein/rna/download_rfam.py @@ -1,8 +1,9 @@ +import copy import gzip import os -import shutil import pathlib -import copy +import shutil + import pandas as pd import wget from loguru import logger as log @@ -28,12 +29,16 @@ def __init__( if not os.path.exists(self.rfam_dir): os.makedirs(self.rfam_dir) - self.rfam_families_archive_filename = pathlib.Path(self.rfam_families_url).name + self.rfam_families_archive_filename = pathlib.Path( + self.rfam_families_url + ).name self.rfam_families_filename = pathlib.Path(self.rfam_families_url).stem self.rfam_pdb_mapping_archive_filename = pathlib.Path( self.rfam_pdb_mapping_url ).name - self.rfam_pdb_mapping_filename = pathlib.Path(self.rfam_pdb_mapping_url).stem + self.rfam_pdb_mapping_filename = pathlib.Path( + self.rfam_pdb_mapping_url + ).stem self.download_metadata() From ef9e4633e79995e8634f821eabd60551f87d0292 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Wed, 27 Mar 2024 11:56:05 +0100 Subject: [PATCH 9/9] bump changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71ed5a4b7..f3885ed66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +### 1.7.7 - UNRELEASED + + +#### New Features +* Adds RFAM Manager [#324](https://github.com/a-r-j/graphein/pull/324) + ### 1.7.1 - UNRELEASED #### API Changes