-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_cds_gbk.py
More file actions
42 lines (38 loc) · 1.63 KB
/
extract_cds_gbk.py
File metadata and controls
42 lines (38 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#22/08/23, AKF, with assistance from ChatGPT
#Use-case: extracting cds from Genbank file (tested with genbanks generated by Prokka with the --compliant flag)
#Requires: Python3, Biopython
#Sample implementation with bash:
##!/bin/sh
##activate conda env with biopython enabled
#source activate my_conda_env
##initialise empty output file
#touch my_cds_hits.fasta
##list all genbank files (second level of directory) and extract cds to output
#for i in $(ls */*.gbk); do
# python3 extract_cds.py "$i" Beta-lactamase >> my_cds_hits.fasta
#done
#conda deactivate
from Bio import SeqIO
import sys
import os
if len(sys.argv) != 3:
print("Usage: python3 extract_cds.py <genbank_file> <product>")
sys.exit(1)
genbank_path = sys.argv[1]
product = sys.argv[2]
base_filename = os.path.splitext(os.path.basename(genbank_path))[0]
base_filename = base_filename.replace("_prokka", "") # Remove _prokka suffix -> mod here for other use cases
# If input is a directory, extract the directory name as part of the header
if os.path.isdir(genbank_path):
dir_name = os.path.basename(os.path.normpath(genbank_path))
fasta_header = f">{dir_name} {product}\n"
else:
fasta_header = f">{base_filename} {product}\n"
# Parse the GenBank file(s) and extract translations
for record in SeqIO.parse(genbank_path, "genbank"):
for feature in record.features:
if feature.type == "CDS":
if "product" in feature.qualifiers and product in feature.qualifiers["product"][0]:
if "translation" in feature.qualifiers:
translation = feature.qualifiers["translation"][0]
print(fasta_header + translation)