diff --git a/phylopypruner/msa.py b/phylopypruner/msa.py index 4b03267..6ac9b39 100644 --- a/phylopypruner/msa.py +++ b/phylopypruner/msa.py @@ -59,17 +59,16 @@ def add_sequence(self, seq_record=None, description="", sequence_data=""): seq_record = sequence.Sequence() seq_record.description = description seq_record.sequence_data = sequence_data - seq_record.otu = re.split(r"\||@", seq_record.description)[0] + seq_record.otu = re.split(r"\||@|_", seq_record.description)[0] try: - seq_record.identifier = re.split( - r"\||@", seq_record.description)[1] - except IndexError: - report.warning("no description found on split with | or @") + seq_record.identifier = re.search(r"[|@_]([^ ]*)", seq_record.description).group(1) + except AttributeError: + report.warning("no description found on split with | , _ or @") seq_record.identifier = None if description: - seq_record.otu = re.split(r"\||@", seq_record.description)[0] + seq_record.otu = re.split(r"\||@|_", seq_record.description)[0] if sequence_data: - seq_record.identifier = re.split(r"\||@", seq_record.description)[1] + seq_record.identifier = re.search(r"[|@_]([^ ]*)", seq_record.description).group(1) self.sequences.append(seq_record) return seq_record diff --git a/phylopypruner/run.py b/phylopypruner/run.py index 27c870d..8064390 100644 --- a/phylopypruner/run.py +++ b/phylopypruner/run.py @@ -16,14 +16,17 @@ def validate_input(msa, tree, tree_path): "Test to see if MSA and tree entries matches." - descriptions = list(msa.iter_descriptions()) - names = list(tree.iter_names()) - - if set(descriptions).intersection(names) < set(descriptions): - print("example tree names:", names[:2], file=sys.stderr) - print("example sequences:", descriptions[:2], file=sys.stderr) - report.error("MSA names don't match tree \n {}\n {}".format( - msa.filename, tree_path)) + descriptions = set(msa.iter_descriptions()) + names = set(tree.iter_names()) + + # Find items in descriptions not in names and vice versa + missing_in_names = descriptions - names + missing_in_descriptions = names - descriptions + + if missing_in_names or missing_in_descriptions: + print("Missing in tree names:", list(missing_in_names), file=sys.stderr) + print("Missing in sequences:", list(missing_in_descriptions), file=sys.stderr) + report.error("MSA names don't match tree \n {}\n {}".format(msa.filename, tree_path)) def run(settings, msa, tree): diff --git a/phylopypruner/sequence.py b/phylopypruner/sequence.py index 76fceec..0e9996d 100644 --- a/phylopypruner/sequence.py +++ b/phylopypruner/sequence.py @@ -21,7 +21,8 @@ def __init__(self, description="", sequence_data=""): self._sequence_data = str(sequence_data) self._is_alignment = bool(self.is_alignment) if description: - self._otu, self._identifier = re.split(r"\||@", description) + self._otu = re.split(r"\||@|_", description)[0] + self._identifier = re.search(r"[|@_]([^ ]*)", description).group(1) else: self._otu = "" self._identifier = "" diff --git a/phylopypruner/tree_node.py b/phylopypruner/tree_node.py index d6907e9..98d24e3 100644 --- a/phylopypruner/tree_node.py +++ b/phylopypruner/tree_node.py @@ -108,7 +108,7 @@ def iter_sisters(self): def otu(self): "Returns the OTU to which this node belongs." - return re.split(r"\||@", self.name)[0] + return re.split(r"\||@|_", self.name)[0] def is_root(self): "Returns True if this node lacks a parent." @@ -526,7 +526,7 @@ def iter_names(self): def iter_otus(self): "Returns an iterator object that includes all OTUs within this node." for name in self.iter_names(): - otu = re.split(r"\||@", name)[0] + otu = re.split(r"\||@|_", name)[0] yield otu def iter_identifiers(self): @@ -534,7 +534,7 @@ def iter_identifiers(self): Returns an iterator object that includes all identifiers in this node. """ for name in self.iter_names(): - identifier = re.split(r"\||@", name)[1] + identifier = re.search(r"[|@_]([^ ]*)", name).group(1) yield identifier def view(self):