Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions modules/nf-core/blast/cdddownloader/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::gnu-wget=1.18
70 changes: 70 additions & 0 deletions modules/nf-core/blast/cdddownloader/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
process BLAST_CDDDOWNLOADER {
tag "$db_prefix"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
? 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9'
: 'biocontainers/gnu-wget:1.18--h36e9172_9'}"

input:
val db_prefix

output:
path "cdd_databases/", emit: db
tuple val("${task.process}"), val('wget'), eval("wget --version | head -1 | cut -d ' ' -f 3"), topic: versions, emit: versions_wget
tuple val("${task.process}"), val('untar'), eval("tar --version 2>&1 | grep -oE '[0-9]+\\.[0-9]+\\.[0-9]+' | head -1"), topic: versions, emit: versions_tar

when:
task.ext.when == null || task.ext.when

script:
def db_name = 'Cdd_NCBI_LE'
if ( "$db_prefix" ==~ /^Cdd$/ ) {
db_name = 'Cdd_LE'
} else if ( "$db_prefix" ==~ /^Cog$/ ) {
db_name = 'Cog_LE'
} else if ( "$db_prefix" ==~ /^Kog$/ ) {
db_name = 'Kog_LE'
} else if ( "$db_prefix" ==~ /^Pfam$/ ) {
db_name = 'Pfam_LE'
} else if ( "$db_prefix" ==~ /^Prk$/ ) {
db_name = 'Prk_LE'
} else if ( "$db_prefix" ==~ /^Smart$/ ) {
db_name = 'Smart_LE'
} else if ( "$db_prefix" ==~ /^Tigr$/ ) {
db_name = 'Tigr_LE'
} else {
log.warn("Unknown CDD databse name (${db_prefix}): selecting Cdd_NCBI default of downloading")
db_prefix = 'Cdd_NCBI'
}

"""
mkdir -p cdd_databases/${db_prefix}
cd cdd_databases/
mkdir data

echo "Downloading ${db_prefix} database into ${db_prefix} dir"

wget https://ftp.ncbi.nlm.nih.gov/pub/mmdb/cdd/little_endian/${db_name}.tar.gz
tar -xzf ${db_name}.tar.gz -C ./${db_prefix}
rm -f ${db_name}.tar.gz

echo "Downloading metadata files"

wget https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cddid.tbl.gz -O ./data/cddid.tbl.gz && gzip -d ./data/cddid.tbl.gz
wget https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cdtrack.txt -O ./data/cdtrack.txt
wget https://ftp.ncbi.nih.gov/pub/mmdb/cdd/family_superfamily_links -O ./data/family_superfamily_links
wget https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cddannot.dat.gz -O ./data/cddannot.dat.gz && gzip -d ./data/cddannot.dat.gz
wget https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cddannot_generic.dat.gz -O ./data/cddannot_generic.dat.gz && gzip -d ./data/cddannot_generic.dat.gz
wget https://ftp.ncbi.nih.gov/pub/mmdb/cdd/bitscore_specific.txt -O ./data/bitscore_specific.txt
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these necessary to be downloaded? Next to every db every time?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One set of them is enough for any database. These are "universal"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering how this module will handle multiple dbs. You'll have to call it as many times as dbs you want, and then all these files will be downloaded again, and then you'll have multiple copies of the cdd_databases folder.
I am not familiar with the tree structure, but what's the plan for moving everything in a single cdd_databases folder to be used by the pipeline for multiple dbs? Or is the idea to only use one db at a time?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rpsblast will run with one databse at the time. The Cdd database contains all the rest of the DBs. In case someone want to run the analysis versus all the databases, should choose Cdd or a the corresponding subset. The idea is to do the check if exists in the calling workflow and run the downloader only when the database is not present. I am not sure how to tell the pipeline to fix the location of the downloaded DB. We probably should do something like this:
https://nf-co.re/funcscan/3.0.0/parameters/#database-downloading-options

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will add an input for the downloading dir and checks for existing dbs

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added handlers to deal with existing db folders


echo "Finish"

"""

stub:
"""
mkdir cdd_databases/
"""
}
71 changes: 71 additions & 0 deletions modules/nf-core/blast/cdddownloader/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "blast_cdddownloader"
description: CDD databases downloader
keywords:
- cdd
- rpsblast
- databases
- metadata
- wget
- tar
tools:
- wget:
description: "wget is a free utility for non-interactive download of files from
the Web."
homepage: "https://www.gnu.org/software/wget/"
documentation: "https://www.gnu.org/software/wget/manual/wget.html"
licence: ["GPL"]
identifier: ""
- untar:
description: |
Extract tar, tar.gz, tar.bz2, tar.xz files.
documentation: https://www.gnu.org/software/tar/manual/
licence: ["GPL-3.0-or-later"]
identifier: ""
input:
- db_prefix:
type: string
description: |
Specify the database to be downloaded from https://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/
pattern: "Cdd|Cdd_NCBI|Cog|Kog|Pfam|Prk|Smart|Tigr"
output:
db:
- cdd_databases/:
type: directory
description: Directory containing the CDD database files
pattern: "cdd_databases/"
versions_wget:
- - ${task.process}:
type: string
description: The name of the process
- wget:
type: string
description: The name of the tool
- wget --version | head -1 | cut -d ' ' -f 3:
type: eval
description: The expression to obtain the version of the tool
versions_tar:
- - ${task.process}:
type: string
description: The name of the process
- untar:
type: string
description: The name of the tool
- tar --version 2>&1 | grep -oE '[0-9]+\\.[0-9]+\\.[0-9]+' | head -1:
type: eval
description: The expression to obtain the version of the tool
topics:
versions:
- - ${task.process}:
type: string
description: The name of the process
- wget:
type: string
description: The name of the tool
- wget --version | head -1 | cut -d ' ' -f 3:
type: eval
description: The expression to obtain the version of the tool
authors:
- "@Ales-ibt"
maintainers:
- "@Ales-ibt"
102 changes: 102 additions & 0 deletions modules/nf-core/blast/cdddownloader/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
nextflow_process {

name "Test Process BLAST_CDDDOWNLOADER"
script "../main.nf"
process "BLAST_CDDDOWNLOADER"

tag "modules"
tag "modules_nfcore"
tag "blast"
tag "blast/cdddownloader"

test("cdddownload - smart") {
when {
process {
"""
input[0] = 'Smart'
"""
}
}

then {
assert process.success
assertAll(
{ assert snapshot(
process.out.db,
process.out.findAll { key, val -> key.startsWith("versions")}
).match() }
)
}

}

test("cdddownload - default Cdd_NCBI") {
when {
process {
"""
input[0] = ''
"""
}
}

then {
assert process.success
assertAll(
{ assert snapshot(
process.out.db,
process.out.findAll { key, val -> key.startsWith("versions")}
).match() }
)
}

}

test("cdddownload - smart - stub") {

options "-stub"

when {
process {
"""
input[0] = 'Smart'
"""
}
}

then {
assert process.success
assertAll(
{ assert snapshot(
process.out.db,
process.out.findAll { key, val -> key.startsWith("versions")}
).match() }
)
}

}

test("cdddownload - default Cdd_NCBI - stub") {

options "-stub"

when {
process {
"""
input[0] = ''
"""
}
}

then {
assert process.success
assertAll(
{ assert snapshot(
process.out.db,
process.out.findAll { key, val -> key.startsWith("versions")}
).match() }
)
}

}

}
Loading
Loading