Skip to content

Commit fd08c25

Browse files
authored
adding seqkit grep (#3500)
* adding seqkit grep * removing todo * Implementing suggestions and test * Fixing tests
1 parent ecdc480 commit fd08c25

6 files changed

Lines changed: 180 additions & 0 deletions

File tree

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
process SEQKIT_GREP {
2+
tag "$meta.id"
3+
label 'process_low'
4+
5+
6+
conda "bioconda::seqkit=2.4.0"
7+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
8+
'https://depot.galaxyproject.org/singularity/seqkit:2.4.0--h9ee0642_0':
9+
'biocontainers/seqkit:2.4.0--h9ee0642_0' }"
10+
11+
input:
12+
tuple val(meta), path(sequence)
13+
path pattern
14+
15+
output:
16+
tuple val(meta), path("*.{fa,fq}.gz") , emit: filter
17+
path "versions.yml" , emit: versions
18+
19+
when:
20+
task.ext.when == null || task.ext.when
21+
22+
script:
23+
def args = task.ext.args ?: ''
24+
def prefix = task.ext.prefix ?: "${meta.id}"
25+
// fasta or fastq. Exact pattern match .fasta or .fa suffix with optional .gz (gzip) suffix
26+
def suffix = task.ext.suffix ?: "${sequence}" ==~ /(.*f[astn]*a(.gz)?$)/ ? "fa" : "fq"
27+
def pattern_file = pattern ? "-f ${pattern}" : ""
28+
29+
"""
30+
seqkit \\
31+
grep \\
32+
$args \\
33+
--threads $task.cpus \\
34+
${pattern_file} \\
35+
${sequence} \\
36+
-o ${prefix}.${suffix}.gz \\
37+
38+
cat <<-END_VERSIONS > versions.yml
39+
"${task.process}":
40+
seqkit: \$( seqkit version | sed 's/seqkit v//' )
41+
END_VERSIONS
42+
"""
43+
44+
stub:
45+
def args = task.ext.args ?: ''
46+
def prefix = task.ext.prefix ?: "${meta.id}"
47+
// fasta or fastq. Exact pattern match .fasta or .fa suffix with optional .gz (gzip) suffix
48+
def suffix = task.ext.suffix ?: "${sequence}" ==~ /(.*f[astn]*a(.gz)?$)/ ? "fa" : "fq"
49+
50+
"""
51+
touch ${prefix}.${suffix}.gz
52+
53+
cat <<-END_VERSIONS > versions.yml
54+
"${task.process}":
55+
seqkit: \$( seqkit version | sed 's/seqkit v//' )
56+
END_VERSIONS
57+
"""
58+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
3+
name: "seqkit_grep"
4+
description: Select sequences from a large file based on name/ID
5+
keywords:
6+
- filter
7+
- seqkit
8+
- subseq
9+
- grep
10+
tools:
11+
- "seqkit":
12+
description: Cross-platform and ultrafast toolkit for FASTA/Q file manipulation, written by Wei Shen.
13+
homepage: https://bioinf.shenwei.me/seqkit/usage/
14+
documentation: https://bioinf.shenwei.me/seqkit/usage/
15+
tool_dev_url: https://github.com/shenwei356/seqkit/
16+
doi: "10.1371/journal.pone.0163962"
17+
licence: ["MIT"]
18+
19+
input:
20+
- meta:
21+
type: map
22+
description: >
23+
Groovy Map containing sample information
24+
e.g. [ id:'test', single_end:false ]
25+
26+
- sequence:
27+
type: file
28+
description: >
29+
Fasta or fastq file containing sequences to be filtered
30+
pattern: "*.{fa,fna,faa,fasta,fq,fastq}[.gz]"
31+
32+
- pattern:
33+
type: file
34+
description: >
35+
pattern file (one record per line). If no pattern is given, a string can be specificied within the args using '-p pattern_string'
36+
pattern: "*.{txt,tsv}"
37+
38+
output:
39+
- meta:
40+
type: map
41+
description: >
42+
Groovy Map containing sample information
43+
e.g. [ id:'test', single_end:false ]
44+
45+
- versions:
46+
type: file
47+
description: File containing software versions
48+
pattern: "versions.yml"
49+
50+
- filter:
51+
type: file
52+
description: >
53+
Fasta or fastq file containing the filtered sequences
54+
pattern: "*.{fa,fq}[.gz]"
55+
56+
authors:
57+
- "@Joon-Klaps"

tests/config/pytest_modules.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3139,6 +3139,10 @@ sentieon/varcal:
31393139
- modules/nf-core/sentieon/varcal/**
31403140
- tests/modules/nf-core/sentieon/varcal/**
31413141

3142+
seqkit/grep:
3143+
- modules/nf-core/seqkit/grep/**
3144+
- tests/modules/nf-core/seqkit/grep/**
3145+
31423146
seqkit/pair:
31433147
- modules/nf-core/seqkit/pair/**
31443148
- tests/modules/nf-core/seqkit/pair/**
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env nextflow
2+
3+
nextflow.enable.dsl = 2
4+
5+
include { SEQKIT_GREP as SEQKIT_GREP_FILE } from '../../../../../modules/nf-core/seqkit/grep/main.nf'
6+
include { SEQKIT_GREP as SEQKIT_GREP_STRING } from '../../../../../modules/nf-core/seqkit/grep/main.nf'
7+
8+
workflow test_seqkit_grep_file {
9+
10+
sequence = [
11+
[ id:'test', single_end:false ], // meta map
12+
file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true)
13+
]
14+
15+
pattern = [
16+
file(params.test_data['homo_sapiens']['genome']['genome_header'], checkIfExists: true)
17+
]
18+
19+
SEQKIT_GREP_FILE ( sequence, pattern )
20+
}
21+
22+
workflow test_seqkit_grep_string {
23+
24+
sequence = [
25+
[ id:'test', single_end:false ], // meta map
26+
file(params.test_data['homo_sapiens']['genome']['genome_fasta_gz'], checkIfExists: true)
27+
]
28+
29+
pattern = []
30+
31+
SEQKIT_GREP_STRING ( sequence, pattern)
32+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
process {
2+
3+
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
4+
5+
withName: SEQKIT_GREP_STRING {
6+
ext.args = "-p chr22"
7+
}
8+
9+
10+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
- name: seqkit grep test_seqkit_grep_file
2+
command: nextflow run ./tests/modules/nf-core/seqkit/grep -entry test_seqkit_grep_file -c ./tests/config/nextflow.config -c ./tests/modules/nf-core/seqkit/grep/nextflow.config
3+
tags:
4+
- seqkit
5+
- seqkit/grep
6+
files:
7+
- path: output/seqkit/test.fa.gz
8+
md5sum: d2c774388d0cffb4663a2c4b5b6dd5df
9+
- path: output/seqkit/versions.yml
10+
11+
- name: seqkit grep test_seqkit_grep_string
12+
command: nextflow run ./tests/modules/nf-core/seqkit/grep -entry test_seqkit_grep_string -c ./tests/config/nextflow.config -c ./tests/modules/nf-core/seqkit/grep/nextflow.config
13+
tags:
14+
- seqkit
15+
- seqkit/grep
16+
files:
17+
- path: output/seqkit/test.fa.gz
18+
md5sum: 9179d23d51c48c03c0f68479440db36c
19+
- path: output/seqkit/versions.yml

0 commit comments

Comments
 (0)