diff --git a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/main.nf b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/main.nf index c655af41588b..f4cc1fa15a0a 100644 --- a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/main.nf +++ b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/main.nf @@ -1,13 +1,17 @@ import groovy.json.JsonSlurper -include { BBMAP_BBSPLIT } from '../../../modules/nf-core/bbmap/bbsplit' -include { CAT_FASTQ } from '../../../modules/nf-core/cat/fastq/main' -include { SORTMERNA } from '../../../modules/nf-core/sortmerna/main' -include { SORTMERNA as SORTMERNA_INDEX } from '../../../modules/nf-core/sortmerna/main' - -include { FASTQ_SUBSAMPLE_FQ_SALMON } from '../fastq_subsample_fq_salmon' -include { FASTQ_FASTQC_UMITOOLS_TRIMGALORE } from '../fastq_fastqc_umitools_trimgalore' -include { FASTQ_FASTQC_UMITOOLS_FASTP } from '../fastq_fastqc_umitools_fastp' +include { BBMAP_BBSPLIT } from '../../../modules/nf-core/bbmap/bbsplit' +include { CAT_FASTQ } from '../../../modules/nf-core/cat/fastq/main' +include { SORTMERNA } from '../../../modules/nf-core/sortmerna/main' +include { SORTMERNA as SORTMERNA_INDEX } from '../../../modules/nf-core/sortmerna/main' +include { FQ_LINT } from '../../../modules/nf-core/fq/lint/main' +include { FQ_LINT as FQ_LINT_AFTER_TRIMMING } from '../../../modules/nf-core/fq/lint/main' +include { FQ_LINT as FQ_LINT_AFTER_BBMAP } from '../../../modules/nf-core/fq/lint/main' +include { FQ_LINT as FQ_LINT_AFTER_SORTMERNA } from '../../../modules/nf-core/fq/lint/main' + +include { FASTQ_SUBSAMPLE_FQ_SALMON } from '../fastq_subsample_fq_salmon' +include { FASTQ_FASTQC_UMITOOLS_TRIMGALORE } from '../fastq_fastqc_umitools_trimgalore' +include { FASTQ_FASTQC_UMITOOLS_FASTP } from '../fastq_fastqc_umitools_fastp' def pass_trimmed_reads = [:] @@ -106,6 +110,7 @@ workflow FASTQ_QC_TRIM_FILTER_SETSTRANDEDNESS { umi_discard_read // integer: 0, 1 or 2 stranded_threshold // float: The fraction of stranded reads that must be assigned to a strandedness for confident assignment. Must be at least 0.5 unstranded_threshold // float: The difference in fraction of stranded reads assigned to 'forward' and 'reverse' below which a sample is classified as 'unstranded' + skip_linting // boolean: true/false main: @@ -113,6 +118,19 @@ workflow FASTQ_QC_TRIM_FILTER_SETSTRANDEDNESS { ch_filtered_reads = Channel.empty() ch_trim_read_count = Channel.empty() ch_multiqc_files = Channel.empty() + ch_lint_log = Channel.empty() + + // + // MODULE: Lint FastQ files + // + if(!skip_linting) { + FQ_LINT ( + ch_reads.map{ meta, fastqs -> [meta, fastqs.flatten()] } + ) + ch_versions = ch_versions.mix(FQ_LINT.out.versions.first()) + ch_lint_log = ch_lint_log.mix(FQ_LINT.out.lint) + ch_reads = ch_reads.join(FQ_LINT.out.lint.map{it[0]}) + } ch_reads .branch { @@ -212,6 +230,14 @@ workflow FASTQ_QC_TRIM_FILTER_SETSTRANDEDNESS { .map { [[:], it] } ) + if((!skip_linting) && (!skip_trimming)) { + FQ_LINT_AFTER_TRIMMING ( + ch_filtered_reads + ) + ch_lint_log = ch_lint_log.mix(FQ_LINT_AFTER_TRIMMING.out.lint) + ch_filtered_reads = ch_filtered_reads.join(FQ_LINT_AFTER_TRIMMING.out.lint.map{it[0]}) + } + // // MODULE: Remove genome contaminant reads // @@ -228,6 +254,14 @@ workflow FASTQ_QC_TRIM_FILTER_SETSTRANDEDNESS { .set { ch_filtered_reads } ch_versions = ch_versions.mix(BBMAP_BBSPLIT.out.versions.first()) + + if(!skip_linting) { + FQ_LINT_AFTER_BBSPLIT ( + ch_filtered_reads + ) + ch_lint_log = ch_lint_log.mix(FQ_LINT_AFTER_BBSPLIT.out.lint) + ch_filtered_reads = ch_filtered_reads.join(FQ_LINT_AFTER_BBSPLIT.out.lint.map{it[0]}) + } } // @@ -260,6 +294,14 @@ workflow FASTQ_QC_TRIM_FILTER_SETSTRANDEDNESS { .mix(SORTMERNA.out.log) ch_versions = ch_versions.mix(SORTMERNA.out.versions.first()) + + if(!skip_linting) { + FQ_LINT_AFTER_SORTMERNA ( + ch_filtered_reads + ) + ch_lint_log = ch_lint_log.mix(FQ_LINT_AFTER_SORTMERNA.out.lint) + ch_filtered_reads = ch_filtered_reads.join(FQ_LINT_AFTER_SORTMERNA.out.lint.map{it[0]}) + } } // Branch FastQ channels if 'auto' specified to infer strandedness @@ -312,6 +354,7 @@ workflow FASTQ_QC_TRIM_FILTER_SETSTRANDEDNESS { emit: + lint_log = ch_lint_log reads = ch_strand_inferred_fastq trim_read_count = ch_trim_read_count diff --git a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/meta.yml b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/meta.yml index 6f92f56a0a02..433837d709ca 100644 --- a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/meta.yml +++ b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/meta.yml @@ -1,6 +1,5 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json name: "fastq_qc_trim_filter_setstrandedness" -description: Basic FASTQ preprocessing for RNA-seq +description: Performs linting, quality control, trimming, filtering, and strandedness determination on RNA-seq FASTQ files, preparing them for downstream analysis. keywords: - fastq - rnaseq @@ -19,39 +18,84 @@ components: - fastq_fastqc_umitools_trimgalore - fastq_fastqc_umitools_fastp input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test' ] - ch_reads: - type: file - description: | - Channel with input FastQ files of size 1 and 2 for single-end and - paired-end data, respectively. + description: Channel with input FastQ files + structure: + - meta: + type: map + description: Groovy Map containing sample information e.g. [ id:'test' ] + - reads: + type: file + description: FastQ files + pattern: "*.{fq,fastq},{,.gz}" - ch_fasta: - type: file description: Channel with genome sequence in fasta format + structure: + - meta: + type: map + description: Metadata for the fasta file + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fa,fasta}" - ch_transcript_fasta: - type: file description: Channel with transcriptome sequence in fasta format + structure: + - meta: + type: map + description: Metadata for the transcript fasta file + - fasta: + type: file + description: Transcript fasta file + pattern: "*.{fa,fasta}" - ch_gtf: - type: file description: Channel with features in GTF format + structure: + - meta: + type: map + description: Metadata for the GTF file + - gtf: + type: file + description: GTF file + pattern: "*.gtf" - ch_salmon_index: - type: file description: Directory containing Salmon index + structure: + - meta: + type: map + description: Metadata for the Salmon index + - index: + type: directory + description: Salmon index directory - ch_sortmerna_index: - type: file description: Directory containing sortmerna index + structure: + - meta: + type: map + description: Metadata for the SortMeRNA index + - index: + type: directory + description: SortMeRNA index directory - ch_bbsplit_index: - type: file description: Path to directory or tar.gz archive for pre-built BBSplit index + structure: + - meta: + type: map + description: Metadata for the BBSplit index + - index: + type: file + description: BBSplit index directory or tar.gz archive + pattern: "{*,*.tar.gz}" - ch_rrna_fastas: - type: file - description: | - Channel containing one or more FASTA files containing rRNA sequences - for use with SortMeRNA + description: Channel containing one or more FASTA files containing rRNA sequences for use with SortMeRNA + structure: + - meta: + type: map + description: Metadata for the rRNA fasta files + - fasta: + type: file + description: rRNA fasta files + pattern: "*.{fa,fasta}" - skip_bbsplit: type: boolean description: Whether to skip BBSplit for removal of non-reference genome reads @@ -63,9 +107,7 @@ input: description: Whether to skip trimming - skip_umi_extract: type: boolean - description: | - Skip the UMI extraction from the read in case the UMIs have been moved - to the headers in advance of the pipeline run + description: Skip the UMI extraction from the read in case the UMIs have been moved to the headers in advance of the pipeline run - make_salmon_index: type: boolean description: Whether to create salmon index before running salmon quant @@ -74,14 +116,10 @@ input: description: Whether to create sortmerna index before running sortmerna - trimmer: type: string - description: | - Specifies the trimming tool to use - available options are 'trimgalore' - and 'fastp' + description: Specifies the trimming tool to use - available options are 'trimgalore' and 'fastp' - min_trimmed_reads: type: integer - description: | - Minimum number of trimmed reads below which samples are removed from - further processing + description: Minimum number of trimmed reads below which samples are removed from further processing - save_trimmed: type: boolean description: Save the trimmed FastQ files in the results directory? @@ -93,39 +131,66 @@ input: description: Enable UMI-based read deduplication - umi_discard_read: type: integer - description: | - After UMI barcode extraction discard either R1 or R2 by setting this - parameter to 1 or 2, respectively + description: After UMI barcode extraction discard either R1 or R2 by setting this parameter to 1 or 2, respectively - stranded_threshold: type: float min: 0.5 - description: | - The fraction of stranded reads that must be assigned to a strandedness - for confident assignment. Must be at least 0.5. + description: The fraction of stranded reads that must be assigned to a strandedness for confident assignment. Must be at least 0.5. - unstranded_threshold: type: float - description: | - The difference in fraction of stranded reads assigned to 'forward' and - 'reverse' below which a sample is classified as 'unstranded'. + description: The difference in fraction of stranded reads assigned to 'forward' and 'reverse' below which a sample is classified as 'unstranded'. + - skip_linting: + type: boolean + description: Whether to skip linting of FastQ files output: - reads: - type: file description: Preprocessed fastq reads - pattern: "*.{fq,fastq}{,.gz}" + structure: + - meta: + type: map + description: Metadata for the preprocessed reads + - reads: + type: file + description: Preprocessed FastQ files + pattern: "*.{fq,fastq},{,.gz}" - multiqc_files: - type: file - description: MultiQC-compatible output files from tools used in prepreocessing - pattern: "*" + description: MultiQC-compatible output files from tools used in preprocessing + structure: + - meta: + type: map + description: Metadata for the MultiQC files + - mqc: + type: file + description: MultiQC-compatible files + pattern: "*" - trim_read_count: - type: integer description: Number of reads remaining after trimming for all input samples + structure: + - meta: + type: map + description: Metadata for the trim read count + - count: + type: integer + description: Number of reads after trimming - versions: - type: file - description: | - File containing software versions - Structure: [ path(versions.yml) ] - pattern: "versions.yml" + description: File containing software versions + structure: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - lint_log: + description: Log files from FastQ linting + structure: + - meta: + type: map + description: Metadata for the lint log + - log: + type: file + description: FastQ lint log file + pattern: "*.log" + authors: - "@pinin4fjords" maintainers: diff --git a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test index 5242f2bee6c3..99c4d931a17d 100644 --- a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test +++ b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test @@ -40,7 +40,7 @@ nextflow_workflow { input[4] = [] // ch_salmon_index input[5] = [] // ch_sortmerna_index input[6] = [] // ch_bbsplit_index - input[7] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas + input[7] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas input[8] = true // skip_bbsplit input[9] = false // skip_fastqc input[10] = false // skip_trimming @@ -55,6 +55,7 @@ nextflow_workflow { input[19] = 0 // umi_discard_read input[20] = 0.8 // stranded_threshold input[21] = 0.1 // unstranded_threshold + input[22] = false // skip_linting """ } } @@ -62,13 +63,22 @@ nextflow_workflow { then { def pelines1 = path(workflow.out.reads[0][1][0]).linesGzip def pelines2 = path(workflow.out.reads[0][1][1]).linesGzip + + // First part of each fq lint report line is a timestamp, remove it before snapshotting + def processed_sortmerna_lint_report = path(workflow.out.lint_log.find { entry -> entry[1].contains('sortmerna')}?.getAt(1)) + .getText() + .readLines() + .collect { line -> line.split(' ', 2)[1] } // Split by the first space and take everything after it + .join('\n') // Join the processed lines back into a single text block + assertAll( { assert workflow.success}, { assert snapshot(pelines1).md5().match("fastp_test_pe_reads_1_lines") }, { assert snapshot(pelines1.size()).match("fastp_test_pe_reads_1_size") }, { assert snapshot(pelines2).md5().match("fastp_test_pe_reads_2_lines") }, { assert snapshot(pelines2.size()).match("fastp_test_pe_reads_2_size") }, - { assert snapshot(workflow.out.trim_read_count).match("fastp_read_count") } + { assert snapshot(workflow.out.trim_read_count).match("fastp_read_count") }, + { assert snapshot(processed_sortmerna_lint_report).md5().match("fastp_lint") } // This doesn't work- 'cat' changes between Conda and Docker - // leaving it here until we find a way to address that // { assert snapshot(workflow.out.versions).match("fastp_versions") } @@ -95,7 +105,7 @@ nextflow_workflow { input[4] = [] // ch_salmon_index input[5] = [] // ch_sortmerna_index input[6] = [] // ch_bbsplit_index - input[7] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas + input[7] = Channel.of(file('https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta', checkIfExists: true)) // ch_rrna_fastas input[8] = true // skip_bbsplit input[9] = false // skip_fastqc input[10] = false // skip_trimming @@ -110,6 +120,7 @@ nextflow_workflow { input[19] = 0 // umi_discard_read input[20] = 0.8 // stranded_threshold input[21] = 0.1 // unstranded_threshold + input[22] = false // skip_linting """ } } @@ -117,19 +128,28 @@ nextflow_workflow { then { def pelines1 = path(workflow.out.reads[0][1][0]).linesGzip def pelines2 = path(workflow.out.reads[0][1][1]).linesGzip + + // First part of each fq lint report line is a timestamp, remove it before snapshotting + def processed_sortmerna_lint_report = path(workflow.out.lint_log.find { entry -> entry[1].contains('sortmerna')}?.getAt(1)) + .getText() + .readLines() + .collect { line -> line.split(' ', 2)[1] } // Split by the first space and take everything after it + .join('\n') // Join the processed lines back into a single text block + assertAll( { assert workflow.success}, { assert snapshot(pelines1).md5().match("trimgalore_test_pe_reads_1_lines") }, { assert snapshot(pelines1.size()).match("trimgalore_test_pe_reads_1_size") }, { assert snapshot(pelines2).md5().match("trimgalore_test_pe_reads_2_lines") }, { assert snapshot(pelines2.size()).match("trimgalore_test_pe_reads_2_size") }, - { assert snapshot(workflow.out.trim_read_count).match("trimgalore_read_count") } + { assert snapshot(workflow.out.trim_read_count).match("trimgalore_read_count") }, + { assert snapshot(processed_sortmerna_lint_report).md5().match("trimgalore_lint") } // This doesn't work- 'cat' changes between Conda and Docker - // leaving it here until we find a way to address that //{ assert snapshot(workflow.out.versions).match("trimgalore_versions") } ) } } - + } diff --git a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test.snap b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test.snap index 98534931769d..b024506593b3 100644 --- a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test.snap +++ b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/main.nf.test.snap @@ -13,82 +13,98 @@ ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-07-17T10:24:00.044553245" + "timestamp": "2024-12-02T12:56:09.941793" }, "trimgalore_test_pe_reads_2_lines": { "content": "eccf3e9e74589ff01c77fce7f4548e41", "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-07-17T10:24:26.838793051" + "timestamp": "2024-12-02T12:56:34.948679" }, "fastp_test_pe_reads_1_size": { "content": [ 4508 ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-07-17T10:23:59.889337984" + "timestamp": "2024-12-02T12:56:09.889501" }, "trimgalore_test_pe_reads_1_size": { "content": [ 4508 ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-07-17T10:24:26.778599725" + "timestamp": "2024-12-02T12:56:34.91497" + }, + "trimgalore_lint": { + "content": "daec499818124330ef90e5af47383f00", + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-02T12:56:34.952929" }, "trimgalore_test_pe_reads_1_lines": { "content": "3868fc1caf09367141d2bbf47e158823", "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-02T12:56:34.913312" + }, + "fastp_lint": { + "content": "daec499818124330ef90e5af47383f00", + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-07-17T10:24:26.774975135" + "timestamp": "2024-12-02T12:56:09.943788" }, "fastp_test_pe_reads_2_lines": { "content": "eccf3e9e74589ff01c77fce7f4548e41", "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-07-17T10:23:59.997625278" + "timestamp": "2024-12-02T12:56:09.93962" }, "fastp_test_pe_reads_2_size": { "content": [ 4508 ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-07-17T10:24:00.042449965" + "timestamp": "2024-12-02T12:56:09.940673" }, "trimgalore_test_pe_reads_2_size": { "content": [ 4508 ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-07-17T10:24:26.841434261" + "timestamp": "2024-12-02T12:56:34.949981" }, "fastp_test_pe_reads_1_lines": { "content": "3868fc1caf09367141d2bbf47e158823", "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-07-17T10:23:59.882844295" + "timestamp": "2024-12-02T12:56:09.887546" }, "trimgalore_read_count": { "content": [ @@ -104,9 +120,9 @@ ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-07-17T10:24:26.84402498" + "timestamp": "2024-12-02T12:56:34.951137" } } \ No newline at end of file diff --git a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/nextflow.config b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/nextflow.config index 9e33e4b338fa..71f2d0d25ae1 100644 --- a/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/nextflow.config +++ b/subworkflows/nf-core/fastq_qc_trim_filter_setstrandedness/tests/nextflow.config @@ -3,6 +3,19 @@ // process { + + withName: 'FQ_LINT_AFTER_TRIMMING' { + ext.prefix = { "${meta.id}.trimmed" } + } + + withName: 'FQ_LINT_AFTER_BBMAP' { + ext.prefix = { "${meta.id}.bbmap" } + } + + withName: 'FQ_LINT_AFTER_SORTMERNA' { + ext.prefix = { "${meta.id}.sortmerna" } + } + withName: 'FQ_SUBSAMPLE' { ext.args = '--record-count 1000000 --seed 1' ext.prefix = { "${meta.id}.subsampled" }