diff --git a/CHANGELOG.md b/CHANGELOG.md index cb6c6cc05..10e88e4b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [PR #1640](https://github.com/nf-core/rnaseq/pull/1640) - Bump version after release 3.22.0 - [PR #1641](https://github.com/nf-core/rnaseq/pull/1641) - Add arm-based CI tests and fix arm-related issues +- [PR #1642](https://github.com/nf-core/rnaseq/pull/1642) - Add long format to rsem merge ## [[3.22.0](https://github.com/nf-core/rnaseq/releases/tag/3.22.0)] - 2025-11-26 @@ -20,7 +21,7 @@ Special thanks to the following for their contributions to the release: - [Ahwan Pandey](https://github.com/ahwanpandey) - [Cristina Tuñí i Domínguez](https://github.com/ctuni) -- [Elad Herzog](https://github.com/EladH1) +- [Elad Herz](https://github.com/EladH1) - [Emily Miyoshi](https://github.com/emilymiyoshi) - [Jonathan Manning](https://github.com/pinin4fjords) - [Pontus Höjer](https://github.com/pontushojer) diff --git a/docs/output.md b/docs/output.md index 77b921fb4..04b9c71cd 100644 --- a/docs/output.md +++ b/docs/output.md @@ -278,6 +278,8 @@ The STAR section of the MultiQC report shows a bar plot with alignment rates: go - `rsem.merged.gene_tpm.tsv`: Matrix of gene-level TPM values across all samples. - `rsem.merged.transcript_counts.tsv`: Matrix of isoform-level raw counts across all samples. - `rsem.merged.transcript_tpm.tsv`: Matrix of isoform-level TPM values across all samples. + - `rsem.merged.genes_long.tsv`: long format contains length, expected_count, TPM, and FPKM across all samples. + - `rsem.merged.isoforms_long.tsv`: long format contains length, expected_count, TPM, FPKM, and IsoPct across all samples. - `*.genes.results`: RSEM gene-level quantification results for each sample. - `*.isoforms.results`: RSEM isoform-level quantification results for each sample. - `*.STAR.genome.bam`: If `--save_align_intermeds` is specified the BAM file from STAR alignment containing read alignments to the reference genome will be placed in this directory. These files can be reused as `genome_bam` input in future pipeline runs. diff --git a/modules/local/rsem_merge_counts/main.nf b/modules/local/rsem_merge_counts/main.nf index c220e31de..4daf6d97c 100644 --- a/modules/local/rsem_merge_counts/main.nf +++ b/modules/local/rsem_merge_counts/main.nf @@ -15,6 +15,8 @@ process RSEM_MERGE_COUNTS { path "rsem.merged.gene_tpm.tsv" , emit: tpm_gene path "rsem.merged.transcript_counts.tsv", emit: counts_transcript path "rsem.merged.transcript_tpm.tsv" , emit: tpm_transcript + path "rsem.merged.genes_long.tsv" , emit: genes_long + path "rsem.merged.isoforms_long.tsv" , emit: isoforms_long path "versions.yml" , emit: versions when: @@ -47,6 +49,20 @@ process RSEM_MERGE_COUNTS { paste transcript_ids.txt tmp/isoforms/*.counts.txt > rsem.merged.transcript_counts.tsv paste transcript_ids.txt tmp/isoforms/*.tpm.txt > rsem.merged.transcript_tpm.tsv + # Create long format for genes (idx=1-4, concat columns 5-7) + echo -e "sample_name\tgene_id\ttranscript_id(s)\tlength\teffective_length\texpected_count\tTPM\tFPKM" > rsem.merged.genes_long.tsv + for fileid in `ls ./genes/*`; do + samplename=`basename \$fileid | sed s/\\.genes.results\$//g` + tail -n+2 \$fileid | awk -v sample=\$samplename 'BEGIN{OFS="\t"}{print sample,\$1,\$2,\$3,\$4,\$5,\$6,\$7}' >> rsem.merged.genes_long.tsv + done + + # Create long format for isoforms (idx=1-4, concat columns 5-8) + echo -e "sample_name\ttranscript_id\tgene_id\tlength\teffective_length\texpected_count\tTPM\tFPKM\tIsoPct" > rsem.merged.isoforms_long.tsv + for fileid in `ls ./isoforms/*`; do + samplename=`basename \$fileid | sed s/\\.isoforms.results\$//g` + tail -n+2 \$fileid | awk -v sample=\$samplename 'BEGIN{OFS="\t"}{print sample,\$1,\$2,\$3,\$4,\$5,\$6,\$7,\$8}' >> rsem.merged.isoforms_long.tsv + done + cat <<-END_VERSIONS > versions.yml "${task.process}": sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') @@ -59,6 +75,8 @@ process RSEM_MERGE_COUNTS { touch rsem.merged.gene_tpm.tsv touch rsem.merged.transcript_counts.tsv touch rsem.merged.transcript_tpm.tsv + touch rsem.merged.genes_long.tsv + touch rsem.merged.isoforms_long.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/rsem_merge_counts/tests/main.nf.test.snap b/modules/local/rsem_merge_counts/tests/main.nf.test.snap index 1715d1279..da991326f 100644 --- a/modules/local/rsem_merge_counts/tests/main.nf.test.snap +++ b/modules/local/rsem_merge_counts/tests/main.nf.test.snap @@ -15,6 +15,12 @@ "rsem.merged.transcript_tpm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ], "4": [ + "rsem.merged.genes_long.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "5": [ + "rsem.merged.isoforms_long.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "6": [ "versions.yml:md5,48ca3e12c91829af8019462b3f6aa29c" ], "counts_gene": [ @@ -23,6 +29,12 @@ "counts_transcript": [ "rsem.merged.transcript_counts.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ], + "genes_long": [ + "rsem.merged.genes_long.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "isoforms_long": [ + "rsem.merged.isoforms_long.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], "tpm_gene": [ "rsem.merged.gene_tpm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ], @@ -35,10 +47,10 @@ } ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" + "nf-test": "0.9.3", + "nextflow": "25.04.6" }, - "timestamp": "2024-06-21T11:55:29.45389" + "timestamp": "2025-11-27T17:09:36.795867708" }, "Should run without failures": { "content": [ @@ -56,6 +68,12 @@ "rsem.merged.transcript_tpm.tsv:md5,abbaac45f9938716c58d604299ea284e" ], "4": [ + "rsem.merged.genes_long.tsv:md5,e26cd2b3b381432a011eed98f3ad4e6d" + ], + "5": [ + "rsem.merged.isoforms_long.tsv:md5,33b189595600493b917d786a542de8e9" + ], + "6": [ "versions.yml:md5,48ca3e12c91829af8019462b3f6aa29c" ], "counts_gene": [ @@ -64,6 +82,12 @@ "counts_transcript": [ "rsem.merged.transcript_counts.tsv:md5,e40bba0aafc5904361513b3513c217ad" ], + "genes_long": [ + "rsem.merged.genes_long.tsv:md5,e26cd2b3b381432a011eed98f3ad4e6d" + ], + "isoforms_long": [ + "rsem.merged.isoforms_long.tsv:md5,33b189595600493b917d786a542de8e9" + ], "tpm_gene": [ "rsem.merged.gene_tpm.tsv:md5,39bad606eb012456bba1d995fe0feb5f" ], @@ -76,9 +100,9 @@ } ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.3", + "nextflow": "25.04.6" }, - "timestamp": "2024-03-09T17:13:37.377348" + "timestamp": "2025-11-27T17:09:28.461621756" } } \ No newline at end of file diff --git a/subworkflows/local/quantify_rsem/tests/main.nf.test.snap b/subworkflows/local/quantify_rsem/tests/main.nf.test.snap index 7c133f1fb..3c85e78c2 100644 --- a/subworkflows/local/quantify_rsem/tests/main.nf.test.snap +++ b/subworkflows/local/quantify_rsem/tests/main.nf.test.snap @@ -1,4 +1,119 @@ { + "homo_sapiens - sentieon - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "strandedness": "forward" + }, + "test.genes.results:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "strandedness": "forward" + }, + "test.isoforms.results:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "strandedness": "forward" + }, + "test.stat:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test", + "strandedness": "forward" + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + "rsem.merged.gene_counts.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "5": [ + "rsem.merged.gene_tpm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "6": [ + "rsem.merged.transcript_counts.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "7": [ + "rsem.merged.transcript_tpm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "8": [ + "versions.yml:md5,2aa5252eb2ffb409cf556a165d40f8a9", + "versions.yml:md5,773c15c4ecb7d486a4bdd8ef73e7ac5d" + ], + "counts_gene": [ + [ + { + "id": "test", + "strandedness": "forward" + }, + "test.genes.results:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "counts_transcript": [ + [ + { + "id": "test", + "strandedness": "forward" + }, + "test.isoforms.results:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "logs": [ + [ + { + "id": "test", + "strandedness": "forward" + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "merged_counts_gene": [ + "rsem.merged.gene_counts.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "merged_counts_transcript": [ + "rsem.merged.transcript_counts.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "merged_tpm_gene": [ + "rsem.merged.gene_tpm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "merged_tpm_transcript": [ + "rsem.merged.transcript_tpm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "stat": [ + [ + { + "id": "test", + "strandedness": "forward" + }, + "test.stat:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,2aa5252eb2ffb409cf556a165d40f8a9", + "versions.yml:md5,773c15c4ecb7d486a4bdd8ef73e7ac5d" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.04.6" + }, + "timestamp": "2025-11-27T16:26:57.604104651" + }, "homo_sapiens - stub": { "content": [ { @@ -109,10 +224,10 @@ } ], "meta": { - "nf-test": "0.9.2", + "nf-test": "0.9.3", "nextflow": "25.04.6" }, - "timestamp": "2025-09-15T16:56:01.229068" + "timestamp": "2025-11-27T16:26:03.792661891" }, "homo_sapiens": { "content": [ @@ -165,9 +280,9 @@ ] ], "meta": { - "nf-test": "0.9.2", - "nextflow": "25.04.3" + "nf-test": "0.9.3", + "nextflow": "25.04.6" }, - "timestamp": "2025-09-16T08:19:00.078928064" + "timestamp": "2025-11-27T17:10:37.825495549" } }