diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ee0debae..5ed1820ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Special thanks to the following for their contributions to the release: +- [Ahwan Pandey](https://github.com/ahwanpandey) - [Elad Herzog](https://github.com/EladH1) - [Emily Miyoshi](https://github.com/emilymiyoshi) - [Pontus Höjer](https://github.com/pontushojer) @@ -29,6 +30,7 @@ Special thanks to the following for their contributions to the release: - [PR #1628](https://github.com/nf-core/rnaseq/pull/1628) - Template update for nf-core/tools v3.5.1 - [PR #1630](https://github.com/nf-core/rnaseq/pull/1630) - Fix arm64 profile to use pre-built ARM containers and update documentation - [PR #1631](https://github.com/nf-core/rnaseq/pull/1631) - Fix bbsplit index staging by using symlinks instead of full copy +- [PR #1635](https://github.com/nf-core/rnaseq/pull/1635) - Fix `--gtf_extra_attributes` to support multiple comma-separated values and correct deprecated parameter name in docs ([#1626](https://github.com/nf-core/rnaseq/issues/1626)) ## [[3.21.0](https://github.com/nf-core/rnaseq/releases/tag/3.21.0)] - 2025-09-18 diff --git a/docs/usage.md b/docs/usage.md index 495929a5c..41a5320ce 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -358,7 +358,7 @@ Remember to note the genome and annotation versions as well as the versions of t If you are using [GENCODE](https://www.gencodegenes.org/) reference genome files please specify the `--gencode` parameter because the format of these files is slightly different to ENSEMBL genome files: -- The `--gtf_group_features_type` parameter will automatically be set to `gene_type` as opposed to `gene_biotype`, respectively. +- The `--featurecounts_group_type` parameter will automatically be set to `gene_type` as opposed to `gene_biotype`, respectively. - If you are running Salmon, the `--gencode` flag will also be passed to the index building step to overcome parsing issues resulting from the transcript IDs in GENCODE fasta files being separated by vertical pipes (`|`) instead of spaces (see [this issue](https://github.com/COMBINE-lab/salmon/issues/15)). As well as the standard annotations, GENCODE also provides "basic" annotations, which include only representative transcripts, but we do not recommend using these. diff --git a/modules.json b/modules.json index 3c4ea02bf..87d4bd1fa 100644 --- a/modules.json +++ b/modules.json @@ -37,7 +37,7 @@ }, "custom/tx2gene": { "branch": "master", - "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", + "git_sha": "e0bdf8637721e27792a38c6b622f0a2345f3cbc9", "installed_by": ["modules", "quantify_pseudo_alignment"] }, "dupradar": { diff --git a/modules/nf-core/custom/tx2gene/meta.yml b/modules/nf-core/custom/tx2gene/meta.yml index 8254afa08..07b1f0f4b 100644 --- a/modules/nf-core/custom/tx2gene/meta.yml +++ b/modules/nf-core/custom/tx2gene/meta.yml @@ -26,26 +26,27 @@ input: type: file description: An annotation file of the reference genome in GTF format pattern: "*.gtf" + ontologies: [] - - meta2: type: map description: | Groovy Map containing information related to the experiment as a whole e.g. `[ id:'SRP123456' ]` - - '"quants/*"': + - quants/*: type: file description: quants file - - - quant_type: - type: string - description: Quantification type, 'kallisto' or 'salmon' - - - id: - type: string - description: Gene ID attribute in the GTF file (default= gene_id) - - - extra: - type: string - description: Extra gene attribute in the GTF file (default= gene_name) + - quant_type: + type: string + description: Quantification type, 'kallisto' or 'salmon' + - id: + type: string + description: Gene ID attribute in the GTF file (default= gene_id) + - extra: + type: string + description: Extra gene attribute(s) in the GTF file, comma-separated for multiple (default= gene_name) output: - - tx2gene: - - meta: + tx2gene: + - - meta: type: map description: | Groovy Map containing reference information related to the GTF file @@ -54,11 +55,15 @@ output: type: file description: A transcript/ gene mapping table in TSV format pattern: "*.tx2gene.tsv" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML authors: - "@pinin4fjords" maintainers: diff --git a/modules/nf-core/custom/tx2gene/templates/tx2gene.py b/modules/nf-core/custom/tx2gene/templates/tx2gene.py index 4d769513d..a90c63e39 100755 --- a/modules/nf-core/custom/tx2gene/templates/tx2gene.py +++ b/modules/nf-core/custom/tx2gene/templates/tx2gene.py @@ -128,7 +128,7 @@ def map_transcripts_to_gene( gtf_file: str, quant_dir: str, gene_id: str, - extra_id_field: str, + extra_id_fields: str, output_file: str, ) -> bool: """ @@ -139,7 +139,7 @@ def map_transcripts_to_gene( gtf_file (str): Path to the GTF file. quant_dir (str): Directory where quantification files are located. gene_id (str): The gene ID attribute in the GTF file. - extra_id_field (str): Additional ID field in the GTF file. + extra_id_fields (str): Additional ID field(s) in the GTF file, comma-separated for multiple. output_file (str): The output file path. Returns: @@ -150,12 +150,17 @@ def map_transcripts_to_gene( # Discover the attribute that corresponds to transcripts in the GTF transcript_attribute = discover_transcript_attribute(gtf_file, transcripts) + # Parse comma-separated extra ID fields + extra_fields = [field.strip() for field in extra_id_fields.split(",")] + # Open GTF and output file to write the mappings # Initialize the set to track seen combinations seen = set() with open(gtf_file) as inh, open(output_file, "w") as output_handle: - output_handle.write(f"{transcript_attribute}\\t{gene_id}\\t{extra_id_field}\\n") + # Write header with all extra fields as separate columns + header_fields = [transcript_attribute, gene_id] + extra_fields + output_handle.write("\\t".join(header_fields) + "\\n") # Parse each line of the GTF, mapping transcripts to genes for line in filter(lambda x: not x.startswith("#"), inh): cols = line.split("\\t") @@ -170,8 +175,10 @@ def map_transcripts_to_gene( # Check if the combination has already been seen if transcript_gene_pair not in seen: # If it's a new combination, write it to the output and add to the seen set - extra_id = attr_dict.get(extra_id_field, attr_dict[gene_id]) - output_handle.write(f"{attr_dict[transcript_attribute]}\\t{attr_dict[gene_id]}\\t{extra_id}\\n") + # Extract values for all extra fields, falling back to gene_id if not present + extra_values = [attr_dict.get(field, attr_dict[gene_id]) for field in extra_fields] + output_fields = [attr_dict[transcript_attribute], attr_dict[gene_id]] + extra_values + output_handle.write("\\t".join(output_fields) + "\\n") seen.add(transcript_gene_pair) return True diff --git a/modules/nf-core/custom/tx2gene/tests/main.nf.test b/modules/nf-core/custom/tx2gene/tests/main.nf.test index 2d45b7646..49518f797 100644 --- a/modules/nf-core/custom/tx2gene/tests/main.nf.test +++ b/modules/nf-core/custom/tx2gene/tests/main.nf.test @@ -4,6 +4,11 @@ nextflow_process { script "../main.nf" process "CUSTOM_TX2GENE" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/tx2gene" + tag "untar" test("saccharomyces_cerevisiae - gtf") { @@ -44,6 +49,45 @@ nextflow_process { } } + test("saccharomyces_cerevisiae - gtf - multiple extra attributes") { + + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/kallisto_results.tar.gz', checkIfExists: true) + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/genome_gfp.gtf', checkIfExists: true) + ]) + input[1] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] } + input[2] = 'kallisto' + input[3] = 'gene_id' + input[4] = 'gene_name,gene_biotype' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + test("saccharomyces_cerevisiae - gtf - stub") { options "-stub" @@ -84,4 +128,4 @@ nextflow_process { ) } } -} \ No newline at end of file +} diff --git a/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap b/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap index 2be5fe547..63f319e90 100644 --- a/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap +++ b/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap @@ -32,6 +32,39 @@ }, "timestamp": "2024-10-18T10:24:12.19104487" }, + "saccharomyces_cerevisiae - gtf - multiple extra attributes": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.tx2gene.tsv:md5,97223927dc2e0dae6c38bad96aaa6f49" + ] + ], + "1": [ + "versions.yml:md5,e504b95d76ef4cf65ba0b38cddce2840" + ], + "tx2gene": [ + [ + { + "id": "test" + }, + "test.tx2gene.tsv:md5,97223927dc2e0dae6c38bad96aaa6f49" + ] + ], + "versions": [ + "versions.yml:md5,e504b95d76ef4cf65ba0b38cddce2840" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.0" + }, + "timestamp": "2025-11-25T19:28:57.610922" + }, "saccharomyces_cerevisiae - gtf - stub": { "content": [ {