nf-core · LouisLeNezet · Nov 26, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/README.md b/README.md
@@ -364,6 +364,12 @@ The earth sciences folder contain subfolders for different data formats encounte
       - sequence/chr22_23800000-23980000.fa: Fasta file containing a section of chr22
     - chr22_chr22_KI270734v1_random: directory for reference files using chr22 and chr22_KI270734v1_random, for paraphase
       - sequence/genome.fa.gz: Gzipped fasta file from GRCh38 with bases not within chr22:18912282-18936793 and chr22_KI270734v1_random:137587-162092 hard masked to N.
+    - genetic_map: directory containing genetic map of GRCh38 in various format
+      - .21/22.eagle.map.gz: tab delimited gzipped map with header: chr (no prefix), position, COMBINED_rate(cM/Mb) Genetic_Map(cM)
+      - .chr21/22.stitch.map: space delimited map with header: position, COMBINED_rate(cM/Mb) Genetic_Map(cM)
+      - .chr21/22.glimpse.map: tab delimited with header: pos (position), chr, cM (centiMorgan)
+      - .chr21/22.plink.map: space delimited map without header: chr, id (all empty "."), centiMorgan, position
+      - .chr21/22.minimac.map: tab delimited with header: #chr, position, chr, Genetic_Map(cM)
     - vcf
       - dbsnp: DBSnp file downsampled based on reference position
       - gnomAD: gnomAD file downsampled based on reference position
@@ -386,6 +392,9 @@ The earth sciences folder contain subfolders for different data formats encounte
     - genome.fasta.gz.gzi: index file for 'genome.fasta.gz'
     - genome2.fasta: Reference fasta based on chr22:16600000-16800000
     - genome3.fasta: Reference fasta based on chr19:45760000-45770300
+    - genomeGRCh38_chr21_22.fa.gz: bgzipped version of GRCh38 fasta file containing whole chr21 and chr22
+    - genomeGRCh38_chr21_22.fa.gz.fai: index file for 'genomeGRCh38_chr21_22.fa.gz'
+    - genomeGRCh38_chr21_22.fa.gz.gzi: index file for 'genomeGRCh38_chr21_22.fa.gz'
     - genome_motifs.txt: TF motifs used for cellranger-atac
     - genome.NC_012920_1.gb: Contains mtDNA reference genome in Genbank format
     - transcriptome.fasta: Reference transcriptome based on `genome.fasta`
@@ -441,6 +450,10 @@ The earth sciences folder contain subfolders for different data formats encounte
       - 'example_hla_pe.sorted.bam': Sorted BAM file for HLATyping workflow / OptiType module.
       - 'example_hla_pe.sorted.bam.bai': Sorted BAM file index for HLATyping workflow / OptiType module.
       - mitochon_standin.recalibrated.sorted: copy of the old, smaller test2.paired_end.recalibrated.sorted, this is to be used to test mutect2's mitochondria mode, as the current recal bams are far too big. This should be replaced once rarediseases obtain an actual mitochondria sample.
+      - NA12878.chr21_22.1X.bam{.bai}: Full coverage (32X) bam file of individual NA12878 for chr21 and 22 between position 16570000-16610000 with index.
+      - NA12878.chr21_22.1X.bam{.bai}: Downsampled at 1X bam file of individual NA12878 for chr21 and 22 between position 16570000-16610000 with index.
+      - NA19401.chr21_22.1X.bam{.bai}: Full coverage (32X) bam file of individual NA19401 for chr21 and 22 between position 16570000-16610000 with index.
+      - NA19401.chr21_22.1X.bam{.bai}: Downsampled at 1X bam file of individual NA19401 for chr21 and 22 between position 16570000-16610000 with index.
       - test_illumina_mt: bam file containing mt data, to test eklipse
       - 'test3.single_end.markduplicates.sorted.bam': Mapped, sorted, and duplicate removed reads from ancient DNA across all human chromosomes on the hs37d5 human reference. Data from [ERR2857053](https://www.ebi.ac.uk/ena/browser/view/ERR2857053) downsampled to 10% of original reads.
       - 'test.rna.paired_end.bam': STAR-aligned, unsorted, paired-end RNAseq bam file from the test*rnaseq*{1,2}.fastq.gz: chr22 of sample GM12878 (SRA accession: SRX2900878)
@@ -563,6 +576,9 @@ The earth sciences folder contain subfolders for different data formats encounte
     - vcf:
       - test.rnaseq.vcf: RNAseq vcf corresponding to `test.rnaseq_{1,2}` reads
       - test.genome_21.somatic_sv.vcf: Indels VCF corresponding to `test.paired_end.recalibrated.sorted` and `genome_21.fasta` generated with Manta
+      - NA12878.chr21_22.1X.glimpse2.vcf.gz{.csi}: Imputed variants of NA12878.chr21_22.1X.bam file with glimpse2 for chr21 and 22 between position 16570000-16610000 with index.
+      - NA19401.chr21_22.1X.glimpse2.vcf.gz{.csi}: Imputed variants of NA19401.chr21_22.1X.bam file with glimpse2 for chr21 and 22 between position 16570000-16610000 with index.
+      - NA12878_GIAB.chr21_22.vcf.gz{.csi}: Benchmarking variants file from Genome In A Bottle initiative for individual NA12878 for chr21 and 22 between position 16570000-16610000 with index.
       - NA12878*chrM.vcf.gz: mitochondrial variants corresponding to `testdata/NA12878_mito*{1,2}.fq.gz`from the`rarediseases` branch.
       - empty.vcf.gz: The RNAseq VCF with all variants removed
       - empty.vcf.gz.tbi: The index of the empty vcf
@@ -642,6 +658,11 @@ The earth sciences folder contain subfolders for different data formats encounte
     - plink_simulated.pgen: case-control simulated variants dataset in PLINK 2 binary format
     - plink_simulated.psam: case-control simulated variants dataset in PLINK 2 binary format
     - plink_simulated.pvar: case-control simulated variants dataset in PLINK 2 binary format
+    - 1000GP.chr*.vcf.gz: Reference panel phased VCF of the 1000 Genome Project in various format to be used for imputation for chr21 and 22 between position 16570000-16610000 with index.
+    - 1000GP.chr*.hap/legend/samples.gz: Same as 1000GP.chr*.vcf.gz but converted to hap/legend/samples format with bcftools
+    - 1000GP.chr*.sites.vcf.gz: Variants informations present in 1000GP.chr*.vcf.gz obtain with `bcftools view -G -m 2 -M 2`
+    - 1000GP.chr*.posfile: Variants position in tabulated format without header: chr, position, ref, alt
+    - 1000GP.chr*.chunks.txt: chunks of the chromosome obtain with GLIMPSE_chunk
   - svsig:
 
     - NA03697B2_new.pbmm2.repeats.svsig.gz: structural variant file for NA03697B2_new.pbmm2.repeats.bam, created with PBSV discover version (2.9.0 default settings)

diff --git a/data/genomics/homo_sapiens/README.md b/data/genomics/homo_sapiens/README.md
@@ -69,14 +69,10 @@ Following 'reference' vcf files are generated. All found in igenomes at `s3://ng
 
 ### Fasta
 
-As base reference `s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/Chromosomes/chr22.fasta` was used.
+As base reference `s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/Chromosomes/chr22.fasta` was used, then compressed and indexed.
 
 ```bash
 samtools faidx chr22.fasta chr22:16570000-16610000  > genome.fasta
-```
-The corresponding compressed fasta and index files:
-
-```bash
 bgzip genome.fasta
 samtools faidx genome.fasta.gz
 ```
@@ -95,8 +91,19 @@ gatk BedToIntervalList -I genome.bed -SD genome.dict -O genome.interval_list
 
 A StrTableFile zip folder was created using GATK4:
 
-````bash
+```bash
 gatk ComposeSTRTableFile --reference genome.fasta --output genome_strtablefile.zip
+```
+
+For parallelization purpose a fasta with both chr21 and chr22 is obtain with
+
+```bash
+REF_PATH="data/genomics/homo_sapiens/genome/genomeGRCh38"
+wget -c -O- https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz | gunzip | bgzip  > ${REF_PATH}.fa.bgz
+samtools faidx ${REF_PATH}.fa.bgz chr21 chr22 | bgzip > ${REF_PATH}_chr21_22.fa.gz
+samtools faidx ${REF_PATH}_chr21_22.fa.gz
+rm ${REF_PATH}.fa.bgz*
+```
 
 ### SDF
 
@@ -105,7 +112,7 @@ An SDF folder of the reference FASTA of chromosome 21 was created using:
 ```bash
 rtg format -o genome_sdf genome.fasta
 tar -czf genome_sdf.tar.gz genome_sdf/
-````
+```
 
 ### GTF/GFF
 
@@ -176,14 +183,37 @@ The genome map of GRCh38 have been generated as follow:
 
 ```bash
 # Download the reference genome map
-PATH_GENOME=data/genomics/homo_sapiens/genome/
-MAP_GRCH38=genome.GRCh38
-wget https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/tables/genetic_map_hg38_withX.txt.gz -O ${PATH_GENOME}${MAP_GRCH38}.map.txt.gz
-zcat ${PATH_GENOME}${MAP_GRCH38}.map.txt.gz | awk 'NR==1 { print $0 } NR>1 && /^22/ { print $1, $2, $3, $4 }' > ${PATH_GENOME}${MAP_GRCH38}.eagle.22.map
-zcat ${PATH_GENOME}${MAP_GRCH38}.map.txt.gz | grep "^22" | awk -F' ' '{ print "chr"$1, $2, $4 }' > ${PATH_GENOME}${MAP_GRCH38}.glimpse.chr22.map
-gzip ${PATH_GENOME}${MAP_GRCH38}.eagle.22.map
-gzip ${PATH_GENOME}${MAP_GRCH38}.glimpse.chr22.map
-wget https://github.com/rwdavies/QUILT/raw/refs/heads/master/maps/hg38/CEU-chr21-final.b38.txt.gz -O ${PATH_GENOME}chr21/sequence/${MAP_GRCH38}.stitch.chr21.txt.gz
+MAP_GRCH38=data/genomics/homo_sapiens/genome/genetic_map/genome.GRCh38
+wget https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/tables/genetic_map_hg38_withX.txt.gz -O ${MAP_GRCH38}.eagle.map.gz
+wget --no-check-certificate https://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/plink.GRCh38.map.zip -O ${MAP_GRCH38}.plink.map.zip
+
+for chr in 21 22; do
+   # Eagle / hapmap format
+   zcat ${MAP_GRCH38}.eagle.map.gz \
+   | awk -v CHR="$chr" 'BEGIN {OFS="\t"} NR==1 {print; next} $1 == CHR {print $1,$2,$3,$4}' \
+   > ${MAP_GRCH38}.${chr}.eagle.map
+
+   # Stitch or quilt format
+   awk 'BEGIN {OFS=" "} {print $2, $3, $4}' ${MAP_GRCH38}.${chr}.eagle.map \
+   > ${MAP_GRCH38}.chr${chr}.stitch.map
+
+   # Plink / bealge5 format
+   unzip -p ${MAP_GRCH38}.plink.map.zip chr_in_chrom_field/plink.chrchr${chr}.GRCh38.map > ${MAP_GRCH38}.chr${chr}.plink.map
+
+   # Minimac format
+   awk 'BEGIN {OFS="\t"} NR==1 {print "#chr", "position", "Genetic_Map(cM)"} NR>1  {print $1, $4, $3}' \
+      ${MAP_GRCH38}.chr${chr}.plink.map > ${MAP_GRCH38}.chr${chr}.minimac.map
+
+   # Glimpse format
+   awk 'BEGIN {OFS="\t"} NR==1 {print "pos", "chr", "cM"} NR>1  {print $4, $1, $3}' \
+      ${MAP_GRCH38}.chr${chr}.plink.map > ${MAP_GRCH38}.chr${chr}.glimpse.map
+
+   # Compress files
+   gzip ${MAP_GRCH38}.${chr}.eagle.map
+done
+
+rm ${MAP_GRCH38}.plink.map.zip
+rm ${MAP_GRCH38}.eagle.map.gz
 ```
 
 ## Alleles, Loci, GC and RT for `ASCAT`
@@ -224,58 +254,82 @@ We focus on the sample GM12878 also known as NA12878 or HG001 depending on the p
 We need its data in different format for the chromosome 22.
 
 ```bash
-REGION=chr22:16570000-16610000
+REGION_STRING="chr21:16570000-16610000 chr22:16570000-16610000"
 DIR_IND=data/genomics/homo_sapiens/illumina
-# Beware huge initial file
-wget -c ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/{} -O $DIR_IND/NA12878.{cram,cram.crai}
-
-# Keep only the wanted region
-samtools view \
-   -bo $DIR_IND/bam/NA12878.chr22.bam $DIR_IND/NA12878.cram \
-   ${REGION}
-samtools index $DIR_IND/bam/NA12878.chr22.bam
-
-# Compute depth on region
-MEAN_DEPTH=$(samtools coverage $DIR_IND/bam/NA12878.chr22.bam -r ${REGION} | \
-   awk -F'\t' '(NR==2){ print $7}')
-FRAC_DEPTH=$(echo "scale=5; 1/$MEAN_DEPTH" | bc)
-
-# Downsample the file to 1X
-samtools view \
-   -s 1${FRAC_DEPTH} \
-   -bo $DIR_IND/bam/NA12878.chr22.1X.bam $DIR_IND/bam/NA12878.chr22.bam
-samtools index $DIR_IND/bam/NA12878.chr22.1X.bam
-
-# Impute the data with GLIMPSE2
-PANEL_FILE=data/genomics/homo_sapiens/popgen/1000GP.chr22
-MAP_GRCH38=data/genomics/homo_sapiens/genome/genome.GRCh38
-GLIMPSE2_phase \
-   --bam-file $DIR_IND/bam/NA12878.chr22.1X.bam \
-   --ind-name NA12878 \
-   --input-region $REGION \
-   --output-region $REGION \
-   --reference $PANEL_FILE.vcf.gz \
-   --output $DIR_IND/vcf/NA12878.chr22.1X.vcf.gz
-
-bcftools index $DIR_IND/vcf/NA12878.chr22.1X.vcf.gz
+
+# Keep only the wanted region but might take some time to retrieve
+samtools view -bo $DIR_IND/bam/NA12878.chr21_22.bam \
+   ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239334/NA12878.final.cram \
+   $REGION_STRING
+
+samtools view -bo $DIR_IND/bam/NA19401.chr21_22.bam \
+   ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239749/NA19401.final.cram \
+   $REGION_STRING
+
+for sample in NA19401 NA12878; do
+
+   samtools index $DIR_IND/bam/$sample.chr21_22.bam
+
+   # Mean depth is approximately of 32X
+   FRAC_DEPTH=$(echo "scale=5; 1/32" | bc)
+   echo "Mean depth: 32X → Downsampling fraction: $FRAC_DEPTH"
+
+   # Downsample the file to 1X
+   samtools view \
+      -s 1${FRAC_DEPTH} \
+      -bo $DIR_IND/bam/$sample.chr21_22.1X.bam $DIR_IND/bam/$sample.chr21_22.bam
+   samtools index $DIR_IND/bam/$sample.chr21_22.1X.bam
+
+   # Impute the data with GLIMPSE2
+   for chr in chr21 chr22; do
+      PANEL_FILE=data/genomics/homo_sapiens/popgen/1000GP.$chr.vcf.gz
+      MAP_GRCH38=data/genomics/homo_sapiens/genome/genetic_map/genome.GRCh38.$chr.glimpse.map
+      GLIMPSE2_phase \
+         --bam-file $DIR_IND/bam/$sample.chr21_22.1X.bam \
+         --ind-name $sample \
+         --input-region $chr:16570000-16610000 \
+         --output-region $chr:16570000-16610000 \
+         --map $MAP_GRCH38 \
+         --reference $PANEL_FILE \
+         --output $DIR_IND/vcf/$sample.$chr.1X.glimpse2.vcf.gz
+
+      bcftools index $DIR_IND/vcf/$sample.$chr.1X.glimpse2.vcf.gz
+   done
+
+   # Concatenate the VCF files
+   bcftools concat -O z -o $DIR_IND/vcf/$sample.1X.glimpse2.vcf.gz \
+      $DIR_IND/vcf/$sample.chr21.1X.glimpse2.vcf.gz \
+      $DIR_IND/vcf/$sample.chr22.1X.glimpse2.vcf.gz
+
+   # Index the combined VCF
+   bcftools index $DIR_IND/vcf/$sample.chr21_22.1X.glimpse2.vcf.gz
+
+   rm $DIR_IND/vcf/$sample.chr*
+done
 
 # Variants benchmarking
+# Download the VCF file
 wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz -O $DIR_IND/NA12878.vcf.gz
+
+# Download the TBI file
 wget -c ftp://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi -O $DIR_IND/NA12878.vcf.gz.tbi
 
 # Renaming sample file
 echo "HG001 NA12878" > sample_renaming.txt
 
+REGION_STRING="chr21:16570000-16610000,chr22:16570000-16610000"
+
 # Normalize and keep only rename variants ID
 bcftools norm -m +any $DIR_IND/NA12878.vcf.gz \
-      --regions ${REGION} --threads 4 -Ov | \
+      -r ${REGION_STRING} --threads 4 -Ov | \
    bcftools reheader --samples sample_renaming.txt | \
    bcftools annotate --threads 4 -Oz \
       --set-id '%CHROM\:%POS\:%REF\:%FIRST_ALT' \
-      -o $DIR_IND/vcf/NA12878_GIAB.chr22.vcf.gz
+      -o $DIR_IND/vcf/NA12878_GIAB.chr21_22.vcf.gz
 
 # Index the file
-bcftools index -f $DIR_IND/vcf/NA12878_GIAB.chr22.vcf.gz --threads 4
+bcftools index -f $DIR_IND/vcf/NA12878_GIAB.chr21_22.vcf.gz --threads 4
+rm $DIR_IND/NA12878.vcf.gz*
 ```
 
 #### `CNVKIT` data

diff --git a/data/genomics/homo_sapiens/genome/genetic_map/genome.GRCh38.21.eagle.map.gz b/data/genomics/homo_sapiens/genome/genetic_map/genome.GRCh38.21.eagle.map.gz
diff --git a/data/genomics/homo_sapiens/genome/genetic_map/genome.GRCh38.22.eagle.map.gz b/data/genomics/homo_sapiens/genome/genetic_map/genome.GRCh38.22.eagle.map.gz