bsmn · kdaily · Dec 10, 2018 · Nov 24, 2018 · Nov 24, 2018 · Nov 25, 2018
diff --git a/.gitignore b/.gitignore
@@ -3,5 +3,6 @@ __pycache__/
 *.py[cod]
 *$py.class
 
-# Installed tools directory
+# tools / resources directories
 /tools/
+/resources/
diff --git a/README.md b/README.md
@@ -2,36 +2,74 @@
 BSMN common data processing pipeline
 
 # Setup and installation
+This pipeline can be run in any cluster system using SGE job scheduler. I would recommend set your own cluster in AWS using AWS ParallelCluster.
 
-## cfncluster
+## AWS ParallelCluster
+For installoing and setting up parallelcluster, plase see the [`Getting Started Guide`](https://aws-parallelcluster.readthedocs.io/en/latest/getting_started.html) for AWS ParallelCluster.
 
-This pipeline runs using [`cfncluster`](https://cfncluster.readthedocs.io).
-
-## Installing cfncluster
+## Installing pipeline
+Check out bsmn_pipeline where you want it installed in the AWS Parallelcluster you set up or the cluster system you are using.
+```
+$ git clone https://github.com/bsmn/bsmn_pipeline
+```
 
-It's recommended to use a Python virtual environment (https://virtualenv.pypa.io/en/stable/).
+Install dependent tools running the following script. For GATK 3.7-0 that this pipeline uses, you should mannually download it and put into the appropriate location following the instruction that the install scipt gives.
+```
+$ cd bsmn_pipeline
+$ ./install_tools.sh
+```
 
-To install `cfncluster`:
+Download requred resource files including the reference sequence. This step require a synapse account that can access to the Syanpse page syn17062535.
+```
+$ ./download_resources.sh
+```
 
+## Extra set up for SGE
+The pipeline require a parallel environment named "threaded" in  your SGE system. If your SGE system doen't have this parallel environment, you should add it into yours.
+```
+$ sudo su
+# qconf -Ap << END
+pe_name            threaded
+slots              99999
+user_lists         NONE
+xuser_lists        NONE
+start_proc_args    NONE
+stop_proc_args     NONE
+allocation_rule    $pe_slots
+control_slaves     FALSE
+job_is_first_task  TRUE
+urgency_slots      min
+accounting_summary TRUE
+qsort_args         NONE
+END
+```
 ```
-pip install cfncluster
+# qconf -mattr queue pe_list threaded all.q
 ```
 
-To get the pipeline software installed on the cluster, a post-install script is run after the cluster starts. You can see this file as a GitHub Gist [here](https://gist.github.com/kdaily/1e0a2d1fcef1c6847f743f637301a3d5).
-
 # Usage
 ## genome_mapping
+Run the pipeline using a wrapper shell script.
 ```bash
-genome_mapping/run.py sample_list.txt
+genome_mapping.sh sample_list.txt
 ```
 
 ### sample_list.txt format
-The first line should be a header line. Eg.
+The lines starting with # will be commented out and ignored. The header line should start with # as well. Eg.
+```
+#sample_id	file_name	location
+5154_brain-BSMN_REF_brain-534-U01MH106876	bulk_sorted.bam	syn10639574
+5154_fibroblast-BSMN_REF_fibroblasts-534-U01MH106876	fibroblasts_sorted.bam	syn10639575
+5154_NeuN_positive-BSMN_REF_NeuN+_E12-677-U01MH106876	E12_MDA_common_sorted.bam	s3://nda-bsmn/abyzova_1497485007384/data/E12_MDA_common_sorted.bam
+5154_NeuN_positive-BSMN_REF_NeuN+_C12-677-U01MH106876	C12_MDA_common_sorted.bam	/efs/data/C12_MDA_common_sorted.bam
+```
+The "location" column can be a Synape ID, S3Uri of the NDA or a user, or LocalPath. For Data download, synapse or aws clients, or symbolic lins will be used, respectively.
+
+### options
 ```
-sample_id       file    synapse_id
-5154_brain-BSMN_REF_brain-534-U01MH106876       bulk_sorted.bam syn10639574
-5154_fibroblast-BSMN_REF_fibroblasts-534-U01MH106876    fibroblasts_sorted.bam  syn10639575
+--parentid syn123
 ```
+With parentid option, you can specify a Synapse ID of project or folder where to upload result bam files. If it is set, the result bam files will be uploaded into Synapse and deleted. Otherwise, they will be locally kept.
 
 # Contributing
 

diff --git a/config.ini b/config.ini
@@ -0,0 +1,28 @@
+[TOOLS]
+PYTHON3  = tools/python/3.6.2/bin/python3
+SYNAPSE  = tools/python/3.6.2/bin/synapse
+AWS      = tools/python/3.6.2/bin/aws
+JAVA     = tools/java/jdk1.8.0_191/bin/java
+BWA      = tools/bwa/0.7.16a/bin/bwa
+SAMTOOLS = tools/samtools/1.7/bin/samtools
+SAMBAMBA = tools/sambamba/v0.6.7/bin/sambamba
+GATK     = tools/gatk/3.7-0/GenomeAnalysisTK.jar
+PICARD   = tools/picard/2.12.1/picard.jar
+BGZIP    = tools/htslib/1.7/bin/bgzip
+TABIX    = tools/htslib/1.7/bin/tabix
+VT       = tools/vt/2018-06-07/bin/vt
+BCFTOOLS = tools/bcftools/1.7/bin/bcftools
+ROOTSYS  = tools/root/6.14.00
+CNVNATOR = tools/cnvnator/2018-07-09/bin/cnvnator
+
+[RESOURCES]
+REFDIR         = resources
+REF            = resources/hs37d5.fa
+DBSNP          = resources/dbsnp_138.b37.vcf
+MILLS          = resources/Mills_and_1000G_gold_standard.indels.b37.vcf
+INDEL1KG       = resources/1000G_phase1.indels.b37.vcf
+OMNI           = resources/1000G_omni2.5.b37.vcf
+HAPMAP         = resources/hapmap_3.3.b37.vcf
+SNP1KG         = resources/1000G_phase1.snps.high_confidence.b37.vcf
+KNOWN_GERM_SNP = resources/gnomAD.1KG.ExAC.ESP6500.Kaviar.snps.txt.gz
+MASK1KG        = resources/20141020.strict_mask.whole_genome.fasta.gz
diff --git a/download_resources.sh b/download_resources.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+mkdir -p resources
+
+# Synapse login
+tools/python/3.6.2/bin/synapse login --remember-me
+
+# Download and index the human ref genome
+tools/python/3.6.2/bin/synapse get syn10347383 --downloadLocation resources/
+gunzip resources/hs37d5.fa.gz
+tools/bwa/0.7.16a/bin/bwa index resources/hs37d5.fa
+tools/samtools/1.7/bin/samtools faidx resources/hs37d5.fa
+tools/java/jdk1.8.0_191/bin/java -jar tools/picard/2.12.1/picard.jar \
+    CreateSequenceDictionary R=resources/hs37d5.fa O=resources/hs37d5.dict
+
+# Download mapping resources
+tools/python/3.6.2/bin/synapse get syn17062535 -r --downloadLocation resources/
+gunzip resources/*vcf.gz resources/*vcf.idx.gz
+rm resources/SYNAPSE_METADATA_MANIFEST.tsv
+
+# Split the ref genome by chromosome
+awk '{ 
+    r = match($1, "^>"); 
+    if (r != 0) {
+        filename = "resources/chr"substr($1, 2, length($1))".fa"; 
+        print $0 > filename;
+    } 
+    else {
+        print $0 >> filename;
+    }
+}' resources/hs37d5.fa
+rm resources/chrGL* resources/chrhs37d5.fa resources/chrNC_007605.fa
diff --git a/genome_mapping.sh b/genome_mapping.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+PIPE_HOME=$(dirname $(readlink -f ${BASH_SOURCE[0]}))
+PYTHON3=$PIPE_HOME/$(grep PYTHON3 $PIPE_HOME/config.ini |cut -f2 -d=|sed 's/^ \+//')
+
+$PYTHON3 $PIPE_HOME/genome_mapping/run.py $@
diff --git a/genome_mapping/job_scripts/aln_1.align_sort.sh b/genome_mapping/job_scripts/aln_1.align_sort.sh
@@ -1,27 +1,30 @@
 #!/bin/bash
 #$ -cwd
-#$ -pe threaded 36 
+#$ -pe threaded 24 
 
-set -eu -o pipefail
+trap "exit 100" ERR
 
 if [[ $# -lt 2 ]]; then
     echo "Usage: $(basename $0) [sample name] [PU info]"
-    exit 1
+    false
 fi
 
-source $(pwd)/run_info
-
 SM=$1
 PU=$2
 
-printf -- "[$(date)] Start align_sort.\n---\n"
+source $(pwd)/$SM/run_info
+
+set -o nounset 
+set -o pipefail
+
+printf -- "---\n[$(date)] Start align_sort.\n"
 
 mkdir -p $SM/bam
-$BWA mem -M -t 32 \
+$BWA mem -M -t $((NSLOTS - 4)) \
     -R "@RG\tID:$SM.$PU\tSM:$SM\tPL:illumina\tLB:$SM\tPU:$PU" \
     $REF $SM/fastq/$SM.$PU.R{1,2}.fastq.gz \
     |$SAMBAMBA view -S -f bam -l 0 /dev/stdin \
     |$SAMBAMBA sort -m 24GB -t 3 -o $SM/bam/$SM.$PU.sorted.bam --tmpdir=tmp /dev/stdin 
 rm $SM/fastq/$SM.$PU.R{1,2}.fastq.gz
 
-printf -- "---\n[$(date)] Finish align_sort.\n"
+printf -- "[$(date)] Finish align_sort.\n---\n"
diff --git a/genome_mapping/job_scripts/aln_2.merge_bam.sh b/genome_mapping/job_scripts/aln_2.merge_bam.sh
@@ -1,26 +1,29 @@
 #!/bin/bash
 #$ -cwd
-#$ -pe threaded 18
+#$ -pe threaded 24
 
-set -eu -o pipefail
+trap "exit 100" ERR
 
 if [[ $# -lt 1 ]]; then
     echo "Usage: $(basename $0) [sample name]"
-    exit 1
+    false
 fi
 
-source $(pwd)/run_info
-
 SM=$1
 
-printf -- "[$(date)] Start merge_bam.\n---\n"
+source $(pwd)/$SM/run_info
+
+set -o nounset
+set -o pipefail
+
+printf -- "---\n[$(date)] Start merge_bam.\n"
 
 if [[ $(ls $SM/bam/$SM.*.sorted.bam|wc -l) == 1 ]]; then
     mv $SM/bam/$SM.*.sorted.bam $SM/bam/$SM.merged.bam
     rm $SM/bam/$SM.*.sorted.bam.bai
 else
-    $SAMBAMBA merge -t 18 $SM/bam/$SM.merged.bam $SM/bam/$SM.*.sorted.bam
+    $SAMBAMBA merge -t $NSLOTS $SM/bam/$SM.merged.bam $SM/bam/$SM.*.sorted.bam
     rm $SM/bam/$SM.*.sorted.bam{,.bai}
 fi
 
-printf -- "---\n[$(date)] Finish merge_bam.\n"
+printf -- "[$(date)] Finish merge_bam.\n---\n"
diff --git a/genome_mapping/job_scripts/aln_3.markdup.sh b/genome_mapping/job_scripts/aln_3.markdup.sh
@@ -1,19 +1,22 @@
 #!/bin/bash
 #$ -cwd
-#$ -pe threaded 18 
+#$ -pe threaded 12
 
-set -eu -o pipefail
+trap "exit 100" ERR
 
 if [[ $# -lt 1 ]]; then
     echo "Usage: $(basename $0) [sample name]"
-    exit 1
+    false
 fi
 
-source $(pwd)/run_info
-
 SM=$1
 
-printf -- "[$(date)] Start markdup.\n---\n"
+source $(pwd)/$SM/run_info
+
+set -o nounset
+set -o pipefail
+
+printf -- "---\n[$(date)] Start markdup.\n"
 
 $JAVA -Xmx26G -jar $PICARD MarkDuplicates \
     I=$SM/bam/$SM.merged.bam \
@@ -25,4 +28,4 @@ $JAVA -Xmx26G -jar $PICARD MarkDuplicates \
 
 rm $SM/bam/$SM.merged.bam{,.bai}
 
-printf -- "---\n[$(date)] Finish markdup.\n"
+printf -- "[$(date)] Finish markdup.\n---\n"
diff --git a/genome_mapping/job_scripts/aln_4.indel_realign.sh b/genome_mapping/job_scripts/aln_4.indel_realign.sh
@@ -1,22 +1,25 @@
 #!/bin/bash
 #$ -cwd
-#$ -pe threaded 36 
+#$ -pe threaded 24
 
-set -eu -o pipefail
+trap "exit 100" ERR
 
 if [[ $# -lt 1 ]]; then
     echo "Usage: $(basename $0) [sample name]"
-    exit 1
+    false
 fi
 
-source $(pwd)/run_info
-
 SM=$1
 
-printf -- "[$(date)] Start RealignerTargetCreator.\n---\n"
+source $(pwd)/$SM/run_info
+
+set -o nounset
+set -o pipefail
+
+printf -- "---\n[$(date)] Start RealignerTargetCreator.\n"
 
 $JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \
-    -T RealignerTargetCreator -nt 36 \
+    -T RealignerTargetCreator -nt $NSLOTS \
     -R $REF -known $MILLS -known $INDEL1KG \
     -I $SM/bam/$SM.markduped.bam \
     -o $SM/realigner.intervals
@@ -32,4 +35,4 @@ $JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \
     -o $SM/bam/$SM.realigned.bam
 rm $SM/bam/$SM.markduped.{bam,bai} $SM/realigner.intervals
 
-printf -- "---\n[$(date)] Finish IndelRealigner.\n"
+printf -- "[$(date)] Finish IndelRealigner.\n---\n"
diff --git a/genome_mapping/job_scripts/aln_5.bqsr.sh b/genome_mapping/job_scripts/aln_5.bqsr.sh
@@ -1,22 +1,25 @@
 #!/bin/bash
 #$ -cwd
-#$ -pe threaded 36 
+#$ -pe threaded 24
 
-set -eu -o pipefail
+trap "exit 100" ERR
 
 if [[ $# -lt 1 ]]; then
     echo "Usage: $(basename $0) [sample name]"
-    exit 1
+    false
 fi
 
-source $(pwd)/run_info
-
 SM=$1
 
-printf -- "[$(date)] Start BQSR recal_table.\n---\n"
+source $(pwd)/$SM/run_info
+
+set -o nounset
+set -o pipefail
+
+printf -- "---\n[$(date)] Start BQSR recal_table.\n"
 
 $JAVA -Xmx58G -jar $GATK \
-    -T BaseRecalibrator -nct 36 \
+    -T BaseRecalibrator -nct $NSLOTS \
     -R $REF -knownSites $DBSNP -knownSites $MILLS -knownSites $INDEL1KG \
     -I $SM/bam/$SM.realigned.bam \
     -o $SM/recal_data.table
@@ -25,11 +28,11 @@ printf -- "---\n[$(date)] Start BQSR recal_table.\n"
 printf -- "---\n[$(date)] Start BQSR PrintReads.\n---\n"
 
 $JAVA -Xmx58G -jar $GATK \
-    -T PrintReads -nct 36 \
+    -T PrintReads -nct $NSLOTS \
     --emit_original_quals \
     -R $REF -BQSR $SM/recal_data.table \
     -I $SM/bam/$SM.realigned.bam \
     -o $SM/bam/$SM.bam
 rm $SM/bam/$SM.realigned.{bam,bai}
 
-printf -- "---\n[$(date)] Finish BQSR PrintReads.\n"
+printf -- "[$(date)] Finish BQSR PrintReads.\n---\n"
diff --git a/genome_mapping/job_scripts/aln_6.upload_bam.sh b/genome_mapping/job_scripts/aln_6.upload_bam.sh
@@ -2,18 +2,21 @@
 #$ -cwd
 #$ -pe threaded 1
 
-set -eu -o pipefail
+trap "exit 100" ERR
 
 if [[ $# -lt 1 ]]; then
     echo "Usage: $(basename $0) [sample name]"
-    exit 1
+    false
 fi
 
-source $(pwd)/run_info
-
 SM=$1
 
-printf -- "[$(date)] Start flagstat: $SM.bam \n---\n"
+source $(pwd)/$SM/run_info
+
+set -o nounset
+set -o pipefail
+
+printf -- "---\n[$(date)] Start flagstat: $SM.bam\n"
 
 $SAMTOOLS flagstat $SM/bam/$SM.bam > $SM/flagstat.txt
 
@@ -28,4 +31,4 @@ cd ..
 rmdir downloads fastq bam
 touch done
 
-printf -- "---\n[$(date)] Finish upload: $SM.{bam,bai}\n"
+printf -- "[$(date)] Finish upload: $SM.{bam,bai}\n---\n"