bsmn · YifanWang0801 · Jan 20, 2018 · Jan 20, 2018 · Jan 20, 2018 · Feb 14, 2018
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,8 @@
 __pycache__/
 *.py[cod]
 *$py.class
+*.swq
+
+# tools / resources directories
+/tools/
+/resources/
diff --git a/README.md b/README.md
@@ -0,0 +1,80 @@
+# bsmn_pipeline
+BSMN common data processing pipeline
+
+# Setup and installation
+This pipeline can be run in any cluster system using SGE job scheduler. I would recommend set your own cluster in AWS using AWS ParallelCluster.
+
+## AWS ParallelCluster
+For installoing and setting up parallelcluster, plase see the [`Getting Started Guide`](https://aws-parallelcluster.readthedocs.io/en/latest/getting_started.html) for AWS ParallelCluster.
+
+## Installing pipeline
+Check out bsmn_pipeline where you want it installed in the AWS Parallelcluster you set up or the cluster system you are using.
+```
+$ git clone https://github.com/bsmn/bsmn_pipeline
+```
+
+Install software dependencies into `bsmn_pipeline/tools` running the following script.
+```
+$ cd bsmn_pipeline
+$ ./install_tools.sh
+```
+
+Download required resource files including the reference sequence. This step require a synapse account that can access to the Synapse page syn17062535.
+```
+$ ./download_resources.sh
+```
+
+## Extra set up for SGE
+The pipeline require a parallel environment named "threaded" in  your SGE system. If your SGE system doen't have this parallel environment, you should add it into yours.
+```
+$ sudo su
+# qconf -Ap << END
+pe_name            threaded
+slots              99999
+user_lists         NONE
+xuser_lists        NONE
+start_proc_args    NONE
+stop_proc_args     NONE
+allocation_rule    $pe_slots
+control_slaves     FALSE
+job_is_first_task  TRUE
+urgency_slots      min
+accounting_summary TRUE
+qsort_args         NONE
+END
+```
+```
+# qconf -mattr queue pe_list threaded all.q
+```
+
+# Usage
+## genome_mapping
+Run the pipeline using a wrapper shell script.
+```bash
+genome_mapping.sh sample_list.txt
+```
+
+### sample_list.txt format
+The lines starting with # will be commented out and ignored. The header line should start with # as well. Eg.
+```
+#sample_id	file_name	location
+5154_brain-BSMN_REF_brain-534-U01MH106876	bulk_sorted.bam	syn10639574
+5154_fibroblast-BSMN_REF_fibroblasts-534-U01MH106876	fibroblasts_sorted.bam	syn10639575
+5154_NeuN_positive-BSMN_REF_NeuN+_E12-677-U01MH106876	E12_MDA_common_sorted.bam	s3://nda-bsmn/abyzova_1497485007384/data/E12_MDA_common_sorted.bam
+5154_NeuN_positive-BSMN_REF_NeuN+_C12-677-U01MH106876	C12_MDA_common_sorted.bam	/efs/data/C12_MDA_common_sorted.bam
+```
+The "location" column can be a Synape ID, S3Uri of the NDA or a user, or LocalPath. For Data download, synapse or aws clients, or symbolic lins will be used, respectively.
+
+### options
+```
+--parentid syn123
+```
+With parentid option, you can specify a Synapse ID of project or folder where to upload result bam files. If it is set, the result bam files will be uploaded into Synapse and deleted. Otherwise, they will be locally kept.
+
+# Contributing
+
+The `master` branch is protected. To make introduce changes:
+
+1. Fork this repository
+2. Open a branch with your github username and a short descriptive statement (like `kdaily-update-readme`). If there is an open issue on this repository, name your branch after the issue (like `kdaily-issue-7`).
+3. Open a pull request and request a review.
diff --git a/config.ini b/config.ini
@@ -0,0 +1,31 @@
+[TOOLS]
+PYTHON3  = tools/python/3.6.2/bin/python3
+SYNAPSE  = tools/python/3.6.2/bin/synapse
+AWS      = tools/python/3.6.2/bin/aws
+JAVA     = tools/java/jdk1.8.0_191/bin/java
+BWA      = tools/bwa/0.7.16a/bin/bwa
+SAMTOOLS = tools/samtools/1.7/bin/samtools
+SAMBAMBA = tools/sambamba/v0.6.7/bin/sambamba
+GATK     = tools/gatk/3.7-0/GenomeAnalysisTK.jar
+GATK4    = tools/gatk/4.1-2/gatk-package-4.1.2.0-local.jar
+PICARD   = tools/picard/2.12.1/picard.jar
+BGZIP    = tools/htslib/1.7/bin/bgzip
+TABIX    = tools/htslib/1.7/bin/tabix
+VT       = tools/vt/2018-06-07/bin/vt
+BCFTOOLS = tools/bcftools/1.7/bin/bcftools
+ROOTSYS  = tools/root/6.14.00
+CNVNATOR = tools/cnvnator/2018-07-09/bin/cnvnator
+STRELKA = tools/strelka/strelka-2.9.2.centos6_x86_64/bin/configureStrelkaSomaticWorkflow.py
+
+[RESOURCES]
+REFDIR         = resources
+REF            = resources/hs37d5.fa
+DBSNP          = resources/dbsnp_138.b37.vcf
+MILLS          = resources/Mills_and_1000G_gold_standard.indels.b37.vcf
+INDEL1KG       = resources/1000G_phase1.indels.b37.vcf
+OMNI           = resources/1000G_omni2.5.b37.vcf
+HAPMAP         = resources/hapmap_3.3.b37.vcf
+SNP1KG         = resources/1000G_phase1.snps.high_confidence.b37.vcf
+KNOWN_GERM_SNP = resources/gnomAD.1KG.ExAC.ESP6500.Kaviar.snps.txt.gz
+MASK1KG        = resources/20141020.strict_mask.whole_genome.fasta.gz
+GNOMAD         = resources/af-only-gnomad.raw.sites.b37.vcf.gz
diff --git a/download_resources.sh b/download_resources.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+mkdir -p resources
+
+# Synapse login
+tools/python/3.6.2/bin/synapse login --remember-me
+
+# Download and index the human ref genome
+tools/python/3.6.2/bin/synapse get syn10347383 --downloadLocation resources/
+gunzip resources/hs37d5.fa.gz
+tools/bwa/0.7.16a/bin/bwa index resources/hs37d5.fa
+tools/samtools/1.7/bin/samtools faidx resources/hs37d5.fa
+tools/java/jdk1.8.0_191/bin/java -jar tools/picard/2.12.1/picard.jar \
+    CreateSequenceDictionary R=resources/hs37d5.fa O=resources/hs37d5.dict
+
+# Download mapping resources
+tools/python/3.6.2/bin/synapse get syn17062535 -r --downloadLocation resources/
+gunzip resources/*vcf.gz resources/*vcf.idx.gz
+rm resources/SYNAPSE_METADATA_MANIFEST.tsv
+
+## Download GATK gnomAD
+wget -P resources ftp://[email protected]/bundle/Mutect2/af-only-gnomad.raw.sites.b37.vcf.gz
+wget -P resources ftp://[email protected]/bundle/Mutect2/af-only-gnomad.raw.sites.b37.vcf.gz.tbi
+
+# Split the ref genome by chromosome
+awk '{ 
+    r = match($1, "^>"); 
+    if (r != 0) {
+        filename = "resources/chr"substr($1, 2, length($1))".fa"; 
+        print $0 > filename;
+    } 
+    else {
+        print $0 >> filename;
+    }
+}' resources/hs37d5.fa
+rm resources/chrGL* resources/chrhs37d5.fa resources/chrNC_007605.fa
diff --git a/genome_mapping.sh b/genome_mapping.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+PIPE_HOME=$(dirname $(readlink -f ${BASH_SOURCE[0]}))
+PYTHON3=$PIPE_HOME/$(grep PYTHON3 $PIPE_HOME/config.ini |cut -f2 -d=|sed 's/^ \+//')
+
+$PYTHON3 $PIPE_HOME/genome_mapping/run.py $@
diff --git a/genome_mapping/job_scripts/aln_1.align_sort.sh b/genome_mapping/job_scripts/aln_1.align_sort.sh
@@ -1,27 +1,30 @@
 #!/bin/bash
 #$ -cwd
-#$ -pe threaded 18
+#$ -pe threaded 24 
 
-set -eu -o pipefail
+trap "exit 100" ERR
 
 if [[ $# -lt 2 ]]; then
     echo "Usage: $(basename $0) [sample name] [PU info]"
-    exit 1
+    false
 fi
 
-source $(pwd)/run_info
-
 SM=$1
 PU=$2
 
-printf -- "[$(date)] Start align_sort.\n---\n"
+source $(pwd)/$SM/run_info
+
+set -o nounset 
+set -o pipefail
+
+printf -- "---\n[$(date)] Start align_sort.\n"
 
 mkdir -p $SM/bam
-$BWA mem -M -t 14 \
+$BWA mem -M -t $((NSLOTS - 4)) \
     -R "@RG\tID:$SM.$PU\tSM:$SM\tPL:illumina\tLB:$SM\tPU:$PU" \
     $REF $SM/fastq/$SM.$PU.R{1,2}.fastq.gz \
     |$SAMBAMBA view -S -f bam -l 0 /dev/stdin \
     |$SAMBAMBA sort -m 24GB -t 3 -o $SM/bam/$SM.$PU.sorted.bam --tmpdir=tmp /dev/stdin 
 rm $SM/fastq/$SM.$PU.R{1,2}.fastq.gz
 
-printf -- "---\n[$(date)] Finish align_sort.\n"
+printf -- "[$(date)] Finish align_sort.\n---\n"
diff --git a/genome_mapping/job_scripts/aln_2.merge_bam.sh b/genome_mapping/job_scripts/aln_2.merge_bam.sh
@@ -1,25 +1,29 @@
 #!/bin/bash
 #$ -cwd
-#$ -pe threaded 18
+#$ -pe threaded 24
 
-set -eu -o pipefail
+trap "exit 100" ERR
 
 if [[ $# -lt 1 ]]; then
     echo "Usage: $(basename $0) [sample name]"
-    exit 1
+    false
 fi
 
-source $(pwd)/run_info
-
 SM=$1
 
-printf -- "[$(date)] Start merge_bam.\n---\n"
+source $(pwd)/$SM/run_info
+
+set -o nounset
+set -o pipefail
+
+printf -- "---\n[$(date)] Start merge_bam.\n"
 
 if [[ $(ls $SM/bam/$SM.*.sorted.bam|wc -l) == 1 ]]; then
     mv $SM/bam/$SM.*.sorted.bam $SM/bam/$SM.merged.bam
+    rm $SM/bam/$SM.*.sorted.bam.bai
 else
-    $SAMBAMBA merge -t 18 $SM/bam/$SM.merged.bam $SM/bam/$SM.*.sorted.bam
+    $SAMBAMBA merge -t $NSLOTS $SM/bam/$SM.merged.bam $SM/bam/$SM.*.sorted.bam
+    rm $SM/bam/$SM.*.sorted.bam{,.bai}
 fi
-rm $SM/bam/$SM.*.sorted.bam{,.bai}
 
-printf -- "---\n[$(date)] Finish merge_bam.\n"
+printf -- "[$(date)] Finish merge_bam.\n---\n"
diff --git a/genome_mapping/job_scripts/aln_3.markdup.sh b/genome_mapping/job_scripts/aln_3.markdup.sh
@@ -1,19 +1,22 @@
 #!/bin/bash
 #$ -cwd
-#$ -pe threaded 18 
+#$ -pe threaded 12
 
-set -eu -o pipefail
+trap "exit 100" ERR
 
 if [[ $# -lt 1 ]]; then
     echo "Usage: $(basename $0) [sample name]"
-    exit 1
+    false
 fi
 
-source $(pwd)/run_info
-
 SM=$1
 
-printf -- "[$(date)] Start markdup.\n---\n"
+source $(pwd)/$SM/run_info
+
+set -o nounset
+set -o pipefail
+
+printf -- "---\n[$(date)] Start markdup.\n"
 
 $JAVA -Xmx26G -jar $PICARD MarkDuplicates \
     I=$SM/bam/$SM.merged.bam \
@@ -25,4 +28,4 @@ $JAVA -Xmx26G -jar $PICARD MarkDuplicates \
 
 rm $SM/bam/$SM.merged.bam{,.bai}
 
-printf -- "---\n[$(date)] Finish markdup.\n"
+printf -- "[$(date)] Finish markdup.\n---\n"
diff --git a/genome_mapping/job_scripts/aln_4.indel_realign.sh b/genome_mapping/job_scripts/aln_4.indel_realign.sh
@@ -1,23 +1,26 @@
 #!/bin/bash
 #$ -cwd
-#$ -pe threaded 36 
+#$ -pe threaded 24
 
-set -eu -o pipefail
+trap "exit 100" ERR
 
 if [[ $# -lt 1 ]]; then
     echo "Usage: $(basename $0) [sample name]"
-    exit 1
+    false
 fi
 
-source $(pwd)/run_info
-
 SM=$1
 
-printf -- "[$(date)] Start RealignerTargetCreator.\n---\n"
+source $(pwd)/$SM/run_info
+
+set -o nounset
+set -o pipefail
+
+printf -- "---\n[$(date)] Start RealignerTargetCreator.\n"
 
 $JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \
-    -T RealignerTargetCreator -nt 36 \
-    -R $REF -known $MILLS -known $ONEKG \
+    -T RealignerTargetCreator -nt $NSLOTS \
+    -R $REF -known $MILLS -known $INDEL1KG \
     -I $SM/bam/$SM.markduped.bam \
     -o $SM/realigner.intervals
 
@@ -26,10 +29,10 @@ printf -- "---\n[$(date)] Start IndelRealigner.\n---\n"
 
 $JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \
     -T IndelRealigner \
-    -R $REF -known $MILLS -known $ONEKG \
+    -R $REF -known $MILLS -known $INDEL1KG \
     -targetIntervals $SM/realigner.intervals \
     -I $SM/bam/$SM.markduped.bam \
     -o $SM/bam/$SM.realigned.bam
 rm $SM/bam/$SM.markduped.{bam,bai} $SM/realigner.intervals
 
-printf -- "---\n[$(date)] Finish IndelRealigner.\n"
+printf -- "[$(date)] Finish IndelRealigner.\n---\n"
diff --git a/genome_mapping/job_scripts/aln_5.bqsr.sh b/genome_mapping/job_scripts/aln_5.bqsr.sh
@@ -1,35 +1,38 @@
 #!/bin/bash
 #$ -cwd
-#$ -pe threaded 36 
+#$ -pe threaded 24
 
-set -eu -o pipefail
+trap "exit 100" ERR
 
 if [[ $# -lt 1 ]]; then
     echo "Usage: $(basename $0) [sample name]"
-    exit 1
+    false
 fi
 
-source $(pwd)/run_info
-
 SM=$1
 
-printf -- "[$(date)] Start BQSR recal_table.\n---\n"
+source $(pwd)/$SM/run_info
+
+set -o nounset
+set -o pipefail
+
+printf -- "---\n[$(date)] Start BQSR recal_table.\n"
 
 $JAVA -Xmx58G -jar $GATK \
-    -T BaseRecalibrator -nct 36 \
-    -R $REF -knownSites $DBSNP -knownSites $MILLS -knownSites $ONEKG \
+    -T BaseRecalibrator -nct $NSLOTS \
+    -R $REF -knownSites $DBSNP -knownSites $MILLS -knownSites $INDEL1KG \
     -I $SM/bam/$SM.realigned.bam \
     -o $SM/recal_data.table
 
 printf -- "---\n[$(date)] Start BQSR recal_table.\n"
 printf -- "---\n[$(date)] Start BQSR PrintReads.\n---\n"
 
 $JAVA -Xmx58G -jar $GATK \
-    -T PrintReads -nct 36 \
+    -T PrintReads -nct $NSLOTS \
     --emit_original_quals \
     -R $REF -BQSR $SM/recal_data.table \
     -I $SM/bam/$SM.realigned.bam \
     -o $SM/bam/$SM.bam
 rm $SM/bam/$SM.realigned.{bam,bai}
 
-printf -- "---\n[$(date)] Finish BQSR PrintReads.\n"
+printf -- "[$(date)] Finish BQSR PrintReads.\n---\n"