From 812f07b906fb5d4dec95a627305e5d38a11aefe3 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Sat, 20 Jan 2018 15:10:00 -0600 Subject: [PATCH 01/70] Create README.md --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..b3e0dad --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +# bsmn_pipeline +BSMN common data processing pipeline + +# Usage +```bash +genome_mapping/run.py sample_list.txt +``` + +## sample_list.txt format +The first line should be a header line. Eg. +``` +sample_id file synapse_id +5154_brain-BSMN_REF_brain-534-U01MH106876 bulk_sorted.bam syn10639574 +5154_fibroblast-BSMN_REF_fibroblasts-534-U01MH106876 fibroblasts_sorted.bam syn10639575 +``` From 9fd1b601d434fc9c7e47790812d23b9786afde68 Mon Sep 17 00:00:00 2001 From: cfncluster user Date: Sat, 20 Jan 2018 15:14:26 -0600 Subject: [PATCH 02/70] Minor fix. --- genome_mapping/job_scripts/pre_1b.bam2fastq.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genome_mapping/job_scripts/pre_1b.bam2fastq.sh b/genome_mapping/job_scripts/pre_1b.bam2fastq.sh index 1840146..ed128fe 100755 --- a/genome_mapping/job_scripts/pre_1b.bam2fastq.sh +++ b/genome_mapping/job_scripts/pre_1b.bam2fastq.sh @@ -29,6 +29,6 @@ fi $SAMTOOLS collate -uOn $TMP_N $SM/downloads/$FNAME $SM/tmp.collate \ |$SAMTOOLS fastq -F 0x900 -1 $SM/fastq/$SM.R1.fastq.gz -2 $SM/fastq/$SM.R2.fastq.gz - -#rm $SM/downloads/$FNAME +rm $SM/downloads/$FNAME printf -- "---\n[$(date)] Finish bam2fastq: $FNAME\n" From 39f271ba6e30b9fc4a9e61bc90f2f627c1fe38f3 Mon Sep 17 00:00:00 2001 From: cfncluster user Date: Sat, 20 Jan 2018 15:17:42 -0600 Subject: [PATCH 03/70] Edit README. --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b3e0dad..93443c7 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,12 @@ BSMN common data processing pipeline # Usage +## genome_mapping ```bash genome_mapping/run.py sample_list.txt ``` -## sample_list.txt format +### sample_list.txt format The first line should be a header line. Eg. ``` sample_id file synapse_id From 19fe1f7ab07e6068d2c1b6a8de0170ee80fb15b4 Mon Sep 17 00:00:00 2001 From: cfncluster user Date: Wed, 14 Feb 2018 13:05:42 -0600 Subject: [PATCH 04/70] a small change in removing files. --- genome_mapping/job_scripts/aln_2.merge_bam.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/genome_mapping/job_scripts/aln_2.merge_bam.sh b/genome_mapping/job_scripts/aln_2.merge_bam.sh index 3eb2e4f..ff53fc5 100755 --- a/genome_mapping/job_scripts/aln_2.merge_bam.sh +++ b/genome_mapping/job_scripts/aln_2.merge_bam.sh @@ -17,9 +17,10 @@ printf -- "[$(date)] Start merge_bam.\n---\n" if [[ $(ls $SM/bam/$SM.*.sorted.bam|wc -l) == 1 ]]; then mv $SM/bam/$SM.*.sorted.bam $SM/bam/$SM.merged.bam + rm $SM/bam/$SM.*.sorted.bam.bai else $SAMBAMBA merge -t 18 $SM/bam/$SM.merged.bam $SM/bam/$SM.*.sorted.bam + rm $SM/bam/$SM.*.sorted.bam{,.bai} fi -rm $SM/bam/$SM.*.sorted.bam{,.bai} printf -- "---\n[$(date)] Finish merge_bam.\n" From eb333f4c7ebb195e488ac9d095fe903eb2dc1fdb Mon Sep 17 00:00:00 2001 From: cfncluster user Date: Wed, 14 Feb 2018 13:06:36 -0600 Subject: [PATCH 05/70] Adjust the threaded option. --- genome_mapping/job_scripts/aln_1.align_sort.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/genome_mapping/job_scripts/aln_1.align_sort.sh b/genome_mapping/job_scripts/aln_1.align_sort.sh index 6d1f534..e16874e 100755 --- a/genome_mapping/job_scripts/aln_1.align_sort.sh +++ b/genome_mapping/job_scripts/aln_1.align_sort.sh @@ -1,6 +1,6 @@ #!/bin/bash #$ -cwd -#$ -pe threaded 18 +#$ -pe threaded 36 set -eu -o pipefail @@ -17,7 +17,7 @@ PU=$2 printf -- "[$(date)] Start align_sort.\n---\n" mkdir -p $SM/bam -$BWA mem -M -t 14 \ +$BWA mem -M -t 32 \ -R "@RG\tID:$SM.$PU\tSM:$SM\tPL:illumina\tLB:$SM\tPU:$PU" \ $REF $SM/fastq/$SM.$PU.R{1,2}.fastq.gz \ |$SAMBAMBA view -S -f bam -l 0 /dev/stdin \ From 2ede371ed6eeaa570ed430fc1e15ce28a967120b Mon Sep 17 00:00:00 2001 From: cfncluster user Date: Wed, 14 Feb 2018 13:18:50 -0600 Subject: [PATCH 06/70] Add an analysis script caculating VAF. --- analysis_utils/snv_af.py | 153 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100755 analysis_utils/snv_af.py diff --git a/analysis_utils/snv_af.py b/analysis_utils/snv_af.py new file mode 100755 index 0000000..a6b3f89 --- /dev/null +++ b/analysis_utils/snv_af.py @@ -0,0 +1,153 @@ +#!/shared/apps/pyenv/versions/3.6.2/bin/python + +import argparse +import re +import subprocess +import sys +import os + + +def print_af(args): + af_routines = [calc_af(pileup(args.clone, args.min_MQ, args.min_BQ, clean(count())))] + header = '#chr\tpos\tref\talt\tcl_af\tcl_depth\tcl_ref_n\tcl_alt_n\tcl_base_count' + if args.tissue != None: + af_routines.append(calc_af(pileup(args.tissue, args.min_MQ, args.min_BQ, clean(count())))) + header = header + '\tti_af\tti_depth\tti_ref_n\tti_alt_n\tti_base_count' + printer(header) + for snv in args.infile: + if snv[0] == '#': + continue + chrom, pos, ref, alt = snv.strip().split()[:4] + af = '\t'.join([af_routine.send((chrom, pos, ref, alt)) for af_routine in af_routines]) + printer('{}\t{}\t{}\t{}\t{}'.format(chrom, pos, ref.upper(), alt.upper(), af)) + + +def coroutine(func): + def start(*args, **kwargs): + g = func(*args, **kwargs) + g.__next__() + return g + return start + +@coroutine +def calc_af(target): + result = None + while True: + chrom, pos, ref, alt = (yield result) + base_n = target.send((chrom, pos)) + total = sum(base_n.values()) + ref_n = base_n[ref.upper()] + base_n[ref.lower()] + alt_n = base_n[alt.upper()] + base_n[alt.lower()] + try: + af = alt_n / total + except ZeroDivisionError: + af = 0 + result = '{:f}\t{}\t{}\t{}\tA={A},C={C},G={G},T={T},a={a},c={c},g={g},t={t},dels={dels}'.format( + af, total, ref_n, alt_n, **base_n) + +@coroutine +def pileup(bam, min_MQ, min_BQ, target): + result = None + while True: + chrom, pos = (yield result) + cmd = ['samtools', 'mpileup', '-d', '8000', + '-q', str(min_MQ), '-Q', str(min_BQ), + '-r', '{}:{}-{}'.format(chrom, pos, pos), bam] + cmd_out = subprocess.run( + cmd, universal_newlines=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + try: + cmd_out.check_returncode() + except subprocess.CalledProcessError: + sys.exit(cmd_out.stderr) + try: + bases = cmd_out.stdout.split()[4] + except IndexError: + bases = '' + result = target.send(bases) + +@coroutine +def clean(target): + result = None + while True: + bases = (yield result) + bases = re.sub('\^.', '', bases) + bases = re.sub('\$', '', bases) + for n in set(re.findall('-(\d+)', bases)): + bases = re.sub('-{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) + for n in set(re.findall('\+(\d+)', bases)): + bases = re.sub('\+{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) + result = target.send(bases) + +@coroutine +def count(): + result = None + while True: + bases = (yield result) + base_n = {} + base_n['A'] = bases.count('A') + base_n['C'] = bases.count('C') + base_n['G'] = bases.count('G') + base_n['T'] = bases.count('T') + base_n['a'] = bases.count('a') + base_n['c'] = bases.count('c') + base_n['g'] = bases.count('g') + base_n['t'] = bases.count('t') + base_n['dels'] = bases.count('*') + result = base_n + +def printer(out): + try: + print(out, flush=True) + except BrokenPipeError: + try: + sys.stdout.close() + except BrokenPipeError: + pass + try: + sys.stderr.close() + except BrokenPipeError: + pass + +def main(): + parser = argparse.ArgumentParser( + description='Calculate allele freqeuency for SNV') + + parser.add_argument( + '-c', '--clone', metavar='FILE', + help='clone bam file', + required=True) + + parser.add_argument( + '-t', '--tissue', metavar='FILE', + help='tissue bam file') + + parser.add_argument( + '-q', '--min-MQ', metavar='INT', + help='mapQ cutoff value [20]', + type=int, default=20) + + parser.add_argument( + '-Q', '--min-BQ', metavar='INT', + help='baseQ/BAQ cutoff value [13]', + type=int, default=13) + + parser.add_argument( + 'infile', metavar='snv_list.txt', + help='''SNV list. + Each line format is "chr\\tpos\\tref\\talt". + Trailing columns will be ignored. [STDIN]''', + nargs='?', type=argparse.FileType('r'), + default=sys.stdin) + + parser.set_defaults(func=print_af) + + args = parser.parse_args() + + if(len(vars(args)) == 0): + parser.print_help() + else: + args.func(args) + +if __name__ == "__main__": + main() From dd2c8be001568f09a619deb095e9198603171c9b Mon Sep 17 00:00:00 2001 From: cfncluster user Date: Wed, 14 Feb 2018 13:19:30 -0600 Subject: [PATCH 07/70] Add nda related utils. --- analysis_utils/nda_aws_token.sh | 99 +++++++++++++++++++++++++++++++++ analysis_utils/nda_s3_path.py | 19 +++++++ 2 files changed, 118 insertions(+) create mode 100755 analysis_utils/nda_aws_token.sh create mode 100755 analysis_utils/nda_s3_path.py diff --git a/analysis_utils/nda_aws_token.sh b/analysis_utils/nda_aws_token.sh new file mode 100755 index 0000000..23b012d --- /dev/null +++ b/analysis_utils/nda_aws_token.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +show_usage() { + cat < $OPTARG; exit 0;; + \?) echo "Unknown option: -$OPTARG" >&2 + show_usage >&2; exit 1;; + : ) echo "Missing option argrument for -$OPTARG" >&2 + show_usage >&2; exit 1;; + esac +done + +if [ -z $nda_cred_f ]; then + read_credential +elif [ ! -e $nda_cred_f ]; then + echo "$nda_cred_f file doesn't exist." >$2; exit 1 +else + source $nda_cred_f +fi + +#username=baetj +#password=fb0dc634e9179ebc3460d7a2a8c05cc3905fa00f +server="https://ndar.nih.gov/DataManager/dataManager" + +############################################################################## +# Make Request +############################################################################## +REQUEST_XML=$(cat < + + + + + 0 + ${username} + ${password} + 0 + + + + +EOF +) +RESPONSE_XML="$(curl -k -s --request POST -H "Content-Type: text/xml" -H "SOAPAction: \"generateToken\"" -d "$REQUEST_XML" $server)" + +############################################################################## +# Handle Response +############################################################################## +ERROR=$(echo $RESPONSE_XML | grep -oP '(?<=).*(?=)') +if [ -n "$ERROR" ]; then + echo "Error requesting token: $ERROR" + exit 1; +fi + +accessKey=$(echo $RESPONSE_XML | grep -oP '(?<=).*(?=)') +secretKey=$(echo $RESPONSE_XML | grep -oP '(?<=).*(?=)') +sessionToken=$(echo $RESPONSE_XML | grep -oP '(?<=).*(?=)') +expirationDate=$(echo $RESPONSE_XML | grep -oP '(?<=).*(?=)') + + +############################################################################## +# Write Token +############################################################################## +echo "export AWS_ACCESS_KEY_ID=$accessKey" +echo "export AWS_SECRET_ACCESS_KEY=$secretKey" +echo "export AWS_SESSION_TOKEN=$sessionToken" diff --git a/analysis_utils/nda_s3_path.py b/analysis_utils/nda_s3_path.py new file mode 100755 index 0000000..ddfa2f9 --- /dev/null +++ b/analysis_utils/nda_s3_path.py @@ -0,0 +1,19 @@ +#!/shared/apps/pyenv/versions/3.6.2/bin/python + +import synapseclient +import sys +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('synid', help='synapse id') +args = parser.parse_args() + +synid = args.synid +syn = synapseclient.login(silent=True) +ent = syn.get(synid, downloadFile=False) +fh = syn._getFileHandleDownload( + fileHandleId=ent.properties.dataFileHandleId, + objectId=ent.properties.id) +print("s3://{bucketName}/{key}".format( + bucketName=fh['fileHandle']['bucketName'], + key=fh['fileHandle']['key'])) From 41df67f165de3b6156d8f7cd7faaf48ff511a3fa Mon Sep 17 00:00:00 2001 From: cfncluster user Date: Mon, 19 Feb 2018 11:50:31 -0600 Subject: [PATCH 08/70] Add a script for strand bias analysis. --- analysis_utils/strand_bias.py | 182 ++++++++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100755 analysis_utils/strand_bias.py diff --git a/analysis_utils/strand_bias.py b/analysis_utils/strand_bias.py new file mode 100755 index 0000000..7f76e5b --- /dev/null +++ b/analysis_utils/strand_bias.py @@ -0,0 +1,182 @@ +#!/shared/apps/pyenv/versions/3.6.2/bin/python + +import argparse +import re +import subprocess +import sys +import os +import math +from rpy2.robjects import r +from scipy.stats import fisher_exact + +def run(args): + s_info = strand_info(pileup(args.bam, args.min_MQ, args.min_BQ, clean(count()))) + header = ('#chr\tpos\tref\talt\t' + + 'total\ttotal_fwd\ttotal_rev\ttotal_ratio\t' + + 'p_poisson\t' + + 'ref_n\tref_fwd\tref_rev\tref_ratio\t' + + 'alt_n\talt_fwd\talt_rev\talt_ratio\t' + + 'p_fisher') + printer(header) + for snv in args.infile: + if snv[0] == '#': + continue + chrom, pos, ref, alt = snv.strip().split()[:4] + printer('{chrom}\t{pos}\t{ref}\t{alt}\t{strand_info}'.format( + chrom=chrom, pos=pos, ref=ref.upper(), alt=alt.upper(), + strand_info=s_info.send((chrom, pos, ref, alt)))) + +def coroutine(func): + def start(*args, **kwargs): + g = func(*args, **kwargs) + g.__next__() + return g + return start + +@coroutine +def strand_info(target): + result = None + while True: + chrom, pos, ref, alt = (yield result) + base_n = target.send((chrom, pos)) + total = sum(base_n.values()) + total_fwd = sum(list(base_n.values())[:4]) + total_rev = sum(list(base_n.values())[4:8]) + try: + total_ratio = total_fwd/total_rev + except ZeroDivisionError: + total_ratio = math.inf + ref_n = base_n[ref.upper()] + base_n[ref.lower()] + ref_fwd = base_n[ref.upper()] + ref_rev = base_n[ref.lower()] + try: + ref_ratio = ref_fwd/ref_rev + except ZeroDivisionError: + ref_ratio = math.inf + alt_n = base_n[alt.upper()] + base_n[alt.lower()] + alt_fwd = base_n[alt.upper()] + alt_rev = base_n[alt.lower()] + try: + alt_ratio = alt_fwd/alt_rev + except ZeroDivisionError: + alt_ratio = math.inf + + result = ('{total}\t{total_fwd}\t{total_rev}\t{total_ratio:f}\t' + + '{p_poisson:f}\t' + + '{ref_n}\t{ref_fwd}\t{ref_rev}\t{ref_ratio:f}\t' + + '{alt_n}\t{alt_fwd}\t{alt_rev}\t{alt_ratio:f}\t' + + '{p_fisher:f}').format( + total=total, total_fwd=total_fwd, total_rev=total_rev, total_ratio=total_ratio, + p_poisson=p_poisson(total_fwd, total_rev), + ref_n=ref_n, ref_fwd=ref_fwd, ref_rev=ref_rev, ref_ratio=ref_ratio, + alt_n=alt_n, alt_fwd=alt_fwd, alt_rev=alt_rev, alt_ratio=alt_ratio, + p_fisher=p_fisher(ref_fwd, alt_fwd, ref_rev, alt_rev)) + +@coroutine +def pileup(bam, min_MQ, min_BQ, target): + result = None + while True: + chrom, pos = (yield result) + cmd = ['samtools', 'mpileup', '-d', '8000', + '-q', str(min_MQ), '-Q', str(min_BQ), + '-r', '{}:{}-{}'.format(chrom, pos, pos), bam] + cmd_out = subprocess.run( + cmd, universal_newlines=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + try: + cmd_out.check_returncode() + except subprocess.CalledProcessError: + sys.exit(cmd_out.stderr) + try: + bases = cmd_out.stdout.split()[4] + except IndexError: + bases = '' + result = target.send(bases) + +@coroutine +def clean(target): + result = None + while True: + bases = (yield result) + bases = re.sub('\^.', '', bases) + bases = re.sub('\$', '', bases) + for n in set(re.findall('-(\d+)', bases)): + bases = re.sub('-{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) + for n in set(re.findall('\+(\d+)', bases)): + bases = re.sub('\+{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) + result = target.send(bases) + +@coroutine +def count(): + result = None + while True: + bases = (yield result) + base_n = {} + base_n['A'] = bases.count('A') + base_n['C'] = bases.count('C') + base_n['G'] = bases.count('G') + base_n['T'] = bases.count('T') + base_n['a'] = bases.count('a') + base_n['c'] = bases.count('c') + base_n['g'] = bases.count('g') + base_n['t'] = bases.count('t') + base_n['dels'] = bases.count('*') + result = base_n + +def printer(out): + try: + print(out, flush=True) + except BrokenPipeError: + try: + sys.stdout.close() + except BrokenPipeError: + pass + try: + sys.stderr.close() + except BrokenPipeError: + pass + +def p_poisson(n_fwd, n_rev): + return r("poisson.test(c({},{}))$p.value".format(n_fwd, n_rev))[0] + +def p_fisher(ref_fwd, alt_fwd, ref_rev, alt_rev): + return fisher_exact([[ref_fwd, alt_fwd], [ref_rev, alt_rev]])[1] + +def main(): + parser = argparse.ArgumentParser( + description='Check strand bias for SNV') + + parser.add_argument( + '-b', '--bam', metavar='FILE', + help='bam file', + required=True) + + parser.add_argument( + '-q', '--min-MQ', metavar='INT', + help='mapQ cutoff value [20]', + type=int, default=20) + + parser.add_argument( + '-Q', '--min-BQ', metavar='INT', + help='baseQ/BAQ cutoff value [13]', + type=int, default=13) + + parser.add_argument( + 'infile', metavar='snv_list.txt', + help='''SNV list. + Each line format is "chr\\tpos\\tref\\talt". + Trailing columns will be ignored. [STDIN]''', + nargs='?', type=argparse.FileType('r'), + default=sys.stdin) + + parser.set_defaults(func=run) + + args = parser.parse_args() + + if(len(vars(args)) == 0): + parser.print_help() + else: + args.func(args) + +if __name__ == "__main__": + main() From ecac128a2f2f9b016391a6ccd5fc72c300ef1326 Mon Sep 17 00:00:00 2001 From: cfncluster user Date: Wed, 21 Feb 2018 10:58:19 -0600 Subject: [PATCH 09/70] Change VAF script. --- analysis_utils/library/__init__.py | 0 analysis_utils/library/pileup.py | 66 +++++++++++++ analysis_utils/library/utils.py | 21 ++++ analysis_utils/snv_af.py | 153 ----------------------------- analysis_utils/somatic_vaf.py | 78 +++++++++++++++ 5 files changed, 165 insertions(+), 153 deletions(-) create mode 100644 analysis_utils/library/__init__.py create mode 100644 analysis_utils/library/pileup.py create mode 100644 analysis_utils/library/utils.py delete mode 100755 analysis_utils/snv_af.py create mode 100755 analysis_utils/somatic_vaf.py diff --git a/analysis_utils/library/__init__.py b/analysis_utils/library/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/analysis_utils/library/pileup.py b/analysis_utils/library/pileup.py new file mode 100644 index 0000000..f5bdb7c --- /dev/null +++ b/analysis_utils/library/pileup.py @@ -0,0 +1,66 @@ +import re +import subprocess +from .utils import coroutine + +@coroutine +def pileup(bam, min_MQ, min_BQ, target): + result = None + while True: + chrom, pos = (yield result) + cmd = ['samtools', 'mpileup', '-d', '8000', + '-q', str(min_MQ), '-Q', str(min_BQ), + '-r', '{}:{}-{}'.format(chrom, pos, pos), bam] + + max_retries = 5 + n_retries = 0 + while True: + cmd_out = subprocess.run( + cmd, universal_newlines=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + try: + cmd_out.check_returncode() + break + except subprocess.CalledProcessError: + if n_retries == 0: sys.stderr.write("\n") + sys.stderr.write("{pileup_err}source bam: {bam}\npileup location: {chrom}:{pos}-{pos}\n".format( + pileup_err=cmd_out.stderr, bam=bam, chrom=chrom, pos=pos)) + if n_retries < max_retries: + n_retries += 1 + sys.stderr.write("Retry pileup...\n") + else: + sys.exit("Failed in pileup.") + try: + bases = cmd_out.stdout.split()[4] + except IndexError: + bases = '' + result = target.send(bases) + +@coroutine +def clean(target): + result = None + while True: + bases = (yield result) + bases = re.sub('\^.', '', bases) + bases = re.sub('\$', '', bases) + for n in set(re.findall('-(\d+)', bases)): + bases = re.sub('-{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) + for n in set(re.findall('\+(\d+)', bases)): + bases = re.sub('\+{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) + result = target.send(bases) + +@coroutine +def count(): + result = None + while True: + bases = (yield result) + base_n = {} + base_n['A'] = bases.count('A') + base_n['C'] = bases.count('C') + base_n['G'] = bases.count('G') + base_n['T'] = bases.count('T') + base_n['a'] = bases.count('a') + base_n['c'] = bases.count('c') + base_n['g'] = bases.count('g') + base_n['t'] = bases.count('t') + base_n['dels'] = bases.count('*') + result = base_n diff --git a/analysis_utils/library/utils.py b/analysis_utils/library/utils.py new file mode 100644 index 0000000..e37a497 --- /dev/null +++ b/analysis_utils/library/utils.py @@ -0,0 +1,21 @@ +import sys + +def coroutine(func): + def start(*args, **kwargs): + g = func(*args, **kwargs) + g.__next__() + return g + return start + +def printer(out): + try: + print(out, flush=True) + except BrokenPipeError: + try: + sys.stdout.close() + except BrokenPipeError: + pass + try: + sys.stderr.close() + except BrokenPipeError: + pass diff --git a/analysis_utils/snv_af.py b/analysis_utils/snv_af.py deleted file mode 100755 index a6b3f89..0000000 --- a/analysis_utils/snv_af.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/shared/apps/pyenv/versions/3.6.2/bin/python - -import argparse -import re -import subprocess -import sys -import os - - -def print_af(args): - af_routines = [calc_af(pileup(args.clone, args.min_MQ, args.min_BQ, clean(count())))] - header = '#chr\tpos\tref\talt\tcl_af\tcl_depth\tcl_ref_n\tcl_alt_n\tcl_base_count' - if args.tissue != None: - af_routines.append(calc_af(pileup(args.tissue, args.min_MQ, args.min_BQ, clean(count())))) - header = header + '\tti_af\tti_depth\tti_ref_n\tti_alt_n\tti_base_count' - printer(header) - for snv in args.infile: - if snv[0] == '#': - continue - chrom, pos, ref, alt = snv.strip().split()[:4] - af = '\t'.join([af_routine.send((chrom, pos, ref, alt)) for af_routine in af_routines]) - printer('{}\t{}\t{}\t{}\t{}'.format(chrom, pos, ref.upper(), alt.upper(), af)) - - -def coroutine(func): - def start(*args, **kwargs): - g = func(*args, **kwargs) - g.__next__() - return g - return start - -@coroutine -def calc_af(target): - result = None - while True: - chrom, pos, ref, alt = (yield result) - base_n = target.send((chrom, pos)) - total = sum(base_n.values()) - ref_n = base_n[ref.upper()] + base_n[ref.lower()] - alt_n = base_n[alt.upper()] + base_n[alt.lower()] - try: - af = alt_n / total - except ZeroDivisionError: - af = 0 - result = '{:f}\t{}\t{}\t{}\tA={A},C={C},G={G},T={T},a={a},c={c},g={g},t={t},dels={dels}'.format( - af, total, ref_n, alt_n, **base_n) - -@coroutine -def pileup(bam, min_MQ, min_BQ, target): - result = None - while True: - chrom, pos = (yield result) - cmd = ['samtools', 'mpileup', '-d', '8000', - '-q', str(min_MQ), '-Q', str(min_BQ), - '-r', '{}:{}-{}'.format(chrom, pos, pos), bam] - cmd_out = subprocess.run( - cmd, universal_newlines=True, - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - try: - cmd_out.check_returncode() - except subprocess.CalledProcessError: - sys.exit(cmd_out.stderr) - try: - bases = cmd_out.stdout.split()[4] - except IndexError: - bases = '' - result = target.send(bases) - -@coroutine -def clean(target): - result = None - while True: - bases = (yield result) - bases = re.sub('\^.', '', bases) - bases = re.sub('\$', '', bases) - for n in set(re.findall('-(\d+)', bases)): - bases = re.sub('-{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) - for n in set(re.findall('\+(\d+)', bases)): - bases = re.sub('\+{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) - result = target.send(bases) - -@coroutine -def count(): - result = None - while True: - bases = (yield result) - base_n = {} - base_n['A'] = bases.count('A') - base_n['C'] = bases.count('C') - base_n['G'] = bases.count('G') - base_n['T'] = bases.count('T') - base_n['a'] = bases.count('a') - base_n['c'] = bases.count('c') - base_n['g'] = bases.count('g') - base_n['t'] = bases.count('t') - base_n['dels'] = bases.count('*') - result = base_n - -def printer(out): - try: - print(out, flush=True) - except BrokenPipeError: - try: - sys.stdout.close() - except BrokenPipeError: - pass - try: - sys.stderr.close() - except BrokenPipeError: - pass - -def main(): - parser = argparse.ArgumentParser( - description='Calculate allele freqeuency for SNV') - - parser.add_argument( - '-c', '--clone', metavar='FILE', - help='clone bam file', - required=True) - - parser.add_argument( - '-t', '--tissue', metavar='FILE', - help='tissue bam file') - - parser.add_argument( - '-q', '--min-MQ', metavar='INT', - help='mapQ cutoff value [20]', - type=int, default=20) - - parser.add_argument( - '-Q', '--min-BQ', metavar='INT', - help='baseQ/BAQ cutoff value [13]', - type=int, default=13) - - parser.add_argument( - 'infile', metavar='snv_list.txt', - help='''SNV list. - Each line format is "chr\\tpos\\tref\\talt". - Trailing columns will be ignored. [STDIN]''', - nargs='?', type=argparse.FileType('r'), - default=sys.stdin) - - parser.set_defaults(func=print_af) - - args = parser.parse_args() - - if(len(vars(args)) == 0): - parser.print_help() - else: - args.func(args) - -if __name__ == "__main__": - main() diff --git a/analysis_utils/somatic_vaf.py b/analysis_utils/somatic_vaf.py new file mode 100755 index 0000000..4a6ff56 --- /dev/null +++ b/analysis_utils/somatic_vaf.py @@ -0,0 +1,78 @@ +#!/shared/apps/pyenv/versions/3.6.2/bin/python + +import argparse +import re +import subprocess +import sys +from scipy.stats import binom_test +from library.utils import coroutine, printer +from library.pileup import pileup, clean, count + +def run(args): + v_info = vaf_info(pileup(args.bam, args.min_MQ, args.min_BQ, clean(count()))) + header = ('#chr\tpos\tref\talt\tvaf\t' + + 'depth\tref_n\talt_n\tp_binom') + printer(header) + for snv in args.infile: + if snv[0] == '#': + continue + chrom, pos, ref, alt = snv.strip().split()[:4] + printer('{chrom}\t{pos}\t{ref}\t{alt}\t{vaf_info}'.format( + chrom=chrom, pos=pos, ref=ref.upper(), alt=alt.upper(), + vaf_info=v_info.send((chrom, pos, ref, alt)))) + +@coroutine +def vaf_info(target): + result = None + while True: + chrom, pos, ref, alt = (yield result) + base_n = target.send((chrom, pos)) + depth = sum(base_n.values()) + ref_n = base_n[ref.upper()] + base_n[ref.lower()] + alt_n = base_n[alt.upper()] + base_n[alt.lower()] + try: + vaf = alt_n/depth + except ZeroDivisionError: + vaf = 0 + + result = '{vaf:f}\t{depth}\t{ref_n}\t{alt_n}\t{p_binom:f}'.format( + vaf=vaf, depth=depth, ref_n=ref_n, alt_n=alt_n, p_binom=binom_test(alt_n, depth)) + +def main(): + parser = argparse.ArgumentParser( + description='Test whether VAF of each SNV is somatic or germline.') + + parser.add_argument( + '-b', '--bam', metavar='FILE', + help='bam file', + required=True) + + parser.add_argument( + '-q', '--min-MQ', metavar='INT', + help='mapQ cutoff value [20]', + type=int, default=20) + + parser.add_argument( + '-Q', '--min-BQ', metavar='INT', + help='baseQ/BAQ cutoff value [13]', + type=int, default=13) + + parser.add_argument( + 'infile', metavar='snv_list.txt', + help='''SNV list. + Each line format is "chr\\tpos\\tref\\talt". + Trailing columns will be ignored. [STDIN]''', + nargs='?', type=argparse.FileType('r'), + default=sys.stdin) + + parser.set_defaults(func=run) + + args = parser.parse_args() + + if(len(vars(args)) == 0): + parser.print_help() + else: + args.func(args) + +if __name__ == "__main__": + main() From b2054695d110265141b718f0fa4d989b926ae722 Mon Sep 17 00:00:00 2001 From: cfncluster user Date: Wed, 21 Feb 2018 11:03:31 -0600 Subject: [PATCH 10/70] Refactoring. --- analysis_utils/strand_bias.py | 74 +---------------------------------- 1 file changed, 2 insertions(+), 72 deletions(-) diff --git a/analysis_utils/strand_bias.py b/analysis_utils/strand_bias.py index 7f76e5b..8122e2a 100755 --- a/analysis_utils/strand_bias.py +++ b/analysis_utils/strand_bias.py @@ -4,10 +4,11 @@ import re import subprocess import sys -import os import math from rpy2.robjects import r from scipy.stats import fisher_exact +from library.utils import coroutine, printer +from library.pileup import pileup, clean, count def run(args): s_info = strand_info(pileup(args.bam, args.min_MQ, args.min_BQ, clean(count()))) @@ -26,13 +27,6 @@ def run(args): chrom=chrom, pos=pos, ref=ref.upper(), alt=alt.upper(), strand_info=s_info.send((chrom, pos, ref, alt)))) -def coroutine(func): - def start(*args, **kwargs): - g = func(*args, **kwargs) - g.__next__() - return g - return start - @coroutine def strand_info(target): result = None @@ -72,70 +66,6 @@ def strand_info(target): alt_n=alt_n, alt_fwd=alt_fwd, alt_rev=alt_rev, alt_ratio=alt_ratio, p_fisher=p_fisher(ref_fwd, alt_fwd, ref_rev, alt_rev)) -@coroutine -def pileup(bam, min_MQ, min_BQ, target): - result = None - while True: - chrom, pos = (yield result) - cmd = ['samtools', 'mpileup', '-d', '8000', - '-q', str(min_MQ), '-Q', str(min_BQ), - '-r', '{}:{}-{}'.format(chrom, pos, pos), bam] - cmd_out = subprocess.run( - cmd, universal_newlines=True, - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - try: - cmd_out.check_returncode() - except subprocess.CalledProcessError: - sys.exit(cmd_out.stderr) - try: - bases = cmd_out.stdout.split()[4] - except IndexError: - bases = '' - result = target.send(bases) - -@coroutine -def clean(target): - result = None - while True: - bases = (yield result) - bases = re.sub('\^.', '', bases) - bases = re.sub('\$', '', bases) - for n in set(re.findall('-(\d+)', bases)): - bases = re.sub('-{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) - for n in set(re.findall('\+(\d+)', bases)): - bases = re.sub('\+{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) - result = target.send(bases) - -@coroutine -def count(): - result = None - while True: - bases = (yield result) - base_n = {} - base_n['A'] = bases.count('A') - base_n['C'] = bases.count('C') - base_n['G'] = bases.count('G') - base_n['T'] = bases.count('T') - base_n['a'] = bases.count('a') - base_n['c'] = bases.count('c') - base_n['g'] = bases.count('g') - base_n['t'] = bases.count('t') - base_n['dels'] = bases.count('*') - result = base_n - -def printer(out): - try: - print(out, flush=True) - except BrokenPipeError: - try: - sys.stdout.close() - except BrokenPipeError: - pass - try: - sys.stderr.close() - except BrokenPipeError: - pass - def p_poisson(n_fwd, n_rev): return r("poisson.test(c({},{}))$p.value".format(n_fwd, n_rev))[0] From d6e79d97d8661ed9233b4438ecefa4d86d1a94e0 Mon Sep 17 00:00:00 2001 From: cfncluster user Date: Thu, 22 Feb 2018 12:12:32 -0600 Subject: [PATCH 11/70] Bug fix: import sys was omitted. --- analysis_utils/library/pileup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/analysis_utils/library/pileup.py b/analysis_utils/library/pileup.py index f5bdb7c..9a9f1da 100644 --- a/analysis_utils/library/pileup.py +++ b/analysis_utils/library/pileup.py @@ -1,3 +1,4 @@ +import sys import re import subprocess from .utils import coroutine From ad1ac805cf629cb2caa2bacb81f66b6432dea42d Mon Sep 17 00:00:00 2001 From: cfncluster user Date: Sat, 24 Feb 2018 16:59:40 -0600 Subject: [PATCH 12/70] Change p-val format into scientific notation. --- analysis_utils/somatic_vaf.py | 2 +- analysis_utils/strand_bias.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/analysis_utils/somatic_vaf.py b/analysis_utils/somatic_vaf.py index 4a6ff56..974cf18 100755 --- a/analysis_utils/somatic_vaf.py +++ b/analysis_utils/somatic_vaf.py @@ -35,7 +35,7 @@ def vaf_info(target): except ZeroDivisionError: vaf = 0 - result = '{vaf:f}\t{depth}\t{ref_n}\t{alt_n}\t{p_binom:f}'.format( + result = '{vaf:f}\t{depth}\t{ref_n}\t{alt_n}\t{p_binom:e}'.format( vaf=vaf, depth=depth, ref_n=ref_n, alt_n=alt_n, p_binom=binom_test(alt_n, depth)) def main(): diff --git a/analysis_utils/strand_bias.py b/analysis_utils/strand_bias.py index 8122e2a..c6978e8 100755 --- a/analysis_utils/strand_bias.py +++ b/analysis_utils/strand_bias.py @@ -56,10 +56,10 @@ def strand_info(target): alt_ratio = math.inf result = ('{total}\t{total_fwd}\t{total_rev}\t{total_ratio:f}\t' - + '{p_poisson:f}\t' + + '{p_poisson:e}\t' + '{ref_n}\t{ref_fwd}\t{ref_rev}\t{ref_ratio:f}\t' + '{alt_n}\t{alt_fwd}\t{alt_rev}\t{alt_ratio:f}\t' - + '{p_fisher:f}').format( + + '{p_fisher:e}').format( total=total, total_fwd=total_fwd, total_rev=total_rev, total_ratio=total_ratio, p_poisson=p_poisson(total_fwd, total_rev), ref_n=ref_n, ref_fwd=ref_fwd, ref_rev=ref_rev, ref_ratio=ref_ratio, From 881c6db09044bba42ddb752608d0937812b1183c Mon Sep 17 00:00:00 2001 From: cfncluster user Date: Sun, 15 Apr 2018 22:35:40 -0500 Subject: [PATCH 13/70] Change binomial test into one-sided (smaller). --- analysis_utils/somatic_vaf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis_utils/somatic_vaf.py b/analysis_utils/somatic_vaf.py index 974cf18..fdd9a7f 100755 --- a/analysis_utils/somatic_vaf.py +++ b/analysis_utils/somatic_vaf.py @@ -4,7 +4,7 @@ import re import subprocess import sys -from scipy.stats import binom_test +from statsmodels.stats.proportion import binom_test from library.utils import coroutine, printer from library.pileup import pileup, clean, count @@ -36,7 +36,7 @@ def vaf_info(target): vaf = 0 result = '{vaf:f}\t{depth}\t{ref_n}\t{alt_n}\t{p_binom:e}'.format( - vaf=vaf, depth=depth, ref_n=ref_n, alt_n=alt_n, p_binom=binom_test(alt_n, depth)) + vaf=vaf, depth=depth, ref_n=ref_n, alt_n=alt_n, p_binom=binom_test(alt_n, depth, alternative='smaller')) def main(): parser = argparse.ArgumentParser( From 757b309050812fd374db575033cb0cc657f76a9d Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Sun, 17 Jun 2018 01:32:16 -0500 Subject: [PATCH 14/70] Move library location. --- analysis_utils/nda_s3_path.py | 2 +- analysis_utils/somatic_vaf.py | 8 ++++--- analysis_utils/strand_bias.py | 8 ++++--- .../library => library}/__init__.py | 0 {analysis_utils/library => library}/pileup.py | 10 ++++++++- {analysis_utils/library => library}/utils.py | 0 pipeline.conf | 22 +++++++++++++++++++ 7 files changed, 42 insertions(+), 8 deletions(-) rename {analysis_utils/library => library}/__init__.py (100%) rename {analysis_utils/library => library}/pileup.py (88%) rename {analysis_utils/library => library}/utils.py (100%) create mode 100644 pipeline.conf diff --git a/analysis_utils/nda_s3_path.py b/analysis_utils/nda_s3_path.py index ddfa2f9..dfcabfb 100755 --- a/analysis_utils/nda_s3_path.py +++ b/analysis_utils/nda_s3_path.py @@ -1,4 +1,4 @@ -#!/shared/apps/pyenv/versions/3.6.2/bin/python +#!/usr/bin/env python3 import synapseclient import sys diff --git a/analysis_utils/somatic_vaf.py b/analysis_utils/somatic_vaf.py index fdd9a7f..077f4cb 100755 --- a/analysis_utils/somatic_vaf.py +++ b/analysis_utils/somatic_vaf.py @@ -1,10 +1,12 @@ -#!/shared/apps/pyenv/versions/3.6.2/bin/python +#!/usr/bin/env python3 import argparse -import re -import subprocess +import os import sys from statsmodels.stats.proportion import binom_test + +pipe_home = os.path.dirname(os.path.realpath(__file__)) + "/.." +sys.path.append(pipe_home) from library.utils import coroutine, printer from library.pileup import pileup, clean, count diff --git a/analysis_utils/strand_bias.py b/analysis_utils/strand_bias.py index c6978e8..461f860 100755 --- a/analysis_utils/strand_bias.py +++ b/analysis_utils/strand_bias.py @@ -1,12 +1,14 @@ -#!/shared/apps/pyenv/versions/3.6.2/bin/python +#!/usr/bin/env python3 import argparse -import re -import subprocess +import os import sys import math from rpy2.robjects import r from scipy.stats import fisher_exact + +pipe_home = os.path.dirname(os.path.realpath(__file__)) + "/.." +sys.path.append(pipe_home) from library.utils import coroutine, printer from library.pileup import pileup, clean, count diff --git a/analysis_utils/library/__init__.py b/library/__init__.py similarity index 100% rename from analysis_utils/library/__init__.py rename to library/__init__.py diff --git a/analysis_utils/library/pileup.py b/library/pileup.py similarity index 88% rename from analysis_utils/library/pileup.py rename to library/pileup.py index 9a9f1da..6a1a09e 100644 --- a/analysis_utils/library/pileup.py +++ b/library/pileup.py @@ -1,14 +1,22 @@ +import os import sys import re import subprocess from .utils import coroutine +config = os.path.dirname(os.path.realpath(__file__)) + "/../pipeline.conf" +with open(config) as f: + for line in f: + if line[:9] == "SAMTOOLS=": + SAMTOOLS=line.strip().split('=')[1] + break + @coroutine def pileup(bam, min_MQ, min_BQ, target): result = None while True: chrom, pos = (yield result) - cmd = ['samtools', 'mpileup', '-d', '8000', + cmd = [SAMTOOLS, 'mpileup', '-d', '8000', '-q', str(min_MQ), '-Q', str(min_BQ), '-r', '{}:{}-{}'.format(chrom, pos, pos), bam] diff --git a/analysis_utils/library/utils.py b/library/utils.py similarity index 100% rename from analysis_utils/library/utils.py rename to library/utils.py diff --git a/pipeline.conf b/pipeline.conf new file mode 100644 index 0000000..36aec7c --- /dev/null +++ b/pipeline.conf @@ -0,0 +1,22 @@ +#TOOLS +PYTHON3=/shared/apps/pyenv/versions/3.6.2/bin/python3 +SYNAPSE=/shared/apps/pyenv/versions/3.6.2/bin/synapse +JAVA=/shared/apps/java/jdk1.8.0_144/bin/java +BWA=/shared/apps/bwa/0.7.16a/bin/bwa +SAMTOOLS=/shared/apps/samtools/1.7/bin/samtools +SAMBAMBA=/shared/apps/sambamba/v0.6.7/bin/sambamba +GATK=/shared/apps/gatk/3.7-0/GenomeAnalysisTK.jar +PICARD=/shared/apps/picard/2.12.1/picard.jar +TABIX=/shared/apps/htslib/1.7/bin/tabix + +#RESOURCES +REF=/shared/refdata/bsmn/hs37d5.fa +DBSNP=/shared/refdata/gatk_bundle/b37/dbsnp_138.b37.vcf +MILLS=/shared/refdata/gatk_bundle/b37/Mills_and_1000G_gold_standard.indels.b37.vcf +INDEL1KG=/shared/refdata/gatk_bundle/b37/1000G_phase1.indels.b37.vcf +OMNI=/shared/refdata/gatk_bundle/b37/1000G_omni2.5.b37.vcf +HAPMAP=/shared/refdata/gatk_bundle/b37/hapmap_3.3.b37.vcf +SNP1KG=/shared/refdata/gatk_bundle/b37/1000G_phase1.snps.high_confidence.b37.vcf + +#SYNAPSE +PARENTID=syn11636015 From c46b413785d1d4f9f3d0324153abdd2fc497a157 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Sun, 17 Jun 2018 01:51:30 -0500 Subject: [PATCH 15/70] Add germline_filter.py --- analysis_utils/germline_filter.py | 48 +++++++++++++++++++++++++++++++ pipeline.conf | 1 + 2 files changed, 49 insertions(+) create mode 100755 analysis_utils/germline_filter.py diff --git a/analysis_utils/germline_filter.py b/analysis_utils/germline_filter.py new file mode 100755 index 0000000..5ae4c77 --- /dev/null +++ b/analysis_utils/germline_filter.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import subprocess +import functools + +config = os.path.dirname(os.path.realpath(__file__)) + "/../pipeline.conf" +with open(config) as f: + for line in f: + if line[:6] == "TABIX=": + TABIX=line.strip().split('=')[1] + break + +print = functools.partial(print, flush=True) + +parser = argparse.ArgumentParser(description='Germline DB filter') +parser.add_argument('infile', help='VCF file', nargs='?', + type=argparse.FileType('r'), default=sys.stdin) +parser.add_argument('--variant', '-V', help='gzipped variant file (format:chrom\\tpos\\t\\tref\\talt)', required=True) + +args = parser.parse_args() +in_vcf = args.infile + +def check_known_germ(chrom, pos, ref, alt, germ_file=args.variant): + germ_out = subprocess.run([TABIX, germ_file, '{0}:{1}-{1}'.format(chrom, pos)], + stdout=subprocess.PIPE).stdout.decode().split('\n') + for line in germ_out: + if line == '': + break + else: + germ_ref, germ_alt = line.split('\t')[2:4] + if ref == germ_ref and alt == germ_alt: + return True + return False + +for line in in_vcf: + if line[0] == '#': + print(line, end='') + continue + chrom, pos, _, ref, alts = line.split()[:5] + for alt in alts.split(','): + if check_known_germ(chrom, pos, ref, alt): + continue + else: + print(line, end='') + break diff --git a/pipeline.conf b/pipeline.conf index 36aec7c..4007bd2 100644 --- a/pipeline.conf +++ b/pipeline.conf @@ -17,6 +17,7 @@ INDEL1KG=/shared/refdata/gatk_bundle/b37/1000G_phase1.indels.b37.vcf OMNI=/shared/refdata/gatk_bundle/b37/1000G_omni2.5.b37.vcf HAPMAP=/shared/refdata/gatk_bundle/b37/hapmap_3.3.b37.vcf SNP1KG=/shared/refdata/gatk_bundle/b37/1000G_phase1.snps.high_confidence.b37.vcf +GERMDB=/shared/refdata/bsmn/gnomAD.1KG.ExAC.ESP6500.Kaviar.snps.txt.gz #SYNAPSE PARENTID=syn11636015 From 23e8f5c4d100e3dde63d726d9d48076077d7c191 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Fri, 29 Jun 2018 16:53:16 -0500 Subject: [PATCH 16/70] Change germline filter script --- analysis_utils/germline_filter.py | 36 ++++++++----------------------- 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/analysis_utils/germline_filter.py b/analysis_utils/germline_filter.py index 5ae4c77..7ea5e78 100755 --- a/analysis_utils/germline_filter.py +++ b/analysis_utils/germline_filter.py @@ -1,48 +1,30 @@ #!/usr/bin/env python3 -import os import sys import argparse -import subprocess import functools - -config = os.path.dirname(os.path.realpath(__file__)) + "/../pipeline.conf" -with open(config) as f: - for line in f: - if line[:6] == "TABIX=": - TABIX=line.strip().split('=')[1] - break +import gzip print = functools.partial(print, flush=True) -parser = argparse.ArgumentParser(description='Germline DB filter') +parser = argparse.ArgumentParser(description='Known germline variant filter') parser.add_argument('infile', help='VCF file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) parser.add_argument('--variant', '-V', help='gzipped variant file (format:chrom\\tpos\\t\\tref\\talt)', required=True) args = parser.parse_args() -in_vcf = args.infile -def check_known_germ(chrom, pos, ref, alt, germ_file=args.variant): - germ_out = subprocess.run([TABIX, germ_file, '{0}:{1}-{1}'.format(chrom, pos)], - stdout=subprocess.PIPE).stdout.decode().split('\n') - for line in germ_out: - if line == '': - break - else: - germ_ref, germ_alt = line.split('\t')[2:4] - if ref == germ_ref and alt == germ_alt: - return True - return False +known_germ = set() +with gzip.open(args.variant, 'rt') as f: + for line in f: + known_germ.add(":".join(line.strip().split())) -for line in in_vcf: +for line in args.infile: if line[0] == '#': print(line, end='') continue chrom, pos, _, ref, alts = line.split()[:5] for alt in alts.split(','): - if check_known_germ(chrom, pos, ref, alt): - continue - else: + var = "{chrom}:{pos}:{ref}:{alt}".format(chrom=chrom, pos=pos, ref=ref, alt=alt) + if var not in known_germ: print(line, end='') - break From 5b9b60a3b2fafb7b36571ddaa63e7885ccf97e05 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Fri, 29 Jun 2018 22:17:58 -0500 Subject: [PATCH 17/70] change location of the library and conf file in genome mapping pipeline --- .../job_scripts/aln_4.indel_realign.sh | 4 +- genome_mapping/job_scripts/aln_5.bqsr.sh | 2 +- .../job_scripts/pre_3.submit_aln_jobs.sh | 2 +- genome_mapping/library/__init__.py | 0 genome_mapping/library/job_queue.py | 99 ------------------- genome_mapping/pipeline.conf | 18 ---- genome_mapping/run.py | 35 ++++--- genome_mapping/submit_aln_jobs.py | 20 ++-- pipeline.conf | 7 +- 9 files changed, 43 insertions(+), 144 deletions(-) delete mode 100644 genome_mapping/library/__init__.py delete mode 100644 genome_mapping/library/job_queue.py delete mode 100644 genome_mapping/pipeline.conf diff --git a/genome_mapping/job_scripts/aln_4.indel_realign.sh b/genome_mapping/job_scripts/aln_4.indel_realign.sh index d0202bd..9a5367b 100755 --- a/genome_mapping/job_scripts/aln_4.indel_realign.sh +++ b/genome_mapping/job_scripts/aln_4.indel_realign.sh @@ -17,7 +17,7 @@ printf -- "[$(date)] Start RealignerTargetCreator.\n---\n" $JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \ -T RealignerTargetCreator -nt 36 \ - -R $REF -known $MILLS -known $ONEKG \ + -R $REF -known $MILLS -known $INDEL1KG \ -I $SM/bam/$SM.markduped.bam \ -o $SM/realigner.intervals @@ -26,7 +26,7 @@ printf -- "---\n[$(date)] Start IndelRealigner.\n---\n" $JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \ -T IndelRealigner \ - -R $REF -known $MILLS -known $ONEKG \ + -R $REF -known $MILLS -known $INDEL1KG \ -targetIntervals $SM/realigner.intervals \ -I $SM/bam/$SM.markduped.bam \ -o $SM/bam/$SM.realigned.bam diff --git a/genome_mapping/job_scripts/aln_5.bqsr.sh b/genome_mapping/job_scripts/aln_5.bqsr.sh index 6b7e0de..b116a47 100755 --- a/genome_mapping/job_scripts/aln_5.bqsr.sh +++ b/genome_mapping/job_scripts/aln_5.bqsr.sh @@ -17,7 +17,7 @@ printf -- "[$(date)] Start BQSR recal_table.\n---\n" $JAVA -Xmx58G -jar $GATK \ -T BaseRecalibrator -nct 36 \ - -R $REF -knownSites $DBSNP -knownSites $MILLS -knownSites $ONEKG \ + -R $REF -knownSites $DBSNP -knownSites $MILLS -knownSites $INDEL1KG \ -I $SM/bam/$SM.realigned.bam \ -o $SM/recal_data.table diff --git a/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh b/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh index 6695645..47bac35 100755 --- a/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh +++ b/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh @@ -17,6 +17,6 @@ SM=$2 printf -- "[$(date)] Start submit_aln_jobs.\n---\n" CWD=$(pwd) -ssh -o StrictHostKeyChecking=No $HOST "cd $CWD; $PYTHON3 $PIPE_HOME/submit_aln_jobs.py $SM" +ssh -o StrictHostKeyChecking=No $HOST "cd $CWD; $PYTHON3 $CMD_HOME/submit_aln_jobs.py $SM" printf -- "---\n[$(date)] Finish submit_aln_jobs.\n" diff --git a/genome_mapping/library/__init__.py b/genome_mapping/library/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/genome_mapping/library/job_queue.py b/genome_mapping/library/job_queue.py deleted file mode 100644 index 09e7131..0000000 --- a/genome_mapping/library/job_queue.py +++ /dev/null @@ -1,99 +0,0 @@ -import subprocess -import xml.etree.ElementTree as ET -import time -import re -from collections import defaultdict - - -class GridEngineQueue: - jstate = defaultdict(list) - is_1st_print = True - - def __init__(self, q_max=2000): - self.q_max = q_max - - @property - def j_total(self): - return len(self.__class__.jstate) - - @property - def j_in_q(self): - return sum(1 for state in self.__class__.jstate.values() - if state != ['done']) - - @property - def j_done(self): - return sum(1 for state in self.__class__.jstate.values() - if state == ['done']) - - @property - def q_total(self): - return sum(len(state) for state in self.qstate.values()) - - @property - def q_run(self): - return sum(state.count('r') for state in self.qstate.values()) - - @property - def q_wait(self): - return sum(state.count('qw') + state.count('hqw') - for state in self.qstate.values()) - - def _update(self): - xmlstr = subprocess.run(['qstat', '-xml'], - stdout=subprocess.PIPE, encoding='utf-8').stdout - root = ET.fromstring(xmlstr) - - self.qstate = defaultdict(list) - for job in root.findall('./*/job_list'): - jid = job.find('JB_job_number').text - state = job.find('state').text - self.qstate[jid].append(state) - - for jid in self.__class__.jstate: - self.__class__.jstate[jid] = self.qstate.get(jid, ['done']) - - def _print_summary(self): - qstat = 'queue status: {:>5} in total {:>5} in run {:>5} in wait'.format( - self.q_total, self.q_run, self.q_wait) - jstat = ' job status: {:>5} submitted {:>5} done {:>5} in queue'.format( - self.j_total, self.j_done, self.j_in_q) - if self.__class__.is_1st_print == True: - self.__class__.is_1st_print = False - else: - print('\x1b[2A', end='\r') - print('\x1b[2K', end='\r') - print('-' * 59) - print('\x1b[2K', end='\r') - print(qstat) - print('\x1b[2K', end='\r') - print(jstat, end='\r') - - def _wait(self): - while True: - self._update() - if self.q_total < self.q_max: - return - self._print_summary() - time.sleep(5) - - def submit(self, q_opt_str, cmd_str): - self._wait() - qsub_cmd_list = ["qsub"] + q_opt_str.split() + cmd_str.split() - stdout = subprocess.run(qsub_cmd_list, - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - encoding='utf-8').stdout - - m = re.search("Your job (\d+) \(\"(.+)\"\)", stdout, re.MULTILINE) - jid = m.group(1) - jname = m.group(2) - - print("Your job {jid} (\"{jname}\") has been submitted".format(jid=jid, jname=jname)) -# print('\x1b[2A', end='\r') -# print('\x1b[2K', end='\r') -# print('Submitted job: {jname}.\n\n'.format(jname=jname)) - - self.__class__.jstate[jid] = [] - self._update() -# self._print_summary() - return jid diff --git a/genome_mapping/pipeline.conf b/genome_mapping/pipeline.conf deleted file mode 100644 index df88b9e..0000000 --- a/genome_mapping/pipeline.conf +++ /dev/null @@ -1,18 +0,0 @@ -#TOOLS -PYTHON3=/shared/apps/pyenv/versions/3.6.2/bin/python3 -SYNAPSE=/shared/apps/pyenv/versions/3.6.2/bin/synapse -JAVA=/shared/apps/java/jdk1.8.0_144/bin/java -BWA=/shared/apps/bwa/0.7.16a/bin/bwa -SAMTOOLS=/shared/apps/samtools/1.6/bin/samtools -SAMBAMBA=/shared/apps/sambamba/v0.6.7/bin/sambamba -GATK=/shared/apps/gatk/3.7-0/GenomeAnalysisTK.jar -PICARD=/shared/apps/picard/2.12.1/picard.jar - -#RESOURCES -REF=/shared/refdata/bsmn/hs37d5.fa -DBSNP=/shared/refdata/gatk_bundle/b37/dbsnp_138.b37.vcf -MILLS=/shared/refdata/gatk_bundle/b37/Mills_and_1000G_gold_standard.indels.b37.vcf -ONEKG=/shared/refdata/gatk_bundle/b37/1000G_phase1.indels.b37.vcf - -#SYNAPSE -PARENTID=syn11636015 diff --git a/genome_mapping/run.py b/genome_mapping/run.py index 0c9f7e9..e0df2ac 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -9,7 +9,13 @@ from collections import defaultdict from library.job_queue import GridEngineQueue -pipe_home = os.path.dirname(os.path.realpath(__file__)) +cmd_home = os.path.dirname(os.path.realpath(__file__)) +pipe_home = os.path.normpath(cmd_home + "/..") +util_home = pipe_home + "/analysis_utils" +job_home = cmd_home + "/job_scripts" +sys.path.append(pipe_home) + +from library.job_queue import GridEngineQueue q = GridEngineQueue() def main(): @@ -43,38 +49,38 @@ def opt(sample, jid=None): def submit_pre_jobs_fastq(sample, fname, synid): jid = q.submit(opt(sample), - "{pipe_home}/job_scripts/pre_1.download.sh {sample} {fname} {synid}".format( - pipe_home=pipe_home, sample=sample, fname=fname, synid=synid)) + "{job_home}/pre_1.download.sh {sample} {fname} {synid}".format( + job_home=job_home, sample=sample, fname=fname, synid=synid)) jid = q.submit(opt(sample, jid), - "{pipe_home}/job_scripts/pre_2.split_fastq_by_RG.sh {sample}/downloads/{fname}".format( - pipe_home=pipe_home, sample=sample, fname=fname)) + "{job_home}/pre_2.split_fastq_by_RG.sh {sample}/downloads/{fname}".format( + job_home=job_home, sample=sample, fname=fname)) return jid def submit_pre_jobs_bam(sample, fname, synid): jid = q.submit(opt(sample), - "{pipe_home}/job_scripts/pre_1.download.sh {sample} {fname} {synid}".format( - pipe_home=pipe_home, sample=sample, fname=fname, synid=synid)) + "{job_home}/pre_1.download.sh {sample} {fname} {synid}".format( + job_home=job_home, sample=sample, fname=fname, synid=synid)) jid = q.submit(opt(sample, jid), - "{pipe_home}/job_scripts/pre_1b.bam2fastq.sh {sample} {fname}".format( - pipe_home=pipe_home, sample=sample, fname=fname)) + "{job_home}/pre_1b.bam2fastq.sh {sample} {fname}".format( + job_home=job_home, sample=sample, fname=fname)) jid_list = [] for read in ["R1", "R2"]: fastq = "{sample}/fastq/{sample}.{read}.fastq.gz".format(sample=sample, read=read) jid_list.append(q.submit(opt(sample, jid), - "{pipe_home}/job_scripts/pre_2.split_fastq_by_RG.sh {sample}/fastq/{sample}.{read}.fastq.gz".format( - pipe_home=pipe_home, sample=sample, read=read))) + "{job_home}/pre_2.split_fastq_by_RG.sh {sample}/fastq/{sample}.{read}.fastq.gz".format( + job_home=job_home, sample=sample, read=read))) jid = ",".join(jid_list) return jid def submit_aln_jobs(sample, jid): q.submit(opt(sample, jid), - "{pipe_home}/job_scripts/pre_3.submit_aln_jobs.sh {host} {sample}".format( - pipe_home=pipe_home, host=os.getenv("HOSTNAME"), sample=sample)) + "{job_home}/pre_3.submit_aln_jobs.sh {host} {sample}".format( + job_home=job_home, host=os.getenv("HOSTNAME"), sample=sample)) def parse_args(): parser = argparse.ArgumentParser(description='Genome Mapping Pipeline') @@ -105,7 +111,8 @@ def synapse_login(): def save_run_info(config): with open("run_info", "w") as run_file: - run_file.write("PIPE_HOME={pipe_home}\n\n".format(pipe_home=pipe_home)) + run_file.write("CMD_HOME={path}\n\n".format(path=cmd_home)) + run_file.write("UTIL_HOME={path}\n\n".format(path=util_home)) with open(config) as cfg_file: for line in cfg_file: run_file.write(line) diff --git a/genome_mapping/submit_aln_jobs.py b/genome_mapping/submit_aln_jobs.py index 0257f2f..5d5de7e 100755 --- a/genome_mapping/submit_aln_jobs.py +++ b/genome_mapping/submit_aln_jobs.py @@ -5,9 +5,13 @@ import glob import os import sys -from library.job_queue import GridEngineQueue -pipe_home = os.path.dirname(os.path.realpath(__file__)) +cmd_home = os.path.dirname(os.path.realpath(__file__)) +pipe_home = os.path.normpath(cmd_home + "/..") +job_home = cmd_home + "/job_scripts" +sys.path.append(pipe_home) + +from library.job_queue import GridEngineQueue def main(): args = parse_args() @@ -16,23 +20,23 @@ def main(): jid_list = [] for pu in [fastq.split(".")[1] for fastq in glob.glob("{sample}/fastq/{sample}.*.R1.fastq.gz".format(sample=args.sample))]: jid_list.append(q.submit(opt(args.sample), - "{pipe_home}/job_scripts/aln_1.align_sort.sh {sample} {pu}".format(pipe_home=pipe_home, sample=args.sample, pu=pu))) + "{job_home}/aln_1.align_sort.sh {sample} {pu}".format(job_home=job_home, sample=args.sample, pu=pu))) jid = ",".join(jid_list) jid = q.submit(opt(args.sample, jid), - "{pipe_home}/job_scripts/aln_2.merge_bam.sh {sample}".format(pipe_home=pipe_home, sample=args.sample)) + "{job_home}/aln_2.merge_bam.sh {sample}".format(job_home=job_home, sample=args.sample)) jid = q.submit(opt(args.sample, jid), - "{pipe_home}/job_scripts/aln_3.markdup.sh {sample}".format(pipe_home=pipe_home, sample=args.sample)) + "{job_home}/aln_3.markdup.sh {sample}".format(job_home=job_home, sample=args.sample)) jid = q.submit(opt(args.sample, jid), - "{pipe_home}/job_scripts/aln_4.indel_realign.sh {sample}".format(pipe_home=pipe_home, sample=args.sample)) + "{job_home}/aln_4.indel_realign.sh {sample}".format(job_home=job_home, sample=args.sample)) jid = q.submit(opt(args.sample, jid), - "{pipe_home}/job_scripts/aln_5.bqsr.sh {sample}".format(pipe_home=pipe_home, sample=args.sample)) + "{job_home}/aln_5.bqsr.sh {sample}".format(job_home=job_home, sample=args.sample)) q.submit(opt(args.sample, jid), - "{pipe_home}/job_scripts/aln_6.upload_bam.sh {sample}".format(pipe_home=pipe_home, sample=args.sample)) + "{job_home}/aln_6.upload_bam.sh {sample}".format(job_home=job_home, sample=args.sample)) def parse_args(): parser = argparse.ArgumentParser(description='Alignment job submitter') diff --git a/pipeline.conf b/pipeline.conf index 4007bd2..c00b61b 100644 --- a/pipeline.conf +++ b/pipeline.conf @@ -7,17 +7,22 @@ SAMTOOLS=/shared/apps/samtools/1.7/bin/samtools SAMBAMBA=/shared/apps/sambamba/v0.6.7/bin/sambamba GATK=/shared/apps/gatk/3.7-0/GenomeAnalysisTK.jar PICARD=/shared/apps/picard/2.12.1/picard.jar +BGZIP=/shared/apps/htslib/1.7/bin/bgzip TABIX=/shared/apps/htslib/1.7/bin/tabix +VT=/shared/apps/vt/20180617/bin/vt +BCFTOOLS=/shared/apps/bcftools/1.7/bin/bcftools #RESOURCES REF=/shared/refdata/bsmn/hs37d5.fa +REFDIR=/shared/refdata/bsmn DBSNP=/shared/refdata/gatk_bundle/b37/dbsnp_138.b37.vcf MILLS=/shared/refdata/gatk_bundle/b37/Mills_and_1000G_gold_standard.indels.b37.vcf INDEL1KG=/shared/refdata/gatk_bundle/b37/1000G_phase1.indels.b37.vcf OMNI=/shared/refdata/gatk_bundle/b37/1000G_omni2.5.b37.vcf HAPMAP=/shared/refdata/gatk_bundle/b37/hapmap_3.3.b37.vcf SNP1KG=/shared/refdata/gatk_bundle/b37/1000G_phase1.snps.high_confidence.b37.vcf -GERMDB=/shared/refdata/bsmn/gnomAD.1KG.ExAC.ESP6500.Kaviar.snps.txt.gz +KNOWN_GERM_SNP=/shared/refdata/bsmn/gnomAD.1KG.ExAC.ESP6500.Kaviar.snps.txt.gz +MASK1KG=/shared/refdata/bsmn/20141020.strict_mask.whole_genome.fasta.gz #SYNAPSE PARENTID=syn11636015 From 99443b2a1b9b7bebfc17f604edaed2fc4c167cc3 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Fri, 29 Jun 2018 22:18:54 -0500 Subject: [PATCH 18/70] change library/job_queue.py location --- library/job_queue.py | 94 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 library/job_queue.py diff --git a/library/job_queue.py b/library/job_queue.py new file mode 100644 index 0000000..ec81441 --- /dev/null +++ b/library/job_queue.py @@ -0,0 +1,94 @@ +import subprocess +import xml.etree.ElementTree as ET +import time +import re +from collections import defaultdict + + +class GridEngineQueue: + jstate = defaultdict(list) + is_1st_print = True + + def __init__(self, q_max=2000): + self.q_max = q_max + + @property + def j_total(self): + return len(self.__class__.jstate) + + @property + def j_in_q(self): + return sum(1 for state in self.__class__.jstate.values() + if state != ['done']) + + @property + def j_done(self): + return sum(1 for state in self.__class__.jstate.values() + if state == ['done']) + + @property + def q_total(self): + return sum(len(state) for state in self.qstate.values()) + + @property + def q_run(self): + return sum(state.count('r') for state in self.qstate.values()) + + @property + def q_wait(self): + return sum(state.count('qw') + state.count('hqw') + for state in self.qstate.values()) + + def _update(self): + xmlstr = subprocess.run(['qstat', '-xml'], + stdout=subprocess.PIPE, encoding='utf-8').stdout + root = ET.fromstring(xmlstr) + + self.qstate = defaultdict(list) + for job in root.findall('./*/job_list'): + jid = job.find('JB_job_number').text + state = job.find('state').text + self.qstate[jid].append(state) + + for jid in self.__class__.jstate: + self.__class__.jstate[jid] = self.qstate.get(jid, ['done']) + + def _print_summary(self): + qstat = 'queue status: {:>5} in total {:>5} in run {:>5} in wait'.format( + self.q_total, self.q_run, self.q_wait) + jstat = ' job status: {:>5} submitted {:>5} done {:>5} in queue'.format( + self.j_total, self.j_done, self.j_in_q) + if self.__class__.is_1st_print == True: + self.__class__.is_1st_print = False + else: + print('\x1b[2A', end='\r') + print('\x1b[2K', end='\r') + print('-' * 59) + print('\x1b[2K', end='\r') + print(qstat) + print('\x1b[2K', end='\r') + print(jstat, end='\r') + + def _wait(self): + while True: + self._update() + if self.q_total < self.q_max: + return + self._print_summary() + time.sleep(5) + + def submit(self, q_opt_str, cmd_str): + self._wait() + qsub_cmd_list = ["qsub"] + q_opt_str.split() + cmd_str.split() + stdout = subprocess.run(qsub_cmd_list, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + encoding='utf-8').stdout + m = re.search("Your job(?:|-array) (\d+)(?:|.+) \(\"(.+)\"\)", stdout, re.MULTILINE) + jid = m.group(1) + jname = m.group(2) + + print("Your job {jid} (\"{jname}\") has been submitted".format(jid=jid, jname=jname)) + + self.__class__.jstate[jid] = [] + self._update() + return jid From ab23d3cd1adfd8ce6f94bde3538efc3fe573168f Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Wed, 11 Jul 2018 01:12:05 -0500 Subject: [PATCH 19/70] Add configurations for root and cnvnator. --- pipeline.conf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipeline.conf b/pipeline.conf index c00b61b..e4cd066 100644 --- a/pipeline.conf +++ b/pipeline.conf @@ -11,6 +11,8 @@ BGZIP=/shared/apps/htslib/1.7/bin/bgzip TABIX=/shared/apps/htslib/1.7/bin/tabix VT=/shared/apps/vt/20180617/bin/vt BCFTOOLS=/shared/apps/bcftools/1.7/bin/bcftools +ROOTSYS=/shared/apps/root/6.14.00 +CNVNATOR=/shared/apps/cnvnator/2018-07-02/bin/cnvnator #RESOURCES REF=/shared/refdata/bsmn/hs37d5.fa From 69a7b2b1cf6a774d72cff0ff990ed8fa96095915 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Wed, 11 Jul 2018 01:13:06 -0500 Subject: [PATCH 20/70] Add variant_calling pipeline. --- .../filter_1.known_germ_filtering.sh | 34 +++++ .../job_scripts/filter_2-1.vaf_info.sh | 28 ++++ .../job_scripts/filter_2-2.strand_info.sh | 28 ++++ .../job_scripts/filter_3.bias_summary.sh | 44 ++++++ variant_calling/job_scripts/gatk_1.hc_gvcf.sh | 49 ++++++ .../job_scripts/gatk_2.joint_gt.sh | 47 ++++++ .../job_scripts/gatk_3.concat_vcf.sh | 41 +++++ variant_calling/job_scripts/gatk_4.vqsr.sh | 99 +++++++++++++ .../job_scripts/post_1.candidates.sh | 57 +++++++ variant_calling/job_scripts/pre_1.download.sh | 22 +++ variant_calling/job_scripts/pre_2.cnvnator.sh | 32 ++++ variant_calling/run.py | 140 ++++++++++++++++++ 12 files changed, 621 insertions(+) create mode 100755 variant_calling/job_scripts/filter_1.known_germ_filtering.sh create mode 100755 variant_calling/job_scripts/filter_2-1.vaf_info.sh create mode 100755 variant_calling/job_scripts/filter_2-2.strand_info.sh create mode 100755 variant_calling/job_scripts/filter_3.bias_summary.sh create mode 100755 variant_calling/job_scripts/gatk_1.hc_gvcf.sh create mode 100755 variant_calling/job_scripts/gatk_2.joint_gt.sh create mode 100755 variant_calling/job_scripts/gatk_3.concat_vcf.sh create mode 100755 variant_calling/job_scripts/gatk_4.vqsr.sh create mode 100755 variant_calling/job_scripts/post_1.candidates.sh create mode 100755 variant_calling/job_scripts/pre_1.download.sh create mode 100755 variant_calling/job_scripts/pre_2.cnvnator.sh create mode 100755 variant_calling/run.py diff --git a/variant_calling/job_scripts/filter_1.known_germ_filtering.sh b/variant_calling/job_scripts/filter_1.known_germ_filtering.sh new file mode 100755 index 0000000..4e4fb72 --- /dev/null +++ b/variant_calling/job_scripts/filter_1.known_germ_filtering.sh @@ -0,0 +1,34 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 8 + +if [[ $# -lt 2 ]]; then + echo "Usage: $(basename $0) [sample name] [ploidy]" + exit 1 +fi + +source $(pwd)/run_info + +set -eu -o pipefail + +SM=$1 +PL=$2 + +IN_VCF=$SM/recal_vcf/$SM.ploidy_$PL.vcf +OUT_VCF=$SM/recal_vcf/$SM.ploidy_$PL.known_germ_filtered.snvs.vcf.gz + +printf -- "---\n[$(date)] Start known_germ_filtered_snvs.\n" + +if [[ ! -f $OUT_VCF.tbi ]]; then + $VT decompose -s $IN_VCF \ + |$VT normalize -n -r $REF - \ + |$VT uniq - \ + |$BCFTOOLS view -v snps \ + |$PYTHON3 $UTIL_HOME/germline_filter.py -V $KNOWN_GERM_SNP \ + |$BGZIP > $OUT_VCF + $TABIX $OUT_VCF +else + echo "Skip this step." +fi + +printf -- "[$(date)] Finish known_germ_filtered_snvs.\n---\n" diff --git a/variant_calling/job_scripts/filter_2-1.vaf_info.sh b/variant_calling/job_scripts/filter_2-1.vaf_info.sh new file mode 100755 index 0000000..ef4280c --- /dev/null +++ b/variant_calling/job_scripts/filter_2-1.vaf_info.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 1 + +if [[ $# -lt 2 ]]; then + echo "Usage: $(basename $0) [sample name] [ploidy]" + exit 1 +fi + +source $(pwd)/run_info + +set -eu -o pipefail + +SM=$1 +PL=$2 + +IN_VCF=$SM/recal_vcf/$SM.ploidy_$PL.known_germ_filtered.snvs.vcf.gz +VAF=$SM/vaf/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt +BAM=$SM/bam/$SM.bam + +printf -- "---\n[$(date)] Start generate vaf info.\n" +mkdir -p $SM/vaf + +$BCFTOOLS view -H -f PASS $IN_VCF \ + |grep -v ^# |cut -f1,2,4,5 \ + |$PYTHON3 $UTIL_HOME/somatic_vaf.py -b $BAM > $VAF + +printf -- "[$(date)] Finish generate vaf info.\n---\n" diff --git a/variant_calling/job_scripts/filter_2-2.strand_info.sh b/variant_calling/job_scripts/filter_2-2.strand_info.sh new file mode 100755 index 0000000..d77ebd3 --- /dev/null +++ b/variant_calling/job_scripts/filter_2-2.strand_info.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 1 + +if [[ $# -lt 2 ]]; then + echo "Usage: $(basename $0) [sample name] [ploidy]" + exit 1 +fi + +source $(pwd)/run_info + +set -eu -o pipefail + +SM=$1 +PL=$2 + +IN_VCF=$SM/recal_vcf/$SM.ploidy_$PL.known_germ_filtered.snvs.vcf.gz +STR=$SM/strand/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt +BAM=$SM/bam/$SM.bam + +printf -- "---\n[$(date)] Start generate strand info.\n" +mkdir -p $SM/strand + +$BCFTOOLS view -H -f PASS $IN_VCF \ + |cut -f1,2,4,5 \ + |$PYTHON3 $UTIL_HOME/strand_bias.py -b $BAM > $STR + +printf -- "[$(date)] Finish generate strand info.\n---\n" diff --git a/variant_calling/job_scripts/filter_3.bias_summary.sh b/variant_calling/job_scripts/filter_3.bias_summary.sh new file mode 100755 index 0000000..4e487ae --- /dev/null +++ b/variant_calling/job_scripts/filter_3.bias_summary.sh @@ -0,0 +1,44 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 1 + +if [[ $# -lt 2 ]]; then + echo "Usage: $(basename $0) [sample name] [ploidy]" + exit 1 +fi + +source $(pwd)/run_info + +set -eu -o pipefail + +SM=$1 +PL=$2 + +VAF=$SM/vaf/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt +STR=$SM/strand/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt +SUM=$SM/bias_summary/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt + +printf -- "---\n[$(date)] Start generate bias summary.\n" +mkdir -p $SM/bias_summary + +printf "#chr\tpos\tref\talt\tdepth\tref_n\talt_n\t" > $SUM +printf "ref_fwd\tref_rev\talt_fwd\talt_rev\tp_binom\tp_poisson\tp_fisher\t" >> $SUM +printf "som_vs_germ\tstrand_bais1\tstrand_bias2\t" >> $SUM +printf "enough_alt\tstarnds_in_alt\tallelic_locus\tmask\n" >> $SUM + +paste $VAF $STR \ + |grep -v ^# \ + |awk -v samtools=$SAMTOOLS -v mask=$MASK1KG '{ +cmd=samtools" faidx "mask" "$1":"$2"-"$2"|tail -n1"; +cmd|getline mask_base; +close(cmd); +if ($9 < 1e-5) a="som"; else a="germ"; +if ($18 >= 0.05) b="unbiased"; else b="bias1"; +if ($27 >= 0.05) c="unbiased"; else c="bias2"; +if ($8 >= 5) d="enough_alt_reads"; else d="not_enough_alt_reads"; +if ($24 >= 1 && $25 >=1) e="both_strands_in_alt_reads"; else e="one_strand_in_alt_reads"; +if ($6 > $7 + $8) f="multiallelic_site"; else f="biallelic_site"; +print $1"\t"$2"\t"$3"\t"$4"\t"$6"\t"$7"\t"$8"\t"$20"\t"$21"\t"$24"\t"$25"\t"$9"\t"$18"\t"$27"\t"a"\t"b"\t"c"\t"d"\t"e"\t"f"\t"mask_base +}' >> $SUM + +printf -- "[$(date)] Finish generate bias summary.\n---\n" diff --git a/variant_calling/job_scripts/gatk_1.hc_gvcf.sh b/variant_calling/job_scripts/gatk_1.hc_gvcf.sh new file mode 100755 index 0000000..16d2c87 --- /dev/null +++ b/variant_calling/job_scripts/gatk_1.hc_gvcf.sh @@ -0,0 +1,49 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 16 + +if [[ $# -lt 2 ]]; then + echo "Usage: $(basename $0) [sample name] [ploidy]" + exit 1 +fi + +source $(pwd)/run_info + +set -eu -o pipefail + +SM=$1 +PL=$2 + +if [[ ${SGE_TASK_ID} -le 22 ]]; then + CHR=${SGE_TASK_ID} +elif [[ ${SGE_TASK_ID} -eq 23 ]]; then + CHR=X +elif [[ ${SGE_TASK_ID} -eq 24 ]]; then + CHR=Y +fi + +BAM=$SM/bam/$SM.bam +GVCF=$SM/gvcf/$SM.ploidy_$PL.$CHR.g.vcf +RVCF=$SM/raw_vcf/$SM.ploidy_$PL.$CHR.vcf +RVCF_ALL=$SM/raw_vcf/$SM.ploidy_$PL.vcf +CVCF_ALL=$SM/recal_vcf/$SM.ploidy_$PL.vcf + +printf -- "---\n[$(date)] Start HC_GVCF.\n" + +if [[ ! -f $GVCF.idx && ! -f $RVCF.idx && ! -f $RVCF_ALL.idx && ! -f $CVCF_ALL.idx ]]; then + mkdir -p $SM/gvcf + $JAVA -Xmx52G -Djava.io.tmpdir=tmp -XX:-UseParallelGC -jar $GATK \ + -T HaplotypeCaller \ + -R $REF \ + -I $BAM \ + --emitRefConfidence GVCF \ + -ploidy $PL \ + -nct $NSLOTS \ + -L $CHR \ + -A StrandAlleleCountsBySample \ + -o $GVCF +else + echo "Skip this step." +fi + +printf -- "[$(date)] Finish HC_GVCF.\n---\n" diff --git a/variant_calling/job_scripts/gatk_2.joint_gt.sh b/variant_calling/job_scripts/gatk_2.joint_gt.sh new file mode 100755 index 0000000..c74c59e --- /dev/null +++ b/variant_calling/job_scripts/gatk_2.joint_gt.sh @@ -0,0 +1,47 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 16 + +if [[ $# -lt 2 ]]; then + echo "Usage: $(basename $0) [sample name] [ploidy]" + exit 1 +fi + +source $(pwd)/run_info + +set -eu -o pipefail + +SM=$1 +PL=$2 + +if [[ ${SGE_TASK_ID} -le 22 ]]; then + CHR=${SGE_TASK_ID} +elif [[ ${SGE_TASK_ID} -eq 23 ]]; then + CHR=X +elif [[ ${SGE_TASK_ID} -eq 24 ]]; then + CHR=Y +fi + +GVCF=$SM/gvcf/$SM.ploidy_$PL.$CHR.g.vcf +RVCF=$SM/raw_vcf/$SM.ploidy_$PL.$CHR.vcf +RVCF_ALL=$SM/raw_vcf/$SM.ploidy_$PL.vcf +CVCF_ALL=$SM/recal_vcf/$SM.ploidy_$PL.vcf + +printf -- "---\n[$(date)] Start Joint GT.\n" + +if [[ ! -f $RVCF.idx && ! -f $RVCF_ALL.idx && ! -f $CVCF_ALL.idx ]]; then + mkdir -p $SM/raw_vcf + $JAVA -Xmx52G -Djava.io.tmpdir=tmp -XX:-UseParallelGC -jar $GATK \ + -T GenotypeGVCFs \ + -R $REF \ + -ploidy $PL \ + -nt $NSLOTS \ + -L $CHR \ + -V $GVCF \ + -o $RVCF + rm $GVCF $GVCF.idx +else + echo "Skip this step." +fi + +printf -- "[$(date)] Finish Joint GT.\n---\n" diff --git a/variant_calling/job_scripts/gatk_3.concat_vcf.sh b/variant_calling/job_scripts/gatk_3.concat_vcf.sh new file mode 100755 index 0000000..5356eb2 --- /dev/null +++ b/variant_calling/job_scripts/gatk_3.concat_vcf.sh @@ -0,0 +1,41 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 3 + +if [[ $# -lt 2 ]]; then + echo "Usage: $(basename $0) [sample name] [ploidy]" + exit 1 +fi + +source $(pwd)/run_info + +set -eu -o pipefail + +SM=$1 +PL=$2 + +RVCFS="" +for i in $(seq 1 22) X Y; do + RVCFS="$RVCFS -V $SM/raw_vcf/$SM.ploidy_$PL.$i.vcf" +done +RVCF_ALL=$SM/raw_vcf/$SM.ploidy_$PL.vcf +CVCF_ALL=$SM/recal_vcf/$SM.ploidy_$PL.vcf + +printf -- "---\n[$(date)] Start concat vcfs.\n" + +if [[ ! -f $RVCF_ALL.idx && ! -f $CVCF_ALL.idx ]]; then + $JAVA -Xmx6G -cp $GATK org.broadinstitute.gatk.tools.CatVariants \ + -R $REF \ + $RVCFS \ + -out $RVCF_ALL \ + -assumeSorted + + for i in $(seq 1 22) X Y; do + rm $SM/raw_vcf/$SM.ploidy_$PL.$i.vcf + rm $SM/raw_vcf/$SM.ploidy_$PL.$i.vcf.idx + done +else + echo "Skip this step." +fi + +printf -- "[$(date)] Finish concat vcfs.\n---\n" diff --git a/variant_calling/job_scripts/gatk_4.vqsr.sh b/variant_calling/job_scripts/gatk_4.vqsr.sh new file mode 100755 index 0000000..9bce6a9 --- /dev/null +++ b/variant_calling/job_scripts/gatk_4.vqsr.sh @@ -0,0 +1,99 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 8 + +if [[ $# -lt 1 ]]; then + echo "Usage: $(basename $0) [sample name] [ploidy]" + exit 1 +fi + +source $(pwd)/run_info + +set -eu -o pipefail + +SM=$1 +PL=$2 + +RVCF=$SM/raw_vcf/$SM.ploidy_$PL.vcf +CVCF_SNP=$SM/recal_vcf/$SM.ploidy_$PL.snps.vcf +CVCF_ALL=$SM/recal_vcf/$SM.ploidy_$PL.vcf +RECAL_SNP=$SM/vqsr/recalibrate_SNP.ploidy_$PL.recal +RECAL_INDEL=$SM/vqsr/recalibrate_INDEL.ploidy_$PL.recal +TRANCHES_SNP=$SM/vqsr/recalibrate_SNP.ploidy_$PL.tranches +TRANCHES_INDEL=$SM/vqsr/recalibrate_INDEL.ploidy_$PL.tranches +RSCRIPT_SNP=$SM/vqsr/recalibrate_SNP_plots.ploidy_$PL.R + +printf -- "---\n[$(date)] Start VQSR.\n" + +if [[ ! -f $CVCF_ALL.idx ]]; then + mkdir -p $SM/vqsr $SM/recal_vcf + + $JAVA -Xmx24G -XX:-UseParallelGC -jar $GATK \ + -T VariantRecalibrator \ + -R $REF \ + -input $RVCF \ + -resource:hapmap,known=false,training=true,truth=true,prior=15.0 $HAPMAP \ + -resource:omni,known=false,training=true,truth=true,prior=12.0 $OMNI \ + -resource:1000G,known=false,training=true,truth=false,prior=10.0 $SNP1KG \ + -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 $DBSNP \ + -an DP \ + -an QD \ + -an FS \ + -an SOR \ + -an MQ \ + -an MQRankSum \ + -an ReadPosRankSum \ + -mode SNP \ + -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \ + -recalFile $RECAL_SNP \ + -tranchesFile $TRANCHES_SNP \ + -rscriptFile $RSCRIPT_SNP + + $JAVA -Xmx24G -XX:-UseParallelGC -jar $GATK \ + -T ApplyRecalibration \ + -R $REF \ + -input $RVCF \ + -mode SNP \ + --ts_filter_level 99.0 \ + -recalFile $RECAL_SNP \ + -tranchesFile $TRANCHES_SNP \ + -o $CVCF_SNP + + $JAVA -Xmx24G -XX:-UseParallelGC -jar $GATK \ + -T VariantRecalibrator \ + -R $REF \ + -input $CVCF_SNP \ + -resource:mills,known=false,training=true,truth=true,prior=12.0 $MILLS \ + -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 $DBSNP \ + -an QD \ + -an DP \ + -an FS \ + -an SOR \ + -an MQRankSum \ + -an ReadPosRankSum \ + -mode INDEL \ + -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \ + --maxGaussians 4 \ + -recalFile $RECAL_INDEL \ + -tranchesFile $TRANCHES_INDEL \ + -rscriptFile $RSCRIPT_INDEL + + $JAVA -Xmx24G -XX:-UseParallelGC -jar $GATK \ + -T ApplyRecalibration \ + -R $REF \ + -input $CVCF_SNP \ + -mode INDEL \ + --ts_filter_level 99.0 \ + -recalFile $RECAL_INDEL \ + -tranchesFile $TRANCHES_INDEL \ + -o $CVCF_ALL + + rm $RVCF $RVCF.idx \ + $CVCF_SNP $CVCF_SNP.idx \ + $RECAL_SNP $RECAL_SNP.idx \ + $RECAL_INDEL $RECAL_INDEL.idx +else + echo "Skip this step." +fi + +printf -- "[$(date)] Finish VQSR.\n---\n" diff --git a/variant_calling/job_scripts/post_1.candidates.sh b/variant_calling/job_scripts/post_1.candidates.sh new file mode 100755 index 0000000..724940b --- /dev/null +++ b/variant_calling/job_scripts/post_1.candidates.sh @@ -0,0 +1,57 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 1 + +if [[ $# -lt 1 ]]; then + echo "Usage: $(basename $0) [sample name]" + exit 1 +fi + +source $(pwd)/run_info +source $ROOTSYS/bin/thisroot.sh + +set -eu -o pipefail + +SM=$1 + +SUM_PREFIX=$SM/bias_summary/$SM.ploidy_ +CANDALL=$SM/candidates/$SM.mosaic_snvs.all.txt +CANDCNV=$SM/candidates/$SM.mosaic_snvs.cnv_considered.txt +GENOTYPE=$SM/candidates/$SM.genotype +VCF=$SM/candidates/$SM.vcf +HISROOT=$SM/cnv/$SM.his.root +BAM=$SM/bam/$SM.bam + +printf -- "---\n[$(date)] Start select candidates.\n" +#rm $BAM +mkdir -p $SM/candidates + +cat ${SUM_PREFIX}{3,4,5,6,7,8,9,10}.*|grep biallelic > ${SUM_PREFIX}tmp +sort -u ${SUM_PREFIX}2.* ${SUM_PREFIX}tmp \ + |grep -v -e not -e germ -e bias1.bias2 -e one_strand \ + |grep P$ \ + |awk '$12 < 1e-5 && $14 >= 0.1' \ + |sort -u -k1V -k2n -k3 > $CANDALL +rm ${SUM_PREFIX}tmp + +awk '{print $1":"$2-1000"-"$2+1000} END {print "exit"}' $CANDALL \ + |$CNVNATOR -root $HISROOT -genotype 100 \ + |grep Genotype > $GENOTYPE + +paste $CANDALL $GENOTYPE \ + |awk '$25 < 2.44' \ + |cut -f-21 > $CANDCNV + +printf "##fileformat=VCFv4.2\n" > $VCF.tmp +printf "##FORMAT=\n" >> $VCF.tmp +printf "##FORMAT=\n" >> $VCF.tmp +printf "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$SM\n" >> $VCF.tmp +awk '{print $1"\t"$2"\t.\t"$3"\t"$4"\t.\t.\t.\tGT:AD\t0/1:"$6","$7}' $CANDCNV >> $VCF.tmp + +$JAVA -Xmx4g -jar $PICARD SortVcf \ + CREATE_INDEX=false \ + SD=${REF/fa/dict} \ + I=$VCF.tmp O=$VCF +rm $VCF.tmp + +printf -- "[$(date)] Finish select candidates.\n---\n" diff --git a/variant_calling/job_scripts/pre_1.download.sh b/variant_calling/job_scripts/pre_1.download.sh new file mode 100755 index 0000000..dd9455a --- /dev/null +++ b/variant_calling/job_scripts/pre_1.download.sh @@ -0,0 +1,22 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 1 + +if [[ $# -lt 3 ]]; then + echo "Usage: $(basename $0) [sample name] [file name] [synapse id]" + exit 1 +fi + +source $(pwd)/run_info + +set -eu -o pipefail + +SM=$1 +FNAME=$2 +SINID=$3 + +printf -- "---\n[$(date)] Start download: $FNAME\n" +mkdir -p $SM/bam + +# $SYNAPSE get $SINID --downloadLocation $SM/bam/ +printf -- "[$(date)] Finish downlaod: $FNAME\n---\n" diff --git a/variant_calling/job_scripts/pre_2.cnvnator.sh b/variant_calling/job_scripts/pre_2.cnvnator.sh new file mode 100755 index 0000000..ed45eb5 --- /dev/null +++ b/variant_calling/job_scripts/pre_2.cnvnator.sh @@ -0,0 +1,32 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 4 + +if [[ $# -lt 1 ]]; then + echo "Usage: $(basename $0) [sample name]" + exit 1 +fi + +source $(pwd)/run_info +source $ROOTSYS/bin/thisroot.sh + +set -eu -o pipefail + +SM=$1 +BAM=$SM/bam/$SM.bam +ROOT=$SM/cnv/$SM.root +CNVCALL=$SM/cnv/$SM.cnvcall +CHROM="$(seq 1 22) X Y" + +printf -- "---\n[$(date)] Start cnvnator.\n" +mkdir -p $SM/cnv + +$CNVNATOR -root $ROOT -chrom $CHROM -tree $BAM -unique +for BINSIZE in 100 1000; do + $CNVNATOR -root $ROOT -chrom $CHROM -his $BINSIZE -d $REFDIR + $CNVNATOR -root $ROOT -chrom $CHROM -stat $BINSIZE + $CNVNATOR -root $ROOT -chrom $CHROM -partition $BINSIZE + $CNVNATOR -root $ROOT -chrom $CHROM -call $BINSIZE > $CNVCALL.bin_$BINSIZE +done + +printf -- "[$(date)] Finish cnvnator.\n---\n" diff --git a/variant_calling/run.py b/variant_calling/run.py new file mode 100755 index 0000000..857c5a4 --- /dev/null +++ b/variant_calling/run.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd +import synapseclient +import subprocess +import pathlib +import os +import sys +from collections import defaultdict + +cmd_home = os.path.dirname(os.path.realpath(__file__)) +pipe_home = os.path.normpath(cmd_home + "/..") +util_home = pipe_home + "/analysis_utils" +job_home = cmd_home + "/job_scripts" +sys.path.append(pipe_home) + +from library.job_queue import GridEngineQueue +q = GridEngineQueue() + +def main(): + args = parse_args() + samples = parse_sample_file(args.infile) + synapse_login() + save_run_info(args.config) + + for sample, sdata in samples.items(): + print(sample) + jid_pre = submit_pre_jobs(sample, sdata) + jid_list = [] + for ploidy in range(2,11): + jid = submit_gatk_jobs(sample, ploidy, jid_pre) + jid = submit_filter_jobs(sample, ploidy, jid) + jid_list.append(jid) + jid = ",".join(jid_list) + submit_post_jobs(sample, jid) + print() + +def opt(sample, jid=None): + opt = "-r y -j y -o {log_dir} -l h_vmem=4G".format(log_dir=log_dir(sample)) + if jid is not None: + opt = "-hold_jid {jid} {opt}".format(jid=jid, opt=opt) + return opt + +def submit_pre_jobs(sample, sdata): + jid_list = [] + for fname, synid in sdata: + jid_list.append( + q.submit(opt(sample), + "{job_home}/pre_1.download.sh {sample} {fname} {synid}".format( + job_home=job_home, sample=sample, fname=fname, synid=synid))) + jid = ",".join(jid_list) + q.submit(opt(sample, jid), + "{job_home}/pre_2.cnvnator.sh {sample}".format( + job_home=job_home, sample=sample)) + return jid + +def submit_gatk_jobs(sample, ploidy, jid): + jid = q.submit( + "-t 1-24 {opt}".format(opt=opt(sample, jid)), + "{job_home}/gatk_1.hc_gvcf.sh {sample} {ploidy}".format( + job_home=job_home, sample=sample, ploidy=ploidy)) + jid = q.submit( + "-t 1-24 {opt}".format(opt=opt(sample, jid)), + "{job_home}/gatk_2.joint_gt.sh {sample} {ploidy}".format( + job_home=job_home, sample=sample, ploidy=ploidy)) + jid = q.submit(opt(sample, jid), + "{job_home}/gatk_3.concat_vcf.sh {sample} {ploidy}".format( + job_home=job_home, sample=sample, ploidy=ploidy)) + jid = q.submit(opt(sample, jid), + "{job_home}/gatk_4.vqsr.sh {sample} {ploidy}".format( + job_home=job_home, sample=sample, ploidy=ploidy)) + return jid + +def submit_filter_jobs(sample, ploidy, jid): + jid = q.submit(opt(sample, jid), + "{job_home}/filter_1.known_germ_filtering.sh {sample} {ploidy}".format( + job_home=job_home, sample=sample, ploidy=ploidy)) + jid_vaf = q.submit(opt(sample, jid), + "{job_home}/filter_2-1.vaf_info.sh {sample} {ploidy}".format( + job_home=job_home, sample=sample, ploidy=ploidy)) + jid_str = q.submit(opt(sample, jid), + "{job_home}/filter_2-2.strand_info.sh {sample} {ploidy}".format( + job_home=job_home, sample=sample, ploidy=ploidy)) + jid = q.submit(opt(sample, jid_vaf+","+jid_str), + "{job_home}/filter_3.bias_summary.sh {sample} {ploidy}".format( + job_home=job_home, sample=sample, ploidy=ploidy)) + return jid + +def submit_post_jobs(sample, jid): + jid = q.submit(opt(sample, jid), + "{job_home}/post_1.candidates.sh {sample}".format( + job_home=job_home, sample=sample)) + return jid + +def parse_args(): + parser = argparse.ArgumentParser( + description='Variant Calling Pipeline') + + parser.add_argument( + 'infile', metavar='sample_list.txt', + help='Sample list file shoud have sample_id, synapse_id, and file_name.') + + parser.add_argument( + '-c', '--config', metavar='config file', + help='Default: [pipeline home]/pipeline.conf', + default="{pipe_home}/pipeline.conf".format(pipe_home=pipe_home)) + + return parser.parse_args() + +def parse_sample_file(sfile): + samples = defaultdict(list) + for sname, group in pd.read_table(sfile).groupby("sample_id"): + for idx, sdata in group.iterrows(): + key = sname + val = (sdata["file"], sdata["synapse_id"]) + samples[key].append(val) + return samples + +def synapse_login(): + try: + synapseclient.login() + except: + subprocess.run(['synapse', 'login', '--remember-me']) + +def save_run_info(config): + with open("run_info", "w") as run_file: + run_file.write("CMD_HOME={path}\n".format(path=cmd_home)) + run_file.write("UTIL_HOME={path}\n\n".format(path=util_home)) + with open(config) as cfg_file: + for line in cfg_file: + run_file.write(line) + +def log_dir(sample): + log_dir = sample+"/logs" + pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) + return log_dir + +if __name__ == "__main__": + main() From f100c00932361c54eba77888c7843de8888fba5b Mon Sep 17 00:00:00 2001 From: "Bae.Taejeong@mayo.edu" Date: Mon, 20 Aug 2018 21:21:42 -0500 Subject: [PATCH 21/70] find samtools path when config not working --- library/pileup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/library/pileup.py b/library/pileup.py index 6a1a09e..9f1b949 100644 --- a/library/pileup.py +++ b/library/pileup.py @@ -1,6 +1,7 @@ import os import sys import re +import shutil import subprocess from .utils import coroutine @@ -10,6 +11,8 @@ if line[:9] == "SAMTOOLS=": SAMTOOLS=line.strip().split('=')[1] break +if not os.path.isfile(SAMTOOLS) or not os.access(SAMTOOLS, os.X_OK): + SAMTOOLS = shutil.which("samtools") @coroutine def pileup(bam, min_MQ, min_BQ, target): From fdcefcc0955f11a1b2162d96de77eb01d979692c Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 1 Oct 2018 00:59:49 -0500 Subject: [PATCH 22/70] Add missing variable. --- variant_calling/job_scripts/gatk_4.vqsr.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/variant_calling/job_scripts/gatk_4.vqsr.sh b/variant_calling/job_scripts/gatk_4.vqsr.sh index 9bce6a9..0f59fa7 100755 --- a/variant_calling/job_scripts/gatk_4.vqsr.sh +++ b/variant_calling/job_scripts/gatk_4.vqsr.sh @@ -22,6 +22,7 @@ RECAL_INDEL=$SM/vqsr/recalibrate_INDEL.ploidy_$PL.recal TRANCHES_SNP=$SM/vqsr/recalibrate_SNP.ploidy_$PL.tranches TRANCHES_INDEL=$SM/vqsr/recalibrate_INDEL.ploidy_$PL.tranches RSCRIPT_SNP=$SM/vqsr/recalibrate_SNP_plots.ploidy_$PL.R +RSCRIPT_INDEL=$SM/vqsr/recalibrate_INDEL_plots.ploidy_$PL.R printf -- "---\n[$(date)] Start VQSR.\n" From bb5ac5c861dbbfb5c69bb545fea5eaaaf2db18c8 Mon Sep 17 00:00:00 2001 From: Kenneth Daily Date: Mon, 1 Oct 2018 09:32:35 -0700 Subject: [PATCH 23/70] Create requirements.txt For Python dependencies. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ae57cb7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +statsmodels From 77cca2c0b0c77bbc5926ad4406046def4d70ac09 Mon Sep 17 00:00:00 2001 From: Kenneth Daily Date: Mon, 1 Oct 2018 09:34:30 -0700 Subject: [PATCH 24/70] Update requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index ae57cb7..3f629c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ statsmodels +pandas +synapseclient From 56a1e3bbb34e72cda013f517ed2dd67593bc28da Mon Sep 17 00:00:00 2001 From: Kenneth Daily Date: Mon, 1 Oct 2018 09:37:06 -0700 Subject: [PATCH 25/70] Update README.md --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 93443c7..2020400 100644 --- a/README.md +++ b/README.md @@ -14,3 +14,11 @@ sample_id file synapse_id 5154_brain-BSMN_REF_brain-534-U01MH106876 bulk_sorted.bam syn10639574 5154_fibroblast-BSMN_REF_fibroblasts-534-U01MH106876 fibroblasts_sorted.bam syn10639575 ``` + +# Contributing + +The `master` branch is protected. To make introduce changes: + +1. Fork this repository +2. Open a branch with your github username and a short descriptive statement (like `kdaily-update-readme`). If there is an open issue on this repository, name your branch after the issue (like `kdaily-issue-7`). +3. Open a pull request and request a review. From f8c93c831553f82735bc55464e453229d73a374a Mon Sep 17 00:00:00 2001 From: Kenneth Daily Date: Mon, 1 Oct 2018 09:55:40 -0700 Subject: [PATCH 26/70] Update README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 93443c7..ec6191d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,12 @@ # bsmn_pipeline BSMN common data processing pipeline +# Setup and installation + +## cfncluster + +To get the pipeline software installed on the cluster, a post-install script is run after the cluster starts. You can see this file as a GitHub Gist [here](https://gist.github.com/kdaily/1e0a2d1fcef1c6847f743f637301a3d5). + # Usage ## genome_mapping ```bash From b339d7ef0ab349abc1443475689655eb7e3ffb12 Mon Sep 17 00:00:00 2001 From: Kenneth Daily Date: Mon, 1 Oct 2018 10:07:28 -0700 Subject: [PATCH 27/70] Update README.md --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index ec6191d..aa664af 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,18 @@ BSMN common data processing pipeline ## cfncluster +This pipeline runs using [`cfncluster`](https://cfncluster.readthedocs.io). + +## Installing cfncluster + +It's recommended to use a Python virtual environment (https://virtualenv.pypa.io/en/stable/). + +To install `cfncluster`: + +``` +pip install cfncluster +``` + To get the pipeline software installed on the cluster, a post-install script is run after the cluster starts. You can see this file as a GitHub Gist [here](https://gist.github.com/kdaily/1e0a2d1fcef1c6847f743f637301a3d5). # Usage From 44e99f2e66c000c8f756f459326ead2171ba8148 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Fri, 19 Oct 2018 14:03:51 -0500 Subject: [PATCH 28/70] Bug fix. --- genome_mapping/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genome_mapping/run.py b/genome_mapping/run.py index e0df2ac..621f2b0 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -6,8 +6,8 @@ import subprocess import pathlib import os +import sys from collections import defaultdict -from library.job_queue import GridEngineQueue cmd_home = os.path.dirname(os.path.realpath(__file__)) pipe_home = os.path.normpath(cmd_home + "/..") From 3eb07c9c3804ffd228da6f3c8491ce739eb205c6 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Fri, 19 Oct 2018 14:39:36 -0500 Subject: [PATCH 29/70] Add an automatic tool installer. --- .gitignore | 3 + install_tools.sh | 143 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+) create mode 100755 install_tools.sh diff --git a/.gitignore b/.gitignore index 749ccda..89241ae 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ __pycache__/ *.py[cod] *$py.class + +# Installed tools directory +/tools/ diff --git a/install_tools.sh b/install_tools.sh new file mode 100755 index 0000000..4131478 --- /dev/null +++ b/install_tools.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +WD=$(pwd) + +# Installing Python3 +mkdir -p tools/python/3.6.2/src +cd tools/python/3.6.2/src +wget -qO- https://www.python.org/ftp/python/3.6.2/Python-3.6.2.tgz \ + |tar xvz --strip-components=1 +./configure --with-ensurepip=install --prefix=$WD/tools/python/3.6.2 +make +make install +cd $WD +rm -r tools/python/3.6.2/src + +# Installing Python modules needed +tools/python/3.6.2/bin/pip3 install --upgrade pip +tools/python/3.6.2/bin/pip3 install pandas synapseclient statsmodels scipy rpy2 + +# Installling Java8 +mkdir -p tools/java +cd tools/java +wget -qO- --no-check-certificate --header "Cookie: oraclelicense=a" \ + http://download.oracle.com/otn-pub/java/jdk/8u191-b12/2787e4a523244c269598db4e85c51e0c/jdk-8u191-linux-x64.tar.gz \ + |tar xvz +cd $WD + +# Installing bwa +mkdir -p tools/bwa/0.7.16a/bin +mkdir -p tools/bwa/0.7.16a/share/man/man1 +mkdir -p tools/bwa/0.7.16a/src +cd tools/bwa/0.7.16a/src +wget -qO- https://github.com/lh3/bwa/releases/download/v0.7.16/bwa-0.7.16a.tar.bz2 \ + |tar xvj --strip-components=1 +make +mv bwa ../bin +mv *.pl ../bin +mv bwa.1 ../share/man/man1 +cd $WD +rm -r tools/bwa/0.7.16a/src + +# Installing samtools +mkdir -p tools/samtools/1.7/src +cd tools/samtools/1.7/src +wget -qO- https://github.com/samtools/samtools/releases/download/1.7/samtools-1.7.tar.bz2 \ + |tar xvj --strip-components=1 +./configure --prefix=$WD/tools/samtools/1.7 +make +make install +cd $WD +rm -r tools/samtools/1.7/src + +# Installing htslib +mkdir -p tools/htslib/1.7/src +cd tools/htslib/1.7/src +wget -qO- https://github.com/samtools/htslib/releases/download/1.7/htslib-1.7.tar.bz2 \ + |tar xvj --strip-components=1 +./configure --prefix=$WD/tools/htslib/1.7 +make +make install +cd $WD +rm -r tools/htslib/1.7/src + +# Installing bcftools +mkdir -p tools/bcftools/1.7/src +cd tools/bcftools/1.7/src +wget -qO- https://github.com/samtools/bcftools/releases/download/1.7/bcftools-1.7.tar.bz2 \ + |tar xvj --strip-components=1 +./configure --prefix=$WD/tools/bcftools/1.7 +make +make install +cd $WD +rm -r tools/bcftools/1.7/src + +# Installing sambamba +mkdir -p tools/sambamba/v0.6.7/bin +cd tools/sambamba/v0.6.7/bin +wget -qO- https://github.com/biod/sambamba/releases/download/v0.6.7/sambamba_v0.6.7_linux.tar.bz2 \ + |tar xvj +cd $WD + +# Installing vt +mkdir -p tools/vt/2018-06-07/bin +mkdir -p tools/vt/2018-06-07/src +cd tools/vt/2018-06-17/src +wget -qO- http://github.com/atks/vt/archive/ee9f613.tar.gz \ + |tar xvz --strip-components=1 +make +mv vt ../bin +cd $WD +rm -r tools/vt/2018-06-17/src + +# Installing root +# prerequisite cmake > 3.4.3 +mkdir -p tools/cmake/3.11.4 +wget -qO- https://cmake.org/files/v3.11/cmake-3.11.4-Linux-x86_64.tar.gz \ + |tar xvz --strip-components=1 -C tools/cmake/3.11.4 + +# build root +mkdir -p tools/root/6.14.00/src +cd tools/root/6.14.00/src +wget -qO- https://root.cern.ch/download/root_v6.14.00.source.tar.gz \ + |tar xvz +$WD/tools/cmake/3.11.4/bin/cmake \ + -Dbuiltin_pcre=ON -Dhttp=ON -Dgnuinstall=ON \ + -DCMAKE_INSTALL_PREFIX=$WD/tools/root/6.14.00 \ + root-6.14.00 +$WD/tools/cmake/3.11.4/bin/cmake --build . -- -j +$WD/tools/cmake/3.11.4/bin/cmake --build . --target install +cd $WD +rm -r tools/root/6.14.00/src + +# Installing cnvnator +source $WD/tools/root/6.14.00/bin/thisroot.sh +mkdir -p tools/cnvnator/2018-07-09/bin +mkdir -p tools/cnvnator/2018-07-09/src/samtools +cd tools/cnvnator/2018-07-09/src +wget -qO- https://github.com/abyzovlab/CNVnator/archive/de012f2.tar.gz \ + |tar xvz --strip-components=1 +cd samtools +wget -qO- https://github.com/samtools/samtools/releases/download/1.7/samtools-1.7.tar.bz2 \ + |tar xvj --strip-components=1 +make +cd .. +sed -i '2s/$/ -lpthread/' Makefile +make OMP=no +mv cnvnator ../bin +mv cnvnator2VCF.pl ../bin +cd $WD +rm -r tools/cnvnator/2018-07-09/src + +# Installing Picard +mkdir -p tools/picard/2.12.1 +cd tools/picard/2.12.1 +wget -q https://github.com/broadinstitute/picard/releases/download/2.12.1/picard.jar +cd $WD + +# Installing GATK +mkdir -p tools/gatk/3.7-0 +echo "----" +echo "Please manually download GATK 3.7-0 and put the jar file in tools/gatk/3.7-0." +echo "URL: https://software.broadinstitute.org/gatk/download/auth?package=GATK-archive&version=3.7-0-gcfedb67" + From cb07f3636e4e49aee1eced76050b81549cc0b0b3 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Fri, 23 Nov 2018 19:06:25 -0600 Subject: [PATCH 30/70] Bug fix: install_tools.sh --- install_tools.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/install_tools.sh b/install_tools.sh index 4131478..6f0a3b7 100755 --- a/install_tools.sh +++ b/install_tools.sh @@ -82,13 +82,13 @@ cd $WD # Installing vt mkdir -p tools/vt/2018-06-07/bin mkdir -p tools/vt/2018-06-07/src -cd tools/vt/2018-06-17/src +cd tools/vt/2018-06-07/src wget -qO- http://github.com/atks/vt/archive/ee9f613.tar.gz \ |tar xvz --strip-components=1 make mv vt ../bin cd $WD -rm -r tools/vt/2018-06-17/src +rm -r tools/vt/2018-06-07/src # Installing root # prerequisite cmake > 3.4.3 From 844f81f8f32f095f6d962fbd9f092912deccf4ed Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Sat, 24 Nov 2018 10:41:53 -0600 Subject: [PATCH 31/70] Add a script for resource download --- .gitignore | 3 ++- download_resources.sh | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100755 download_resources.sh diff --git a/.gitignore b/.gitignore index 89241ae..faab01f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,6 @@ __pycache__/ *.py[cod] *$py.class -# Installed tools directory +# tools / resources directories /tools/ +/resources/ diff --git a/download_resources.sh b/download_resources.sh new file mode 100755 index 0000000..49678e2 --- /dev/null +++ b/download_resources.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +mkdir -p resources + +# Download and index the human ref genome +tools/python/3.6.2/bin/synapse get syn10347383 --downloadLocation resources/ +gunzip resources/hs37d5.fa.gz +tools/bwa/0.7.16a/bin/bwa index resources/hs37d5.fa +tools/samtools/1.7/bin/samtools faidx resources/hs37d5.fa +tools/java/jdk1.8.0_191/bin/java -jar tools/picard/2.12.1/picard.jar \ + CreateSequenceDictionary R=resources/hs37d5.fa O=resources/hs37d5.dict + +# Download mapping resources +tools/python/3.6.2/bin/synapse get syn17062535 -r --downloadLocation resources/ +gunzip resources/*vcf.gz resources/*vcf.idx.gz +rm resources/SYNAPSE_METADATA_MANIFEST.tsv + +# Split the ref genome by chromosome +awk '{ + r = match($1, "^>"); + if (r != 0) { + filename = "resources/chr"substr($1, 2, length($1))".fa"; + print $0 > filename; + } + else { + print $0 >> filename; + } +}' resources/hs37d5.fa +rm resources/chrGL* resources/chrhs37d5.fa resources/chrNC_007605.fa From a843671b7c4528b28e8af63240eb9d16ba10861d Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Sun, 25 Nov 2018 15:29:34 -0600 Subject: [PATCH 32/70] Change configuration system --- config.ini | 27 +++++++++++++++++++++++++++ genome_mapping/run.py | 34 ++++++++++++++++++++++------------ pipeline.conf | 30 ------------------------------ variant_calling/run.py | 24 ++++++++++++------------ 4 files changed, 61 insertions(+), 54 deletions(-) create mode 100644 config.ini delete mode 100644 pipeline.conf diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..03d8054 --- /dev/null +++ b/config.ini @@ -0,0 +1,27 @@ +[TOOLS] +PYTHON3 = tools/python/3.6.2/bin/python3 +SYNAPSE = tools/python/3.6.2/bin/synapse +JAVA = tools/java/jdk1.8.0_191/bin/java +BWA = tools/bwa/0.7.16a/bin/bwa +SAMTOOLS = tools/samtools/1.7/bin/samtools +SAMBAMBA = tools/sambamba/v0.6.7/bin/sambamba +GATK = tools/gatk/3.7-0/GenomeAnalysisTK.jar +PICARD = tools/picard/2.12.1/picard.jar +BGZIP = tools/htslib/1.7/bin/bgzip +TABIX = tools/htslib/1.7/bin/tabix +VT = tools/vt/2018-06-07/bin/vt +BCFTOOLS = tools/bcftools/1.7/bin/bcftools +ROOTSYS = tools/root/6.14.00 +CNVNATOR = tools/cnvnator/2018-07-09/bin/cnvnator + +[RESOURCES] +REFDIR = resources +REF = resources/hs37d5.fa +DBSNP = resources/dbsnp_138.b37.vcf +MILLS = resources/Mills_and_1000G_gold_standard.indels.b37.vcf +INDEL1KG = resources/1000G_phase1.indels.b37.vcf +OMNI = resources/1000G_omni2.5.b37.vcf +HAPMAP = resources/hapmap_3.3.b37.vcf +SNP1KG = resources/1000G_phase1.snps.high_confidence.b37.vcf +KNOWN_GERM_SNP = resources/gnomAD.1KG.ExAC.ESP6500.Kaviar.snps.txt.gz +MASK1KG = resources/20141020.strict_mask.whole_genome.fasta.gz diff --git a/genome_mapping/run.py b/genome_mapping/run.py index 621f2b0..4fe323f 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import argparse +import configparser import pandas as pd import synapseclient import subprocess @@ -21,10 +22,8 @@ def main(): args = parse_args() samples = parse_sample_file(args.infile) - synapse_login() - - save_run_info(args.config) + save_run_info(args.parentid) for key, val in samples.items(): sample, filetype = key @@ -86,9 +85,11 @@ def parse_args(): parser = argparse.ArgumentParser(description='Genome Mapping Pipeline') parser.add_argument('infile', metavar='sample_list.txt', help='Sample list file shoud have sample_id, synapse_id, and file_name.') - parser.add_argument('-c', '--config', metavar='config file', - help='Default: [pipeline home]/pipeline.conf', - default="{pipe_home}/pipeline.conf".format(pipe_home=pipe_home)) + parser.add_argument('--parentid', metavar='syn123', + help='''Synapse ID of project or folder where to upload result bam files. +If it is not set, the result bam files will be locally saved. +Default: None''', default=None) + return parser.parse_args() def filetype(fname): @@ -109,13 +110,22 @@ def synapse_login(): except: subprocess.run(['synapse', 'login', '--remember-me']) -def save_run_info(config): +def save_run_info(parentid): + config = configparser.ConfigParser() + config.read(pipe_home + "/config.ini") + with open("run_info", "w") as run_file: - run_file.write("CMD_HOME={path}\n\n".format(path=cmd_home)) - run_file.write("UTIL_HOME={path}\n\n".format(path=util_home)) - with open(config) as cfg_file: - for line in cfg_file: - run_file.write(line) + run_file.write("CMD_HOME={path}\n".format(path=cmd_home)) + run_file.write("UTIL_HOME={path}\n".format(path=util_home)) + + for section in config.sections(): + run_file.write("\n#{section}\n".format(section=section)) + for key in config[section]: + run_file.write("{key}={path}/{val}\n".format( + key=key.upper(), val=config[section][key], path=pipe_home)) + + run_file.write("\n#SYNAPSE\n") + run_file.write("PARENTID={id}\n".format(id=parentid)) def log_dir(sample): log_dir = sample+"/logs" diff --git a/pipeline.conf b/pipeline.conf deleted file mode 100644 index e4cd066..0000000 --- a/pipeline.conf +++ /dev/null @@ -1,30 +0,0 @@ -#TOOLS -PYTHON3=/shared/apps/pyenv/versions/3.6.2/bin/python3 -SYNAPSE=/shared/apps/pyenv/versions/3.6.2/bin/synapse -JAVA=/shared/apps/java/jdk1.8.0_144/bin/java -BWA=/shared/apps/bwa/0.7.16a/bin/bwa -SAMTOOLS=/shared/apps/samtools/1.7/bin/samtools -SAMBAMBA=/shared/apps/sambamba/v0.6.7/bin/sambamba -GATK=/shared/apps/gatk/3.7-0/GenomeAnalysisTK.jar -PICARD=/shared/apps/picard/2.12.1/picard.jar -BGZIP=/shared/apps/htslib/1.7/bin/bgzip -TABIX=/shared/apps/htslib/1.7/bin/tabix -VT=/shared/apps/vt/20180617/bin/vt -BCFTOOLS=/shared/apps/bcftools/1.7/bin/bcftools -ROOTSYS=/shared/apps/root/6.14.00 -CNVNATOR=/shared/apps/cnvnator/2018-07-02/bin/cnvnator - -#RESOURCES -REF=/shared/refdata/bsmn/hs37d5.fa -REFDIR=/shared/refdata/bsmn -DBSNP=/shared/refdata/gatk_bundle/b37/dbsnp_138.b37.vcf -MILLS=/shared/refdata/gatk_bundle/b37/Mills_and_1000G_gold_standard.indels.b37.vcf -INDEL1KG=/shared/refdata/gatk_bundle/b37/1000G_phase1.indels.b37.vcf -OMNI=/shared/refdata/gatk_bundle/b37/1000G_omni2.5.b37.vcf -HAPMAP=/shared/refdata/gatk_bundle/b37/hapmap_3.3.b37.vcf -SNP1KG=/shared/refdata/gatk_bundle/b37/1000G_phase1.snps.high_confidence.b37.vcf -KNOWN_GERM_SNP=/shared/refdata/bsmn/gnomAD.1KG.ExAC.ESP6500.Kaviar.snps.txt.gz -MASK1KG=/shared/refdata/bsmn/20141020.strict_mask.whole_genome.fasta.gz - -#SYNAPSE -PARENTID=syn11636015 diff --git a/variant_calling/run.py b/variant_calling/run.py index 857c5a4..54d39da 100755 --- a/variant_calling/run.py +++ b/variant_calling/run.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import argparse +import configparser import pandas as pd import synapseclient import subprocess @@ -94,18 +95,11 @@ def submit_post_jobs(sample, jid): return jid def parse_args(): - parser = argparse.ArgumentParser( - description='Variant Calling Pipeline') - + parser = argparse.ArgumentParser(description='Variant Calling Pipeline') parser.add_argument( 'infile', metavar='sample_list.txt', help='Sample list file shoud have sample_id, synapse_id, and file_name.') - parser.add_argument( - '-c', '--config', metavar='config file', - help='Default: [pipeline home]/pipeline.conf', - default="{pipe_home}/pipeline.conf".format(pipe_home=pipe_home)) - return parser.parse_args() def parse_sample_file(sfile): @@ -124,12 +118,18 @@ def synapse_login(): subprocess.run(['synapse', 'login', '--remember-me']) def save_run_info(config): + config = configparser.ConfigParser() + config.read(pipe_home + "/config.ini") + with open("run_info", "w") as run_file: run_file.write("CMD_HOME={path}\n".format(path=cmd_home)) - run_file.write("UTIL_HOME={path}\n\n".format(path=util_home)) - with open(config) as cfg_file: - for line in cfg_file: - run_file.write(line) + run_file.write("UTIL_HOME={path}\n".format(path=util_home)) + + for section in config.sections(): + run_file.write("\n#{section}\n".format(section=section)) + for key in config[section]: + run_file.write("{key}={path}/{val}\n".format( + key=key.upper(), val=config[section][key], path=pipe_home)) def log_dir(sample): log_dir = sample+"/logs" From 73dfd02c2f722a89587acb4d85bdd432adaa9c65 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Sun, 25 Nov 2018 16:35:41 -0600 Subject: [PATCH 33/70] pileup.py upadte as configuration system changes --- library/pileup.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/library/pileup.py b/library/pileup.py index 9f1b949..5c22ef2 100644 --- a/library/pileup.py +++ b/library/pileup.py @@ -1,3 +1,4 @@ +import configparser import os import sys import re @@ -5,12 +6,12 @@ import subprocess from .utils import coroutine -config = os.path.dirname(os.path.realpath(__file__)) + "/../pipeline.conf" -with open(config) as f: - for line in f: - if line[:9] == "SAMTOOLS=": - SAMTOOLS=line.strip().split('=')[1] - break +lib_home = os.path.dirname(os.path.realpath(__file__)) +pipe_home = os.path.normpath(lib_home + "/..") +config = configparser.ConfigParser() +config.read(pipe_home + "/config.ini") + +SAMTOOLS = pipe_home + "/" + config["TOOLS"]["SAMTOOLS"] if not os.path.isfile(SAMTOOLS) or not os.access(SAMTOOLS, os.X_OK): SAMTOOLS = shutil.which("samtools") From d6b12792061f9e8f970fadfc7b276ae7d816af31 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Sun, 25 Nov 2018 18:52:44 -0600 Subject: [PATCH 34/70] synpase_login refactoring --- genome_mapping/run.py | 9 +-------- library/login.py | 18 ++++++++++++++++++ variant_calling/run.py | 13 +++---------- 3 files changed, 22 insertions(+), 18 deletions(-) create mode 100644 library/login.py diff --git a/genome_mapping/run.py b/genome_mapping/run.py index 4fe323f..bb22cb4 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -3,8 +3,6 @@ import argparse import configparser import pandas as pd -import synapseclient -import subprocess import pathlib import os import sys @@ -16,6 +14,7 @@ job_home = cmd_home + "/job_scripts" sys.path.append(pipe_home) +from library.login import synapse_login from library.job_queue import GridEngineQueue q = GridEngineQueue() @@ -104,12 +103,6 @@ def parse_sample_file(sfile): samples[key].append(val) return samples -def synapse_login(): - try: - synapseclient.login() - except: - subprocess.run(['synapse', 'login', '--remember-me']) - def save_run_info(parentid): config = configparser.ConfigParser() config.read(pipe_home + "/config.ini") diff --git a/library/login.py b/library/login.py new file mode 100644 index 0000000..6f69e23 --- /dev/null +++ b/library/login.py @@ -0,0 +1,18 @@ +import configparser +import os +import synapseclient +import subprocess + +lib_home = os.path.dirname(os.path.realpath(__file__)) +pipe_home = os.path.normpath(lib_home + "/..") +config = configparser.ConfigParser() +config.read(pipe_home + "/config.ini") + +SYNAPSE = pipe_home + "/" + config["TOOLS"]["SYNAPSE"] + +def synapse_login(): + print("- Check synapse login") + try: + synapseclient.login() + except: + subprocess.run([SYNAPSE, 'login', '--remember-me']) diff --git a/variant_calling/run.py b/variant_calling/run.py index 54d39da..b19a6ba 100755 --- a/variant_calling/run.py +++ b/variant_calling/run.py @@ -3,8 +3,6 @@ import argparse import configparser import pandas as pd -import synapseclient -import subprocess import pathlib import os import sys @@ -16,6 +14,7 @@ job_home = cmd_home + "/job_scripts" sys.path.append(pipe_home) +from library.login import synapse_login from library.job_queue import GridEngineQueue q = GridEngineQueue() @@ -23,7 +22,7 @@ def main(): args = parse_args() samples = parse_sample_file(args.infile) synapse_login() - save_run_info(args.config) + save_run_info() for sample, sdata in samples.items(): print(sample) @@ -111,13 +110,7 @@ def parse_sample_file(sfile): samples[key].append(val) return samples -def synapse_login(): - try: - synapseclient.login() - except: - subprocess.run(['synapse', 'login', '--remember-me']) - -def save_run_info(config): +def save_run_info(): config = configparser.ConfigParser() config.read(pipe_home + "/config.ini") From f67e494218ec52d750de80f6a86c07ab08b02052 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Sun, 25 Nov 2018 23:15:30 -0600 Subject: [PATCH 35/70] Add nda_login --- analysis_utils/nda_aws_token.sh | 4 +--- genome_mapping/run.py | 3 ++- library/login.py | 23 ++++++++++++++++++++++- variant_calling/run.py | 3 ++- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/analysis_utils/nda_aws_token.sh b/analysis_utils/nda_aws_token.sh index 23b012d..2e2edf7 100755 --- a/analysis_utils/nda_aws_token.sh +++ b/analysis_utils/nda_aws_token.sh @@ -44,13 +44,11 @@ done if [ -z $nda_cred_f ]; then read_credential elif [ ! -e $nda_cred_f ]; then - echo "$nda_cred_f file doesn't exist." >$2; exit 1 + echo "$nda_cred_f file doesn't exist." >&2; exit 1 else source $nda_cred_f fi -#username=baetj -#password=fb0dc634e9179ebc3460d7a2a8c05cc3905fa00f server="https://ndar.nih.gov/DataManager/dataManager" ############################################################################## diff --git a/genome_mapping/run.py b/genome_mapping/run.py index bb22cb4..dc48c5f 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -14,7 +14,7 @@ job_home = cmd_home + "/job_scripts" sys.path.append(pipe_home) -from library.login import synapse_login +from library.login import synapse_login, nda_login from library.job_queue import GridEngineQueue q = GridEngineQueue() @@ -22,6 +22,7 @@ def main(): args = parse_args() samples = parse_sample_file(args.infile) synapse_login() + nda_login() save_run_info(args.parentid) for key, val in samples.items(): diff --git a/library/login.py b/library/login.py index 6f69e23..0134657 100644 --- a/library/login.py +++ b/library/login.py @@ -9,10 +9,31 @@ config.read(pipe_home + "/config.ini") SYNAPSE = pipe_home + "/" + config["TOOLS"]["SYNAPSE"] +NDA_TOKEN = pipe_home + "/analysis_utils/nda_aws_token.sh" def synapse_login(): print("- Check synapse login") try: synapseclient.login() except: - subprocess.run([SYNAPSE, 'login', '--remember-me']) + while True: + subprocess.run([SYNAPSE, 'login', '--remember-me']) + try: + synapseclient.login(silent=True) + break + except: + pass + +def nda_login(): + print("- Check NDA login") + nda_cred_f = os.path.expanduser("~/.nda_credential") + if not os.path.isfile(nda_cred_f): + subprocess.run([NDA_TOKEN, '-s', nda_cred_f]) + while True: + run_token = subprocess.run([NDA_TOKEN, '-r', nda_cred_f], stdout=subprocess.PIPE, encoding="utf-8") + if run_token.returncode == 0: + print("Requesting token succeeded!\n") + break + else: + print(run_token.stdout) + subprocess.run([NDA_TOKEN, '-s', nda_cred_f]) diff --git a/variant_calling/run.py b/variant_calling/run.py index b19a6ba..d4cea44 100755 --- a/variant_calling/run.py +++ b/variant_calling/run.py @@ -14,7 +14,7 @@ job_home = cmd_home + "/job_scripts" sys.path.append(pipe_home) -from library.login import synapse_login +from library.login import synapse_login, nda_login() from library.job_queue import GridEngineQueue q = GridEngineQueue() @@ -22,6 +22,7 @@ def main(): args = parse_args() samples = parse_sample_file(args.infile) synapse_login() + nda_login() save_run_info() for sample, sdata in samples.items(): From 475d6f96416c100603b9851347f8d089658b82d8 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 26 Nov 2018 01:22:45 -0600 Subject: [PATCH 36/70] Refactoring: config, run_info --- .../job_scripts/pre_3.submit_aln_jobs.sh | 3 +- genome_mapping/run.py | 29 +++++------------- library/config.py | 30 +++++++++++++++++++ library/login.py | 12 +++----- library/pileup.py | 10 ++----- .../filter_1.known_germ_filtering.sh | 2 +- .../job_scripts/filter_2-1.vaf_info.sh | 2 +- .../job_scripts/filter_2-2.strand_info.sh | 2 +- variant_calling/run.py | 25 ++++------------ 9 files changed, 55 insertions(+), 60 deletions(-) create mode 100644 library/config.py diff --git a/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh b/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh index 47bac35..222b038 100755 --- a/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh +++ b/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh @@ -17,6 +17,7 @@ SM=$2 printf -- "[$(date)] Start submit_aln_jobs.\n---\n" CWD=$(pwd) -ssh -o StrictHostKeyChecking=No $HOST "cd $CWD; $PYTHON3 $CMD_HOME/submit_aln_jobs.py $SM" +ssh -o StrictHostKeyChecking=No $HOST \ + "cd $CWD; $PYTHON3 $PIPE_HOME/genome_mapping/submit_aln_jobs.py $SM" printf -- "---\n[$(date)] Finish submit_aln_jobs.\n" diff --git a/genome_mapping/run.py b/genome_mapping/run.py index dc48c5f..83e813e 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse -import configparser import pandas as pd import pathlib import os @@ -10,21 +9,24 @@ cmd_home = os.path.dirname(os.path.realpath(__file__)) pipe_home = os.path.normpath(cmd_home + "/..") -util_home = pipe_home + "/analysis_utils" job_home = cmd_home + "/job_scripts" sys.path.append(pipe_home) +from library.config import run_info, run_info_append from library.login import synapse_login, nda_login from library.job_queue import GridEngineQueue q = GridEngineQueue() def main(): args = parse_args() - samples = parse_sample_file(args.infile) + synapse_login() nda_login() - save_run_info(args.parentid) - + + run_info() + run_info_append("\n#SYNAPSE\nPARENTID={}".format(args.parentid)) + + samples = parse_sample_file(args.infile) for key, val in samples.items(): sample, filetype = key print(sample) @@ -104,23 +106,6 @@ def parse_sample_file(sfile): samples[key].append(val) return samples -def save_run_info(parentid): - config = configparser.ConfigParser() - config.read(pipe_home + "/config.ini") - - with open("run_info", "w") as run_file: - run_file.write("CMD_HOME={path}\n".format(path=cmd_home)) - run_file.write("UTIL_HOME={path}\n".format(path=util_home)) - - for section in config.sections(): - run_file.write("\n#{section}\n".format(section=section)) - for key in config[section]: - run_file.write("{key}={path}/{val}\n".format( - key=key.upper(), val=config[section][key], path=pipe_home)) - - run_file.write("\n#SYNAPSE\n") - run_file.write("PARENTID={id}\n".format(id=parentid)) - def log_dir(sample): log_dir = sample+"/logs" pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) diff --git a/library/config.py b/library/config.py new file mode 100644 index 0000000..318079c --- /dev/null +++ b/library/config.py @@ -0,0 +1,30 @@ +import configparser +import os + +def read_config(): + lib_home = os.path.dirname(os.path.realpath(__file__)) + pipe_home = os.path.normpath(lib_home + "/..") + config = configparser.ConfigParser() + config["PATH"] = {"pipe_home": pipe_home} + + config.read(pipe_home + "/config.ini") + for section in ["TOOLS", "RESOURCES"]: + for key in config[section]: + config[section][key] = pipe_home + "/" + config[section][key] + + return config + +def run_info(): + config = read_config() + + with open("run_info", "w") as run_file: + run_file.write("#PATH\nPIPE_HOME={}\n".format(config["PATH"]["pipe_home"])) + for section in ["TOOLS", "RESOURCES"]: + run_file.write("\n#{section}\n".format(section=section)) + for key in config[section]: + run_file.write("{key}={val}\n".format( + key=key.upper(), val=config[section][key])) + +def run_info_append(line): + with open("run_info", "a") as run_file: + run_file.write(line + "\n") diff --git a/library/login.py b/library/login.py index 0134657..af228ce 100644 --- a/library/login.py +++ b/library/login.py @@ -1,15 +1,11 @@ -import configparser import os import synapseclient import subprocess +from .config import read_config -lib_home = os.path.dirname(os.path.realpath(__file__)) -pipe_home = os.path.normpath(lib_home + "/..") -config = configparser.ConfigParser() -config.read(pipe_home + "/config.ini") - -SYNAPSE = pipe_home + "/" + config["TOOLS"]["SYNAPSE"] -NDA_TOKEN = pipe_home + "/analysis_utils/nda_aws_token.sh" +config = read_config() +SYNAPSE = config["TOOLS"]["SYNAPSE"] +NDA_TOKEN = config["PATH"]["pipe_home"] + "/analysis_utils/nda_aws_token.sh" def synapse_login(): print("- Check synapse login") diff --git a/library/pileup.py b/library/pileup.py index 5c22ef2..bac453f 100644 --- a/library/pileup.py +++ b/library/pileup.py @@ -1,17 +1,13 @@ -import configparser import os import sys import re import shutil import subprocess +from .config import read_config from .utils import coroutine -lib_home = os.path.dirname(os.path.realpath(__file__)) -pipe_home = os.path.normpath(lib_home + "/..") -config = configparser.ConfigParser() -config.read(pipe_home + "/config.ini") - -SAMTOOLS = pipe_home + "/" + config["TOOLS"]["SAMTOOLS"] +config = read_config() +SAMTOOLS = config["TOOLS"]["SAMTOOLS"] if not os.path.isfile(SAMTOOLS) or not os.access(SAMTOOLS, os.X_OK): SAMTOOLS = shutil.which("samtools") diff --git a/variant_calling/job_scripts/filter_1.known_germ_filtering.sh b/variant_calling/job_scripts/filter_1.known_germ_filtering.sh index 4e4fb72..b10c86c 100755 --- a/variant_calling/job_scripts/filter_1.known_germ_filtering.sh +++ b/variant_calling/job_scripts/filter_1.known_germ_filtering.sh @@ -24,7 +24,7 @@ if [[ ! -f $OUT_VCF.tbi ]]; then |$VT normalize -n -r $REF - \ |$VT uniq - \ |$BCFTOOLS view -v snps \ - |$PYTHON3 $UTIL_HOME/germline_filter.py -V $KNOWN_GERM_SNP \ + |$PYTHON3 $PIPE_HOME/analysis_utils/germline_filter.py -V $KNOWN_GERM_SNP \ |$BGZIP > $OUT_VCF $TABIX $OUT_VCF else diff --git a/variant_calling/job_scripts/filter_2-1.vaf_info.sh b/variant_calling/job_scripts/filter_2-1.vaf_info.sh index ef4280c..d196a59 100755 --- a/variant_calling/job_scripts/filter_2-1.vaf_info.sh +++ b/variant_calling/job_scripts/filter_2-1.vaf_info.sh @@ -23,6 +23,6 @@ mkdir -p $SM/vaf $BCFTOOLS view -H -f PASS $IN_VCF \ |grep -v ^# |cut -f1,2,4,5 \ - |$PYTHON3 $UTIL_HOME/somatic_vaf.py -b $BAM > $VAF + |$PYTHON3 $PIPE_HOME/analysis_utils/somatic_vaf.py -b $BAM > $VAF printf -- "[$(date)] Finish generate vaf info.\n---\n" diff --git a/variant_calling/job_scripts/filter_2-2.strand_info.sh b/variant_calling/job_scripts/filter_2-2.strand_info.sh index d77ebd3..d99350a 100755 --- a/variant_calling/job_scripts/filter_2-2.strand_info.sh +++ b/variant_calling/job_scripts/filter_2-2.strand_info.sh @@ -23,6 +23,6 @@ mkdir -p $SM/strand $BCFTOOLS view -H -f PASS $IN_VCF \ |cut -f1,2,4,5 \ - |$PYTHON3 $UTIL_HOME/strand_bias.py -b $BAM > $STR + |$PYTHON3 $PIPE_HOME/analysis_utils/strand_bias.py -b $BAM > $STR printf -- "[$(date)] Finish generate strand info.\n---\n" diff --git a/variant_calling/run.py b/variant_calling/run.py index d4cea44..1adf540 100755 --- a/variant_calling/run.py +++ b/variant_calling/run.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse -import configparser import pandas as pd import pathlib import os @@ -10,21 +9,23 @@ cmd_home = os.path.dirname(os.path.realpath(__file__)) pipe_home = os.path.normpath(cmd_home + "/..") -util_home = pipe_home + "/analysis_utils" job_home = cmd_home + "/job_scripts" sys.path.append(pipe_home) -from library.login import synapse_login, nda_login() +from library.config import run_info +from library.login import synapse_login, nda_login from library.job_queue import GridEngineQueue q = GridEngineQueue() def main(): args = parse_args() - samples = parse_sample_file(args.infile) + synapse_login() nda_login() - save_run_info() + + run_info() + samples = parse_sample_file(args.infile) for sample, sdata in samples.items(): print(sample) jid_pre = submit_pre_jobs(sample, sdata) @@ -111,20 +112,6 @@ def parse_sample_file(sfile): samples[key].append(val) return samples -def save_run_info(): - config = configparser.ConfigParser() - config.read(pipe_home + "/config.ini") - - with open("run_info", "w") as run_file: - run_file.write("CMD_HOME={path}\n".format(path=cmd_home)) - run_file.write("UTIL_HOME={path}\n".format(path=util_home)) - - for section in config.sections(): - run_file.write("\n#{section}\n".format(section=section)) - for key in config[section]: - run_file.write("{key}={path}/{val}\n".format( - key=key.upper(), val=config[section][key], path=pipe_home)) - def log_dir(sample): log_dir = sample+"/logs" pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) From 6ae30d513b99a9b95c9fbfd3e0d6b1ced8d24923 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 26 Nov 2018 01:57:03 -0600 Subject: [PATCH 37/70] Library name change: utils -> misc --- analysis_utils/somatic_vaf.py | 2 +- analysis_utils/strand_bias.py | 2 +- library/{utils.py => misc.py} | 0 library/pileup.py | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename library/{utils.py => misc.py} (100%) diff --git a/analysis_utils/somatic_vaf.py b/analysis_utils/somatic_vaf.py index 077f4cb..3f665f1 100755 --- a/analysis_utils/somatic_vaf.py +++ b/analysis_utils/somatic_vaf.py @@ -7,7 +7,7 @@ pipe_home = os.path.dirname(os.path.realpath(__file__)) + "/.." sys.path.append(pipe_home) -from library.utils import coroutine, printer +from library.misc import coroutine, printer from library.pileup import pileup, clean, count def run(args): diff --git a/analysis_utils/strand_bias.py b/analysis_utils/strand_bias.py index 461f860..c242330 100755 --- a/analysis_utils/strand_bias.py +++ b/analysis_utils/strand_bias.py @@ -9,7 +9,7 @@ pipe_home = os.path.dirname(os.path.realpath(__file__)) + "/.." sys.path.append(pipe_home) -from library.utils import coroutine, printer +from library.misc import coroutine, printer from library.pileup import pileup, clean, count def run(args): diff --git a/library/utils.py b/library/misc.py similarity index 100% rename from library/utils.py rename to library/misc.py diff --git a/library/pileup.py b/library/pileup.py index bac453f..49a68a1 100644 --- a/library/pileup.py +++ b/library/pileup.py @@ -4,7 +4,7 @@ import shutil import subprocess from .config import read_config -from .utils import coroutine +from .misc import coroutine config = read_config() SAMTOOLS = config["TOOLS"]["SAMTOOLS"] From 7787b738630cc296fe90c974bd143684719f73a2 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 26 Nov 2018 03:12:03 -0600 Subject: [PATCH 38/70] Refactoring sample list parser & Fix sample list format --- genome_mapping/run.py | 27 +++++++++------------------ library/parser.py | 15 +++++++++++++++ variant_calling/run.py | 23 ++++++++--------------- 3 files changed, 32 insertions(+), 33 deletions(-) create mode 100644 library/parser.py diff --git a/genome_mapping/run.py b/genome_mapping/run.py index 83e813e..aa50f28 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse -import pandas as pd import pathlib import os import sys @@ -14,6 +13,7 @@ from library.config import run_info, run_info_append from library.login import synapse_login, nda_login +from library.parser import sample_list from library.job_queue import GridEngineQueue q = GridEngineQueue() @@ -26,7 +26,7 @@ def main(): run_info() run_info_append("\n#SYNAPSE\nPARENTID={}".format(args.parentid)) - samples = parse_sample_file(args.infile) + samples = sample_list(args.infile) for key, val in samples.items(): sample, filetype = key print(sample) @@ -86,26 +86,17 @@ def submit_aln_jobs(sample, jid): def parse_args(): parser = argparse.ArgumentParser(description='Genome Mapping Pipeline') parser.add_argument('infile', metavar='sample_list.txt', - help='Sample list file shoud have sample_id, synapse_id, and file_name.') + help='''Sample list file. + Each line format is "sample_id\\tfile_name\\tlocation". + Header line should start with "#". Trailing columns will be ignored. + "location" is either a synapse_id, or a s3_location of the NDA. + For data download, synapse or aws clients will be used, respectively.''') parser.add_argument('--parentid', metavar='syn123', help='''Synapse ID of project or folder where to upload result bam files. -If it is not set, the result bam files will be locally saved. -Default: None''', default=None) - + If it is not set, the result bam files will be locally saved. + [ Default: None ]''', default=None) return parser.parse_args() -def filetype(fname): - return "bam" if os.path.splitext(fname)[1] == ".bam" else "fastq" - -def parse_sample_file(sfile): - samples = defaultdict(list) - for sname, group in pd.read_table(sfile).groupby("sample_id"): - for idx, sdata in group.iterrows(): - key = (sname, filetype(sdata["file"])) - val = (sdata["file"], sdata["synapse_id"]) - samples[key].append(val) - return samples - def log_dir(sample): log_dir = sample+"/logs" pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) diff --git a/library/parser.py b/library/parser.py new file mode 100644 index 0000000..33a3ff0 --- /dev/null +++ b/library/parser.py @@ -0,0 +1,15 @@ +import os +from collections import defaultdict + +def filetype(fname): + return "bam" if os.path.splitext(fname)[1] == ".bam" else "fastq" + +def sample_list(fname): + samples = defaultdict(list) + with open(fname) as sfile: + for line in sfile: + if line[0] == "#": + continue + sample_id, file_name, location = line.strip().split()[:3] + samples[(sample_id, filetype(file_name))].append((file_name, location)) + return samples diff --git a/variant_calling/run.py b/variant_calling/run.py index 1adf540..48c9b38 100755 --- a/variant_calling/run.py +++ b/variant_calling/run.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse -import pandas as pd import pathlib import os import sys @@ -14,6 +13,7 @@ from library.config import run_info from library.login import synapse_login, nda_login +from library.parser import sample_list from library.job_queue import GridEngineQueue q = GridEngineQueue() @@ -25,7 +25,7 @@ def main(): run_info() - samples = parse_sample_file(args.infile) + samples = sample_list(args.infile) for sample, sdata in samples.items(): print(sample) jid_pre = submit_pre_jobs(sample, sdata) @@ -97,21 +97,14 @@ def submit_post_jobs(sample, jid): def parse_args(): parser = argparse.ArgumentParser(description='Variant Calling Pipeline') - parser.add_argument( - 'infile', metavar='sample_list.txt', - help='Sample list file shoud have sample_id, synapse_id, and file_name.') - + parser.add_argument('infile', metavar='sample_list.txt', + help='''Sample list file. + Each line format is "sample_id\\tfile_name\\tlocation". + Header line should start with "#". Trailing columns will be ignored. + "location" is either a synapse_id, or a s3_location of the NDA. + For data download, synapse or aws clients will be used, respectively.''') return parser.parse_args() -def parse_sample_file(sfile): - samples = defaultdict(list) - for sname, group in pd.read_table(sfile).groupby("sample_id"): - for idx, sdata in group.iterrows(): - key = sname - val = (sdata["file"], sdata["synapse_id"]) - samples[key].append(val) - return samples - def log_dir(sample): log_dir = sample+"/logs" pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) From 1a8c0174b727b0d77d6a89f53b9eb6daa43c0db6 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 26 Nov 2018 12:22:12 -0600 Subject: [PATCH 39/70] Add synapse login in the download_resources.sh --- download_resources.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/download_resources.sh b/download_resources.sh index 49678e2..7c233b5 100755 --- a/download_resources.sh +++ b/download_resources.sh @@ -2,6 +2,9 @@ mkdir -p resources +# Synapse login +tools/python/3.6.2/bin/synapse login --remember-me + # Download and index the human ref genome tools/python/3.6.2/bin/synapse get syn10347383 --downloadLocation resources/ gunzip resources/hs37d5.fa.gz From 31161f390dabf0e28af6570155adfcd4791ac96f Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 26 Nov 2018 12:28:05 -0600 Subject: [PATCH 40/70] Add NDA download funtion --- config.ini | 1 + genome_mapping/job_scripts/pre_1.download.sh | 21 +++++++----- .../job_scripts/pre_1b.bam2fastq.sh | 11 ++++--- .../job_scripts/pre_2.split_fastq_by_RG.sh | 9 +++--- .../job_scripts/pre_3.submit_aln_jobs.sh | 8 ++--- genome_mapping/run.py | 32 +++++++++---------- install_tools.sh | 2 +- variant_calling/job_scripts/pre_1.download.sh | 12 +++++-- variant_calling/run.py | 6 ++-- 9 files changed, 57 insertions(+), 45 deletions(-) diff --git a/config.ini b/config.ini index 03d8054..d52b5ed 100644 --- a/config.ini +++ b/config.ini @@ -1,6 +1,7 @@ [TOOLS] PYTHON3 = tools/python/3.6.2/bin/python3 SYNAPSE = tools/python/3.6.2/bin/synapse +AWS = tools/python/3.6.2/bin/aws JAVA = tools/java/jdk1.8.0_191/bin/java BWA = tools/bwa/0.7.16a/bin/bwa SAMTOOLS = tools/samtools/1.7/bin/samtools diff --git a/genome_mapping/job_scripts/pre_1.download.sh b/genome_mapping/job_scripts/pre_1.download.sh index 5dd0938..c766016 100755 --- a/genome_mapping/job_scripts/pre_1.download.sh +++ b/genome_mapping/job_scripts/pre_1.download.sh @@ -2,22 +2,27 @@ #$ -cwd #$ -pe threaded 1 -set -eu -o pipefail - if [[ $# -lt 3 ]]; then - echo "Usage: $(basename $0) [sample name] [file name] [synapse id]" + echo "Usage: $(basename $0) [sample name] [file name] [location]" exit 1 fi source $(pwd)/run_info +set -eu -o pipefail + SM=$1 FNAME=$2 -SINID=$3 +LOC=$3 -printf -- "[$(date)] Start download: $FNAME\n---\n" +printf -- "---\n[$(date)] Start download: $FNAME\n" -mkdir -p $SM/downloads $SM/fastq -$SYNAPSE get $SINID --downloadLocation $SM/downloads/ +mkdir -p $SM/downloads +if [[ $LOC =~ ^syn[0-9]+ ]]; then + $SYNAPSE get $LOC --downloadLocation $SM/downloads/ +elif [[ $LOC =~ ^s3:.+ ]]; then + source <($PIPE_HOME/analysis_utils/nda_aws_token.sh -r ~/.nda_credential) + $AWS s3 cp $LOC $SM/downloads/ +fi -printf -- "---\n[$(date)] Finish downlaod: $FNAME\n" +printf -- "[$(date)] Finish downlaod: $FNAME\n---\n" diff --git a/genome_mapping/job_scripts/pre_1b.bam2fastq.sh b/genome_mapping/job_scripts/pre_1b.bam2fastq.sh index ed128fe..562d250 100755 --- a/genome_mapping/job_scripts/pre_1b.bam2fastq.sh +++ b/genome_mapping/job_scripts/pre_1b.bam2fastq.sh @@ -2,8 +2,6 @@ #$ -cwd #$ -pe threaded 4 -set -eu -o pipefail - if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [sample name] [file name]" exit 1 @@ -11,10 +9,12 @@ fi source $(pwd)/run_info +set -eu -o pipefail + SM=$1 FNAME=$2 -printf -- "---\n[$(date)] Start bam2fastq: $FNAME\n---\n" +printf -- "---\n[$(date)] Start bam2fastq: $FNAME\n" FSIZE=$(stat -c%s $SM/downloads/$FNAME) @@ -27,8 +27,9 @@ else exit 1 fi +mkdir -p $SM/fastq $SAMTOOLS collate -uOn $TMP_N $SM/downloads/$FNAME $SM/tmp.collate \ - |$SAMTOOLS fastq -F 0x900 -1 $SM/fastq/$SM.R1.fastq.gz -2 $SM/fastq/$SM.R2.fastq.gz - + |$SAMTOOLS fastq -F 0x900 -1 $SM/fastq/$FNAME.R1.fastq.gz -2 $SM/fastq/$FNAME.R2.fastq.gz - rm $SM/downloads/$FNAME -printf -- "---\n[$(date)] Finish bam2fastq: $FNAME\n" +printf -- "[$(date)] Finish bam2fastq: $FNAME\n---\n" diff --git a/genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh b/genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh index 1f3a18c..638cbef 100755 --- a/genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh +++ b/genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh @@ -2,8 +2,6 @@ #$ -cwd #$ -pe threaded 3 -set -eu -o pipefail - if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [fastq]" exit 1 @@ -11,6 +9,8 @@ fi source $(pwd)/run_info +set -eu -o pipefail + FQ=$1 SM=$(echo $FQ|cut -d"/" -f1) @@ -26,8 +26,9 @@ else RD=R2 fi -printf -- "[$(date)] Start split fastq: $FQ\n---\n" +printf -- "---\n[$(date)] Start split fastq: $FQ\n" +mkdir -p $SM/fastq $CAT $FQ |paste - - - - |awk -F"\t" -v SM=$SM -v RD=$RD '{ h=$1; sub(/^@/,"",h); @@ -40,4 +41,4 @@ $CAT $FQ |paste - - - - |awk -F"\t" -v SM=$SM -v RD=$RD '{ print "READ N: "NR}' rm $FQ -printf -- "---\n[$(date)] Finish split fastq: $FQ\n" +printf -- "[$(date)] Finish split fastq: $FQ\n---\n" diff --git a/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh b/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh index 222b038..100c477 100755 --- a/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh +++ b/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh @@ -2,8 +2,6 @@ #$ -cwd #$ -pe threaded 1 -set -eu -o pipefail - if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [host] [sample]" exit 1 @@ -11,13 +9,15 @@ fi source $(pwd)/run_info +set -eu -o pipefail + HOST=$1 SM=$2 -printf -- "[$(date)] Start submit_aln_jobs.\n---\n" +printf -- "---\n[$(date)] Start submit_aln_jobs.\n" CWD=$(pwd) ssh -o StrictHostKeyChecking=No $HOST \ "cd $CWD; $PYTHON3 $PIPE_HOME/genome_mapping/submit_aln_jobs.py $SM" -printf -- "---\n[$(date)] Finish submit_aln_jobs.\n" +printf -- "[$(date)] Finish submit_aln_jobs.\n---\n" diff --git a/genome_mapping/run.py b/genome_mapping/run.py index aa50f28..33bf232 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -30,14 +30,13 @@ def main(): for key, val in samples.items(): sample, filetype = key print(sample) - if filetype == "bam": - fname, synid = val[0] - jid = submit_pre_jobs_bam(sample, fname, synid) - else: - jid_list = [] - for sdata in val: - fname, synid = sdata - jid_list.append(submit_pre_jobs_fastq(sample, fname, synid)) + jid_list = [] + for sdata in val: + fname, loc = sdata + if filetype == "bam": + jid_list.append(submit_pre_jobs_bam(sample, fname, loc)) + else: + jid_list.append(submit_pre_jobs_fastq(sample, fname, loc)) jid = ",".join(jid_list) submit_aln_jobs(sample, jid) print() @@ -48,10 +47,10 @@ def opt(sample, jid=None): opt = "-hold_jid {jid} {opt}".format(jid=jid, opt=opt) return opt -def submit_pre_jobs_fastq(sample, fname, synid): +def submit_pre_jobs_fastq(sample, fname, loc): jid = q.submit(opt(sample), - "{job_home}/pre_1.download.sh {sample} {fname} {synid}".format( - job_home=job_home, sample=sample, fname=fname, synid=synid)) + "{job_home}/pre_1.download.sh {sample} {fname} {loc}".format( + job_home=job_home, sample=sample, fname=fname, loc=loc)) jid = q.submit(opt(sample, jid), "{job_home}/pre_2.split_fastq_by_RG.sh {sample}/downloads/{fname}".format( @@ -59,10 +58,10 @@ def submit_pre_jobs_fastq(sample, fname, synid): return jid -def submit_pre_jobs_bam(sample, fname, synid): +def submit_pre_jobs_bam(sample, fname, loc): jid = q.submit(opt(sample), - "{job_home}/pre_1.download.sh {sample} {fname} {synid}".format( - job_home=job_home, sample=sample, fname=fname, synid=synid)) + "{job_home}/pre_1.download.sh {sample} {fname} {loc}".format( + job_home=job_home, sample=sample, fname=fname, loc=loc)) jid = q.submit(opt(sample, jid), "{job_home}/pre_1b.bam2fastq.sh {sample} {fname}".format( @@ -70,10 +69,9 @@ def submit_pre_jobs_bam(sample, fname, synid): jid_list = [] for read in ["R1", "R2"]: - fastq = "{sample}/fastq/{sample}.{read}.fastq.gz".format(sample=sample, read=read) jid_list.append(q.submit(opt(sample, jid), - "{job_home}/pre_2.split_fastq_by_RG.sh {sample}/fastq/{sample}.{read}.fastq.gz".format( - job_home=job_home, sample=sample, read=read))) + "{job_home}/pre_2.split_fastq_by_RG.sh {sample}/fastq/{fname}.{read}.fastq.gz".format( + job_home=job_home, sample=sample, fname=fname, read=read))) jid = ",".join(jid_list) return jid diff --git a/install_tools.sh b/install_tools.sh index 6f0a3b7..dafc537 100755 --- a/install_tools.sh +++ b/install_tools.sh @@ -15,7 +15,7 @@ rm -r tools/python/3.6.2/src # Installing Python modules needed tools/python/3.6.2/bin/pip3 install --upgrade pip -tools/python/3.6.2/bin/pip3 install pandas synapseclient statsmodels scipy rpy2 +tools/python/3.6.2/bin/pip3 install awscli synapseclient statsmodels scipy rpy2 # Installling Java8 mkdir -p tools/java diff --git a/variant_calling/job_scripts/pre_1.download.sh b/variant_calling/job_scripts/pre_1.download.sh index dd9455a..df6c31b 100755 --- a/variant_calling/job_scripts/pre_1.download.sh +++ b/variant_calling/job_scripts/pre_1.download.sh @@ -3,7 +3,7 @@ #$ -pe threaded 1 if [[ $# -lt 3 ]]; then - echo "Usage: $(basename $0) [sample name] [file name] [synapse id]" + echo "Usage: $(basename $0) [sample name] [file name] [location]" exit 1 fi @@ -13,10 +13,16 @@ set -eu -o pipefail SM=$1 FNAME=$2 -SINID=$3 +LOC=$3 printf -- "---\n[$(date)] Start download: $FNAME\n" + mkdir -p $SM/bam +if [[ $LOC =~ ^syn[0-9]+ ]]; then + $SYNAPSE get $LOC --downloadLocation $SM/bam/ +elif [[ $LOC =~ ^s3:.+ ]]; then + source <($PIPE_HOME/analysis_utils/nda_aws_token.sh -r ~/.nda_credential) + $AWS s3 cp $LOC $SM/bam/ +fi -# $SYNAPSE get $SINID --downloadLocation $SM/bam/ printf -- "[$(date)] Finish downlaod: $FNAME\n---\n" diff --git a/variant_calling/run.py b/variant_calling/run.py index 48c9b38..5f7fc0b 100755 --- a/variant_calling/run.py +++ b/variant_calling/run.py @@ -46,11 +46,11 @@ def opt(sample, jid=None): def submit_pre_jobs(sample, sdata): jid_list = [] - for fname, synid in sdata: + for fname, loc in sdata: jid_list.append( q.submit(opt(sample), - "{job_home}/pre_1.download.sh {sample} {fname} {synid}".format( - job_home=job_home, sample=sample, fname=fname, synid=synid))) + "{job_home}/pre_1.download.sh {sample} {fname} {loc}".format( + job_home=job_home, sample=sample, fname=fname, loc=loc))) jid = ",".join(jid_list) q.submit(opt(sample, jid), "{job_home}/pre_2.cnvnator.sh {sample}".format( From 213acebd373a08d46226a765bf3eede8413e07ad Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 26 Nov 2018 12:28:46 -0600 Subject: [PATCH 41/70] Minor change in job scripts --- genome_mapping/job_scripts/aln_1.align_sort.sh | 8 ++++---- genome_mapping/job_scripts/aln_2.merge_bam.sh | 8 ++++---- genome_mapping/job_scripts/aln_3.markdup.sh | 8 ++++---- genome_mapping/job_scripts/aln_4.indel_realign.sh | 8 ++++---- genome_mapping/job_scripts/aln_5.bqsr.sh | 8 ++++---- genome_mapping/job_scripts/aln_6.upload_bam.sh | 8 ++++---- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/genome_mapping/job_scripts/aln_1.align_sort.sh b/genome_mapping/job_scripts/aln_1.align_sort.sh index e16874e..c9af064 100755 --- a/genome_mapping/job_scripts/aln_1.align_sort.sh +++ b/genome_mapping/job_scripts/aln_1.align_sort.sh @@ -2,8 +2,6 @@ #$ -cwd #$ -pe threaded 36 -set -eu -o pipefail - if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [sample name] [PU info]" exit 1 @@ -11,10 +9,12 @@ fi source $(pwd)/run_info +set -eu -o pipefail + SM=$1 PU=$2 -printf -- "[$(date)] Start align_sort.\n---\n" +printf -- "---\n[$(date)] Start align_sort.\n" mkdir -p $SM/bam $BWA mem -M -t 32 \ @@ -24,4 +24,4 @@ $BWA mem -M -t 32 \ |$SAMBAMBA sort -m 24GB -t 3 -o $SM/bam/$SM.$PU.sorted.bam --tmpdir=tmp /dev/stdin rm $SM/fastq/$SM.$PU.R{1,2}.fastq.gz -printf -- "---\n[$(date)] Finish align_sort.\n" +printf -- "[$(date)] Finish align_sort.\n---\n" diff --git a/genome_mapping/job_scripts/aln_2.merge_bam.sh b/genome_mapping/job_scripts/aln_2.merge_bam.sh index ff53fc5..11ecf95 100755 --- a/genome_mapping/job_scripts/aln_2.merge_bam.sh +++ b/genome_mapping/job_scripts/aln_2.merge_bam.sh @@ -2,8 +2,6 @@ #$ -cwd #$ -pe threaded 18 -set -eu -o pipefail - if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name]" exit 1 @@ -11,9 +9,11 @@ fi source $(pwd)/run_info +set -eu -o pipefail + SM=$1 -printf -- "[$(date)] Start merge_bam.\n---\n" +printf -- "---\n[$(date)] Start merge_bam.\n" if [[ $(ls $SM/bam/$SM.*.sorted.bam|wc -l) == 1 ]]; then mv $SM/bam/$SM.*.sorted.bam $SM/bam/$SM.merged.bam @@ -23,4 +23,4 @@ else rm $SM/bam/$SM.*.sorted.bam{,.bai} fi -printf -- "---\n[$(date)] Finish merge_bam.\n" +printf -- "[$(date)] Finish merge_bam.\n---\n" diff --git a/genome_mapping/job_scripts/aln_3.markdup.sh b/genome_mapping/job_scripts/aln_3.markdup.sh index d9ee088..5a75a69 100755 --- a/genome_mapping/job_scripts/aln_3.markdup.sh +++ b/genome_mapping/job_scripts/aln_3.markdup.sh @@ -2,8 +2,6 @@ #$ -cwd #$ -pe threaded 18 -set -eu -o pipefail - if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name]" exit 1 @@ -11,9 +9,11 @@ fi source $(pwd)/run_info +set -eu -o pipefail + SM=$1 -printf -- "[$(date)] Start markdup.\n---\n" +printf -- "---\n[$(date)] Start markdup.\n" $JAVA -Xmx26G -jar $PICARD MarkDuplicates \ I=$SM/bam/$SM.merged.bam \ @@ -25,4 +25,4 @@ $JAVA -Xmx26G -jar $PICARD MarkDuplicates \ rm $SM/bam/$SM.merged.bam{,.bai} -printf -- "---\n[$(date)] Finish markdup.\n" +printf -- "[$(date)] Finish markdup.\n---\n" diff --git a/genome_mapping/job_scripts/aln_4.indel_realign.sh b/genome_mapping/job_scripts/aln_4.indel_realign.sh index 9a5367b..722809e 100755 --- a/genome_mapping/job_scripts/aln_4.indel_realign.sh +++ b/genome_mapping/job_scripts/aln_4.indel_realign.sh @@ -2,8 +2,6 @@ #$ -cwd #$ -pe threaded 36 -set -eu -o pipefail - if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name]" exit 1 @@ -11,9 +9,11 @@ fi source $(pwd)/run_info +set -eu -o pipefail + SM=$1 -printf -- "[$(date)] Start RealignerTargetCreator.\n---\n" +printf -- "---\n[$(date)] Start RealignerTargetCreator.\n" $JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \ -T RealignerTargetCreator -nt 36 \ @@ -32,4 +32,4 @@ $JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \ -o $SM/bam/$SM.realigned.bam rm $SM/bam/$SM.markduped.{bam,bai} $SM/realigner.intervals -printf -- "---\n[$(date)] Finish IndelRealigner.\n" +printf -- "[$(date)] Finish IndelRealigner.\n---\n" diff --git a/genome_mapping/job_scripts/aln_5.bqsr.sh b/genome_mapping/job_scripts/aln_5.bqsr.sh index b116a47..7e54326 100755 --- a/genome_mapping/job_scripts/aln_5.bqsr.sh +++ b/genome_mapping/job_scripts/aln_5.bqsr.sh @@ -2,8 +2,6 @@ #$ -cwd #$ -pe threaded 36 -set -eu -o pipefail - if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name]" exit 1 @@ -11,9 +9,11 @@ fi source $(pwd)/run_info +set -eu -o pipefail + SM=$1 -printf -- "[$(date)] Start BQSR recal_table.\n---\n" +printf -- "---\n[$(date)] Start BQSR recal_table.\n" $JAVA -Xmx58G -jar $GATK \ -T BaseRecalibrator -nct 36 \ @@ -32,4 +32,4 @@ $JAVA -Xmx58G -jar $GATK \ -o $SM/bam/$SM.bam rm $SM/bam/$SM.realigned.{bam,bai} -printf -- "---\n[$(date)] Finish BQSR PrintReads.\n" +printf -- "[$(date)] Finish BQSR PrintReads.\n---\n" diff --git a/genome_mapping/job_scripts/aln_6.upload_bam.sh b/genome_mapping/job_scripts/aln_6.upload_bam.sh index 80f408f..8b2dd34 100755 --- a/genome_mapping/job_scripts/aln_6.upload_bam.sh +++ b/genome_mapping/job_scripts/aln_6.upload_bam.sh @@ -2,8 +2,6 @@ #$ -cwd #$ -pe threaded 1 -set -eu -o pipefail - if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name]" exit 1 @@ -11,9 +9,11 @@ fi source $(pwd)/run_info +set -eu -o pipefail + SM=$1 -printf -- "[$(date)] Start flagstat: $SM.bam \n---\n" +printf -- "---\n[$(date)] Start flagstat: $SM.bam\n" $SAMTOOLS flagstat $SM/bam/$SM.bam > $SM/flagstat.txt @@ -28,4 +28,4 @@ cd .. rmdir downloads fastq bam touch done -printf -- "---\n[$(date)] Finish upload: $SM.{bam,bai}\n" +printf -- "[$(date)] Finish upload: $SM.{bam,bai}\n---\n" From 3d2570179f203edaa03f52b9c3157be91119d1d1 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 26 Nov 2018 12:36:06 -0600 Subject: [PATCH 42/70] Refactoring: rename analysis_utils -> utils --- genome_mapping/job_scripts/pre_1.download.sh | 2 +- library/login.py | 2 +- {analysis_utils => utils}/germline_filter.py | 0 {analysis_utils => utils}/nda_aws_token.sh | 0 {analysis_utils => utils}/nda_s3_path.py | 0 {analysis_utils => utils}/somatic_vaf.py | 0 {analysis_utils => utils}/strand_bias.py | 0 variant_calling/job_scripts/filter_1.known_germ_filtering.sh | 2 +- variant_calling/job_scripts/filter_2-1.vaf_info.sh | 2 +- variant_calling/job_scripts/filter_2-2.strand_info.sh | 2 +- variant_calling/job_scripts/pre_1.download.sh | 2 +- 11 files changed, 6 insertions(+), 6 deletions(-) rename {analysis_utils => utils}/germline_filter.py (100%) rename {analysis_utils => utils}/nda_aws_token.sh (100%) rename {analysis_utils => utils}/nda_s3_path.py (100%) rename {analysis_utils => utils}/somatic_vaf.py (100%) rename {analysis_utils => utils}/strand_bias.py (100%) diff --git a/genome_mapping/job_scripts/pre_1.download.sh b/genome_mapping/job_scripts/pre_1.download.sh index c766016..9a9d0bf 100755 --- a/genome_mapping/job_scripts/pre_1.download.sh +++ b/genome_mapping/job_scripts/pre_1.download.sh @@ -21,7 +21,7 @@ mkdir -p $SM/downloads if [[ $LOC =~ ^syn[0-9]+ ]]; then $SYNAPSE get $LOC --downloadLocation $SM/downloads/ elif [[ $LOC =~ ^s3:.+ ]]; then - source <($PIPE_HOME/analysis_utils/nda_aws_token.sh -r ~/.nda_credential) + source <($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential) $AWS s3 cp $LOC $SM/downloads/ fi diff --git a/library/login.py b/library/login.py index af228ce..43d3dd8 100644 --- a/library/login.py +++ b/library/login.py @@ -5,7 +5,7 @@ config = read_config() SYNAPSE = config["TOOLS"]["SYNAPSE"] -NDA_TOKEN = config["PATH"]["pipe_home"] + "/analysis_utils/nda_aws_token.sh" +NDA_TOKEN = config["PATH"]["pipe_home"] + "/utils/nda_aws_token.sh" def synapse_login(): print("- Check synapse login") diff --git a/analysis_utils/germline_filter.py b/utils/germline_filter.py similarity index 100% rename from analysis_utils/germline_filter.py rename to utils/germline_filter.py diff --git a/analysis_utils/nda_aws_token.sh b/utils/nda_aws_token.sh similarity index 100% rename from analysis_utils/nda_aws_token.sh rename to utils/nda_aws_token.sh diff --git a/analysis_utils/nda_s3_path.py b/utils/nda_s3_path.py similarity index 100% rename from analysis_utils/nda_s3_path.py rename to utils/nda_s3_path.py diff --git a/analysis_utils/somatic_vaf.py b/utils/somatic_vaf.py similarity index 100% rename from analysis_utils/somatic_vaf.py rename to utils/somatic_vaf.py diff --git a/analysis_utils/strand_bias.py b/utils/strand_bias.py similarity index 100% rename from analysis_utils/strand_bias.py rename to utils/strand_bias.py diff --git a/variant_calling/job_scripts/filter_1.known_germ_filtering.sh b/variant_calling/job_scripts/filter_1.known_germ_filtering.sh index b10c86c..99e2790 100755 --- a/variant_calling/job_scripts/filter_1.known_germ_filtering.sh +++ b/variant_calling/job_scripts/filter_1.known_germ_filtering.sh @@ -24,7 +24,7 @@ if [[ ! -f $OUT_VCF.tbi ]]; then |$VT normalize -n -r $REF - \ |$VT uniq - \ |$BCFTOOLS view -v snps \ - |$PYTHON3 $PIPE_HOME/analysis_utils/germline_filter.py -V $KNOWN_GERM_SNP \ + |$PYTHON3 $PIPE_HOME/utils/germline_filter.py -V $KNOWN_GERM_SNP \ |$BGZIP > $OUT_VCF $TABIX $OUT_VCF else diff --git a/variant_calling/job_scripts/filter_2-1.vaf_info.sh b/variant_calling/job_scripts/filter_2-1.vaf_info.sh index d196a59..e7b41aa 100755 --- a/variant_calling/job_scripts/filter_2-1.vaf_info.sh +++ b/variant_calling/job_scripts/filter_2-1.vaf_info.sh @@ -23,6 +23,6 @@ mkdir -p $SM/vaf $BCFTOOLS view -H -f PASS $IN_VCF \ |grep -v ^# |cut -f1,2,4,5 \ - |$PYTHON3 $PIPE_HOME/analysis_utils/somatic_vaf.py -b $BAM > $VAF + |$PYTHON3 $PIPE_HOME/utils/somatic_vaf.py -b $BAM > $VAF printf -- "[$(date)] Finish generate vaf info.\n---\n" diff --git a/variant_calling/job_scripts/filter_2-2.strand_info.sh b/variant_calling/job_scripts/filter_2-2.strand_info.sh index d99350a..a34266b 100755 --- a/variant_calling/job_scripts/filter_2-2.strand_info.sh +++ b/variant_calling/job_scripts/filter_2-2.strand_info.sh @@ -23,6 +23,6 @@ mkdir -p $SM/strand $BCFTOOLS view -H -f PASS $IN_VCF \ |cut -f1,2,4,5 \ - |$PYTHON3 $PIPE_HOME/analysis_utils/strand_bias.py -b $BAM > $STR + |$PYTHON3 $PIPE_HOME/utils/strand_bias.py -b $BAM > $STR printf -- "[$(date)] Finish generate strand info.\n---\n" diff --git a/variant_calling/job_scripts/pre_1.download.sh b/variant_calling/job_scripts/pre_1.download.sh index df6c31b..7fc8693 100755 --- a/variant_calling/job_scripts/pre_1.download.sh +++ b/variant_calling/job_scripts/pre_1.download.sh @@ -21,7 +21,7 @@ mkdir -p $SM/bam if [[ $LOC =~ ^syn[0-9]+ ]]; then $SYNAPSE get $LOC --downloadLocation $SM/bam/ elif [[ $LOC =~ ^s3:.+ ]]; then - source <($PIPE_HOME/analysis_utils/nda_aws_token.sh -r ~/.nda_credential) + source <($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential) $AWS s3 cp $LOC $SM/bam/ fi From 4c1e75cede34a87fbbac81b7d398b58970253a46 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Tue, 27 Nov 2018 23:02:51 -0600 Subject: [PATCH 43/70] Add an option for turning off bam upload to synapse --- genome_mapping/submit_aln_jobs.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/genome_mapping/submit_aln_jobs.py b/genome_mapping/submit_aln_jobs.py index 5d5de7e..1090331 100755 --- a/genome_mapping/submit_aln_jobs.py +++ b/genome_mapping/submit_aln_jobs.py @@ -35,14 +35,21 @@ def main(): jid = q.submit(opt(args.sample, jid), "{job_home}/aln_5.bqsr.sh {sample}".format(job_home=job_home, sample=args.sample)) - q.submit(opt(args.sample, jid), - "{job_home}/aln_6.upload_bam.sh {sample}".format(job_home=job_home, sample=args.sample)) + if parentid() != "None": + q.submit(opt(args.sample, jid), + "{job_home}/aln_6.upload_bam.sh {sample}".format(job_home=job_home, sample=args.sample)) def parse_args(): parser = argparse.ArgumentParser(description='Alignment job submitter') parser.add_argument('sample', metavar='sample name') return parser.parse_args() +def parentid(): + with open('/efs/tmp/run_info') as run_info: + for line in run_info: + if line[:8] == "PARENTID": + return line.strip().split("=")[1] + def log_dir(sample): log_dir = sample+"/logs" pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) From aad38a2fe005d8b0c3e4023d73c9c52b381e7cbb Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Tue, 27 Nov 2018 23:37:44 -0600 Subject: [PATCH 44/70] Add wrapper scripts for pipeline runners --- genome_mapping.sh | 6 ++++++ requirements.txt | 3 --- variant_calling.sh | 6 ++++++ 3 files changed, 12 insertions(+), 3 deletions(-) create mode 100755 genome_mapping.sh delete mode 100644 requirements.txt create mode 100755 variant_calling.sh diff --git a/genome_mapping.sh b/genome_mapping.sh new file mode 100755 index 0000000..aa135bc --- /dev/null +++ b/genome_mapping.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +PIPE_HOME=$(dirname $(readlink -f ${BASH_SOURCE[0]})) +PYTHON3=$PIPE_HOME/$(grep PYTHON3 $PIPE_HOME/config.ini |cut -f2 -d=|sed 's/^ \+//') + +$PYTHON3 $PIPE_HOME/genome_mapping/run.py $@ diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 3f629c7..0000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -statsmodels -pandas -synapseclient diff --git a/variant_calling.sh b/variant_calling.sh new file mode 100755 index 0000000..bd5a734 --- /dev/null +++ b/variant_calling.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +PIPE_HOME=$(dirname $(readlink -f ${BASH_SOURCE[0]})) +PYTHON3=$PIPE_HOME/$(grep PYTHON3 $PIPE_HOME/config.ini |cut -f2 -d=|sed 's/^ \+//') + +$PYTHON3 $PIPE_HOME/variant_calling/run.py $@ From 2818e1c82067fcd98f00096c3feed620091dc1e2 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Wed, 28 Nov 2018 01:28:11 -0600 Subject: [PATCH 45/70] Bug_fix: NDA credential setting --- genome_mapping/job_scripts/pre_1.download.sh | 2 +- variant_calling/job_scripts/pre_1.download.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/genome_mapping/job_scripts/pre_1.download.sh b/genome_mapping/job_scripts/pre_1.download.sh index 9a9d0bf..6e6b7c3 100755 --- a/genome_mapping/job_scripts/pre_1.download.sh +++ b/genome_mapping/job_scripts/pre_1.download.sh @@ -21,7 +21,7 @@ mkdir -p $SM/downloads if [[ $LOC =~ ^syn[0-9]+ ]]; then $SYNAPSE get $LOC --downloadLocation $SM/downloads/ elif [[ $LOC =~ ^s3:.+ ]]; then - source <($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential) + eval "$($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential)" $AWS s3 cp $LOC $SM/downloads/ fi diff --git a/variant_calling/job_scripts/pre_1.download.sh b/variant_calling/job_scripts/pre_1.download.sh index 7fc8693..c81613d 100755 --- a/variant_calling/job_scripts/pre_1.download.sh +++ b/variant_calling/job_scripts/pre_1.download.sh @@ -21,7 +21,7 @@ mkdir -p $SM/bam if [[ $LOC =~ ^syn[0-9]+ ]]; then $SYNAPSE get $LOC --downloadLocation $SM/bam/ elif [[ $LOC =~ ^s3:.+ ]]; then - source <($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential) + eval "$($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential)" $AWS s3 cp $LOC $SM/bam/ fi From 9675dea733eff970d11abc34ae06ecd89fbd3edb Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Wed, 28 Nov 2018 10:41:07 -0600 Subject: [PATCH 46/70] minor change of the argument help in runners --- genome_mapping/run.py | 2 +- variant_calling/run.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/genome_mapping/run.py b/genome_mapping/run.py index 33bf232..30e1716 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -87,7 +87,7 @@ def parse_args(): help='''Sample list file. Each line format is "sample_id\\tfile_name\\tlocation". Header line should start with "#". Trailing columns will be ignored. - "location" is either a synapse_id, or a s3_location of the NDA. + "location" is either a synapse_id, or a s3_uri of the NDA. For data download, synapse or aws clients will be used, respectively.''') parser.add_argument('--parentid', metavar='syn123', help='''Synapse ID of project or folder where to upload result bam files. diff --git a/variant_calling/run.py b/variant_calling/run.py index 5f7fc0b..37fa973 100755 --- a/variant_calling/run.py +++ b/variant_calling/run.py @@ -101,7 +101,7 @@ def parse_args(): help='''Sample list file. Each line format is "sample_id\\tfile_name\\tlocation". Header line should start with "#". Trailing columns will be ignored. - "location" is either a synapse_id, or a s3_location of the NDA. + "location" is either a synapse_id, or a s3_uri of the NDA. For data download, synapse or aws clients will be used, respectively.''') return parser.parse_args() From 99d7a68dd60137146c2eab00acb497710aef3c98 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Wed, 28 Nov 2018 10:55:38 -0600 Subject: [PATCH 47/70] Simplify aws download log --- genome_mapping/job_scripts/pre_1.download.sh | 2 +- variant_calling/job_scripts/pre_1.download.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/genome_mapping/job_scripts/pre_1.download.sh b/genome_mapping/job_scripts/pre_1.download.sh index 6e6b7c3..307f2fb 100755 --- a/genome_mapping/job_scripts/pre_1.download.sh +++ b/genome_mapping/job_scripts/pre_1.download.sh @@ -22,7 +22,7 @@ if [[ $LOC =~ ^syn[0-9]+ ]]; then $SYNAPSE get $LOC --downloadLocation $SM/downloads/ elif [[ $LOC =~ ^s3:.+ ]]; then eval "$($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential)" - $AWS s3 cp $LOC $SM/downloads/ + $AWS s3 cp --no-progress $LOC $SM/downloads/ fi printf -- "[$(date)] Finish downlaod: $FNAME\n---\n" diff --git a/variant_calling/job_scripts/pre_1.download.sh b/variant_calling/job_scripts/pre_1.download.sh index c81613d..fcf73f5 100755 --- a/variant_calling/job_scripts/pre_1.download.sh +++ b/variant_calling/job_scripts/pre_1.download.sh @@ -22,7 +22,7 @@ if [[ $LOC =~ ^syn[0-9]+ ]]; then $SYNAPSE get $LOC --downloadLocation $SM/bam/ elif [[ $LOC =~ ^s3:.+ ]]; then eval "$($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential)" - $AWS s3 cp $LOC $SM/bam/ + $AWS s3 cp --no-progress $LOC $SM/bam/ fi printf -- "[$(date)] Finish downlaod: $FNAME\n---\n" From 936b31049cac0a854afff70103cbe32d6ce4bf73 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Wed, 28 Nov 2018 11:41:36 -0600 Subject: [PATCH 48/70] Removing a conflict of tmp collate file names when running bam2fq on multiple bams --- genome_mapping/job_scripts/pre_1b.bam2fastq.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genome_mapping/job_scripts/pre_1b.bam2fastq.sh b/genome_mapping/job_scripts/pre_1b.bam2fastq.sh index 562d250..17e3190 100755 --- a/genome_mapping/job_scripts/pre_1b.bam2fastq.sh +++ b/genome_mapping/job_scripts/pre_1b.bam2fastq.sh @@ -28,7 +28,7 @@ else fi mkdir -p $SM/fastq -$SAMTOOLS collate -uOn $TMP_N $SM/downloads/$FNAME $SM/tmp.collate \ +$SAMTOOLS collate -uOn $TMP_N $SM/downloads/$FNAME $SM/$FNAME.collate \ |$SAMTOOLS fastq -F 0x900 -1 $SM/fastq/$FNAME.R1.fastq.gz -2 $SM/fastq/$FNAME.R2.fastq.gz - rm $SM/downloads/$FNAME From 7ea731be42c5fd4f47fdac490bec0a952a7bae37 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 3 Dec 2018 21:04:56 -0600 Subject: [PATCH 49/70] Refactoring: run_info, log_dir --- genome_mapping/run.py | 12 +++--------- genome_mapping/submit_aln_jobs.py | 9 ++------- library/config.py | 14 ++++++++++---- variant_calling/run.py | 10 ++-------- 4 files changed, 17 insertions(+), 28 deletions(-) diff --git a/genome_mapping/run.py b/genome_mapping/run.py index 30e1716..11f9a5e 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse -import pathlib import os import sys from collections import defaultdict @@ -11,7 +10,7 @@ job_home = cmd_home + "/job_scripts" sys.path.append(pipe_home) -from library.config import run_info, run_info_append +from library.config import run_info, run_info_append, log_dir from library.login import synapse_login, nda_login from library.parser import sample_list from library.job_queue import GridEngineQueue @@ -23,8 +22,8 @@ def main(): synapse_login() nda_login() - run_info() - run_info_append("\n#SYNAPSE\nPARENTID={}".format(args.parentid)) + run_info("run_info") + run_info_append("run_info", "\n#SYNAPSE\nPARENTID={}".format(args.parentid)) samples = sample_list(args.infile) for key, val in samples.items(): @@ -95,10 +94,5 @@ def parse_args(): [ Default: None ]''', default=None) return parser.parse_args() -def log_dir(sample): - log_dir = sample+"/logs" - pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) - return log_dir - if __name__ == "__main__": main() diff --git a/genome_mapping/submit_aln_jobs.py b/genome_mapping/submit_aln_jobs.py index 1090331..a8a9923 100755 --- a/genome_mapping/submit_aln_jobs.py +++ b/genome_mapping/submit_aln_jobs.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse -import pathlib import glob import os import sys @@ -11,6 +10,7 @@ job_home = cmd_home + "/job_scripts" sys.path.append(pipe_home) +from library.config import log_dir from library.job_queue import GridEngineQueue def main(): @@ -45,16 +45,11 @@ def parse_args(): return parser.parse_args() def parentid(): - with open('/efs/tmp/run_info') as run_info: + with open('run_info') as run_info: for line in run_info: if line[:8] == "PARENTID": return line.strip().split("=")[1] -def log_dir(sample): - log_dir = sample+"/logs" - pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) - return log_dir - def opt(sample, jid=None): opt = "-j y -o {log_dir} -l h_vmem=2G".format(log_dir=log_dir(sample)) if jid is not None: diff --git a/library/config.py b/library/config.py index 318079c..c23522b 100644 --- a/library/config.py +++ b/library/config.py @@ -1,4 +1,5 @@ import configparser +import pathlib import os def read_config(): @@ -14,10 +15,10 @@ def read_config(): return config -def run_info(): +def run_info(fname): config = read_config() - with open("run_info", "w") as run_file: + with open(fname, "w") as run_file: run_file.write("#PATH\nPIPE_HOME={}\n".format(config["PATH"]["pipe_home"])) for section in ["TOOLS", "RESOURCES"]: run_file.write("\n#{section}\n".format(section=section)) @@ -25,6 +26,11 @@ def run_info(): run_file.write("{key}={val}\n".format( key=key.upper(), val=config[section][key])) -def run_info_append(line): - with open("run_info", "a") as run_file: +def run_info_append(fname, line): + with open(fname, "a") as run_file: run_file.write(line + "\n") + +def log_dir(sample): + log_dir = sample+"/logs" + pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) + return log_dir diff --git a/variant_calling/run.py b/variant_calling/run.py index 37fa973..1e636dd 100755 --- a/variant_calling/run.py +++ b/variant_calling/run.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse -import pathlib import os import sys from collections import defaultdict @@ -11,7 +10,7 @@ job_home = cmd_home + "/job_scripts" sys.path.append(pipe_home) -from library.config import run_info +from library.config import run_info, log_dir from library.login import synapse_login, nda_login from library.parser import sample_list from library.job_queue import GridEngineQueue @@ -23,7 +22,7 @@ def main(): synapse_login() nda_login() - run_info() + run_info("run_info") samples = sample_list(args.infile) for sample, sdata in samples.items(): @@ -105,10 +104,5 @@ def parse_args(): For data download, synapse or aws clients will be used, respectively.''') return parser.parse_args() -def log_dir(sample): - log_dir = sample+"/logs" - pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) - return log_dir - if __name__ == "__main__": main() From e64ab24642e9084515f2781eb69bb2ed03998b75 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 3 Dec 2018 22:17:39 -0600 Subject: [PATCH 50/70] Change location to save run_info --- genome_mapping/job_scripts/aln_1.align_sort.sh | 8 ++++---- genome_mapping/job_scripts/aln_2.merge_bam.sh | 6 +++--- genome_mapping/job_scripts/aln_3.markdup.sh | 6 +++--- genome_mapping/job_scripts/aln_4.indel_realign.sh | 6 +++--- genome_mapping/job_scripts/aln_5.bqsr.sh | 6 +++--- genome_mapping/job_scripts/aln_6.upload_bam.sh | 6 +++--- genome_mapping/job_scripts/pre_1.download.sh | 8 ++++---- genome_mapping/job_scripts/pre_1b.bam2fastq.sh | 8 ++++---- genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh | 8 ++++---- genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh | 8 ++++---- genome_mapping/run.py | 8 +++++--- genome_mapping/submit_aln_jobs.py | 6 +++--- .../job_scripts/filter_1.known_germ_filtering.sh | 8 ++++---- variant_calling/job_scripts/filter_2-1.vaf_info.sh | 8 ++++---- variant_calling/job_scripts/filter_2-2.strand_info.sh | 8 ++++---- variant_calling/job_scripts/filter_3.bias_summary.sh | 8 ++++---- variant_calling/job_scripts/gatk_1.hc_gvcf.sh | 8 ++++---- variant_calling/job_scripts/gatk_2.joint_gt.sh | 8 ++++---- variant_calling/job_scripts/gatk_3.concat_vcf.sh | 8 ++++---- variant_calling/job_scripts/gatk_4.vqsr.sh | 8 ++++---- variant_calling/job_scripts/post_1.candidates.sh | 6 +++--- variant_calling/job_scripts/pre_1.download.sh | 8 ++++---- variant_calling/job_scripts/pre_2.cnvnator.sh | 5 +++-- variant_calling/run.py | 5 +++-- 24 files changed, 88 insertions(+), 84 deletions(-) diff --git a/genome_mapping/job_scripts/aln_1.align_sort.sh b/genome_mapping/job_scripts/aln_1.align_sort.sh index c9af064..d871d63 100755 --- a/genome_mapping/job_scripts/aln_1.align_sort.sh +++ b/genome_mapping/job_scripts/aln_1.align_sort.sh @@ -7,13 +7,13 @@ if [[ $# -lt 2 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - SM=$1 PU=$2 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + printf -- "---\n[$(date)] Start align_sort.\n" mkdir -p $SM/bam diff --git a/genome_mapping/job_scripts/aln_2.merge_bam.sh b/genome_mapping/job_scripts/aln_2.merge_bam.sh index 11ecf95..0e2c605 100755 --- a/genome_mapping/job_scripts/aln_2.merge_bam.sh +++ b/genome_mapping/job_scripts/aln_2.merge_bam.sh @@ -7,11 +7,11 @@ if [[ $# -lt 1 ]]; then exit 1 fi -source $(pwd)/run_info +SM=$1 -set -eu -o pipefail +source $(pwd)/$SM/run_info -SM=$1 +set -eu -o pipefail printf -- "---\n[$(date)] Start merge_bam.\n" diff --git a/genome_mapping/job_scripts/aln_3.markdup.sh b/genome_mapping/job_scripts/aln_3.markdup.sh index 5a75a69..0306a30 100755 --- a/genome_mapping/job_scripts/aln_3.markdup.sh +++ b/genome_mapping/job_scripts/aln_3.markdup.sh @@ -7,11 +7,11 @@ if [[ $# -lt 1 ]]; then exit 1 fi -source $(pwd)/run_info +SM=$1 -set -eu -o pipefail +source $(pwd)/$SM/run_info -SM=$1 +set -eu -o pipefail printf -- "---\n[$(date)] Start markdup.\n" diff --git a/genome_mapping/job_scripts/aln_4.indel_realign.sh b/genome_mapping/job_scripts/aln_4.indel_realign.sh index 722809e..8593f54 100755 --- a/genome_mapping/job_scripts/aln_4.indel_realign.sh +++ b/genome_mapping/job_scripts/aln_4.indel_realign.sh @@ -7,11 +7,11 @@ if [[ $# -lt 1 ]]; then exit 1 fi -source $(pwd)/run_info +SM=$1 -set -eu -o pipefail +source $(pwd)/$SM/run_info -SM=$1 +set -eu -o pipefail printf -- "---\n[$(date)] Start RealignerTargetCreator.\n" diff --git a/genome_mapping/job_scripts/aln_5.bqsr.sh b/genome_mapping/job_scripts/aln_5.bqsr.sh index 7e54326..9cb76ad 100755 --- a/genome_mapping/job_scripts/aln_5.bqsr.sh +++ b/genome_mapping/job_scripts/aln_5.bqsr.sh @@ -7,11 +7,11 @@ if [[ $# -lt 1 ]]; then exit 1 fi -source $(pwd)/run_info +SM=$1 -set -eu -o pipefail +source $(pwd)/$SM/run_info -SM=$1 +set -eu -o pipefail printf -- "---\n[$(date)] Start BQSR recal_table.\n" diff --git a/genome_mapping/job_scripts/aln_6.upload_bam.sh b/genome_mapping/job_scripts/aln_6.upload_bam.sh index 8b2dd34..f06f69d 100755 --- a/genome_mapping/job_scripts/aln_6.upload_bam.sh +++ b/genome_mapping/job_scripts/aln_6.upload_bam.sh @@ -7,11 +7,11 @@ if [[ $# -lt 1 ]]; then exit 1 fi -source $(pwd)/run_info +SM=$1 -set -eu -o pipefail +source $(pwd)/$SM/run_info -SM=$1 +set -eu -o pipefail printf -- "---\n[$(date)] Start flagstat: $SM.bam\n" diff --git a/genome_mapping/job_scripts/pre_1.download.sh b/genome_mapping/job_scripts/pre_1.download.sh index 307f2fb..9c5e9a4 100755 --- a/genome_mapping/job_scripts/pre_1.download.sh +++ b/genome_mapping/job_scripts/pre_1.download.sh @@ -7,14 +7,14 @@ if [[ $# -lt 3 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - SM=$1 FNAME=$2 LOC=$3 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + printf -- "---\n[$(date)] Start download: $FNAME\n" mkdir -p $SM/downloads diff --git a/genome_mapping/job_scripts/pre_1b.bam2fastq.sh b/genome_mapping/job_scripts/pre_1b.bam2fastq.sh index 17e3190..09ac5ce 100755 --- a/genome_mapping/job_scripts/pre_1b.bam2fastq.sh +++ b/genome_mapping/job_scripts/pre_1b.bam2fastq.sh @@ -7,13 +7,13 @@ if [[ $# -lt 2 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - SM=$1 FNAME=$2 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + printf -- "---\n[$(date)] Start bam2fastq: $FNAME\n" FSIZE=$(stat -c%s $SM/downloads/$FNAME) diff --git a/genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh b/genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh index 638cbef..fa59551 100755 --- a/genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh +++ b/genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh @@ -7,13 +7,13 @@ if [[ $# -lt 1 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - FQ=$1 SM=$(echo $FQ|cut -d"/" -f1) +source $(pwd)/$SM/run_info + +set -eu -o pipefail + if [[ $FQ == *.gz ]]; then CAT=zcat else diff --git a/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh b/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh index 100c477..0c2a241 100755 --- a/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh +++ b/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh @@ -7,13 +7,13 @@ if [[ $# -lt 2 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - HOST=$1 SM=$2 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + printf -- "---\n[$(date)] Start submit_aln_jobs.\n" CWD=$(pwd) diff --git a/genome_mapping/run.py b/genome_mapping/run.py index 11f9a5e..c22c0c4 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -22,13 +22,15 @@ def main(): synapse_login() nda_login() - run_info("run_info") - run_info_append("run_info", "\n#SYNAPSE\nPARENTID={}".format(args.parentid)) - samples = sample_list(args.infile) for key, val in samples.items(): sample, filetype = key print(sample) + + f_run_info = sample + "/run_info" + run_info(f_run_info) + run_info_append(f_run_info, "\n#SYNAPSE\nPARENTID={}".format(args.parentid)) + jid_list = [] for sdata in val: fname, loc = sdata diff --git a/genome_mapping/submit_aln_jobs.py b/genome_mapping/submit_aln_jobs.py index a8a9923..eaedd47 100755 --- a/genome_mapping/submit_aln_jobs.py +++ b/genome_mapping/submit_aln_jobs.py @@ -35,7 +35,7 @@ def main(): jid = q.submit(opt(args.sample, jid), "{job_home}/aln_5.bqsr.sh {sample}".format(job_home=job_home, sample=args.sample)) - if parentid() != "None": + if parentid(args.sample) != "None": q.submit(opt(args.sample, jid), "{job_home}/aln_6.upload_bam.sh {sample}".format(job_home=job_home, sample=args.sample)) @@ -44,8 +44,8 @@ def parse_args(): parser.add_argument('sample', metavar='sample name') return parser.parse_args() -def parentid(): - with open('run_info') as run_info: +def parentid(sample): + with open(sample + "/run_info") as run_info: for line in run_info: if line[:8] == "PARENTID": return line.strip().split("=")[1] diff --git a/variant_calling/job_scripts/filter_1.known_germ_filtering.sh b/variant_calling/job_scripts/filter_1.known_germ_filtering.sh index 99e2790..335a85a 100755 --- a/variant_calling/job_scripts/filter_1.known_germ_filtering.sh +++ b/variant_calling/job_scripts/filter_1.known_germ_filtering.sh @@ -7,13 +7,13 @@ if [[ $# -lt 2 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - SM=$1 PL=$2 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + IN_VCF=$SM/recal_vcf/$SM.ploidy_$PL.vcf OUT_VCF=$SM/recal_vcf/$SM.ploidy_$PL.known_germ_filtered.snvs.vcf.gz diff --git a/variant_calling/job_scripts/filter_2-1.vaf_info.sh b/variant_calling/job_scripts/filter_2-1.vaf_info.sh index e7b41aa..51f0ff7 100755 --- a/variant_calling/job_scripts/filter_2-1.vaf_info.sh +++ b/variant_calling/job_scripts/filter_2-1.vaf_info.sh @@ -7,13 +7,13 @@ if [[ $# -lt 2 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - SM=$1 PL=$2 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + IN_VCF=$SM/recal_vcf/$SM.ploidy_$PL.known_germ_filtered.snvs.vcf.gz VAF=$SM/vaf/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt BAM=$SM/bam/$SM.bam diff --git a/variant_calling/job_scripts/filter_2-2.strand_info.sh b/variant_calling/job_scripts/filter_2-2.strand_info.sh index a34266b..69979c8 100755 --- a/variant_calling/job_scripts/filter_2-2.strand_info.sh +++ b/variant_calling/job_scripts/filter_2-2.strand_info.sh @@ -7,13 +7,13 @@ if [[ $# -lt 2 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - SM=$1 PL=$2 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + IN_VCF=$SM/recal_vcf/$SM.ploidy_$PL.known_germ_filtered.snvs.vcf.gz STR=$SM/strand/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt BAM=$SM/bam/$SM.bam diff --git a/variant_calling/job_scripts/filter_3.bias_summary.sh b/variant_calling/job_scripts/filter_3.bias_summary.sh index 4e487ae..1aa33f7 100755 --- a/variant_calling/job_scripts/filter_3.bias_summary.sh +++ b/variant_calling/job_scripts/filter_3.bias_summary.sh @@ -7,13 +7,13 @@ if [[ $# -lt 2 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - SM=$1 PL=$2 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + VAF=$SM/vaf/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt STR=$SM/strand/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt SUM=$SM/bias_summary/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt diff --git a/variant_calling/job_scripts/gatk_1.hc_gvcf.sh b/variant_calling/job_scripts/gatk_1.hc_gvcf.sh index 16d2c87..1c614d8 100755 --- a/variant_calling/job_scripts/gatk_1.hc_gvcf.sh +++ b/variant_calling/job_scripts/gatk_1.hc_gvcf.sh @@ -7,13 +7,13 @@ if [[ $# -lt 2 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - SM=$1 PL=$2 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + if [[ ${SGE_TASK_ID} -le 22 ]]; then CHR=${SGE_TASK_ID} elif [[ ${SGE_TASK_ID} -eq 23 ]]; then diff --git a/variant_calling/job_scripts/gatk_2.joint_gt.sh b/variant_calling/job_scripts/gatk_2.joint_gt.sh index c74c59e..d2ec88c 100755 --- a/variant_calling/job_scripts/gatk_2.joint_gt.sh +++ b/variant_calling/job_scripts/gatk_2.joint_gt.sh @@ -7,13 +7,13 @@ if [[ $# -lt 2 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - SM=$1 PL=$2 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + if [[ ${SGE_TASK_ID} -le 22 ]]; then CHR=${SGE_TASK_ID} elif [[ ${SGE_TASK_ID} -eq 23 ]]; then diff --git a/variant_calling/job_scripts/gatk_3.concat_vcf.sh b/variant_calling/job_scripts/gatk_3.concat_vcf.sh index 5356eb2..acec34a 100755 --- a/variant_calling/job_scripts/gatk_3.concat_vcf.sh +++ b/variant_calling/job_scripts/gatk_3.concat_vcf.sh @@ -7,13 +7,13 @@ if [[ $# -lt 2 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - SM=$1 PL=$2 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + RVCFS="" for i in $(seq 1 22) X Y; do RVCFS="$RVCFS -V $SM/raw_vcf/$SM.ploidy_$PL.$i.vcf" diff --git a/variant_calling/job_scripts/gatk_4.vqsr.sh b/variant_calling/job_scripts/gatk_4.vqsr.sh index 0f59fa7..b977ea2 100755 --- a/variant_calling/job_scripts/gatk_4.vqsr.sh +++ b/variant_calling/job_scripts/gatk_4.vqsr.sh @@ -7,13 +7,13 @@ if [[ $# -lt 1 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - SM=$1 PL=$2 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + RVCF=$SM/raw_vcf/$SM.ploidy_$PL.vcf CVCF_SNP=$SM/recal_vcf/$SM.ploidy_$PL.snps.vcf CVCF_ALL=$SM/recal_vcf/$SM.ploidy_$PL.vcf diff --git a/variant_calling/job_scripts/post_1.candidates.sh b/variant_calling/job_scripts/post_1.candidates.sh index 724940b..0ba68ed 100755 --- a/variant_calling/job_scripts/post_1.candidates.sh +++ b/variant_calling/job_scripts/post_1.candidates.sh @@ -7,13 +7,13 @@ if [[ $# -lt 1 ]]; then exit 1 fi -source $(pwd)/run_info +SM=$1 + +source $(pwd)/$SM/run_info source $ROOTSYS/bin/thisroot.sh set -eu -o pipefail -SM=$1 - SUM_PREFIX=$SM/bias_summary/$SM.ploidy_ CANDALL=$SM/candidates/$SM.mosaic_snvs.all.txt CANDCNV=$SM/candidates/$SM.mosaic_snvs.cnv_considered.txt diff --git a/variant_calling/job_scripts/pre_1.download.sh b/variant_calling/job_scripts/pre_1.download.sh index fcf73f5..0f07090 100755 --- a/variant_calling/job_scripts/pre_1.download.sh +++ b/variant_calling/job_scripts/pre_1.download.sh @@ -7,14 +7,14 @@ if [[ $# -lt 3 ]]; then exit 1 fi -source $(pwd)/run_info - -set -eu -o pipefail - SM=$1 FNAME=$2 LOC=$3 +source $(pwd)/$SM/run_info + +set -eu -o pipefail + printf -- "---\n[$(date)] Start download: $FNAME\n" mkdir -p $SM/bam diff --git a/variant_calling/job_scripts/pre_2.cnvnator.sh b/variant_calling/job_scripts/pre_2.cnvnator.sh index ed45eb5..ded4718 100755 --- a/variant_calling/job_scripts/pre_2.cnvnator.sh +++ b/variant_calling/job_scripts/pre_2.cnvnator.sh @@ -7,12 +7,13 @@ if [[ $# -lt 1 ]]; then exit 1 fi -source $(pwd)/run_info +SM=$1 + +source $(pwd)/$SM/run_info source $ROOTSYS/bin/thisroot.sh set -eu -o pipefail -SM=$1 BAM=$SM/bam/$SM.bam ROOT=$SM/cnv/$SM.root CNVCALL=$SM/cnv/$SM.cnvcall diff --git a/variant_calling/run.py b/variant_calling/run.py index 1e636dd..4fdad65 100755 --- a/variant_calling/run.py +++ b/variant_calling/run.py @@ -22,11 +22,12 @@ def main(): synapse_login() nda_login() - run_info("run_info") - samples = sample_list(args.infile) for sample, sdata in samples.items(): print(sample) + + run_info(sample + "/run_info") + jid_pre = submit_pre_jobs(sample, sdata) jid_list = [] for ploidy in range(2,11): From 11f638f07b30da19ad4530013276d0d2d2d9ca48 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Tue, 4 Dec 2018 00:09:09 -0600 Subject: [PATCH 51/70] Improve error handling in job_scripts --- genome_mapping/job_scripts/aln_1.align_sort.sh | 7 +++++-- genome_mapping/job_scripts/aln_2.merge_bam.sh | 7 +++++-- genome_mapping/job_scripts/aln_3.markdup.sh | 7 +++++-- genome_mapping/job_scripts/aln_4.indel_realign.sh | 7 +++++-- genome_mapping/job_scripts/aln_5.bqsr.sh | 7 +++++-- genome_mapping/job_scripts/aln_6.upload_bam.sh | 7 +++++-- genome_mapping/job_scripts/pre_1.download.sh | 7 +++++-- genome_mapping/job_scripts/pre_1b.bam2fastq.sh | 7 +++++-- genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh | 7 +++++-- genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh | 7 +++++-- .../job_scripts/filter_1.known_germ_filtering.sh | 7 +++++-- variant_calling/job_scripts/filter_2-1.vaf_info.sh | 7 +++++-- variant_calling/job_scripts/filter_2-2.strand_info.sh | 7 +++++-- variant_calling/job_scripts/filter_3.bias_summary.sh | 7 +++++-- variant_calling/job_scripts/gatk_1.hc_gvcf.sh | 7 +++++-- variant_calling/job_scripts/gatk_2.joint_gt.sh | 7 +++++-- variant_calling/job_scripts/gatk_3.concat_vcf.sh | 7 +++++-- variant_calling/job_scripts/gatk_4.vqsr.sh | 7 +++++-- variant_calling/job_scripts/post_1.candidates.sh | 7 +++++-- variant_calling/job_scripts/pre_1.download.sh | 7 +++++-- variant_calling/job_scripts/pre_2.cnvnator.sh | 7 +++++-- 21 files changed, 105 insertions(+), 42 deletions(-) diff --git a/genome_mapping/job_scripts/aln_1.align_sort.sh b/genome_mapping/job_scripts/aln_1.align_sort.sh index d871d63..8e028b7 100755 --- a/genome_mapping/job_scripts/aln_1.align_sort.sh +++ b/genome_mapping/job_scripts/aln_1.align_sort.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 36 +trap "exit 100" ERR + if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [sample name] [PU info]" - exit 1 + false fi SM=$1 @@ -12,7 +14,8 @@ PU=$2 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail printf -- "---\n[$(date)] Start align_sort.\n" diff --git a/genome_mapping/job_scripts/aln_2.merge_bam.sh b/genome_mapping/job_scripts/aln_2.merge_bam.sh index 0e2c605..1c086b6 100755 --- a/genome_mapping/job_scripts/aln_2.merge_bam.sh +++ b/genome_mapping/job_scripts/aln_2.merge_bam.sh @@ -2,16 +2,19 @@ #$ -cwd #$ -pe threaded 18 +trap "exit 100" ERR + if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name]" - exit 1 + false fi SM=$1 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail printf -- "---\n[$(date)] Start merge_bam.\n" diff --git a/genome_mapping/job_scripts/aln_3.markdup.sh b/genome_mapping/job_scripts/aln_3.markdup.sh index 0306a30..ab729dc 100755 --- a/genome_mapping/job_scripts/aln_3.markdup.sh +++ b/genome_mapping/job_scripts/aln_3.markdup.sh @@ -2,16 +2,19 @@ #$ -cwd #$ -pe threaded 18 +trap "exit 100" ERR + if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name]" - exit 1 + false fi SM=$1 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail printf -- "---\n[$(date)] Start markdup.\n" diff --git a/genome_mapping/job_scripts/aln_4.indel_realign.sh b/genome_mapping/job_scripts/aln_4.indel_realign.sh index 8593f54..93d4f4f 100755 --- a/genome_mapping/job_scripts/aln_4.indel_realign.sh +++ b/genome_mapping/job_scripts/aln_4.indel_realign.sh @@ -2,16 +2,19 @@ #$ -cwd #$ -pe threaded 36 +trap "exit 100" ERR + if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name]" - exit 1 + false fi SM=$1 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail printf -- "---\n[$(date)] Start RealignerTargetCreator.\n" diff --git a/genome_mapping/job_scripts/aln_5.bqsr.sh b/genome_mapping/job_scripts/aln_5.bqsr.sh index 9cb76ad..f363ba2 100755 --- a/genome_mapping/job_scripts/aln_5.bqsr.sh +++ b/genome_mapping/job_scripts/aln_5.bqsr.sh @@ -2,16 +2,19 @@ #$ -cwd #$ -pe threaded 36 +trap "exit 100" ERR + if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name]" - exit 1 + false fi SM=$1 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail printf -- "---\n[$(date)] Start BQSR recal_table.\n" diff --git a/genome_mapping/job_scripts/aln_6.upload_bam.sh b/genome_mapping/job_scripts/aln_6.upload_bam.sh index f06f69d..4f517f4 100755 --- a/genome_mapping/job_scripts/aln_6.upload_bam.sh +++ b/genome_mapping/job_scripts/aln_6.upload_bam.sh @@ -2,16 +2,19 @@ #$ -cwd #$ -pe threaded 1 +trap "exit 100" ERR + if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name]" - exit 1 + false fi SM=$1 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail printf -- "---\n[$(date)] Start flagstat: $SM.bam\n" diff --git a/genome_mapping/job_scripts/pre_1.download.sh b/genome_mapping/job_scripts/pre_1.download.sh index 9c5e9a4..594a79b 100755 --- a/genome_mapping/job_scripts/pre_1.download.sh +++ b/genome_mapping/job_scripts/pre_1.download.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 1 +trap "exit 100" ERR + if [[ $# -lt 3 ]]; then echo "Usage: $(basename $0) [sample name] [file name] [location]" - exit 1 + false fi SM=$1 @@ -13,7 +15,8 @@ LOC=$3 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail printf -- "---\n[$(date)] Start download: $FNAME\n" diff --git a/genome_mapping/job_scripts/pre_1b.bam2fastq.sh b/genome_mapping/job_scripts/pre_1b.bam2fastq.sh index 09ac5ce..203666e 100755 --- a/genome_mapping/job_scripts/pre_1b.bam2fastq.sh +++ b/genome_mapping/job_scripts/pre_1b.bam2fastq.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 4 +trap "exit 100" ERR + if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [sample name] [file name]" - exit 1 + false fi SM=$1 @@ -12,7 +14,8 @@ FNAME=$2 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail printf -- "---\n[$(date)] Start bam2fastq: $FNAME\n" diff --git a/genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh b/genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh index fa59551..af8523d 100755 --- a/genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh +++ b/genome_mapping/job_scripts/pre_2.split_fastq_by_RG.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 3 +trap "exit 100" ERR + if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [fastq]" - exit 1 + false fi FQ=$1 @@ -12,7 +14,8 @@ SM=$(echo $FQ|cut -d"/" -f1) source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail if [[ $FQ == *.gz ]]; then CAT=zcat diff --git a/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh b/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh index 0c2a241..270411e 100755 --- a/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh +++ b/genome_mapping/job_scripts/pre_3.submit_aln_jobs.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 1 +trap "exit 100" ERR + if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [host] [sample]" - exit 1 + false fi HOST=$1 @@ -12,7 +14,8 @@ SM=$2 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail printf -- "---\n[$(date)] Start submit_aln_jobs.\n" diff --git a/variant_calling/job_scripts/filter_1.known_germ_filtering.sh b/variant_calling/job_scripts/filter_1.known_germ_filtering.sh index 335a85a..3c6e61c 100755 --- a/variant_calling/job_scripts/filter_1.known_germ_filtering.sh +++ b/variant_calling/job_scripts/filter_1.known_germ_filtering.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 8 +trap "exit 100" ERR + if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [sample name] [ploidy]" - exit 1 + false fi SM=$1 @@ -12,7 +14,8 @@ PL=$2 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail IN_VCF=$SM/recal_vcf/$SM.ploidy_$PL.vcf OUT_VCF=$SM/recal_vcf/$SM.ploidy_$PL.known_germ_filtered.snvs.vcf.gz diff --git a/variant_calling/job_scripts/filter_2-1.vaf_info.sh b/variant_calling/job_scripts/filter_2-1.vaf_info.sh index 51f0ff7..b11858c 100755 --- a/variant_calling/job_scripts/filter_2-1.vaf_info.sh +++ b/variant_calling/job_scripts/filter_2-1.vaf_info.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 1 +trap "exit 100" ERR + if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [sample name] [ploidy]" - exit 1 + false fi SM=$1 @@ -12,7 +14,8 @@ PL=$2 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail IN_VCF=$SM/recal_vcf/$SM.ploidy_$PL.known_germ_filtered.snvs.vcf.gz VAF=$SM/vaf/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt diff --git a/variant_calling/job_scripts/filter_2-2.strand_info.sh b/variant_calling/job_scripts/filter_2-2.strand_info.sh index 69979c8..7295ea2 100755 --- a/variant_calling/job_scripts/filter_2-2.strand_info.sh +++ b/variant_calling/job_scripts/filter_2-2.strand_info.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 1 +trap "exit 100" ERR + if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [sample name] [ploidy]" - exit 1 + false fi SM=$1 @@ -12,7 +14,8 @@ PL=$2 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail IN_VCF=$SM/recal_vcf/$SM.ploidy_$PL.known_germ_filtered.snvs.vcf.gz STR=$SM/strand/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt diff --git a/variant_calling/job_scripts/filter_3.bias_summary.sh b/variant_calling/job_scripts/filter_3.bias_summary.sh index 1aa33f7..b9c76d7 100755 --- a/variant_calling/job_scripts/filter_3.bias_summary.sh +++ b/variant_calling/job_scripts/filter_3.bias_summary.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 1 +trap "exit 100" ERR + if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [sample name] [ploidy]" - exit 1 + false fi SM=$1 @@ -12,7 +14,8 @@ PL=$2 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail VAF=$SM/vaf/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt STR=$SM/strand/$SM.ploidy_$PL.known_germ_filtered.pass.snvs.txt diff --git a/variant_calling/job_scripts/gatk_1.hc_gvcf.sh b/variant_calling/job_scripts/gatk_1.hc_gvcf.sh index 1c614d8..495ede5 100755 --- a/variant_calling/job_scripts/gatk_1.hc_gvcf.sh +++ b/variant_calling/job_scripts/gatk_1.hc_gvcf.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 16 +trap "exit 100" ERR + if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [sample name] [ploidy]" - exit 1 + false fi SM=$1 @@ -12,7 +14,8 @@ PL=$2 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail if [[ ${SGE_TASK_ID} -le 22 ]]; then CHR=${SGE_TASK_ID} diff --git a/variant_calling/job_scripts/gatk_2.joint_gt.sh b/variant_calling/job_scripts/gatk_2.joint_gt.sh index d2ec88c..7f458e1 100755 --- a/variant_calling/job_scripts/gatk_2.joint_gt.sh +++ b/variant_calling/job_scripts/gatk_2.joint_gt.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 16 +trap "exit 100" ERR + if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [sample name] [ploidy]" - exit 1 + false fi SM=$1 @@ -12,7 +14,8 @@ PL=$2 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail if [[ ${SGE_TASK_ID} -le 22 ]]; then CHR=${SGE_TASK_ID} diff --git a/variant_calling/job_scripts/gatk_3.concat_vcf.sh b/variant_calling/job_scripts/gatk_3.concat_vcf.sh index acec34a..4c6953a 100755 --- a/variant_calling/job_scripts/gatk_3.concat_vcf.sh +++ b/variant_calling/job_scripts/gatk_3.concat_vcf.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 3 +trap "exit 100" ERR + if [[ $# -lt 2 ]]; then echo "Usage: $(basename $0) [sample name] [ploidy]" - exit 1 + false fi SM=$1 @@ -12,7 +14,8 @@ PL=$2 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail RVCFS="" for i in $(seq 1 22) X Y; do diff --git a/variant_calling/job_scripts/gatk_4.vqsr.sh b/variant_calling/job_scripts/gatk_4.vqsr.sh index b977ea2..e697300 100755 --- a/variant_calling/job_scripts/gatk_4.vqsr.sh +++ b/variant_calling/job_scripts/gatk_4.vqsr.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 8 +trap "exit 100" ERR + if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name] [ploidy]" - exit 1 + false fi SM=$1 @@ -12,7 +14,8 @@ PL=$2 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail RVCF=$SM/raw_vcf/$SM.ploidy_$PL.vcf CVCF_SNP=$SM/recal_vcf/$SM.ploidy_$PL.snps.vcf diff --git a/variant_calling/job_scripts/post_1.candidates.sh b/variant_calling/job_scripts/post_1.candidates.sh index 0ba68ed..de261f4 100755 --- a/variant_calling/job_scripts/post_1.candidates.sh +++ b/variant_calling/job_scripts/post_1.candidates.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 1 +trap "exit 100" ERR + if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name]" - exit 1 + false fi SM=$1 @@ -12,7 +14,8 @@ SM=$1 source $(pwd)/$SM/run_info source $ROOTSYS/bin/thisroot.sh -set -eu -o pipefail +set -o nounset +set -o pipefail SUM_PREFIX=$SM/bias_summary/$SM.ploidy_ CANDALL=$SM/candidates/$SM.mosaic_snvs.all.txt diff --git a/variant_calling/job_scripts/pre_1.download.sh b/variant_calling/job_scripts/pre_1.download.sh index 0f07090..75efa97 100755 --- a/variant_calling/job_scripts/pre_1.download.sh +++ b/variant_calling/job_scripts/pre_1.download.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 1 +trap "exit 100" ERR + if [[ $# -lt 3 ]]; then echo "Usage: $(basename $0) [sample name] [file name] [location]" - exit 1 + false fi SM=$1 @@ -13,7 +15,8 @@ LOC=$3 source $(pwd)/$SM/run_info -set -eu -o pipefail +set -o nounset +set -o pipefail printf -- "---\n[$(date)] Start download: $FNAME\n" diff --git a/variant_calling/job_scripts/pre_2.cnvnator.sh b/variant_calling/job_scripts/pre_2.cnvnator.sh index ded4718..1ad79e5 100755 --- a/variant_calling/job_scripts/pre_2.cnvnator.sh +++ b/variant_calling/job_scripts/pre_2.cnvnator.sh @@ -2,9 +2,11 @@ #$ -cwd #$ -pe threaded 4 +trap "exit 100" ERR + if [[ $# -lt 1 ]]; then echo "Usage: $(basename $0) [sample name]" - exit 1 + false fi SM=$1 @@ -12,7 +14,8 @@ SM=$1 source $(pwd)/$SM/run_info source $ROOTSYS/bin/thisroot.sh -set -eu -o pipefail +set -o nounset +set -o pipefail BAM=$SM/bam/$SM.bam ROOT=$SM/cnv/$SM.root From 6c379fb2ecc18fde0b8a4732b840d5c1fd9e7ce1 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Tue, 4 Dec 2018 02:57:36 -0600 Subject: [PATCH 52/70] Add user's S3Uri and LocalPath options for data source; Multiple try when downloading data --- genome_mapping/job_scripts/pre_1.download.sh | 26 +++++++++++++---- genome_mapping/run.py | 12 ++++---- variant_calling/job_scripts/pre_1.download.sh | 28 ++++++++++++++----- variant_calling/run.py | 10 ++++--- 4 files changed, 54 insertions(+), 22 deletions(-) diff --git a/genome_mapping/job_scripts/pre_1.download.sh b/genome_mapping/job_scripts/pre_1.download.sh index 594a79b..a2d053e 100755 --- a/genome_mapping/job_scripts/pre_1.download.sh +++ b/genome_mapping/job_scripts/pre_1.download.sh @@ -21,11 +21,25 @@ set -o pipefail printf -- "---\n[$(date)] Start download: $FNAME\n" mkdir -p $SM/downloads -if [[ $LOC =~ ^syn[0-9]+ ]]; then - $SYNAPSE get $LOC --downloadLocation $SM/downloads/ -elif [[ $LOC =~ ^s3:.+ ]]; then - eval "$($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential)" - $AWS s3 cp --no-progress $LOC $SM/downloads/ -fi + +rc=0 +n=0 +until [[ $n -eq 5 ]]; do + if [[ $LOC =~ ^syn[0-9]+ ]]; then + $SYNAPSE get $LOC --downloadLocation $SM/downloads/ && break || rc=$? + elif [[ $LOC =~ ^s3:.+ ]]; then + $AWS s3 ls $LOC || { + printf "\nSet an NDA AWS token\n" + eval "$($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential)" + } + $AWS s3 cp --no-progress $LOC $SM/downloads/ && break || rc=$? + else + ls -lh $LOC && ln -sf $(readlink -f $LOC) $SM/downloads/ || rc=$? + break + fi + n=$((n+1)) + printf "Download try $n failed.\n" +done +[[ $rc -eq 0 ]] || false printf -- "[$(date)] Finish downlaod: $FNAME\n---\n" diff --git a/genome_mapping/run.py b/genome_mapping/run.py index c22c0c4..a38054f 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -84,12 +84,14 @@ def submit_aln_jobs(sample, jid): def parse_args(): parser = argparse.ArgumentParser(description='Genome Mapping Pipeline') - parser.add_argument('infile', metavar='sample_list.txt', - help='''Sample list file. + parser.add_argument('infile', metavar='sample_list.txt', + help='''Sample list file. Each line format is "sample_id\\tfile_name\\tlocation". - Header line should start with "#". Trailing columns will be ignored. - "location" is either a synapse_id, or a s3_uri of the NDA. - For data download, synapse or aws clients will be used, respectively.''') + Lines staring with "#" will omitted. + Header line should also start with "#". + Trailing columns will be ignored. + "location" is Synapse ID, S3Uri of the NDA or a user, or LocalPath. + For data download, synapse or aws clients, or symbolic link will be used, respectively.''') parser.add_argument('--parentid', metavar='syn123', help='''Synapse ID of project or folder where to upload result bam files. If it is not set, the result bam files will be locally saved. diff --git a/variant_calling/job_scripts/pre_1.download.sh b/variant_calling/job_scripts/pre_1.download.sh index 75efa97..d6b14dd 100755 --- a/variant_calling/job_scripts/pre_1.download.sh +++ b/variant_calling/job_scripts/pre_1.download.sh @@ -20,12 +20,26 @@ set -o pipefail printf -- "---\n[$(date)] Start download: $FNAME\n" -mkdir -p $SM/bam -if [[ $LOC =~ ^syn[0-9]+ ]]; then - $SYNAPSE get $LOC --downloadLocation $SM/bam/ -elif [[ $LOC =~ ^s3:.+ ]]; then - eval "$($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential)" - $AWS s3 cp --no-progress $LOC $SM/bam/ -fi +mkdir -p $SM/bam + +rc=0 +n=0 +until [[ $n -eq 5 ]]; do + if [[ $LOC =~ ^syn[0-9]+ ]]; then + $SYNAPSE get $LOC --downloadLocation $SM/bam/ && break || rc=$? + elif [[ $LOC =~ ^s3:.+ ]]; then + $AWS s3 ls $LOC || { + printf "\nSet an NDA AWS token\n" + eval "$($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential)" + } + $AWS s3 cp --no-progress $LOC $SM/bam/ && break || rc=$? + else + ls -lh $LOC && ln -sf $(readlink -f $LOC) $SM/bam/ || rc=$? + break + fi + n=$((n+1)) + printf "Download try $n failed.\n" +done +[[ $rc -eq 0 ]] || false printf -- "[$(date)] Finish downlaod: $FNAME\n---\n" diff --git a/variant_calling/run.py b/variant_calling/run.py index 4fdad65..35e6214 100755 --- a/variant_calling/run.py +++ b/variant_calling/run.py @@ -98,11 +98,13 @@ def submit_post_jobs(sample, jid): def parse_args(): parser = argparse.ArgumentParser(description='Variant Calling Pipeline') parser.add_argument('infile', metavar='sample_list.txt', - help='''Sample list file. + help='''Sample list file. Each line format is "sample_id\\tfile_name\\tlocation". - Header line should start with "#". Trailing columns will be ignored. - "location" is either a synapse_id, or a s3_uri of the NDA. - For data download, synapse or aws clients will be used, respectively.''') + Lines staring with "#" will omitted. + Header line should also start with "#". + Trailing columns will be ignored. + "location" is Synapse ID, S3Uri of the NDA or a user, or LocalPath. + For data download, synapse or aws clients, or symbolic link will be used, respectively.''') return parser.parse_args() if __name__ == "__main__": From 380ab5e29dcc13a444de50f26ca4f0f8c3bb1b45 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Tue, 4 Dec 2018 10:38:01 -0600 Subject: [PATCH 53/70] Bug fix: Not updating the return code of multiple download tries --- genome_mapping/job_scripts/pre_1.download.sh | 8 ++++---- variant_calling/job_scripts/pre_1.download.sh | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/genome_mapping/job_scripts/pre_1.download.sh b/genome_mapping/job_scripts/pre_1.download.sh index a2d053e..3863b3d 100755 --- a/genome_mapping/job_scripts/pre_1.download.sh +++ b/genome_mapping/job_scripts/pre_1.download.sh @@ -26,19 +26,19 @@ rc=0 n=0 until [[ $n -eq 5 ]]; do if [[ $LOC =~ ^syn[0-9]+ ]]; then - $SYNAPSE get $LOC --downloadLocation $SM/downloads/ && break || rc=$? + $SYNAPSE get $LOC --downloadLocation $SM/downloads/ && { rc=$?; break; } || rc=$? elif [[ $LOC =~ ^s3:.+ ]]; then $AWS s3 ls $LOC || { - printf "\nSet an NDA AWS token\n" + printf "Set an NDA AWS token\n\n" eval "$($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential)" } - $AWS s3 cp --no-progress $LOC $SM/downloads/ && break || rc=$? + $AWS s3 cp --no-progress $LOC $SM/downloads/ && { rc=$?; break; } || rc=$? else ls -lh $LOC && ln -sf $(readlink -f $LOC) $SM/downloads/ || rc=$? break fi n=$((n+1)) - printf "Download try $n failed.\n" + printf "Download try $n failed.\n\n" done [[ $rc -eq 0 ]] || false diff --git a/variant_calling/job_scripts/pre_1.download.sh b/variant_calling/job_scripts/pre_1.download.sh index d6b14dd..66b7e19 100755 --- a/variant_calling/job_scripts/pre_1.download.sh +++ b/variant_calling/job_scripts/pre_1.download.sh @@ -26,19 +26,19 @@ rc=0 n=0 until [[ $n -eq 5 ]]; do if [[ $LOC =~ ^syn[0-9]+ ]]; then - $SYNAPSE get $LOC --downloadLocation $SM/bam/ && break || rc=$? + $SYNAPSE get $LOC --downloadLocation $SM/bam/ && { rc=$?; break; } || rc=$? elif [[ $LOC =~ ^s3:.+ ]]; then $AWS s3 ls $LOC || { - printf "\nSet an NDA AWS token\n" + printf "Set an NDA AWS token\n\n" eval "$($PIPE_HOME/utils/nda_aws_token.sh -r ~/.nda_credential)" } - $AWS s3 cp --no-progress $LOC $SM/bam/ && break || rc=$? + $AWS s3 cp --no-progress $LOC $SM/bam/ && { rc=$?; break; } || rc=$? else ls -lh $LOC && ln -sf $(readlink -f $LOC) $SM/bam/ || rc=$? break fi n=$((n+1)) - printf "Download try $n failed.\n" + printf "Download try $n failed.\n\n" done [[ $rc -eq 0 ]] || false From b449050fadaa9588b37ef26c8d0f852baa6d3bcc Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Tue, 4 Dec 2018 15:46:00 -0600 Subject: [PATCH 54/70] Bug fix: Make sample dir before saving run_info --- library/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/config.py b/library/config.py index c23522b..59a4c08 100644 --- a/library/config.py +++ b/library/config.py @@ -17,7 +17,7 @@ def read_config(): def run_info(fname): config = read_config() - + pathlib.Path(os.path.dirname(fname)).mkdir(parents=True, exist_ok=True) with open(fname, "w") as run_file: run_file.write("#PATH\nPIPE_HOME={}\n".format(config["PATH"]["pipe_home"])) for section in ["TOOLS", "RESOURCES"]: From 1216c22adc2bb4a99f997bf639e884ddc72045db Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Wed, 5 Dec 2018 00:06:30 -0600 Subject: [PATCH 55/70] Adjust threaded and java Xmx to match with m5.12xlarge --- genome_mapping/job_scripts/aln_1.align_sort.sh | 4 ++-- genome_mapping/job_scripts/aln_2.merge_bam.sh | 4 ++-- genome_mapping/job_scripts/aln_3.markdup.sh | 4 ++-- genome_mapping/job_scripts/aln_4.indel_realign.sh | 8 ++++---- genome_mapping/job_scripts/aln_5.bqsr.sh | 10 +++++----- genome_mapping/run.py | 2 +- genome_mapping/submit_aln_jobs.py | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/genome_mapping/job_scripts/aln_1.align_sort.sh b/genome_mapping/job_scripts/aln_1.align_sort.sh index 8e028b7..20b02df 100755 --- a/genome_mapping/job_scripts/aln_1.align_sort.sh +++ b/genome_mapping/job_scripts/aln_1.align_sort.sh @@ -1,6 +1,6 @@ #!/bin/bash #$ -cwd -#$ -pe threaded 36 +#$ -pe threaded 24 trap "exit 100" ERR @@ -20,7 +20,7 @@ set -o pipefail printf -- "---\n[$(date)] Start align_sort.\n" mkdir -p $SM/bam -$BWA mem -M -t 32 \ +$BWA mem -M -t $((NSLOTS - 4)) \ -R "@RG\tID:$SM.$PU\tSM:$SM\tPL:illumina\tLB:$SM\tPU:$PU" \ $REF $SM/fastq/$SM.$PU.R{1,2}.fastq.gz \ |$SAMBAMBA view -S -f bam -l 0 /dev/stdin \ diff --git a/genome_mapping/job_scripts/aln_2.merge_bam.sh b/genome_mapping/job_scripts/aln_2.merge_bam.sh index 1c086b6..e60fd5d 100755 --- a/genome_mapping/job_scripts/aln_2.merge_bam.sh +++ b/genome_mapping/job_scripts/aln_2.merge_bam.sh @@ -1,6 +1,6 @@ #!/bin/bash #$ -cwd -#$ -pe threaded 18 +#$ -pe threaded 24 trap "exit 100" ERR @@ -22,7 +22,7 @@ if [[ $(ls $SM/bam/$SM.*.sorted.bam|wc -l) == 1 ]]; then mv $SM/bam/$SM.*.sorted.bam $SM/bam/$SM.merged.bam rm $SM/bam/$SM.*.sorted.bam.bai else - $SAMBAMBA merge -t 18 $SM/bam/$SM.merged.bam $SM/bam/$SM.*.sorted.bam + $SAMBAMBA merge -t $NSLOTS $SM/bam/$SM.merged.bam $SM/bam/$SM.*.sorted.bam rm $SM/bam/$SM.*.sorted.bam{,.bai} fi diff --git a/genome_mapping/job_scripts/aln_3.markdup.sh b/genome_mapping/job_scripts/aln_3.markdup.sh index ab729dc..f274a8b 100755 --- a/genome_mapping/job_scripts/aln_3.markdup.sh +++ b/genome_mapping/job_scripts/aln_3.markdup.sh @@ -1,6 +1,6 @@ #!/bin/bash #$ -cwd -#$ -pe threaded 18 +#$ -pe threaded 12 trap "exit 100" ERR @@ -18,7 +18,7 @@ set -o pipefail printf -- "---\n[$(date)] Start markdup.\n" -$JAVA -Xmx26G -jar $PICARD MarkDuplicates \ +$JAVA -Xmx36G -jar $PICARD MarkDuplicates \ I=$SM/bam/$SM.merged.bam \ O=$SM/bam/$SM.markduped.bam \ METRICS_FILE=$SM/markduplicates_metrics.txt \ diff --git a/genome_mapping/job_scripts/aln_4.indel_realign.sh b/genome_mapping/job_scripts/aln_4.indel_realign.sh index 93d4f4f..60dc356 100755 --- a/genome_mapping/job_scripts/aln_4.indel_realign.sh +++ b/genome_mapping/job_scripts/aln_4.indel_realign.sh @@ -1,6 +1,6 @@ #!/bin/bash #$ -cwd -#$ -pe threaded 36 +#$ -pe threaded 24 trap "exit 100" ERR @@ -18,8 +18,8 @@ set -o pipefail printf -- "---\n[$(date)] Start RealignerTargetCreator.\n" -$JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \ - -T RealignerTargetCreator -nt 36 \ +$JAVA -Xmx72G -Djava.io.tmpdir=tmp -jar $GATK \ + -T RealignerTargetCreator -nt $NSLOTS \ -R $REF -known $MILLS -known $INDEL1KG \ -I $SM/bam/$SM.markduped.bam \ -o $SM/realigner.intervals @@ -27,7 +27,7 @@ $JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \ printf -- "---\n[$(date)] Finish RealignerTargetCreator.\n" printf -- "---\n[$(date)] Start IndelRealigner.\n---\n" -$JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \ +$JAVA -Xmx72G -Djava.io.tmpdir=tmp -jar $GATK \ -T IndelRealigner \ -R $REF -known $MILLS -known $INDEL1KG \ -targetIntervals $SM/realigner.intervals \ diff --git a/genome_mapping/job_scripts/aln_5.bqsr.sh b/genome_mapping/job_scripts/aln_5.bqsr.sh index f363ba2..2d3fa97 100755 --- a/genome_mapping/job_scripts/aln_5.bqsr.sh +++ b/genome_mapping/job_scripts/aln_5.bqsr.sh @@ -1,6 +1,6 @@ #!/bin/bash #$ -cwd -#$ -pe threaded 36 +#$ -pe threaded 24 trap "exit 100" ERR @@ -18,8 +18,8 @@ set -o pipefail printf -- "---\n[$(date)] Start BQSR recal_table.\n" -$JAVA -Xmx58G -jar $GATK \ - -T BaseRecalibrator -nct 36 \ +$JAVA -Xmx72G -jar $GATK \ + -T BaseRecalibrator -nct $NSLOTS \ -R $REF -knownSites $DBSNP -knownSites $MILLS -knownSites $INDEL1KG \ -I $SM/bam/$SM.realigned.bam \ -o $SM/recal_data.table @@ -27,8 +27,8 @@ $JAVA -Xmx58G -jar $GATK \ printf -- "---\n[$(date)] Start BQSR recal_table.\n" printf -- "---\n[$(date)] Start BQSR PrintReads.\n---\n" -$JAVA -Xmx58G -jar $GATK \ - -T PrintReads -nct 36 \ +$JAVA -Xmx72G -jar $GATK \ + -T PrintReads -nct $NSLOTS \ --emit_original_quals \ -R $REF -BQSR $SM/recal_data.table \ -I $SM/bam/$SM.realigned.bam \ diff --git a/genome_mapping/run.py b/genome_mapping/run.py index a38054f..7ef24bb 100755 --- a/genome_mapping/run.py +++ b/genome_mapping/run.py @@ -43,7 +43,7 @@ def main(): print() def opt(sample, jid=None): - opt = "-j y -o {log_dir} -l h_vmem=2G".format(log_dir=log_dir(sample)) + opt = "-j y -o {log_dir} -l h_vmem=4G".format(log_dir=log_dir(sample)) if jid is not None: opt = "-hold_jid {jid} {opt}".format(jid=jid, opt=opt) return opt diff --git a/genome_mapping/submit_aln_jobs.py b/genome_mapping/submit_aln_jobs.py index eaedd47..98c52a3 100755 --- a/genome_mapping/submit_aln_jobs.py +++ b/genome_mapping/submit_aln_jobs.py @@ -51,7 +51,7 @@ def parentid(sample): return line.strip().split("=")[1] def opt(sample, jid=None): - opt = "-j y -o {log_dir} -l h_vmem=2G".format(log_dir=log_dir(sample)) + opt = "-j y -o {log_dir} -l h_vmem=4G".format(log_dir=log_dir(sample)) if jid is not None: opt = "-hold_jid {jid} {opt}".format(jid=jid, opt=opt) return opt From 8f2cf6b643b799b4b3d28919863f0ddf1fc3098f Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Sun, 9 Dec 2018 14:45:36 -0600 Subject: [PATCH 56/70] Java memory adjustment --- genome_mapping/job_scripts/aln_3.markdup.sh | 2 +- genome_mapping/job_scripts/aln_4.indel_realign.sh | 4 ++-- genome_mapping/job_scripts/aln_5.bqsr.sh | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/genome_mapping/job_scripts/aln_3.markdup.sh b/genome_mapping/job_scripts/aln_3.markdup.sh index f274a8b..9b68099 100755 --- a/genome_mapping/job_scripts/aln_3.markdup.sh +++ b/genome_mapping/job_scripts/aln_3.markdup.sh @@ -18,7 +18,7 @@ set -o pipefail printf -- "---\n[$(date)] Start markdup.\n" -$JAVA -Xmx36G -jar $PICARD MarkDuplicates \ +$JAVA -Xmx26G -jar $PICARD MarkDuplicates \ I=$SM/bam/$SM.merged.bam \ O=$SM/bam/$SM.markduped.bam \ METRICS_FILE=$SM/markduplicates_metrics.txt \ diff --git a/genome_mapping/job_scripts/aln_4.indel_realign.sh b/genome_mapping/job_scripts/aln_4.indel_realign.sh index 60dc356..85cfd25 100755 --- a/genome_mapping/job_scripts/aln_4.indel_realign.sh +++ b/genome_mapping/job_scripts/aln_4.indel_realign.sh @@ -18,7 +18,7 @@ set -o pipefail printf -- "---\n[$(date)] Start RealignerTargetCreator.\n" -$JAVA -Xmx72G -Djava.io.tmpdir=tmp -jar $GATK \ +$JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \ -T RealignerTargetCreator -nt $NSLOTS \ -R $REF -known $MILLS -known $INDEL1KG \ -I $SM/bam/$SM.markduped.bam \ @@ -27,7 +27,7 @@ $JAVA -Xmx72G -Djava.io.tmpdir=tmp -jar $GATK \ printf -- "---\n[$(date)] Finish RealignerTargetCreator.\n" printf -- "---\n[$(date)] Start IndelRealigner.\n---\n" -$JAVA -Xmx72G -Djava.io.tmpdir=tmp -jar $GATK \ +$JAVA -Xmx58G -Djava.io.tmpdir=tmp -jar $GATK \ -T IndelRealigner \ -R $REF -known $MILLS -known $INDEL1KG \ -targetIntervals $SM/realigner.intervals \ diff --git a/genome_mapping/job_scripts/aln_5.bqsr.sh b/genome_mapping/job_scripts/aln_5.bqsr.sh index 2d3fa97..d9b56f7 100755 --- a/genome_mapping/job_scripts/aln_5.bqsr.sh +++ b/genome_mapping/job_scripts/aln_5.bqsr.sh @@ -18,7 +18,7 @@ set -o pipefail printf -- "---\n[$(date)] Start BQSR recal_table.\n" -$JAVA -Xmx72G -jar $GATK \ +$JAVA -Xmx58G -jar $GATK \ -T BaseRecalibrator -nct $NSLOTS \ -R $REF -knownSites $DBSNP -knownSites $MILLS -knownSites $INDEL1KG \ -I $SM/bam/$SM.realigned.bam \ @@ -27,7 +27,7 @@ $JAVA -Xmx72G -jar $GATK \ printf -- "---\n[$(date)] Start BQSR recal_table.\n" printf -- "---\n[$(date)] Start BQSR PrintReads.\n---\n" -$JAVA -Xmx72G -jar $GATK \ +$JAVA -Xmx58G -jar $GATK \ -T PrintReads -nct $NSLOTS \ --emit_original_quals \ -R $REF -BQSR $SM/recal_data.table \ From 3b45165aca95457dbd596b20e8c25a28a71ba0c3 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 10 Dec 2018 12:16:22 -0600 Subject: [PATCH 57/70] Update the README file --- README.md | 60 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index ea9e69b..dbecb43 100644 --- a/README.md +++ b/README.md @@ -2,36 +2,72 @@ BSMN common data processing pipeline # Setup and installation +This pipeline can be run in any cluster system using SGE job scheduler. I would recommend set your own cluster in AWS using AWS ParallelCluster. -## cfncluster +## AWS ParallelCluster +For installoing and setting up parallelcluster, plase see the [`Getting Started Guide`](https://aws-parallelcluster.readthedocs.io/en/latest/getting_started.html) for AWS ParallelCluster. -This pipeline runs using [`cfncluster`](https://cfncluster.readthedocs.io). - -## Installing cfncluster +## Installing pipeline +Check out bsmn_pipeline where you want it installed in the AWS Parallelcluster you set up or the cluster system you are using. +``` +$ git clone https://github.com/bsmn/bsmn_pipeline +``` -It's recommended to use a Python virtual environment (https://virtualenv.pypa.io/en/stable/). +Install dependent tools running the following script. For GATK 3.7-0 that this pipeline uses, you should mannually download it and put into the appropriate location following the instruction that the install scipt gives. +``` +$ cd bsmn_pipeline +$ ./install_tools.sh +``` -To install `cfncluster`: +Download requred resource files including the reference sequence. This step require a synapse account that can access to the Syanpse page syn17062535. +``` +$ ./download_resources.sh +``` +## Extra set up for SGE +The pipeline require a parallel environment named "threaded" in your SGE system. If your SGE system doen't have this parallel environment, you should add it into yours. +``` +$ sudo su +# qconf -Ap << END +pe_name threaded +slots 99999 +user_lists NONE +xuser_lists NONE +start_proc_args NONE +stop_proc_args NONE +allocation_rule $pe_slots +control_slaves FALSE +job_is_first_task TRUE +urgency_slots min +accounting_summary TRUE +qsort_args NONE +END +``` ``` -pip install cfncluster +# qconf -mattr queue pe_list threaded all.q ``` -To get the pipeline software installed on the cluster, a post-install script is run after the cluster starts. You can see this file as a GitHub Gist [here](https://gist.github.com/kdaily/1e0a2d1fcef1c6847f743f637301a3d5). - # Usage ## genome_mapping +Run the pipeline using a wrapper shell script. ```bash -genome_mapping/run.py sample_list.txt +genome_mapping.sh sample_list.txt ``` ### sample_list.txt format -The first line should be a header line. Eg. +The lines starting with # will be commented out and ignored. The header line should start with # as well. Eg. ``` -sample_id file synapse_id +#sample_id file_name location 5154_brain-BSMN_REF_brain-534-U01MH106876 bulk_sorted.bam syn10639574 5154_fibroblast-BSMN_REF_fibroblasts-534-U01MH106876 fibroblasts_sorted.bam syn10639575 ``` +The "location" column can be a Synape ID, S3Uri of the NDA or a user, or LocalPath. For Data download, synapse or aws clients, or symbolic lins will be used, respectively. + +### options +``` +--parentid syn123 +``` +With parentid option, you can specify a Synapse ID of project or folder where to upload result bam files. If it is set, the result bam files will be uploaded into Synapse and deleted. Otherwise, they will be locally kept. # Contributing From 69347c542ff3480eab88a78df7a6a6072a5dce0e Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 10 Dec 2018 12:25:31 -0600 Subject: [PATCH 58/70] Update the README file --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dbecb43..e9fdf64 100644 --- a/README.md +++ b/README.md @@ -57,9 +57,11 @@ genome_mapping.sh sample_list.txt ### sample_list.txt format The lines starting with # will be commented out and ignored. The header line should start with # as well. Eg. ``` -#sample_id file_name location -5154_brain-BSMN_REF_brain-534-U01MH106876 bulk_sorted.bam syn10639574 -5154_fibroblast-BSMN_REF_fibroblasts-534-U01MH106876 fibroblasts_sorted.bam syn10639575 +#sample_id file_name location +5154_brain-BSMN_REF_brain-534-U01MH106876 bulk_sorted.bam syn10639574 +5154_fibroblast-BSMN_REF_fibroblasts-534-U01MH106876 fibroblasts_sorted.bam syn10639575 +5154_NeuN_positive-BSMN_REF_NeuN+_E12-677-U01MH106876 E12_MDA_common_sorted.bam s3://nda-bsmn/abyzova_1497485007384/data/E12_MDA_common_sorted.bam +5154_NeuN_positive-BSMN_REF_NeuN+_C12-677-U01MH106876 C12_MDA_common_sorted.bam /efs/data/C12_MDA_common_sorted.bam ``` The "location" column can be a Synape ID, S3Uri of the NDA or a user, or LocalPath. For Data download, synapse or aws clients, or symbolic lins will be used, respectively. From 2b92eb1a6abe01a06351fc41d02de5147f510305 Mon Sep 17 00:00:00 2001 From: Attila Gulyas-Kovacs Date: Tue, 15 Jan 2019 16:23:36 -0500 Subject: [PATCH 59/70] Small edits on README.md --- README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e9fdf64..7ce3b4a 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,23 @@ Check out bsmn_pipeline where you want it installed in the AWS Parallelcluster y $ git clone https://github.com/bsmn/bsmn_pipeline ``` -Install dependent tools running the following script. For GATK 3.7-0 that this pipeline uses, you should mannually download it and put into the appropriate location following the instruction that the install scipt gives. +Install software dependencies into `bsmn_pipeline/tools` running the following script. ``` $ cd bsmn_pipeline $ ./install_tools.sh +``` +The only dependency not installed by the script is GATK 3.7-0. Therefore GATK 3.7-0 should be manually downloaded into the location `bsmn_pipeline/tools/gatk/3.7-0` using for instance the following command: + +``` +# Currently install_tools.sh contains only vague hints on the download. +# Suggested commands to download and extract GATK 3.7-0; +# Couldn't these be included in install_tools.sh? +cd tools/gatk/3.7-0 +url='https://software.broadinstitute.org/gatk/download/auth?package=GATK-archive&version=3.7-0-gcfedb67' +wget -O GenomeAnalysisTK-3.7-0-gcfedb67.tar.bz2 "$url" +tar xjf GenomeAnalysisTK-3.7-0-gcfedb67.tar.bz2 +cd ../../.. + ``` Download requred resource files including the reference sequence. This step require a synapse account that can access to the Syanpse page syn17062535. From 512ad274a0de46bf95557f3c04f3b87d430cda55 Mon Sep 17 00:00:00 2001 From: Attila Gulyas-Kovacs Date: Tue, 15 Jan 2019 17:09:56 -0500 Subject: [PATCH 60/70] install_tools.sh now downloads and extracts GATK --- install_tools.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/install_tools.sh b/install_tools.sh index dafc537..8db9edd 100755 --- a/install_tools.sh +++ b/install_tools.sh @@ -137,7 +137,8 @@ cd $WD # Installing GATK mkdir -p tools/gatk/3.7-0 -echo "----" -echo "Please manually download GATK 3.7-0 and put the jar file in tools/gatk/3.7-0." -echo "URL: https://software.broadinstitute.org/gatk/download/auth?package=GATK-archive&version=3.7-0-gcfedb67" - +cd tools/gatk/3.7-0 +url='https://software.broadinstitute.org/gatk/download/auth?package=GATK-archive&version=3.7-0-gcfedb67' +wget -O GenomeAnalysisTK-3.7-0-gcfedb67.tar.bz2 "$url" +tar xjf GenomeAnalysisTK-3.7-0-gcfedb67.tar.bz2 && rm GenomeAnalysisTK-3.7-0-gcfedb67.tar.bz2 +cd $WD From daa03a7e924f685a2a39bc5bd0de49a579e47352 Mon Sep 17 00:00:00 2001 From: Attila Gulyas-Kovacs Date: Tue, 15 Jan 2019 17:13:56 -0500 Subject: [PATCH 61/70] Removed manual download of GATK from README.md --- .gitignore | 1 + README.md | 15 +-------------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index faab01f..ad341b0 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ __pycache__/ *.py[cod] *$py.class +*.swq # tools / resources directories /tools/ diff --git a/README.md b/README.md index 7ce3b4a..e1c13c1 100644 --- a/README.md +++ b/README.md @@ -18,21 +18,8 @@ Install software dependencies into `bsmn_pipeline/tools` running the following s $ cd bsmn_pipeline $ ./install_tools.sh ``` -The only dependency not installed by the script is GATK 3.7-0. Therefore GATK 3.7-0 should be manually downloaded into the location `bsmn_pipeline/tools/gatk/3.7-0` using for instance the following command: -``` -# Currently install_tools.sh contains only vague hints on the download. -# Suggested commands to download and extract GATK 3.7-0; -# Couldn't these be included in install_tools.sh? -cd tools/gatk/3.7-0 -url='https://software.broadinstitute.org/gatk/download/auth?package=GATK-archive&version=3.7-0-gcfedb67' -wget -O GenomeAnalysisTK-3.7-0-gcfedb67.tar.bz2 "$url" -tar xjf GenomeAnalysisTK-3.7-0-gcfedb67.tar.bz2 -cd ../../.. - -``` - -Download requred resource files including the reference sequence. This step require a synapse account that can access to the Syanpse page syn17062535. +Download required resource files including the reference sequence. This step require a synapse account that can access to the Synapse page syn17062535. ``` $ ./download_resources.sh ``` From d301c366bf2bf6f6c786908bcbc1a90101a4be5a Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Tue, 30 Apr 2019 10:27:09 -0500 Subject: [PATCH 62/70] Bug fix:NDA api url change --- utils/nda_aws_token.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/nda_aws_token.sh b/utils/nda_aws_token.sh index 2e2edf7..80badec 100755 --- a/utils/nda_aws_token.sh +++ b/utils/nda_aws_token.sh @@ -49,7 +49,7 @@ else source $nda_cred_f fi -server="https://ndar.nih.gov/DataManager/dataManager" +server="https://nda.nih.gov/DataManager/dataManager" ############################################################################## # Make Request From e19be4bde9a1d69e05b1c66436df315c7f94badb Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Tue, 30 Apr 2019 10:39:38 -0500 Subject: [PATCH 63/70] Bug fix:sample list input parser change --- library/parser.py | 10 ++++++++++ variant_calling/run.py | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/library/parser.py b/library/parser.py index 33a3ff0..daa9e89 100644 --- a/library/parser.py +++ b/library/parser.py @@ -13,3 +13,13 @@ def sample_list(fname): sample_id, file_name, location = line.strip().split()[:3] samples[(sample_id, filetype(file_name))].append((file_name, location)) return samples + +def sample_list2(fname): + samples = defaultdict(list) + with open(fname) as sfile: + for line in sfile: + if line[0] == "#": + continue + sample_id, file_name, location = line.strip().split()[:3] + samples[sample_id].append((file_name, location)) + return samples diff --git a/variant_calling/run.py b/variant_calling/run.py index 35e6214..a72c341 100755 --- a/variant_calling/run.py +++ b/variant_calling/run.py @@ -12,7 +12,7 @@ from library.config import run_info, log_dir from library.login import synapse_login, nda_login -from library.parser import sample_list +from library.parser import sample_list2 from library.job_queue import GridEngineQueue q = GridEngineQueue() @@ -22,7 +22,7 @@ def main(): synapse_login() nda_login() - samples = sample_list(args.infile) + samples = sample_list2(args.infile) for sample, sdata in samples.items(): print(sample) From 406314471dd35506bed7d7ba704be60fa0c75693 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Tue, 30 Apr 2019 11:41:34 -0500 Subject: [PATCH 64/70] Add templates for new job scripts --- variant_calling/job_scripts/filter.PON.sh | 23 ++++++++++++++++ variant_calling/job_scripts/mosaicforcast.sh | 23 ++++++++++++++++ .../job_scripts/mutect.paired_mode.sh | 27 +++++++++++++++++++ .../job_scripts/mutect.single_mode.sh | 25 +++++++++++++++++ .../job_scripts/strelka.paired_mode.sh | 27 +++++++++++++++++++ .../job_scripts/strelka.single_mode.sh | 25 +++++++++++++++++ 6 files changed, 150 insertions(+) create mode 100755 variant_calling/job_scripts/filter.PON.sh create mode 100755 variant_calling/job_scripts/mosaicforcast.sh create mode 100755 variant_calling/job_scripts/mutect.paired_mode.sh create mode 100755 variant_calling/job_scripts/mutect.single_mode.sh create mode 100755 variant_calling/job_scripts/strelka.paired_mode.sh create mode 100755 variant_calling/job_scripts/strelka.single_mode.sh diff --git a/variant_calling/job_scripts/filter.PON.sh b/variant_calling/job_scripts/filter.PON.sh new file mode 100755 index 0000000..d4f8a83 --- /dev/null +++ b/variant_calling/job_scripts/filter.PON.sh @@ -0,0 +1,23 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 16 + +trap "exit 100" ERR + +if [[ $# -lt 1 ]]; then + echo "Usage: $(basename $0) [sample name]" + false +fi + +SM=$1 + +source $(pwd)/$SM/run_info + +set -o nounset +set -o pipefail + +printf -- "---\n[$(date)] Start MosaicForcast.\n" + +# Add command + +printf -- "[$(date)] Finish MosaicForcast.\n---\n" diff --git a/variant_calling/job_scripts/mosaicforcast.sh b/variant_calling/job_scripts/mosaicforcast.sh new file mode 100755 index 0000000..d4f8a83 --- /dev/null +++ b/variant_calling/job_scripts/mosaicforcast.sh @@ -0,0 +1,23 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 16 + +trap "exit 100" ERR + +if [[ $# -lt 1 ]]; then + echo "Usage: $(basename $0) [sample name]" + false +fi + +SM=$1 + +source $(pwd)/$SM/run_info + +set -o nounset +set -o pipefail + +printf -- "---\n[$(date)] Start MosaicForcast.\n" + +# Add command + +printf -- "[$(date)] Finish MosaicForcast.\n---\n" diff --git a/variant_calling/job_scripts/mutect.paired_mode.sh b/variant_calling/job_scripts/mutect.paired_mode.sh new file mode 100755 index 0000000..4efb6ff --- /dev/null +++ b/variant_calling/job_scripts/mutect.paired_mode.sh @@ -0,0 +1,27 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 16 + +trap "exit 100" ERR + +if [[ $# -lt 2 ]]; then + echo "Usage: $(basename $0) [sample name] [control sample name]" + false +fi + +SM=$1 +CTR=$2 + +source $(pwd)/$SM/run_info + +set -o nounset +set -o pipefail + +BAM1=$SM/bam/$SM.bam +BAM2=$SM/bam/$CTR.bam + +printf -- "---\n[$(date)] Start Mutect paired sample mode.\n" + +# Add command + +printf -- "[$(date)] Finish Mutect paired sample mode.\n---\n" diff --git a/variant_calling/job_scripts/mutect.single_mode.sh b/variant_calling/job_scripts/mutect.single_mode.sh new file mode 100755 index 0000000..ade6771 --- /dev/null +++ b/variant_calling/job_scripts/mutect.single_mode.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 16 + +trap "exit 100" ERR + +if [[ $# -lt 1 ]]; then + echo "Usage: $(basename $0) [sample name]" + false +fi + +SM=$1 + +source $(pwd)/$SM/run_info + +set -o nounset +set -o pipefail + +BAM=$SM/bam/$SM.bam + +printf -- "---\n[$(date)] Start Mutect single sample mode.\n" + +# Add command + +printf -- "[$(date)] Finish Mutect single sample mode.\n---\n" diff --git a/variant_calling/job_scripts/strelka.paired_mode.sh b/variant_calling/job_scripts/strelka.paired_mode.sh new file mode 100755 index 0000000..fd0d5aa --- /dev/null +++ b/variant_calling/job_scripts/strelka.paired_mode.sh @@ -0,0 +1,27 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 16 + +trap "exit 100" ERR + +if [[ $# -lt 2 ]]; then + echo "Usage: $(basename $0) [sample name] [control sample name]" + false +fi + +SM=$1 +CTR=$2 + +source $(pwd)/$SM/run_info + +set -o nounset +set -o pipefail + +BAM1=$SM/bam/$SM.bam +BAM2=$SM/bam/$CTR.bam + +printf -- "---\n[$(date)] Start Strelka paired sample mode.\n" + +# Add command + +printf -- "[$(date)] Finish Strelka paired sample mode.\n---\n" diff --git a/variant_calling/job_scripts/strelka.single_mode.sh b/variant_calling/job_scripts/strelka.single_mode.sh new file mode 100755 index 0000000..07d193d --- /dev/null +++ b/variant_calling/job_scripts/strelka.single_mode.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#$ -cwd +#$ -pe threaded 16 + +trap "exit 100" ERR + +if [[ $# -lt 1 ]]; then + echo "Usage: $(basename $0) [sample name]" + false +fi + +SM=$1 + +source $(pwd)/$SM/run_info + +set -o nounset +set -o pipefail + +BAM=$SM/bam/$SM.bam + +printf -- "---\n[$(date)] Start Strelka single sample mode.\n" + +# Add command + +printf -- "[$(date)] Finish Strelka single sample mode.\n---\n" From fad95f0881c49d6cb26bfedc9e40f89c98379143 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Tue, 30 Apr 2019 11:44:46 -0500 Subject: [PATCH 65/70] Small change PON filter --- variant_calling/job_scripts/filter.PON.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/variant_calling/job_scripts/filter.PON.sh b/variant_calling/job_scripts/filter.PON.sh index d4f8a83..182cbfd 100755 --- a/variant_calling/job_scripts/filter.PON.sh +++ b/variant_calling/job_scripts/filter.PON.sh @@ -16,8 +16,8 @@ source $(pwd)/$SM/run_info set -o nounset set -o pipefail -printf -- "---\n[$(date)] Start MosaicForcast.\n" +printf -- "---\n[$(date)] Start PON filter.\n" # Add command -printf -- "[$(date)] Finish MosaicForcast.\n---\n" +printf -- "[$(date)] Finish PON filter.\n---\n" From 580673d4ec1c6b569cec06dd6ccda17b71bf5a7a Mon Sep 17 00:00:00 2001 From: Sean Cho Date: Mon, 6 May 2019 18:01:36 -0400 Subject: [PATCH 66/70] Add mutect2 single sample workflow NOTE: This workflow does not use PON. Should be implemented in the future. Added variant calling scripts 1 and 2 for calling and filtering. Added GATK4 and gnomAD VCF to set up. Tested on AWS EC2 m5d.xlarge with Moran WES BAM 6:96076198-96096198. --- .../job_scripts/mutect-ss_1.call.sh | 40 +++++++ .../job_scripts/mutect-ss_2.filter.sh | 109 ++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 variant_calling/job_scripts/mutect-ss_1.call.sh create mode 100644 variant_calling/job_scripts/mutect-ss_2.filter.sh diff --git a/variant_calling/job_scripts/mutect-ss_1.call.sh b/variant_calling/job_scripts/mutect-ss_1.call.sh new file mode 100644 index 0000000..909c914 --- /dev/null +++ b/variant_calling/job_scripts/mutect-ss_1.call.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +#$ -cwd +#$ -pe threaded 16 + +if [[ $# -lt 1 ]]; then + echo "Usage: $(basename $0) [targets.bed]" + exit 1 +fi + +SM=$1 + +if [ -z "$2" ]; then + mode='WGS' + TARGETS='' +else + mode='WES' + TARGETS="-L ${2} -ip 50" ## pad 50 bases +fi + +#source $(pwd)/$SM/run_info + +set -euo pipefail + +BAM=$SM/bam/$SM.bam +VCF=$SM/vcf/$SM.mutect2-raw.vcf.gz + +printf -- "---\n[$(date)] Starting MuTect2 single sample calling ${mode}.\n". + +if [[ ! -f $VCF ]]; then + mkdir -p $SM/vcf + $JAVA -Xmx32G -Djava.io.tmpdir=tmp -XX:-UseParallelGC -jar $GATK4 \ + Mutect2 \ + -R $REF \ + -I $BAM \ + -tumor $SM \ + --germline-resource $GNOMAD \ + $TARGETS \ + -O $VCF \ + --disable-read-filter MateOnSameContigOrNoMappedMateReadFilter ## see https://github.com/broadinstitute/gatk/issues/3514 +fi diff --git a/variant_calling/job_scripts/mutect-ss_2.filter.sh b/variant_calling/job_scripts/mutect-ss_2.filter.sh new file mode 100644 index 0000000..0221b07 --- /dev/null +++ b/variant_calling/job_scripts/mutect-ss_2.filter.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash +#$ -cwd +#$ -pe threaded 16 + +if [[ $# -lt 1 ]]; then + echo "Usage: $(basename $0) [targets.bed]" + exit 1 +fi + +SM=$1 + +if [ -z "$2" ]; then + mode='WGS' + TARGETS='' +else + mode='WES' + TARGETS="-L ${2} -ip 50" ## pad 50 bases +fi + +source $(pwd)/$SM/run_info + +set -euo pipefail +mkdir -p $SM/tmp + +BAM="$SM/bam/${SM}.bam" +VCF_RAW="$SM/vcf/${SM}.mutect2-raw.vcf.gz" +PILEUP_SUMMARIES="$SM/tmp/${SM}.pileup_summaries.table" +CONTAMINATION="$SM/tmp/${SM}.contamination.table" +VCF_ONEPASS="$SM/tmp/${SM}.mutect2-onepass.vcf.gz" +ARTIFACT_PREFIX="$SM/tmp/${SM}.artifact" +ARTIFACTS="${ARTIFACT_PREFIX}.pre_adapter_detail_metrics.txt" +VCF_ALLFILTERS="$SM/vcf/${SM}.mutect2-allfilters.vcf.gz" +VCF_FINAL="$SM/vcf/${SM}.mutect2-filt.vcf.gz" + +if [[ -f $VCF_FINAL ]]; then + echo "$VCF_FINAL exists. Nothing to be done." + exit 0 +fi + +printf -- "---\n[$(date)] {SM}: Filtering Mutect2 calls for ${mode}.\n" + +## get pileup summaries +if [[ ! -f $PILEUP_SUMMARIES ]]; then + echo "Getting pileup summaries." + $JAVA -Xmx32G -Djava.io.tmpdir=tmp -XX:-UseParallelGC -jar $GATK4 \ + GetPileupSummaries \ + -I $BAM \ + -V $GNOMAD \ + $TARGETS \ + -O $PILEUP_SUMMARIES \ + --verbosity DEBUG +fi + +## calculate contamination +if [[ ! -f $CONTAMINATION ]]; then + echo "Calculating contamination." + $JAVA -Xmx32G -Djava.io.tmpdir=tmp -XX:-UseParallelGC -jar $GATK4 \ + CalculateContamination \ + -I $PILEUP_SUMMARIES \ + -O $CONTAMINATION \ + --verbosity DEBUG +fi + +## apply first pass filters +if [[ ! -f $VCF_ONEPASS ]]; then + echo "Filtering first pass." + $JAVA -Xmx32G -Djava.io.tmpdir=tmp -XX:-UseParallelGC -jar $GATK4 \ + FilterMutectCalls \ + -V $VCF_RAW \ + --reference $REF \ + --contamination-table $CONTAMINATION \ + -O $VCF_ONEPASS \ + --verbosity DEBUG +fi + +## calculate artifact metrics +if [[ ! -f $ARTIFACTS ]]; then + echo "Calculating sequencing artifact metrics." + $JAVA -Xmx32G -Djava.io.tmpdir=tmp -XX:-UseParallelGC -jar $GATK4 \ + CollectSequencingArtifactMetrics \ + -I $BAM \ + -O $ARTIFACT_PREFIX \ + --FILE_EXTENSION ".txt" \ + -R $REF +fi + +## filter orientation bias +if [[ ! -f $VCF_ALLFILTERS ]]; then + echo "Filtering for orientation bias." + $JAVA -Xmx32G -Djava.io.tmpdir=tmp -XX:-UseParallelGC -jar $GATK4 \ + FilterByOrientationBias \ + -AM G/T \ + -V $VCF_ONEPASS \ + -P $ARTIFACTS \ + -O $VCF_ALLFILTERS +fi + +## filter for PASS only +if [[ ! -f $VCF_FINAL ]]; then + echo "Filtering ${VCF_ALLFILTERS} for passed variants to ${VCF_FINAL}." + zcat $VCF_ALLFILTERS | grep '^#\|PASS' | $BGZIP -c > $VCF_FINAL + sleep 5 + $TABIX $VCF_FINAL +fi + +## clean up +if [ $? -eq 0 ]; then + rm $SM/tmp/* +fi From f603f9f5d9067bdde1c18221a6540ee9be50baa2 Mon Sep 17 00:00:00 2001 From: Sean Cho Date: Mon, 6 May 2019 18:09:44 -0400 Subject: [PATCH 67/70] Update resources Added GATK4 and gnomAD VCF to set up. --- config.ini | 2 ++ download_resources.sh | 4 ++++ install_tools.sh | 13 +++++++++++-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/config.ini b/config.ini index d52b5ed..1377b7e 100644 --- a/config.ini +++ b/config.ini @@ -7,6 +7,7 @@ BWA = tools/bwa/0.7.16a/bin/bwa SAMTOOLS = tools/samtools/1.7/bin/samtools SAMBAMBA = tools/sambamba/v0.6.7/bin/sambamba GATK = tools/gatk/3.7-0/GenomeAnalysisTK.jar +GATK4 = tools/gatk/4.1-2/gatk-package-4.1.2.0-local.jar PICARD = tools/picard/2.12.1/picard.jar BGZIP = tools/htslib/1.7/bin/bgzip TABIX = tools/htslib/1.7/bin/tabix @@ -26,3 +27,4 @@ HAPMAP = resources/hapmap_3.3.b37.vcf SNP1KG = resources/1000G_phase1.snps.high_confidence.b37.vcf KNOWN_GERM_SNP = resources/gnomAD.1KG.ExAC.ESP6500.Kaviar.snps.txt.gz MASK1KG = resources/20141020.strict_mask.whole_genome.fasta.gz +GNOMAD = resources/af-only-gnomad.raw.sites.b37.vcf.gz diff --git a/download_resources.sh b/download_resources.sh index 7c233b5..338add7 100755 --- a/download_resources.sh +++ b/download_resources.sh @@ -18,6 +18,10 @@ tools/python/3.6.2/bin/synapse get syn17062535 -r --downloadLocation resources/ gunzip resources/*vcf.gz resources/*vcf.idx.gz rm resources/SYNAPSE_METADATA_MANIFEST.tsv +## Download GATK gnomAD +wget -P resources ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/Mutect2/af-only-gnomad.raw.sites.b37.vcf.gz +wget -P resources ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/Mutect2/af-only-gnomad.raw.sites.b37.vcf.gz.tbi + # Split the ref genome by chromosome awk '{ r = match($1, "^>"); diff --git a/install_tools.sh b/install_tools.sh index 8db9edd..8cc0c91 100755 --- a/install_tools.sh +++ b/install_tools.sh @@ -8,7 +8,7 @@ cd tools/python/3.6.2/src wget -qO- https://www.python.org/ftp/python/3.6.2/Python-3.6.2.tgz \ |tar xvz --strip-components=1 ./configure --with-ensurepip=install --prefix=$WD/tools/python/3.6.2 -make +make make install cd $WD rm -r tools/python/3.6.2/src @@ -105,7 +105,7 @@ $WD/tools/cmake/3.11.4/bin/cmake \ -Dbuiltin_pcre=ON -Dhttp=ON -Dgnuinstall=ON \ -DCMAKE_INSTALL_PREFIX=$WD/tools/root/6.14.00 \ root-6.14.00 -$WD/tools/cmake/3.11.4/bin/cmake --build . -- -j +$WD/tools/cmake/3.11.4/bin/cmake --build . -- -j $WD/tools/cmake/3.11.4/bin/cmake --build . --target install cd $WD rm -r tools/root/6.14.00/src @@ -142,3 +142,12 @@ url='https://software.broadinstitute.org/gatk/download/auth?package=GATK-archive wget -O GenomeAnalysisTK-3.7-0-gcfedb67.tar.bz2 "$url" tar xjf GenomeAnalysisTK-3.7-0-gcfedb67.tar.bz2 && rm GenomeAnalysisTK-3.7-0-gcfedb67.tar.bz2 cd $WD + +# Installing GATK4 +mkdir -p tools/gatk/ +cd tools/gatk/ +url='https://github.com/broadinstitute/gatk/releases/download/4.1.2.0/gatk-4.1.2.0.zip' +wget -O gatk-4.1.2.0.zip "$url" +unzip gatk-4.1.2.0.zip && rm gatk-4.1.2.0.zip +mv gatk-4.1.2.0 4.1-2 +cd $WD From 3303403aa5eda5a7c9b149bf8b81961f9ca4e0a1 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Tue, 14 May 2019 23:30:14 -0500 Subject: [PATCH 68/70] add alt_bq_sum.py with modification of pileup library --- library/pileup.py | 45 +++++++++++++++++----------- utils/alt_bq_sum.py | 70 ++++++++++++++++++++++++++++++++++++++++++++ utils/somatic_vaf.py | 4 +-- utils/strand_bias.py | 4 +-- 4 files changed, 102 insertions(+), 21 deletions(-) create mode 100755 utils/alt_bq_sum.py diff --git a/library/pileup.py b/library/pileup.py index 49a68a1..2aa0aeb 100644 --- a/library/pileup.py +++ b/library/pileup.py @@ -11,6 +11,12 @@ if not os.path.isfile(SAMTOOLS) or not os.access(SAMTOOLS, os.X_OK): SAMTOOLS = shutil.which("samtools") +def base_count(bam, min_MQ, min_BQ): + return pileup(bam, min_MQ, min_BQ, base_n()) + +def base_qual_tuple(bam, min_MQ, min_BQ): + return pileup(bam, min_MQ, min_BQ, base_qual()) + @coroutine def pileup(bam, min_MQ, min_BQ, target): result = None @@ -39,29 +45,26 @@ def pileup(bam, min_MQ, min_BQ, target): else: sys.exit("Failed in pileup.") try: - bases = cmd_out.stdout.split()[4] + bases, quals = cmd_out.stdout.split()[4:6] + bases = bases_clean(bases) except IndexError: - bases = '' - result = target.send(bases) + bases, quals = ('', '') + result = target.send((bases, quals)) -@coroutine -def clean(target): - result = None - while True: - bases = (yield result) - bases = re.sub('\^.', '', bases) - bases = re.sub('\$', '', bases) - for n in set(re.findall('-(\d+)', bases)): - bases = re.sub('-{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) - for n in set(re.findall('\+(\d+)', bases)): - bases = re.sub('\+{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) - result = target.send(bases) +def bases_clean(bases): + bases = re.sub('\^.', '', bases) + bases = re.sub('\$', '', bases) + for n in set(re.findall('-(\d+)', bases)): + bases = re.sub('-{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) + for n in set(re.findall('\+(\d+)', bases)): + bases = re.sub('\+{0}[ACGTNacgtn]{{{0}}}'.format(n), '', bases) + return bases @coroutine -def count(): +def base_n(): result = None while True: - bases = (yield result) + bases, quals = (yield result) base_n = {} base_n['A'] = bases.count('A') base_n['C'] = bases.count('C') @@ -73,3 +76,11 @@ def count(): base_n['t'] = bases.count('t') base_n['dels'] = bases.count('*') result = base_n + +@coroutine +def base_qual(): + result = None + while True: + bases, quals = (yield result) + bases = re.sub('\*', '', bases) + result = list(map(lambda b, q: (b.upper(), ord(q)-33), bases, quals)) diff --git a/utils/alt_bq_sum.py b/utils/alt_bq_sum.py new file mode 100755 index 0000000..d242ebd --- /dev/null +++ b/utils/alt_bq_sum.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys +from statsmodels.stats.proportion import binom_test + +pipe_home = os.path.dirname(os.path.realpath(__file__)) + "/.." +sys.path.append(pipe_home) +from library.misc import coroutine, printer +from library.pileup import base_qual_tuple + +def run(args): + alt_BQ_info = alt_BQ_sum(base_qual_tuple(args.bam, args.min_MQ, args.min_BQ)) + header = '#chr\tpos\tref\talt\talt_n\talt_BQ_sum' + printer(header) + for snv in args.infile: + if snv[0] == '#': + continue + chrom, pos, ref, alt = snv.strip().split()[:4] + printer('{chrom}\t{pos}\t{ref}\t{alt}\t{alt_BQ_info}'.format( + chrom=chrom, pos=pos, ref=ref.upper(), alt=alt.upper(), + alt_BQ_info=alt_BQ_info.send((chrom, pos, alt)))) + +@coroutine +def alt_BQ_sum(target): + result = None + while True: + chrom, pos, alt = (yield result) + alt_BQ = [q for b, q in target.send((chrom, pos)) if b == alt.upper()] + result = '{alt_n}\t{alt_BQ_sum}'.format(alt_n=len(alt_BQ), alt_BQ_sum=sum(alt_BQ)) + +def main(): + parser = argparse.ArgumentParser( + description='Sum of base qualities of alt allele of each SNV') + + parser.add_argument( + '-b', '--bam', metavar='FILE', + help='bam file', + required=True) + + parser.add_argument( + '-q', '--min-MQ', metavar='INT', + help='mapQ cutoff value [20]', + type=int, default=20) + + parser.add_argument( + '-Q', '--min-BQ', metavar='INT', + help='baseQ/BAQ cutoff value [13]', + type=int, default=13) + + parser.add_argument( + 'infile', metavar='snv_list.txt', + help='''SNV list. + Each line format is "chr\\tpos\\tref\\talt". + Trailing columns will be ignored. [STDIN]''', + nargs='?', type=argparse.FileType('r'), + default=sys.stdin) + + parser.set_defaults(func=run) + + args = parser.parse_args() + + if(len(vars(args)) == 0): + parser.print_help() + else: + args.func(args) + +if __name__ == "__main__": + main() diff --git a/utils/somatic_vaf.py b/utils/somatic_vaf.py index 3f665f1..df079a4 100755 --- a/utils/somatic_vaf.py +++ b/utils/somatic_vaf.py @@ -8,10 +8,10 @@ pipe_home = os.path.dirname(os.path.realpath(__file__)) + "/.." sys.path.append(pipe_home) from library.misc import coroutine, printer -from library.pileup import pileup, clean, count +from library.pileup import base_count def run(args): - v_info = vaf_info(pileup(args.bam, args.min_MQ, args.min_BQ, clean(count()))) + v_info = vaf_info(base_count(args.bam, args.min_MQ, args.min_BQ)) header = ('#chr\tpos\tref\talt\tvaf\t' + 'depth\tref_n\talt_n\tp_binom') printer(header) diff --git a/utils/strand_bias.py b/utils/strand_bias.py index c242330..dd68e88 100755 --- a/utils/strand_bias.py +++ b/utils/strand_bias.py @@ -10,10 +10,10 @@ pipe_home = os.path.dirname(os.path.realpath(__file__)) + "/.." sys.path.append(pipe_home) from library.misc import coroutine, printer -from library.pileup import pileup, clean, count +from library.pileup import base_count def run(args): - s_info = strand_info(pileup(args.bam, args.min_MQ, args.min_BQ, clean(count()))) + s_info = strand_info(base_count(args.bam, args.min_MQ, args.min_BQ)) header = ('#chr\tpos\tref\talt\t' + 'total\ttotal_fwd\ttotal_rev\ttotal_ratio\t' + 'p_poisson\t' From b588012130b1cbc6fc87af72051e835b7debeff7 Mon Sep 17 00:00:00 2001 From: Taejeong Bae Date: Mon, 27 May 2019 21:13:51 -0500 Subject: [PATCH 69/70] bug fix: pileup --- library/pileup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/pileup.py b/library/pileup.py index 2aa0aeb..e7976aa 100644 --- a/library/pileup.py +++ b/library/pileup.py @@ -47,7 +47,7 @@ def pileup(bam, min_MQ, min_BQ, target): try: bases, quals = cmd_out.stdout.split()[4:6] bases = bases_clean(bases) - except IndexError: + except ValueError: bases, quals = ('', '') result = target.send((bases, quals)) From 10b406b17d11da8efa3f7c87444b4f38023b9f7e Mon Sep 17 00:00:00 2001 From: Yifan Wang Date: Mon, 10 Jun 2019 16:36:12 -0400 Subject: [PATCH 70/70] adding strelka paired mode into variant_calling --- config.ini | 1 + install_tools.sh | 8 ++++++++ variant_calling/job_scripts/strelka.paired_mode.sh | 13 ++++++++++--- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/config.ini b/config.ini index 1377b7e..1732e61 100644 --- a/config.ini +++ b/config.ini @@ -15,6 +15,7 @@ VT = tools/vt/2018-06-07/bin/vt BCFTOOLS = tools/bcftools/1.7/bin/bcftools ROOTSYS = tools/root/6.14.00 CNVNATOR = tools/cnvnator/2018-07-09/bin/cnvnator +STRELKA = tools/strelka/strelka-2.9.2.centos6_x86_64/bin/configureStrelkaSomaticWorkflow.py [RESOURCES] REFDIR = resources diff --git a/install_tools.sh b/install_tools.sh index 8cc0c91..f7e8e3c 100755 --- a/install_tools.sh +++ b/install_tools.sh @@ -151,3 +151,11 @@ wget -O gatk-4.1.2.0.zip "$url" unzip gatk-4.1.2.0.zip && rm gatk-4.1.2.0.zip mv gatk-4.1.2.0 4.1-2 cd $WD + +#Installing Strelka +mkdir -p tools/strelka +cd tools/strelka +url = 'https://github.com/Illumina/strelka/releases/download/v2.9.2/strelka-2.9.2.centos6_x86_64.tar.bz2' +wget -O strelka-2.9.2.centos6_x86_64.tar.bz2 "$url" +tar xjf strelka-2.9.2.centos6_x86_64.tar.bz2 && rm strelka-2.9.2.centos6_x86_64.tar.bz2 +cd $WD diff --git a/variant_calling/job_scripts/strelka.paired_mode.sh b/variant_calling/job_scripts/strelka.paired_mode.sh index fd0d5aa..676b5b6 100755 --- a/variant_calling/job_scripts/strelka.paired_mode.sh +++ b/variant_calling/job_scripts/strelka.paired_mode.sh @@ -19,9 +19,16 @@ set -o pipefail BAM1=$SM/bam/$SM.bam BAM2=$SM/bam/$CTR.bam +strelka_out=$SM/strelka printf -- "---\n[$(date)] Start Strelka paired sample mode.\n" - -# Add command - +if [[ ! -f $VCF ]]; then + mkdir $SM/strelka + $STRELKA \ + --normalBam $BAM2 \ + --tumorBam $BAM1 \ + --referenceFasta $REF \ + --runDir $strelka_out + cd $strelka_out + $strelka_out/runWorkflow.py -m local -j 8 printf -- "[$(date)] Finish Strelka paired sample mode.\n---\n"