11#!/usr/bin/python3
2- # Copyright 2011-2013 Francisco Pina Martins <[email protected] > 2+ # Copyright 2011-2015 Francisco Pina Martins <[email protected] > 33# This file is part of 4Pipe4.
44# 4Pipe4 is free software: you can redistribute it and/or modify
55# it under the terms of the GNU General Public License as published by
2626import Reporter
2727import SSRfinder as ssr
2828import Metrics
29- import CAF_to_TCS
29+ import SAM_to_BAM
30+ import BAM_to_TCS
3031import argparse
32+ import sff_extractor
3133from argparse import RawTextHelpFormatter
3234
3335
34- #### # ARGUMENT LIST ##### #
36+ # # # # # ARGUMENT LIST # # # # # #
3537parser = argparse .ArgumentParser (description = "" ,
3638 epilog = "The idea here is that to resume an \
37- analysis that was interrupted for example \
38- after the assembling process you should \
39- issue -s '4,5,6,7,8,9' or -s '456789'. Note \
40- that some steps depend on the output of \
41- previous steps, so using some combinations \
42- can cause errors. The arguments can be given \
43- in any order." ,
39+ analysis that was interrupted for example after the assembling process you \
40+ should issue -s '4,5,6,7,8,9' or -s '456789'. Note that some steps depend on \
41+ the output of previous steps, so using some combinations can cause errors. \
42+ The arguments can be given in any order." ,
4443 prog = "4Pipe4" ,
4544 formatter_class = RawTextHelpFormatter )
4645parser .add_argument ("-i" , dest = "infile" , nargs = 1 , required = True ,
4746 help = "Provide the full path to your target sff file\n " ,
4847 metavar = "sff_file" )
4948parser .add_argument ("-o" , dest = "outfile" , nargs = 1 , required = True ,
5049 help = "Provide the full path to your results directory, \
51- plus the name you want to give your results\n " ,
50+ plus the name you want to give your results\n " ,
5251 metavar = "basefile" )
5352parser .add_argument ("-c" , dest = "configFile" , nargs = 1 ,
5453 help = "Provide the full path to your configuration file. \
55- If none is provided, the program will look in the current \
56- working directory and then in ~/.config/4Pipe4rc (in this\
57- order) for one. If none is found the program will stop\n " ,
58- metavar = "configfile" )
54+ If none is provided, the program will look in the current working directory \
55+ and then in ~/.config/4Pipe4rc (in this order) for one. If none is found the \
56+ program will stop\n " , metavar = "configfile" )
5957parser .add_argument ("-s" , dest = "run_list" , nargs = "?" ,
6058 default = "1 2 3 4 5 6 7 8 9" , help = "Specify the numbers \
61- corresponding to the pipeline steps that will be run. The \
62- string after -s must be given inside quotation marks, and \
63- numbers can be joined together or separated by any \
64- symbol. The numbers are the pipeline steps that should be \
65- run. This is an optional argument and it's omission will \
66- run all steps by default. The numbers, from 1 to 9 \
67- represent the following steps:\n \t 1 - SFF extraction\n \t 2 \
68- - SeqClean\n \t 3 - Mira\n \t 4 - DiscoveryTCS\n \t 5 - \
69- SNP grabber\n \t 6 - ORF finder\n \t 7 - Blast2go\n \t 8 - SSR \
70- finder\n \t 9 - 7zip the report" )
59+ corresponding to the pipeline steps that will be run. The string after -s \
60+ must be given inside quotation marks, and numbers can be joined together or \
61+ separated by any symbol. The numbers are the pipeline steps that should be \
62+ run. This is an optional argument and it's omission will run all steps by \
63+ default. The numbers, from 1 to 9 represent the following steps:\n \t 1 - SFF \
64+ extraction\n \t 2 - SeqClean\n \t 3 - Mira\n \t 4 - DiscoveryTCS\n \t 5 - \
65+ SNP grabber\n \t 6 - ORF finder\n \t 7 - Blast2go\n \t 8 - SSR finder\n \t 9 - 7zip \
66+ the report" )
7167arg = parser .parse_args ()
7268
7369
@@ -143,53 +139,44 @@ def RunProgram(cli, requires_output):
143139 time .sleep (5 )
144140
145141
146- def SffExtract (sff , clip ):
147- '''Function for using the sff_extract program. The function returns the
148- 'clip' value recommended by sff_extract. If run sequentially, the
149- recommendations should be added.'''
150- cli = [config .get ('Program paths' , 'sff_extract_path' ), '-c' ,
151- '--min_left_clip=' + str (clip ),
152- '--min_frequency=' + config .get ('Variables' , 'max_equality' ), '-o' ,
153- basefile , "-A" , sff ]
154- print ("\n Running sff_extract using the following command:" )
155- print (' ' .join (cli ))
156- sff_extract_stdout = RunProgram (cli , 1 )
157- if len (sff_extract_stdout ) == 20 :
158- for lines in sff_extract_stdout :
159- if "Probably" in str (lines ):
160- warning = str (lines )
161- number = ''
162- for letters in warning :
163- if letters .isdigit ():
164- number = number + letters
165- return number
166- else :
167- print ("The found value seems acceptable. If this message is displayed \
168- twice in a row you have found your min_left_clip.\n " )
169- return "OK"
170-
171-
172- def MinClip (basefile ):
173- '''Function for calling the SffExtract function and adding 1 to the 'clip'
174- value until the ideal value is found. Depends on SffExtract.'''
175- OK = 0
176- clip = 0
177- while OK < 2 :
178- turner = SffExtract (sff , clip )
179- if turner == "OK" :
180- OK = OK + 1
181- clip = clip + 1
182- if OK == 2 :
183- clip = clip - 2
142+ def SffExtraction (sff , basefile ):
143+ '''Function for using the sff_extractor module. It will look for an "ideal"
144+ clipping value using multiple runs before outputting the final files.'''
145+ clip_found = 0
146+
147+ # Sff_extractor parameters:
148+ sff_config = {}
149+ sff_config ["append" ] = False
150+ sff_config ["qual_fname" ] = basefile + ".fasta.qual"
151+ sff_config ["want_fastq" ] = False
152+ sff_config ["min_leftclip" ] = 0
153+ sff_config ["min_freq" ] = int (config .get ('Variables' , 'max_equality' ))
154+ sff_config ["xml_info" ] = None
155+ sff_config ["want_fr" ] = False
156+ sff_config ["pelinker_fname" ] = ""
157+ sff_config ["mix_case" ] = True
158+ sff_config ["clip" ] = True
159+ sff_config ["xml_fname" ] = basefile + ".xml"
160+ sff_config ["basename" ] = basefile
161+ sff_config ["seq_fname" ] = basefile + ".fasta"
162+
163+ while clip_found < 2 :
164+ extra_clip = sff_extractor .extract_reads_from_sff (sff_config , [sff ])
165+ sff_config ["min_leftclip" ] += extra_clip
166+ if extra_clip == 0 :
167+ clip_found += 1
184168 else :
185- OK = 0
186- clip = clip + int (turner )
187- print ("Sff_extract finished with a min_left_clip=" + str (clip ) + ".\n " )
169+ clip_found = 0
170+
171+ print ("Sff_extractor finished with a min_left_clip=" +
172+ str (sff_config ["min_leftclip" ]) + ".\n " )
173+
174+ return
188175
189176
190177def SeqClean (basefile ):
191178 '''Function for using seqclean and clean2qual.'''
192- #seqclean
179+ # seqclean
193180 cli = [config .get ('Program paths' , 'seqclean_path' ),
194181 basefile + '.fasta' , '-r' , basefile + '.clean.rpt' , '-l' ,
195182 config .get ('Variables' , 'min_len' ), '-o' ,
@@ -199,7 +186,7 @@ def SeqClean(basefile):
199186 print ("\n Running Seqclean using the following command:" )
200187 print (' ' .join (cli ))
201188 RunProgram (cli , 0 )
202- #cln2qual
189+ # cln2qual
203190 cli = [config .get ('Program paths' , 'cln2qual_path' ),
204191 basefile + '.clean.rpt' , basefile + '.fasta.qual' ]
205192 print ("\n Running cln2qual using the following command:" )
@@ -222,22 +209,34 @@ def MiraRun(basefile):
222209 manifest .write ("data = " + basename + ".clean.fasta\n " )
223210 manifest .close ()
224211
212+ # Run mira
225213 cli = [config .get ('Program paths' , 'mira_path' ), basefile + ".manifest" ]
226214
227215 print ("\n Running Mira using the following command:" )
228216 print (' ' .join (cli ))
229217 RunProgram (cli , 0 )
230218
219+ # Convert the MAF output to SAM output
220+ cli = [config .get ('Program paths' , 'mira_path' ) + "convert" , "-f" , "maf" ,
221+ "-t" , "sam" , basefile + '_assembly/' + miraproject + '_d_results/' +
222+ miraproject + '_out.maf' , basefile + ".sam" ]
223+
224+ print ("\n Converting MAF to SAM using miraconvert:" )
225+ print (' ' .join (cli ))
226+ RunProgram (cli , 0 )
227+
231228
232229def DiscoveryTCS (basefile ):
233230 '''Discovers SNPs in the TCS output file of Mira. Use only if trying to
234231 find SNPs. Output in TCS format.'''
235232 os .chdir (os .path .split (basefile )[0 ])
236233 print ("\n Running SNP Discovery tool module..." )
237- CAF_to_TCS .RunModule (basefile + '_assembly/' + miraproject
238- + '_d_results/' + miraproject + '_out.caf' )
239- TCS .RunModule (basefile + '_assembly/' + miraproject + '_d_results/'
240- + miraproject + '_out.tcs' , basefile + '_out.short.tcs' ,
234+ SAM_to_BAM .RunModule (basefile + '.sam' ,
235+ basefile + '.bam' )
236+ BAM_to_TCS .RunModule (basefile + '.bam' , basefile + '_assembly/' +
237+ miraproject + '_d_results/' + miraproject +
238+ '_out.padded.fasta' )
239+ TCS .RunModule (basefile + '.tcs' , basefile + '_out.short.tcs' ,
241240 int (config .get ('Variables' , 'minqual' )),
242241 int (config .get ('Variables' , 'mincov' )))
243242
@@ -264,10 +263,10 @@ def ORFliner(basefile):
264263 print ("\n Running EMBOSS 'getorf' using the following command:" )
265264 print (' ' .join (cli ))
266265 RunProgram (cli , 0 )
267- #After this we go to ORFmaker.py:
266+ # After this we go to ORFmaker.py:
268267 print ("\n Running ORFmaker module..." )
269268 ORFmaker .RunModule (basefile + '.allORFs.fasta' )
270- #Next we BLAST the resulting ORFs against the local 'nr' database:
269+ # Next we BLAST the resulting ORFs against the local 'nr' database:
271270 if config .get ('Program paths' , 'BLAST_path' ).endswith ('blast2' ):
272271 cli = [config .get ('Program paths' , 'BLAST_path' ), '-p' , 'blastx' , '-d' ,
273272 config .get ('Program paths' , 'BLASTdb_path' ), '-i' ,
@@ -283,7 +282,7 @@ def ORFliner(basefile):
283282 print ("\n Running NCBI 'blastx' using the following command:" )
284283 print (' ' .join (cli ))
285284 RunProgram (cli , 0 )
286- #Then we write the metrics report:
285+ # Then we write the metrics report:
287286 print ("\n Running the metrics calculator module..." )
288287 seqclean_log_path = "%s/seqcl_%s.fasta.log" % (os .path .split (basefile )[0 ],
289288 miraproject )
@@ -294,7 +293,7 @@ def ORFliner(basefile):
294293 + miraproject + '_info_assembly.txt' , basefile
295294 + '.SNPs.fasta' , basefile + '.BestORF.fasta' ,
296295 basefile + '.Metrics.html' )
297- #Finally we write down our report using the data gathered so far:
296+ # Finally we write down our report using the data gathered so far:
298297 print ("\n Running Reporter module..." )
299298 Reporter .RunModule (basefile + '.BestORF.fasta' , basefile + '.SNPs.fasta' ,
300299 basefile + '.ORFblast.html' , basefile + '.Report.html' ,
@@ -321,7 +320,7 @@ def B2G(basefile):
321320 print ("\n Running NCBI 'blastx' using the following command:" )
322321 print (' ' .join (cli ))
323322 RunProgram (cli , 0 )
324- #After 'blasting' we run b2g4pipe:
323+ # After 'blasting' we run b2g4pipe:
325324 if os .path .isfile (config .get ('Program paths' , 'Blast2go_path' )):
326325 cli = ['java' , '-jar' , config .get ('Program paths' , 'Blast2go_path' ),
327326 '-in' , basefile + '.shortlistblast.xml' , '-prop' ,
@@ -385,7 +384,7 @@ def TidyUP(basefile):
385384 print (basefile + '.Metrics.html does not exist' )
386385 shutil .copy (config .get ('Program paths' , 'Templates_path' ) +
387386 '/Report.html' , 'Report/Report.html' )
388- #7zip it
387+ # 7zip it
389388 cli = [config .get ('Program paths' , '7z_path' ), 'a' , '-y' , '-bd' , basefile
390389 + '.report.7z' , 'Report' ]
391390 print ("\n 7ziping the Report folder using the following command:" )
@@ -397,7 +396,7 @@ def RunMe(arguments):
397396 '''Function to parse which parts of 4Pipe4 will run.'''
398397 for option , number in zip (list (arguments ), range (len (arguments ))):
399398 if option == "1" :
400- MinClip ( basefile )
399+ SffExtraction ( sff , basefile )
401400 if option == "2" :
402401 SeqClean (basefile )
403402 if option == "3" :
0 commit comments