From 9782bec7cfdb3d20ab631d2d18fcf8732f184f02 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 1 Apr 2021 11:38:04 +0200 Subject: [PATCH 001/229] bug fix runSingle.py --- fdog/fDOGassembly.py | 4 +++- fdog/runSingle.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b802b26..f207516 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -597,6 +597,9 @@ def main(): if core_path == '': core_path = out + '/core_orthologs/' + print(assemblyDir) + + # user input has to be checked here before fDOGassembly continues @@ -725,7 +728,6 @@ def main(): return 1 ################## checking accepted genes for co-orthologs ########################## - print(reciprocal_sequences) reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index 34d7fc1..a0ded09 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -437,8 +437,8 @@ def main(): assemblypath = cfg['assemblypath'] except: sys.exit('assemblypath not found in %s' % pathFile) - if assembly == True: - searchpath = assemblypath + if assembly == True: + searchpath = assemblypath ### check input arguments seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) From e56d87ac8f9b7dc5240ea9e6e090ca303648fdd1 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 10:04:04 +0200 Subject: [PATCH 002/229] cleaning output --- fdog/fDOGassembly.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f207516..515ddfe 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -60,7 +60,7 @@ def parse_blast(line, blast_results): #print(line) line = line.replace("\n", "") line_info = line.split("\t") - #print(line_info) + print(line_info) evalue = float(line_info[3]) #cut off @@ -598,7 +598,7 @@ def main(): core_path = out + '/core_orthologs/' print(assemblyDir) - + # user input has to be checked here before fDOGassembly continues @@ -620,7 +620,7 @@ def main(): ###################### create tmp folder ################################### - os.system('mkdir ' + out + '/tmp') + os.system('mkdir ' + out + '/tmp' + '>/dev/null 2>&1') ######################## consensus sequence ################################ @@ -659,7 +659,7 @@ def main(): searchBool = True ################### path definitions ################################### - os.system('mkdir ' + out + '/tmp/' + asName) + os.system('mkdir ' + out + '/tmp/' + asName + '>/dev/null 2>&1') tmp_path = out + "/tmp/" + asName + "/" candidatesOutFile = tmp_path + group + ".candidates.fa" if searchTaxon != '': @@ -740,7 +740,7 @@ def main(): if searchTaxon != '' and fasoff == False: fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - os.system('mkdir ' + tmp_path + 'anno_dir') + os.system('mkdir ' + tmp_path + 'anno_dir' + '>/dev/null 2>&1') os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName ) From 766c89d34b02723403bd4a03296f30785d6c4feb Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 10:56:26 +0200 Subject: [PATCH 003/229] testing --- fdog/fDOGassembly.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 515ddfe..d06e2bc 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -31,6 +31,11 @@ def merge(blast_results, insert_length): i = 1 while i < size_list-1: + a = locations[j][0] + b = locations[i][0] + c = locations[j][1] + d = locations[j][5] + e = locations[i][5] if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5])): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) @@ -60,7 +65,7 @@ def parse_blast(line, blast_results): #print(line) line = line.replace("\n", "") line_info = line.split("\t") - print(line_info) + #print(line_info) evalue = float(line_info[3]) #cut off @@ -597,7 +602,7 @@ def main(): if core_path == '': core_path = out + '/core_orthologs/' - print(assemblyDir) + #print(assemblyDir) From 48e41540d6ba403d974219a54c1563436ac54661 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:00:31 +0200 Subject: [PATCH 004/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d06e2bc..c317d8a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,7 +27,7 @@ def merge(blast_results, insert_length): j = 0 - while j < size_list-1: + while j < size_list-2: i = 1 while i < size_list-1: From 47f45d61f2875f61822e12f310e4b07d5eec20df Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:19:14 +0200 Subject: [PATCH 005/229] testing --- fdog/fDOGassembly.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c317d8a..be7edaf 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -599,6 +599,9 @@ def main(): assemblyDir = dataPath + '/assembly_dir/' if out == '': out = os.getcwd() + else: + if not os.path.exists(out + '/group'): + os.system('mkdir ' + out + '/group') if core_path == '': core_path = out + '/core_orthologs/' From fe44e0bf0458909febf5e5c9bec2fecd85c5f7ee Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:21:15 +0200 Subject: [PATCH 006/229] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index be7edaf..98e6480 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -599,6 +599,7 @@ def main(): assemblyDir = dataPath + '/assembly_dir/' if out == '': out = os.getcwd() + os.system('mkdir ' + out + '/group') else: if not os.path.exists(out + '/group'): os.system('mkdir ' + out + '/group') From 34e87cac0ca8f4b4c24ef807223c2b7cecaa0dbc Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:28:57 +0200 Subject: [PATCH 007/229] testing --- fdog/fDOGassembly.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 98e6480..e4434bc 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -599,10 +599,12 @@ def main(): assemblyDir = dataPath + '/assembly_dir/' if out == '': out = os.getcwd() - os.system('mkdir ' + out + '/group') + os.system('mkdir ' + out + '/' + group) + out = out + '/' + group else: - if not os.path.exists(out + '/group'): - os.system('mkdir ' + out + '/group') + if not os.path.exists(out + '/' + group): + os.system('mkdir ' + out + '/' + group) + out = out + '/' + group if core_path == '': core_path = out + '/core_orthologs/' From 32bce0eb9c9d9e1193ea2a668240fbab0f5be18d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:32:12 +0200 Subject: [PATCH 008/229] testing --- fdog/fDOGassembly.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e4434bc..ae77ac3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -601,10 +601,6 @@ def main(): out = os.getcwd() os.system('mkdir ' + out + '/' + group) out = out + '/' + group - else: - if not os.path.exists(out + '/' + group): - os.system('mkdir ' + out + '/' + group) - out = out + '/' + group if core_path == '': core_path = out + '/core_orthologs/' From a8362e35e18c6f298227cd658b4a33ed5d6b3e8a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:38:23 +0200 Subject: [PATCH 009/229] testing --- fdog/fDOGassembly.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index ae77ac3..d476d7a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -566,18 +566,8 @@ def main(): # print(out + "fdog.log \n") # sys.stdout = Logger(out) - try: - f = open(out + "fdog.log", "a+") - except FileNotFoundError: - f = open(out + "fdog.log", "w") - if silent == True: - sys.stderr = f - sys.stdout = f - else: - sys.stdout = Logger(f) - #checking paths if dataPath == '': @@ -605,6 +595,17 @@ def main(): core_path = out + '/core_orthologs/' #print(assemblyDir) + try: + f = open(out + "fdog.log", "a+") + except FileNotFoundError: + f = open(out + "fdog.log", "w") + + + if silent == True: + sys.stderr = f + sys.stdout = f + else: + sys.stdout = Logger(f) From 0458c252acb2c4077c551dcb3ddf361494617251 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:42:57 +0200 Subject: [PATCH 010/229] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d476d7a..f54a654 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -594,6 +594,7 @@ def main(): if core_path == '': core_path = out + '/core_orthologs/' + print(out) #print(assemblyDir) try: f = open(out + "fdog.log", "a+") From 1b07c9017814ece00a64adad8a97aac00e1ec89a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:54:09 +0200 Subject: [PATCH 011/229] testing --- fdog/fDOGassembly.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f54a654..d0b8610 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -591,10 +591,12 @@ def main(): out = os.getcwd() os.system('mkdir ' + out + '/' + group) out = out + '/' + group + if core_path == '': core_path = out + '/core_orthologs/' print(out) + print("test " + group + "\n" ) #print(assemblyDir) try: f = open(out + "fdog.log", "a+") From 2d3f8dda146d10082186dc1dce395c87f0949505 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:56:36 +0200 Subject: [PATCH 012/229] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d0b8610..a00876c 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -599,9 +599,9 @@ def main(): print("test " + group + "\n" ) #print(assemblyDir) try: - f = open(out + "fdog.log", "a+") + f = open(out + "/fdog.log", "a+") except FileNotFoundError: - f = open(out + "fdog.log", "w") + f = open(out + "/fdog.log", "w") if silent == True: From afec218d459c6cc181dd80a7eebe7e41a74754a9 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 11:59:34 +0200 Subject: [PATCH 013/229] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a00876c..33def84 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -588,6 +588,7 @@ def main(): if assemblyDir == '': assemblyDir = dataPath + '/assembly_dir/' if out == '': + print('test out \n') out = os.getcwd() os.system('mkdir ' + out + '/' + group) out = out + '/' + group From 9983e001ab8704188bf74168faf56b0e74a76def Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 8 Apr 2021 12:05:45 +0200 Subject: [PATCH 014/229] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 33def84..87749bf 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -530,7 +530,7 @@ def main(): assemblyDir = args.assemblyPath dataPath = args.dataPath core_path = args.coregroupPath - out = args.out + "/" + out = args.out pathFile = args.pathFile #I/O tmp = args.tmp @@ -591,7 +591,7 @@ def main(): print('test out \n') out = os.getcwd() os.system('mkdir ' + out + '/' + group) - out = out + '/' + group + out = out + '/' + group + '/' if core_path == '': core_path = out + '/core_orthologs/' From 4cca757f6fec7ffbe309996c5b6a8bc98a48a866 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 9 Apr 2021 13:18:56 +0200 Subject: [PATCH 015/229] bug fix if augutus can't idetify a gene at a candidate region --- fdog/fDOGassembly.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 87749bf..03af975 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -146,16 +146,21 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) - sequence_file = open(tmp_path + name + ".aa", "r") - lines = sequence_file.readlines() - for line in lines: - if line[0] == ">": - id = line.replace(">", "") - header = ">" + group + "|" + ass_name + "|" + name + "_" + id - output.write(header) - else: - output.write(line) - sequence_file.close() + try: + sequence_file = open(tmp_path + name + ".aa", "r") + lines = sequence_file.readlines() + for line in lines: + if line[0] == ">": + id = line.replace(">", "") + header = ">" + group + "|" + ass_name + "|" + name + "_" + id + output.write(header) + else: + output.write(line) + sequence_file.close() + except FileNotFoundError: + print("No gene found by ID:" + name +" , continuing with next region") + + output.close() From d9bb72dcd0e1e359417d36edbc69de201aa29da6 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 9 Apr 2021 14:07:44 +0200 Subject: [PATCH 016/229] testing --- fdog/fDOGassembly.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 03af975..8aa5f74 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -589,6 +589,8 @@ def main(): dataPath = cfg['dataPath'] except: dataPath = 'config' + if core_path == '': + core_path = out + '/core_orthologs/' if assemblyDir == '': assemblyDir = dataPath + '/assembly_dir/' @@ -598,8 +600,7 @@ def main(): os.system('mkdir ' + out + '/' + group) out = out + '/' + group + '/' - if core_path == '': - core_path = out + '/core_orthologs/' + print(out) print("test " + group + "\n" ) @@ -659,9 +660,11 @@ def main(): else: print("Building block profiles failed. Using prepareAlign to convert alignment\n") new_path = core_path + group +"/"+ group + "_new.aln" + print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path + print(cmd) result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) print("block profile is finished \n") From ddec3f0909fb9695c90569b674084b4826a7aa9c Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 9 Apr 2021 14:13:53 +0200 Subject: [PATCH 017/229] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 8aa5f74..e309f33 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -659,7 +659,7 @@ def main(): print("block profile is finished \n") else: print("Building block profiles failed. Using prepareAlign to convert alignment\n") - new_path = core_path + group +"/"+ group + "_new.aln" + new_path = core_path + "/" + group +"/"+ group + "_new.aln" print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) From 13aea2d3c2233ba9b32e857275d7e39d6574a2a0 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 9 Apr 2021 14:16:14 +0200 Subject: [PATCH 018/229] bug fix --- fdog/fDOGassembly.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e309f33..1691ac9 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -591,6 +591,9 @@ def main(): dataPath = 'config' if core_path == '': core_path = out + '/core_orthologs/' + else: + if not core_path.endswith('/'): + core_path = core_path + '/' if assemblyDir == '': assemblyDir = dataPath + '/assembly_dir/' @@ -660,11 +663,11 @@ def main(): else: print("Building block profiles failed. Using prepareAlign to convert alignment\n") new_path = core_path + "/" + group +"/"+ group + "_new.aln" - print(cmd) + #print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path - print(cmd) + #print(cmd) result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) print("block profile is finished \n") From 89a8843fd1c80f2fddb690f5f0505dbf6f8293ba Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 9 Apr 2021 14:17:13 +0200 Subject: [PATCH 019/229] cleaning up --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 1691ac9..6ba8aa6 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -662,7 +662,7 @@ def main(): print("block profile is finished \n") else: print("Building block profiles failed. Using prepareAlign to convert alignment\n") - new_path = core_path + "/" + group +"/"+ group + "_new.aln" + new_path = core_path + group +"/"+ group + "_new.aln" #print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) From 116acad39a7af8c56941b47a55fa96285ca7a132 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 09:50:21 +0200 Subject: [PATCH 020/229] testing --- fdog/fDOGassembly.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 6ba8aa6..27dc85b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -428,6 +428,7 @@ def checkOptions(): def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): candidates = readFasta(candidatesFile) ref = readFasta(fasta) + print(candidate_name) out = tmp_path + '/checkCoorthologs.fa' f = open(out,"w") @@ -441,8 +442,11 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci f.write(str(record.seq) + "\n") break + for record in candidates: + print(record.id + "ID\n") for name in candidate_names: + print(name + "name\n") if name in record.id: f.write(">" + name + "\n") f.write(str(record.seq) + "\n") @@ -604,9 +608,6 @@ def main(): out = out + '/' + group + '/' - - print(out) - print("test " + group + "\n" ) #print(assemblyDir) try: f = open(out + "/fdog.log", "a+") From 0078ee440f5e933bb81c6fb4eb12b88e788b05e0 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 10:03:56 +0200 Subject: [PATCH 021/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 27dc85b..3d7a243 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -428,7 +428,7 @@ def checkOptions(): def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): candidates = readFasta(candidatesFile) ref = readFasta(fasta) - print(candidate_name) + print(candidate_names) out = tmp_path + '/checkCoorthologs.fa' f = open(out,"w") From c03e59dab4caf920874263fe4c6bc78ba4b36c25 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 10:12:08 +0200 Subject: [PATCH 022/229] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 3d7a243..f4034b6 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -376,6 +376,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva #print(orthologs) + orthologs = set(orthologs) return list(orthologs), seed def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): From 366a4ab858870057f7df27f4bfc2ad99134932eb Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 12:18:15 +0200 Subject: [PATCH 023/229] testing --- fdog/fDOGassembly.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f4034b6..d751f53 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -22,7 +22,7 @@ def merge(blast_results, insert_length): locations = blast_results[key] locations = sorted(locations, key = lambda x: int(x[3])) #print("test") - #print(locations) + print(locations) size_list = len(locations) j = 0 @@ -59,23 +59,19 @@ def merge(blast_results, insert_length): #print(blast_results) return blast_results, number_regions -def parse_blast(line, blast_results): - # format blast line: - #fomrat dictionary: {node_name: [(,)]} - #print(line) +def parse_blast(line, blast_results, cutoff): + # format blast line: + #fomrat dictionary: {node_name: [(,,evalue, ,,)]} line = line.replace("\n", "") line_info = line.split("\t") - #print(line_info) evalue = float(line_info[3]) - #cut off - if evalue > 0.00001: + if evalue > cutoff: return blast_results, evalue #add region to dictionary else: node_name, sstart, send, qstart, qend = line_info[0], line_info[1], line_info[2], line_info[4], line_info[5] split = node_name.split("|") - # finding out on which strand tBLASTn founded a hit if sstart < send: strand = "+" @@ -83,7 +79,6 @@ def parse_blast(line, blast_results): sstart = line_info[2] send = line_info[1] strand = "-" - #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off of 0.00001 if len(split) > 1: node_name = split[1] @@ -96,7 +91,7 @@ def parse_blast(line, blast_results): return blast_results, evalue -def candidate_regions(intron_length, evalue, tmp_path): +def candidate_regions(intron_length, cutoff_evalue, tmp_path): ###################### extracting candidate regions ######################## # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 blast_file = open(tmp_path + "/blast_results.out", "r") @@ -109,9 +104,9 @@ def candidate_regions(intron_length, evalue, tmp_path): if not line: break #parsing blast output - blast_results, evalue = parse_blast(line, blast_results) + blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) #evalue cut-off - if not evalue <= evalue: + if not evalue <= cutoff_evalue: break if blast_results == {}: return 0,0 @@ -429,7 +424,6 @@ def checkOptions(): def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): candidates = readFasta(candidatesFile) ref = readFasta(fasta) - print(candidate_names) out = tmp_path + '/checkCoorthologs.fa' f = open(out,"w") @@ -445,9 +439,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci for record in candidates: - print(record.id + "ID\n") for name in candidate_names: - print(name + "name\n") if name in record.id: f.write(">" + name + "\n") f.write(str(record.seq) + "\n") @@ -603,7 +595,7 @@ def main(): if assemblyDir == '': assemblyDir = dataPath + '/assembly_dir/' if out == '': - print('test out \n') + #print('test out \n') out = os.getcwd() os.system('mkdir ' + out + '/' + group) out = out + '/' + group + '/' From 79f2b67802f76f5a3fbb003efbe9fd39f7db70df Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 13:43:34 +0200 Subject: [PATCH 024/229] bug fix in merge function, regions in minus strand were not merged correctly --- fdog/fDOGassembly.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d751f53..a3480a3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -36,27 +36,42 @@ def merge(blast_results, insert_length): c = locations[j][1] d = locations[j][5] e = locations[i][5] - if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5])): + if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 i -= 1 - elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2* insert_length) and (locations[j][5] == locations[i][5])): + elif ((locations[j][0] > locations[i][0]) and (locations[j][1] < locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + #merge overlapping regions + locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations.pop(i) + size_list -= 1 + i -= 1 + elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #print(j) locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 i -=1 + elif ((locations[j][0] > locations[i][0]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + #print(j) + locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations.pop(i) + size_list -= 1 + i -=1 + i += 1 j += 1 number_regions += len(locations) blast_results[key] = locations - #print(blast_results) + print(blast_results) return blast_results, number_regions def parse_blast(line, blast_results, cutoff): From 5425cd138dc47933a0f019896e1fe5db337d5ad0 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 14:09:51 +0200 Subject: [PATCH 025/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a3480a3..9694c6d 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -45,7 +45,7 @@ def merge(blast_results, insert_length): i -= 1 elif ((locations[j][0] > locations[i][0]) and (locations[j][1] < locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #merge overlapping regions - locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 From 174cc0c834c6ea1c9fb89b553dfed24e89570778 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 14:10:11 +0200 Subject: [PATCH 026/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 9694c6d..be67237 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -59,7 +59,7 @@ def merge(blast_results, insert_length): i -=1 elif ((locations[j][0] > locations[i][0]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #print(j) - locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 From ccc3e4eb0d0aae6eedae7b61e0ab1761ebcf31a2 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 14:28:53 +0200 Subject: [PATCH 027/229] testing --- fdog/fDOGassembly.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index be67237..7ae65c0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -31,11 +31,6 @@ def merge(blast_results, insert_length): i = 1 while i < size_list-1: - a = locations[j][0] - b = locations[i][0] - c = locations[j][1] - d = locations[j][5] - e = locations[i][5] if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) @@ -79,6 +74,7 @@ def parse_blast(line, blast_results, cutoff): #fomrat dictionary: {node_name: [(,,evalue, ,,)]} line = line.replace("\n", "") line_info = line.split("\t") + print(line_info) evalue = float(line_info[3]) #cut off if evalue > cutoff: @@ -87,14 +83,14 @@ def parse_blast(line, blast_results, cutoff): else: node_name, sstart, send, qstart, qend = line_info[0], line_info[1], line_info[2], line_info[4], line_info[5] split = node_name.split("|") - # finding out on which strand tBLASTn founded a hit + # finding out on which strand tBLASTn found a hit if sstart < send: strand = "+" else: sstart = line_info[2] send = line_info[1] strand = "-" - #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off of 0.00001 + #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off if len(split) > 1: node_name = split[1] if node_name in blast_results: From e2cb392d31015b99f49cca38b68f7cfacb28e7a6 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 14:35:26 +0200 Subject: [PATCH 028/229] testing --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 7ae65c0..95fe32b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -81,14 +81,14 @@ def parse_blast(line, blast_results, cutoff): return blast_results, evalue #add region to dictionary else: - node_name, sstart, send, qstart, qend = line_info[0], line_info[1], line_info[2], line_info[4], line_info[5] + node_name, sstart, send, qstart, qend = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]) split = node_name.split("|") # finding out on which strand tBLASTn found a hit if sstart < send: strand = "+" else: - sstart = line_info[2] - send = line_info[1] + sstart = int(line_info[2]) + send = int(line_info[1]) strand = "-" #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off if len(split) > 1: From 6c9b25828e68d0a5dc79f7ed2dd28fcfb3d42aa4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 15:07:17 +0200 Subject: [PATCH 029/229] testing --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 95fe32b..f8d6487 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -38,7 +38,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -= 1 - elif ((locations[j][0] > locations[i][0]) and (locations[j][1] < locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #merge overlapping regions locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) @@ -52,7 +52,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -=1 - elif ((locations[j][0] > locations[i][0]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #print(j) locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) @@ -74,7 +74,7 @@ def parse_blast(line, blast_results, cutoff): #fomrat dictionary: {node_name: [(,,evalue, ,,)]} line = line.replace("\n", "") line_info = line.split("\t") - print(line_info) + #print(line_info) evalue = float(line_info[3]) #cut off if evalue > cutoff: From b9c055ead8880df456dd1c5fc154bd79c0051f0b Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 15:21:29 +0200 Subject: [PATCH 030/229] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f8d6487..996bec6 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,7 +27,7 @@ def merge(blast_results, insert_length): j = 0 - while j < size_list-2: + while j < size_list-1: i = 1 while i < size_list-1: From 79df315ba23f40bf8205221880a062d81f48b8ed Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 15:30:42 +0200 Subject: [PATCH 031/229] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 996bec6..f4da667 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -18,6 +18,7 @@ def load_config(config_file): def merge(blast_results, insert_length): number_regions = 0 + insert_length = int(insert_length) for key in blast_results: locations = blast_results[key] locations = sorted(locations, key = lambda x: int(x[3])) From 0bc70a06235d836dd3c91ff98e2c16de16473364 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 15:36:56 +0200 Subject: [PATCH 032/229] testing --- fdog/fDOGassembly.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f4da667..02ff236 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,11 +27,9 @@ def merge(blast_results, insert_length): size_list = len(locations) j = 0 - while j < size_list-1: - i = 1 + i = j+1 while i < size_list-1: - if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) @@ -46,7 +44,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -= 1 - elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): + elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #print(j) locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) From a31d5e9acf2fcac7d1d588af42edbc22e6219bdf Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 15:49:20 +0200 Subject: [PATCH 033/229] testing --- fdog/fDOGassembly.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 02ff236..d4e0518 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,9 +27,12 @@ def merge(blast_results, insert_length): size_list = len(locations) j = 0 - while j < size_list-1: - i = j+1 + while j < size_list-2: + i = j + 1 while i < size_list-1: + print("Vergleich \n") + print(locations[j] + "\n") + print(locations[i] + "\n") if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) @@ -37,6 +40,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -= 1 + print("M+") elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #merge overlapping regions locations[j][0] = min(locations[j][0], locations[i][0]) @@ -44,6 +48,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -= 1 + print("M-") elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #print(j) locations[j][1] = max(locations[j][1], locations[i][1]) @@ -51,6 +56,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -=1 + print("Insert+") elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #print(j) locations[j][0] = min(locations[j][0], locations[i][0]) @@ -58,6 +64,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -=1 + print("Insert-") i += 1 j += 1 From 55137f49c3e4ba3986239084cbe002713257a888 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 15:52:54 +0200 Subject: [PATCH 034/229] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d4e0518..b3d78f9 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -31,8 +31,8 @@ def merge(blast_results, insert_length): i = j + 1 while i < size_list-1: print("Vergleich \n") - print(locations[j] + "\n") - print(locations[i] + "\n") + print(str(locations[j]) + "\n") + print(str(locations[i]) + "\n") if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) From ab85180e94e60515963a1190386c0c68ed39e771 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 12 Apr 2021 16:00:17 +0200 Subject: [PATCH 035/229] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b3d78f9..f1e3771 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,9 +27,9 @@ def merge(blast_results, insert_length): size_list = len(locations) j = 0 - while j < size_list-2: + while j < size_list-1: i = j + 1 - while i < size_list-1: + while i < size_list: print("Vergleich \n") print(str(locations[j]) + "\n") print(str(locations[i]) + "\n") From f66f72c5638323cc7d22b6f73bea38ce20f6cf2b Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 13 Apr 2021 09:47:12 +0200 Subject: [PATCH 036/229] clean up --- fdog/fDOGassembly.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f1e3771..3b499a1 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -23,16 +23,16 @@ def merge(blast_results, insert_length): locations = blast_results[key] locations = sorted(locations, key = lambda x: int(x[3])) #print("test") - print(locations) + #print(locations) size_list = len(locations) j = 0 while j < size_list-1: i = j + 1 while i < size_list: - print("Vergleich \n") - print(str(locations[j]) + "\n") - print(str(locations[i]) + "\n") + #print("Vergleich \n") + #print(str(locations[j]) + "\n") + #print(str(locations[i]) + "\n") if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #merge overlapping regions locations[j][1] = max(locations[j][1], locations[i][1]) @@ -40,7 +40,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -= 1 - print("M+") + #print("M+") elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #merge overlapping regions locations[j][0] = min(locations[j][0], locations[i][0]) @@ -48,7 +48,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -= 1 - print("M-") + #print("M-") elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): #print(j) locations[j][1] = max(locations[j][1], locations[i][1]) @@ -56,7 +56,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -=1 - print("Insert+") + #print("Insert+") elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): #print(j) locations[j][0] = min(locations[j][0], locations[i][0]) @@ -64,7 +64,7 @@ def merge(blast_results, insert_length): locations.pop(i) size_list -= 1 i -=1 - print("Insert-") + #print("Insert-") i += 1 j += 1 @@ -72,7 +72,7 @@ def merge(blast_results, insert_length): number_regions += len(locations) blast_results[key] = locations - print(blast_results) + #print(blast_results) return blast_results, number_regions def parse_blast(line, blast_results, cutoff): From f573dc4776fac4a9df2513191bcae389f365a9c1 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 11:33:44 +0200 Subject: [PATCH 037/229] testing --- fdog/fDOGassembly.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 3b499a1..c64a244 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -77,7 +77,7 @@ def merge(blast_results, insert_length): def parse_blast(line, blast_results, cutoff): # format blast line: - #fomrat dictionary: {node_name: [(,,evalue, ,,)]} + # format dictionary: {node_name: [(,,evalue, ,,)]} line = line.replace("\n", "") line_info = line.split("\t") #print(line_info) @@ -123,7 +123,10 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): #parsing blast output blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) #evalue cut-off + print(evalue + " evalue candidate region \n") + print(cutoff + " cutoff evalue \n") if not evalue <= cutoff_evalue: + print("break \n") break if blast_results == {}: return 0,0 From 4dad8869a6ee3d5d013d1bbe4586f161455f19c6 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 11:41:24 +0200 Subject: [PATCH 038/229] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c64a244..126decf 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -131,6 +131,7 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): if blast_results == {}: return 0,0 else: + print(blast_results) candidate_regions, number_regions = merge(blast_results, intron_length) #candidate_regions, number_regions = merge_regions(blast_results, cut_off) #print(candidate_regions, number_regions) From ef9c17fda354bc4cd5b7954f6f93d4cadf360aba Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 11:43:08 +0200 Subject: [PATCH 039/229] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 126decf..c69733a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -123,8 +123,8 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): #parsing blast output blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) #evalue cut-off - print(evalue + " evalue candidate region \n") - print(cutoff + " cutoff evalue \n") + print(str(evalue) + " evalue candidate region \n") + print(str(cutoff) + " cutoff evalue \n") if not evalue <= cutoff_evalue: print("break \n") break From e5b06e1d279195a08c6e94b79dafec192b0d82f4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 11:44:49 +0200 Subject: [PATCH 040/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c69733a..1b1c5f7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -124,7 +124,7 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) #evalue cut-off print(str(evalue) + " evalue candidate region \n") - print(str(cutoff) + " cutoff evalue \n") + print(str(cutoff_evalue) + " cutoff evalue \n") if not evalue <= cutoff_evalue: print("break \n") break From 7e0377db68470f2a2cdaefa308f1def70250fcbf Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 12:44:50 +0200 Subject: [PATCH 041/229] bug fix --- fdog/fDOGassembly.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 1b1c5f7..0485db0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -122,12 +122,7 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): break #parsing blast output blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) - #evalue cut-off - print(str(evalue) + " evalue candidate region \n") - print(str(cutoff_evalue) + " cutoff evalue \n") - if not evalue <= cutoff_evalue: - print("break \n") - break + if blast_results == {}: return 0,0 else: @@ -731,7 +726,7 @@ def main(): #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt print("tBLASTn search against data base") - os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -out ' + tmp_path + '/blast_results.out') + os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + evalue + ' -out ' + tmp_path + '/blast_results.out') print("tBLASTn search is finished") ################### search for candidate regions and extract seq ########### From 721cfffea9d3837bb49b1a52c91dc6d362f18474 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 12:49:12 +0200 Subject: [PATCH 042/229] testing new tblastn call --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 0485db0..8f47d98 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -726,7 +726,7 @@ def main(): #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt print("tBLASTn search against data base") - os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + evalue + ' -out ' + tmp_path + '/blast_results.out') + os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out') print("tBLASTn search is finished") ################### search for candidate regions and extract seq ########### From 496bb1f8c1dd1b0b158d36c99b567faeae7e67ca Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 15 Apr 2021 12:55:54 +0200 Subject: [PATCH 043/229] testing --- fdog/fDOGassembly.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 8f47d98..b9ee3f4 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -126,9 +126,8 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): if blast_results == {}: return 0,0 else: - print(blast_results) candidate_regions, number_regions = merge(blast_results, intron_length) - #candidate_regions, number_regions = merge_regions(blast_results, cut_off) + print(candidate_regions) #print(candidate_regions, number_regions) return candidate_regions, number_regions From 2cdc82d53fc5dab4a82ddd9e03fbccc0d003d399 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 16 Apr 2021 10:19:54 +0200 Subject: [PATCH 044/229] testing --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b9ee3f4..5d2f9e9 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -127,7 +127,7 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): return 0,0 else: candidate_regions, number_regions = merge(blast_results, intron_length) - print(candidate_regions) + #print(candidate_regions) #print(candidate_regions, number_regions) return candidate_regions, number_regions @@ -750,7 +750,7 @@ def main(): ################# backward search to filter for orthologs################### reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path) - + print(reciprocal_sequences) if reciprocal_sequences == 0: print("No ortholog fulfilled the reciprocity criteria") if searchTaxon == '': @@ -761,7 +761,7 @@ def main(): ################## checking accepted genes for co-orthologs ########################## reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - + print(reciprocal_sequences) ################ add sequences to extended.fa in the output folder########## From 3898d4ee8869332c76250593c2e2c391ad933c46 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 16 Apr 2021 10:27:00 +0200 Subject: [PATCH 045/229] testing --- fdog/fDOGassembly.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 5d2f9e9..842d67f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -437,6 +437,9 @@ def checkOptions(): #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!! def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): + if len(candidate_name) == 1: + return candidate_name + candidates = readFasta(candidatesFile) ref = readFasta(fasta) From e1fec1af78f1f59e43d4c4f1be83cbbfa67b661d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 16 Apr 2021 10:28:40 +0200 Subject: [PATCH 046/229] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 842d67f..d31af58 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -437,8 +437,8 @@ def checkOptions(): #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!! def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): - if len(candidate_name) == 1: - return candidate_name + if len(candidate_names) == 1: + return candidate_names candidates = readFasta(candidatesFile) ref = readFasta(fasta) From 65c1e1e0ae34b9bb948de5e2511cca1cc29f6781 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 18 Apr 2021 19:40:44 +0200 Subject: [PATCH 047/229] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d31af58..80582bc 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -477,11 +477,11 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci for name in candidate_names: distance = distances[ref_id , name] - if distance < min_dist: + if distance <= min_dist: min_dist = distance min_name = name - checked = [] + for name in candidate_names: From 34b2ee591f76296e48dfe27bdd46d6e3d6e666fd Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 20 Apr 2021 15:47:17 +0200 Subject: [PATCH 048/229] code clean up --- fdog/fDOGassembly.py | 120 ++++++++++++------------------------------- 1 file changed, 33 insertions(+), 87 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 80582bc..44e7607 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -17,6 +17,7 @@ def load_config(config_file): print(exc) def merge(blast_results, insert_length): + #merging overlapping and contigous candidate regions number_regions = 0 insert_length = int(insert_length) for key in blast_results: @@ -25,54 +26,44 @@ def merge(blast_results, insert_length): #print("test") #print(locations) size_list = len(locations) - j = 0 while j < size_list-1: i = j + 1 while i < size_list: - #print("Vergleich \n") - #print(str(locations[j]) + "\n") - #print(str(locations[i]) + "\n") if ((locations[j][0] < locations[i][0]) and (locations[j][1] > locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): - #merge overlapping regions + #merge overlapping regions plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 i -= 1 - #print("M+") elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): - #merge overlapping regions + #merge overlapping regions minus strand locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 i -= 1 - #print("M-") elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): - #print(j) + #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 i -=1 - #print("Insert+") elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): - #print(j) + #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) locations.pop(i) size_list -= 1 i -=1 - #print("Insert-") - i += 1 j += 1 number_regions += len(locations) blast_results[key] = locations - #print(blast_results) return blast_results, number_regions def parse_blast(line, blast_results, cutoff): @@ -80,7 +71,6 @@ def parse_blast(line, blast_results, cutoff): # format dictionary: {node_name: [(,,evalue, ,,)]} line = line.replace("\n", "") line_info = line.split("\t") - #print(line_info) evalue = float(line_info[3]) #cut off if evalue > cutoff: @@ -127,12 +117,11 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): return 0,0 else: candidate_regions, number_regions = merge(blast_results, intron_length) - #print(candidate_regions) - #print(candidate_regions, number_regions) + return candidate_regions, number_regions def extract_seq(region_dic, path, tmp_path): - #print(region_dic) + for key in region_dic: #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" @@ -145,17 +134,18 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug locations = regions[key] counter = 0 for i in locations: + # some variables counter += 1 start = str(i[0] - length_extension) end = str(i[1] + length_extension) name = key + "_" + str(counter) - #print("augustus --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + ".gff") - + # augutus call cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" - result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) + # transfer augustus output to as sequence cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) - + # parsing header and sequences try: sequence_file = open(tmp_path + name + ".aa", "r") lines = sequence_file.readlines() @@ -168,20 +158,15 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug output.write(line) sequence_file.close() except FileNotFoundError: - print("No gene found by ID:" + name +" , continuing with next region") - - - + print("No gene found in region with ID:" + name + " , continuing with next region") output.close() def searching_for_db(assembly_path): - #print("test: " + str(assembly_path) + "\n") + db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto'] check = True for end in db_endings: - #print(assembly_path + end + "\n") check = check and os.path.exists(assembly_path + end) - #print(check) return check def get_distance_biopython(file, matrix): @@ -240,8 +225,6 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates #print("mafft-linsi") os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) - #d_ref = get_distance(aln_file, best_hit, ref) - #d = get_distance(aln_file, best_hit, candidate_name) distances = get_distance_biopython(aln_file, matrix) distance_hit_query = distances[best_hit, candidate_name] @@ -390,9 +373,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva return list(orthologs), seed def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): - #print("addSequences") - #print(sequenceIds) - #print(species_list) + output_file = open(output, "a+") if refBool == False: seq_records_core = readFasta(core_fasta) @@ -406,8 +387,6 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species seq_records_candidate = readFasta(candidate_fasta) seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: - #print(entry_candidate.id) - #print(sequenceIds) if entry_candidate.id in sequenceIds: output_file.write(">" + entry_candidate.id + "\n") output_file.write(str(entry_candidate.seq) + "\n") @@ -455,7 +434,6 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci f.write(str(record.seq) + "\n") break - for record in candidates: for name in candidate_names: if name in record.id: @@ -465,9 +443,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci if msaTool == "muscle": os.system("muscle -quiet -in " + out + " -out " + aln_file) - #print("muscle -quiet -in " + output_file + " -out " + aln_file) elif msaTool == "mafft-linsi": - #print("mafft-linsi") os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file) distances = get_distance_biopython(aln_file, matrix) @@ -481,9 +457,6 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci min_dist = distance min_name = name - - - for name in candidate_names: if distances[min_name , name] < distances[min_name , ref_id]: checked.append(name) @@ -577,18 +550,6 @@ def main(): searchTaxon = args.searchTaxon silent = args.silent - ###################### How to handling std output ########################## - # if silent == True: - # print(out + "fdog.log \n") - # f = open(out + "fdog.log", "a+") - # sys.stdout = f - # else: - # print(out + "fdog.log \n") - # sys.stdout = Logger(out) - - - - #checking paths if dataPath == '': fdogPath = os.path.realpath(__file__).replace('/fDOGassembly.py','') @@ -618,31 +579,25 @@ def main(): os.system('mkdir ' + out + '/' + group) out = out + '/' + group + '/' - - #print(assemblyDir) try: f = open(out + "/fdog.log", "a+") except FileNotFoundError: f = open(out + "/fdog.log", "w") + ################## How to handle std output and std error ################## if silent == True: sys.stderr = f sys.stdout = f else: sys.stdout = Logger(f) - - - # user input has to be checked here before fDOGassembly continues - assembly_names = os.listdir(assemblyDir) - - ########################## some variables ################################## refBool = False # checks if sequences of reference species were already part of the extended.fa file + ########### paths ########### msa_path = core_path + "/" + group +"/"+ group + ".aln" @@ -658,18 +613,16 @@ def main(): ######################## consensus sequence ################################ #make a majority-rule consensus sequence with the tool hmmemit from hmmer - print("Building a consensus sequence \n") + print("Building a consensus sequence for gene " + group + " \n") os.system('hmmemit -c -o' + consensus_path + ' ' + hmm_path) print("consensus sequence is finished\n") ######################## block profile ##################################### - print("Building a block profile \n") + print("Building a block profile for gene " + group + " \n") cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path - #os.system('msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path) result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) - #print(os.path.getsize(profile_path)) if int(os.path.getsize(profile_path)) > 0: print("block profile is finished \n") else: @@ -683,9 +636,9 @@ def main(): result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) print("block profile is finished \n") - searchBool = False + #################### fDOG assembly computation for all species ############# for asName in assembly_names: if searchBool == True: break @@ -694,6 +647,7 @@ def main(): searchBool = True ################### path definitions ################################### + os.system('mkdir ' + out + '/tmp/' + asName + '>/dev/null 2>&1') tmp_path = out + "/tmp/" + asName + "/" candidatesOutFile = tmp_path + group + ".candidates.fa" @@ -706,16 +660,13 @@ def main(): fasOutFile = out + "/" + group mappingFile = out + "/tmp/" + group + ".mapping.txt" - print("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - ######################## tBLASTn ########################################### - - #database anlegen + ######################## tBLASTn ########################################### + #checks if data base exists already db_check = searching_for_db(db_path) - #print(assembly_path) if db_check == 0: print("creating a blast data base \n") os.system('makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path) @@ -723,16 +674,13 @@ def main(): else: print('blast data base exists already, continuing...') - - #make a tBLASTn search against the new database - #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - + #makes a tBLASTn search against the new database + #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt print("tBLASTn search against data base") os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out') print("tBLASTn search is finished") ################### search for candidate regions and extract seq ########### - # parse blast and filter for candiate regions regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) @@ -740,20 +688,20 @@ def main(): #no candidat region are available, no ortholog can be found print("No candidate region found") continue - else: print(str(number_regions) + " candiate regions were found. Extracting sequences...") extract_seq(regions, db_path, tmp_path) ############### make Augustus PPX search ################################### + print("starting augustus ppx \n") augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path) print("augustus is finished \n") ################# backward search to filter for orthologs################### + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path) - print(reciprocal_sequences) if reciprocal_sequences == 0: print("No ortholog fulfilled the reciprocity criteria") if searchTaxon == '': @@ -762,35 +710,34 @@ def main(): cleanup(tmp, tmp_path) return 1 - ################## checking accepted genes for co-orthologs ########################## - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - print(reciprocal_sequences) + ################## checking accepted genes for co-orthologs ################ + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) ################ add sequences to extended.fa in the output folder########## + addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) refBool = True ############### make Annotation with FAS ################################### + # if we want to search in only one Taxon if searchTaxon != '' and fasoff == False: fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option os.system('mkdir ' + tmp_path + 'anno_dir' + '>/dev/null 2>&1') os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName ) - - + #if we searched in more than one Taxon and no ortholog was found if refBool == False and searchTaxon == '': print("No orthologs found. Exciting ...") cleanup(tmp, tmp_path) return 1 - + #if we searched in more than one taxon if fasoff == False and searchTaxon == '': tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group ) - ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) @@ -799,6 +746,5 @@ def main(): f.close() - if __name__ == '__main__': main() From 6546b530bc1209d940d50916667ef3ae213a6595 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Apr 2021 09:42:31 +0200 Subject: [PATCH 049/229] clean up code --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 44e7607..bc8eb54 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -576,7 +576,7 @@ def main(): if out == '': #print('test out \n') out = os.getcwd() - os.system('mkdir ' + out + '/' + group) + os.system('mkdir ' + out + '/' + group + '>/dev/null 2>&1') out = out + '/' + group + '/' try: From 583536554383b3222ce0a01eee343571d234cbec Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Apr 2021 15:01:48 +0200 Subject: [PATCH 050/229] clean up --- .DS_Store | Bin 6148 -> 6148 bytes fdog/fDOGassembly.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.DS_Store b/.DS_Store index fa2521e2436140a5f3689d5732ee4d25d777342f..ec261b8d3b9c0dfca3a952aa505e1a946aaf66ab 100644 GIT binary patch delta 140 zcmZoMXfc=|#>B!ku~2NHo+2a1#(>?7iv?Ji7&#{MFxfMnnC!#Uz#?82!uF&z$_^q@4UD1_lNJ1_q|| m%`cdrGHzz);O79^wV9FSJM(0I5l0S2pd3&M!{!K)HOv4JL?SK# delta 118 zcmZoMXfc=|#>B)qu~2NHo+2ar#(>?7jO>$nSnL^3P4;1FV3Vw_HZ(Aqe1Sz-aTWs@ zFfu}D27V|Fqh?PQVSTt+j6;BBGdl-A2hh~bf*jwOC-aLqa)8tT^|5S@5Lv?v0F#at Ad;kCd diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bc8eb54..1c2f21a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -576,7 +576,7 @@ def main(): if out == '': #print('test out \n') out = os.getcwd() - os.system('mkdir ' + out + '/' + group + '>/dev/null 2>&1') + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') out = out + '/' + group + '/' try: @@ -608,7 +608,7 @@ def main(): ###################### create tmp folder ################################### - os.system('mkdir ' + out + '/tmp' + '>/dev/null 2>&1') + os.system('mkdir ' + out + '/tmp' + ' >/dev/null 2>&1') ######################## consensus sequence ################################ From 421580d7895fb76f32ed79820b9d652516af7bf3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Apr 2021 15:12:24 +0200 Subject: [PATCH 051/229] clean up --- fdog/fDOGassembly.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 1c2f21a..2c57503 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -125,7 +125,7 @@ def extract_seq(region_dic, path, tmp_path): for key in region_dic: #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path): output = open(candidatesOutFile, "w") @@ -457,8 +457,10 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci min_dist = distance min_name = name + checked = [] + for name in candidate_names: - if distances[min_name , name] < distances[min_name , ref_id]: + if distances[min_name , name] <= distances[min_name , ref_id]: checked.append(name) return checked From 89dfaf0290ada42714b958057d16e537570b5beb Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Apr 2021 15:20:06 +0200 Subject: [PATCH 052/229] reduce output --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 2c57503..b2d2afa 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -144,7 +144,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) # transfer augustus output to as sequence cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) # parsing header and sequences try: sequence_file = open(tmp_path + name + ".aa", "r") From ecf29edbc63829f9ee2cfedd872f2b5f4d857c67 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 22 Apr 2021 11:34:21 +0200 Subject: [PATCH 053/229] clean up code --- fdog/fDOGassembly.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b2d2afa..03f998a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -724,10 +724,12 @@ def main(): ############### make Annotation with FAS ################################### # if we want to search in only one Taxon if searchTaxon != '' and fasoff == False: + print("Calculating FAS scores") fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option os.system('mkdir ' + tmp_path + 'anno_dir' + '>/dev/null 2>&1') - os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName ) + cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) #if we searched in more than one Taxon and no ortholog was found if refBool == False and searchTaxon == '': print("No orthologs found. Exciting ...") @@ -735,10 +737,12 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '': + print("Calculating FAS scores") tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - os.system('calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group ) + cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) ################# remove tmp folder ######################################## if searchTaxon != '': From 55a9e6c2ce2fabc8d2872371e6bbe0dc7599407a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sat, 24 Apr 2021 11:05:32 +0200 Subject: [PATCH 054/229] check augustus --- fdog/fDOGassembly.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 03f998a..b028245 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -141,7 +141,8 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug name = key + "_" + str(counter) # augutus call cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" - result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) + #result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, shell=True) # transfer augustus output to as sequence cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) From d2492d036e66777104e1277f2035eebee6960f65 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sat, 24 Apr 2021 11:12:34 +0200 Subject: [PATCH 055/229] testing --- fdog/fDOGassembly.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b028245..5e85998 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -125,7 +125,8 @@ def extract_seq(region_dic, path, tmp_path): for key in region_dic: #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" - result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + #result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, shell=True) def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path): output = open(candidatesOutFile, "w") From 2c0d152f76f9d1540e273822417f2ef9c224abaa Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 21:20:59 +0200 Subject: [PATCH 056/229] adding option to recognize if co-ortholog or not in header of the extended.fa --- fdog/fDOGassembly.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 5e85998..bc3a290 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -390,8 +390,12 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: if entry_candidate.id in sequenceIds: - output_file.write(">" + entry_candidate.id + "\n") - output_file.write(str(entry_candidate.seq) + "\n") + if entry_candidate == sequenceIds[0]: + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") output_file.close() return 0 @@ -459,10 +463,12 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci min_dist = distance min_name = name - checked = [] + checked = [min_name] for name in candidate_names: - if distances[min_name , name] <= distances[min_name , ref_id]: + if name == min_name: + pass + elif distances[min_name , name] <= distances[min_name , ref_id]: checked.append(name) return checked From 4b19832344ea880614875e6923f9f793b2202f87 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 21:25:54 +0200 Subject: [PATCH 057/229] testing --- fdog/fDOGassembly.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bc3a290..6d5059f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -390,7 +390,8 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: if entry_candidate.id in sequenceIds: - if entry_candidate == sequenceIds[0]: + if entry_candidate.id == sequenceIds[0]: + print(entry_candidate.id) output_file.write(">" + entry_candidate.id + "|1" + "\n") output_file.write(str(entry_candidate.seq) + "\n") else: @@ -751,7 +752,7 @@ def main(): # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) - + print(cmd) ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) From db4c6a57fff0939bbae951a9c0ae3b1dc3c3384e Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 21:34:36 +0200 Subject: [PATCH 058/229] testing --- fdog/fDOGassembly.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 6d5059f..2f780c5 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -383,7 +383,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species for species in species_list: for entry_core in seq_records_core: if species in entry_core.id: - output_file.write(">" + entry_core.id + "\n") + output_file.write(">" + entry_core.id + "|1" + "\n") output_file.write(str(entry_core.seq) + "\n") seq_records_candidate = readFasta(candidate_fasta) @@ -403,6 +403,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species def createFasInput(orthologsOutFile, mappingFile): with open(orthologsOutFile, "r") as f: fas_seed_id = (f.readline())[1:-1] + fas_seed_id = fas_seed_id.split("|")[0] mappingFile = open(mappingFile, "a+") From f4871452939fa6b9952f1293b46c2aa3b2376464 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 21:54:12 +0200 Subject: [PATCH 059/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 2f780c5..9ea9837 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -752,7 +752,7 @@ def main(): fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group - result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, shell=True) print(cmd) ################# remove tmp folder ######################################## if searchTaxon != '': From 43b73b0a63bea0b3b72557ec19fc1fe9b7ed2574 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 22:02:08 +0200 Subject: [PATCH 060/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 9ea9837..c549076 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -403,7 +403,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species def createFasInput(orthologsOutFile, mappingFile): with open(orthologsOutFile, "r") as f: fas_seed_id = (f.readline())[1:-1] - fas_seed_id = fas_seed_id.split("|")[0] + #fas_seed_id = fas_seed_id.split("|")[0] mappingFile = open(mappingFile, "a+") From 620d5fa9cf37883ccd9e14556af6513e993559d5 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 22:34:04 +0200 Subject: [PATCH 061/229] testing --- fdog/fDOGassembly.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bc3a290..d13cbc8 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -383,14 +383,15 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species for species in species_list: for entry_core in seq_records_core: if species in entry_core.id: - output_file.write(">" + entry_core.id + "\n") + output_file.write(">" + entry_core.id + "|1" + "\n") output_file.write(str(entry_core.seq) + "\n") seq_records_candidate = readFasta(candidate_fasta) seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: if entry_candidate.id in sequenceIds: - if entry_candidate == sequenceIds[0]: + if entry_candidate.id == sequenceIds[0]: + print(entry_candidate.id) output_file.write(">" + entry_candidate.id + "|1" + "\n") output_file.write(str(entry_candidate.seq) + "\n") else: @@ -750,8 +751,8 @@ def main(): fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group - result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) - + result = subprocess.run(cmd, shell=True) + print(cmd) ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) From ac3477362a0e7339dbc6de19460e79cc26d8ce58 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 22:41:07 +0200 Subject: [PATCH 062/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c549076..d20968e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -383,7 +383,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species for species in species_list: for entry_core in seq_records_core: if species in entry_core.id: - output_file.write(">" + entry_core.id + "|1" + "\n") + output_file.write(">" + entry_core.id + "\n") output_file.write(str(entry_core.seq) + "\n") seq_records_candidate = readFasta(candidate_fasta) From 86337fcb7b7884c0865bef1b56bd3f1daf26385a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 22:42:09 +0200 Subject: [PATCH 063/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d20968e..e8100ec 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -403,7 +403,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species def createFasInput(orthologsOutFile, mappingFile): with open(orthologsOutFile, "r") as f: fas_seed_id = (f.readline())[1:-1] - #fas_seed_id = fas_seed_id.split("|")[0] + fas_seed_id = fas_seed_id.split("|")[0] mappingFile = open(mappingFile, "a+") From 507238052124d6ea6e0c4f45594ff51d741a1614 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 25 Apr 2021 22:47:37 +0200 Subject: [PATCH 064/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e8100ec..d20968e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -403,7 +403,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species def createFasInput(orthologsOutFile, mappingFile): with open(orthologsOutFile, "r") as f: fas_seed_id = (f.readline())[1:-1] - fas_seed_id = fas_seed_id.split("|")[0] + #fas_seed_id = fas_seed_id.split("|")[0] mappingFile = open(mappingFile, "a+") From df6d32467000ee0c350e313e68d118d2bbfcf90d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 10:54:45 +0200 Subject: [PATCH 065/229] added function starting_subprocess() to handle call of extern tools more easily --- fdog/fDOGassembly.py | 69 ++++++++++++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d20968e..a7c24ed 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -16,6 +16,14 @@ def load_config(config_file): except yaml.YAMLError as exc: print(exc) +def starting_subprocess(cmd, mode): + if mode == 'debug': + result = subprocess.run(cmd, shell=True) + elif mode == 'silent': + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) + elif mode == 'normal': + result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + def merge(blast_results, insert_length): #merging overlapping and contigous candidate regions number_regions = 0 @@ -120,15 +128,14 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): return candidate_regions, number_regions -def extract_seq(region_dic, path, tmp_path): +def extract_seq(region_dic, path, tmp_path, mode): for key in region_dic: #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" - #result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) - result = subprocess.run(cmd, shell=True) + starting_subprocess(cmd, mode) -def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path): +def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode): output = open(candidatesOutFile, "w") for key in regions: @@ -143,10 +150,10 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug # augutus call cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" #result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) - result = subprocess.run(cmd, shell=True) + starting_subprocess(cmd, mode) # transfer augustus output to as sequence cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" - result = subprocess.run(cmd, stderr = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + starting_subprocess(cmd, mode) # parsing header and sequences try: sequence_file = open(tmp_path + name + ".aa", "r") @@ -524,6 +531,8 @@ def main(): optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='') optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) + optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) + args = parser.parse_args() @@ -561,6 +570,18 @@ def main(): fasoff = args.fasoff searchTaxon = args.searchTaxon silent = args.silent + debug = args.debug + + if debug == True and silent == True: + print("It's not possible to use booth modes, please restart and use --debug or --silent") + return 1 + else: + if debug == True: + mode = 'debug' + elif silent == True: + mode = 'silent' + else: + mode = 'normal' #checking paths if dataPath == '': @@ -598,11 +619,12 @@ def main(): ################## How to handle std output and std error ################## - if silent == True: + if mode == 'silent': sys.stderr = f sys.stdout = f else: sys.stdout = Logger(f) + # user input has to be checked here before fDOGassembly continues assembly_names = os.listdir(assemblyDir) @@ -620,20 +642,22 @@ def main(): ###################### create tmp folder ################################### - os.system('mkdir ' + out + '/tmp' + ' >/dev/null 2>&1') + cmd = 'mkdir ' + out + '/tmp' + starting_subprocess(cmd, 'silent') ######################## consensus sequence ################################ #make a majority-rule consensus sequence with the tool hmmemit from hmmer print("Building a consensus sequence for gene " + group + " \n") - os.system('hmmemit -c -o' + consensus_path + ' ' + hmm_path) + cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path + starting_subprocess(cmd, mode) print("consensus sequence is finished\n") ######################## block profile ##################################### print("Building a block profile for gene " + group + " \n") cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + starting_subprocess(cmd, mode) if int(os.path.getsize(profile_path)) > 0: print("block profile is finished \n") @@ -642,10 +666,10 @@ def main(): new_path = core_path + group +"/"+ group + "_new.aln" #print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + starting_subprocess(cmd, mode) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path #print(cmd) - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + starting_subprocess(cmd, mode) print("block profile is finished \n") searchBool = False @@ -660,7 +684,8 @@ def main(): ################### path definitions ################################### - os.system('mkdir ' + out + '/tmp/' + asName + '>/dev/null 2>&1') + cmd = 'mkdir ' + out + '/tmp/' + asName + starting_subprocess(cmd, 'silent') tmp_path = out + "/tmp/" + asName + "/" candidatesOutFile = tmp_path + group + ".candidates.fa" if searchTaxon != '': @@ -681,7 +706,8 @@ def main(): db_check = searching_for_db(db_path) if db_check == 0: print("creating a blast data base \n") - os.system('makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path) + cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path + starting_subprocess(cmd, mode) print("database is finished \n") else: print('blast data base exists already, continuing...') @@ -689,7 +715,8 @@ def main(): #makes a tBLASTn search against the new database #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt print("tBLASTn search against data base") - os.system('tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out') + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + starting_subprocess(cmd, mode) print("tBLASTn search is finished") ################### search for candidate regions and extract seq ########### @@ -702,12 +729,12 @@ def main(): continue else: print(str(number_regions) + " candiate regions were found. Extracting sequences...") - extract_seq(regions, db_path, tmp_path) + extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### print("starting augustus ppx \n") - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path) + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) print("augustus is finished \n") ################# backward search to filter for orthologs################### @@ -737,9 +764,10 @@ def main(): print("Calculating FAS scores") fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - os.system('mkdir ' + tmp_path + 'anno_dir' + '>/dev/null 2>&1') + cmd = 'mkdir ' + tmp_path + 'anno_dir' + starting_subprocess(cmd, 'silent') cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName - result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) + starting_subprocess(cmd, mode) #if we searched in more than one Taxon and no ortholog was found if refBool == False and searchTaxon == '': print("No orthologs found. Exciting ...") @@ -752,8 +780,7 @@ def main(): fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group - result = subprocess.run(cmd, shell=True) - print(cmd) + starting_subprocess(cmd, mode) ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) From 7187972986ee69a27b472104d981455498c208bb Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 10:55:29 +0200 Subject: [PATCH 066/229] added augustus to dependencies --- .DS_Store | Bin 6148 -> 6148 bytes fdog/setup/setup_conda.sh | 3 +++ 2 files changed, 3 insertions(+) diff --git a/.DS_Store b/.DS_Store index ec261b8d3b9c0dfca3a952aa505e1a946aaf66ab..824f712743a6414728f27d69a840e656771a9cdf 100644 GIT binary patch delta 68 zcmZoMXffE}&BAzUvJXoGn`Cvhp@GTd3oOctvlzgDkr6^O@Iz@BHG8uN>qn-|>>Pjj E0r)Enz5oCK delta 40 wcmZoMXffE}&BAzMvJXoGi+FXl!Q@LU%8WB6e`AqnoVi(y^&`_}R*wJt02)~h&j0`b diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh index fae81b7..ddc4e23 100755 --- a/fdog/setup/setup_conda.sh +++ b/fdog/setup/setup_conda.sh @@ -116,6 +116,7 @@ dependencies=( mafft # for linsi muscle fasta36 + augustus #for fdog.assembly ) for i in "${dependencies[@]}"; do @@ -134,6 +135,8 @@ for i in "${dependencies[@]}"; do fi elif [ "$tool" = "fasta36" ]; then conda install -y -c bioconda fasta3 + elif [ "$tool" = "augustus" ]; then + conda install -y -c bioconda augustus else conda install -y -c bioconda $i fi From 721bcdbaa9c0db7055c9bd3e4c0001cd613ea045 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 12:55:26 +0200 Subject: [PATCH 067/229] testing --- fdog/fDOGassembly.py | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a7c24ed..bdaf93b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -247,7 +247,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates #rejected return 0, distance_ref_hit, distance_hit_query -def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path): +def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path, mode): # the backward search uses the genes predicted from augustus and makes a blastp search #the blastp search is against all species that are part of the core_ortholog group if the option --strict was chosen or only against the ref taxa seedDic = getSeedInfo(fasta_path) @@ -263,7 +263,8 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva print("The fDOG reference species isn't part of the core ortholog group, ... exciting") return 0, seed if searchTool == "blast": - os.system("blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile) + cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + starting_subprocess(cmd, mode) else: print("diamonds are the girls best friends") ##### diamond call @@ -348,7 +349,8 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva print("The species " + species + " isn't part of the core ortholog group, ... exciting") return 0, seed - os.system("blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile) + cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + starting_subprocess(cmd, mode) alg_file = open(tmp_path + "/blast_" + species, "r") lines = alg_file.readlines() alg_file.close() @@ -393,17 +395,18 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species output_file.write(">" + entry_core.id + "\n") output_file.write(str(entry_core.seq) + "\n") - seq_records_candidate = readFasta(candidate_fasta) - seq_records_candidate = list(seq_records_candidate) - for entry_candidate in seq_records_candidate: - if entry_candidate.id in sequenceIds: - if entry_candidate.id == sequenceIds[0]: - print(entry_candidate.id) - output_file.write(">" + entry_candidate.id + "|1" + "\n") - output_file.write(str(entry_candidate.seq) + "\n") - else: - output_file.write(">" + entry_candidate.id + "|0" + "\n") - output_file.write(str(entry_candidate.seq) + "\n") + if sequenceIds != 0: + seq_records_candidate = readFasta(candidate_fasta) + seq_records_candidate = list(seq_records_candidate) + for entry_candidate in seq_records_candidate: + if entry_candidate.id in sequenceIds: + if entry_candidate.id == sequenceIds[0]: + print(entry_candidate.id) + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") output_file.close() return 0 @@ -738,16 +741,24 @@ def main(): print("augustus is finished \n") ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) > 0: + print("No genes found at candidate regions\n") + if searchTaxon == '': + continue + else: + addSequences(0, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) + return 0 - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path) + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) if reciprocal_sequences == 0: print("No ortholog fulfilled the reciprocity criteria") if searchTaxon == '': continue else: + addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) cleanup(tmp, tmp_path) - return 1 + return 0 ################## checking accepted genes for co-orthologs ################ From 9a2e4d00a97cff812e623b4bf219e581ae08922b Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 13:06:34 +0200 Subject: [PATCH 068/229] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bdaf93b..c22e515 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -741,7 +741,7 @@ def main(): print("augustus is finished \n") ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) > 0: + if int(os.path.getsize(candidatesOutFile)) <= 0: print("No genes found at candidate regions\n") if searchTaxon == '': continue From 1e5893b85c169899ed0ace275dcb3ff89ee5cdef Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 13:30:51 +0200 Subject: [PATCH 069/229] testing --- fdog/fDOGassembly.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c22e515..e52b8a4 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -22,7 +22,7 @@ def starting_subprocess(cmd, mode): elif mode == 'silent': result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) elif mode == 'normal': - result = subprocess.run(cmd, stderr = subprocess.PIPE, shell=True) + result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True) def merge(blast_results, insert_length): #merging overlapping and contigous candidate regions @@ -485,6 +485,17 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci return checked +def changes_for_fas(file, header, mode): + #def replace_first_line( src_filename, target_filename, replacement_line): + f_in = open(file) + first_line, remainder = f.readline(), f.read() + line = first_line.split("|")[0] + f_in.close() + f_out = open(file + "s","w") + f_out.write(line + "\n") + f_out.write(remainder) + f_out.close() + class Logger(object): def __init__(self, file): self.file = file @@ -746,23 +757,20 @@ def main(): if searchTaxon == '': continue else: - addSequences(0, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) - return 0 + reciprocal_sequences = 0 + else: + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + ################## checking accepted genes for co-orthologs ################ if reciprocal_sequences == 0: print("No ortholog fulfilled the reciprocity criteria") if searchTaxon == '': continue else: - addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) - cleanup(tmp, tmp_path) - return 0 - - ################## checking accepted genes for co-orthologs ################ - - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + reciprocal_sequences = 0 + else: + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) ################ add sequences to extended.fa in the output folder########## From e8440c86fcec447a0ff1d98ffd3d1940139a69bb Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 13:42:49 +0200 Subject: [PATCH 070/229] testing --- fdog/fDOGassembly.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e52b8a4..688a000 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -754,7 +754,7 @@ def main(): ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: print("No genes found at candidate regions\n") - if searchTaxon == '': + if searchTaxon == '' and refBool == True: continue else: reciprocal_sequences = 0 @@ -765,7 +765,7 @@ def main(): ################## checking accepted genes for co-orthologs ################ if reciprocal_sequences == 0: print("No ortholog fulfilled the reciprocity criteria") - if searchTaxon == '': + if searchTaxon == '' and refBool == True: continue else: reciprocal_sequences = 0 @@ -788,6 +788,7 @@ def main(): cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName starting_subprocess(cmd, mode) #if we searched in more than one Taxon and no ortholog was found + if refBool == False and searchTaxon == '': print("No orthologs found. Exciting ...") cleanup(tmp, tmp_path) From 6362e47e45034fd026cfbc2e3319c3266a2c9d65 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 13:52:41 +0200 Subject: [PATCH 071/229] testing --- fdog/fDOGassembly.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 688a000..08cdfaa 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -739,21 +739,23 @@ def main(): if regions == 0: #no candidat region are available, no ortholog can be found - print("No candidate region found") - continue + if refBool == True: + print("No candidate region found") + continue else: print(str(number_regions) + " candiate regions were found. Extracting sequences...") extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### - print("starting augustus ppx \n") - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - print("augustus is finished \n") + print("starting augustus ppx \n") + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + print("augustus is finished \n") ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) <= 0: - print("No genes found at candidate regions\n") + if int(os.path.getsize(candidatesOutFile)) <= 0 or regions == 0: + if regions != 0: + print("No genes found at candidate regions\n") if searchTaxon == '' and refBool == True: continue else: From 02ad76cd791f0c7d202f443ca2a0665a13271c3a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 13:58:03 +0200 Subject: [PATCH 072/229] testing --- fdog/fDOGassembly.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 08cdfaa..02f627f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -753,15 +753,14 @@ def main(): print("augustus is finished \n") ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) <= 0 or regions == 0: - if regions != 0: + if int(os.path.getsize(candidatesOutFile)) <= 0: print("No genes found at candidate regions\n") - if searchTaxon == '' and refBool == True: - continue + if searchTaxon == '' and refBool == True: + continue + else: + reciprocal_sequences = 0 else: - reciprocal_sequences = 0 - else: - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) ################## checking accepted genes for co-orthologs ################ @@ -772,7 +771,10 @@ def main(): else: reciprocal_sequences = 0 else: - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + if regions != 0 + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + else: + reciprocal_sequences = 0 ################ add sequences to extended.fa in the output folder########## From ac929b7f87c55870f83cb2201d1bad8e4a2d56c2 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 14:01:00 +0200 Subject: [PATCH 073/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 02f627f..c98f6a7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -771,7 +771,7 @@ def main(): else: reciprocal_sequences = 0 else: - if regions != 0 + if regions != 0: reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) else: reciprocal_sequences = 0 From 060b4bb10297df20b627a6b71324c4926eef616a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 14:03:49 +0200 Subject: [PATCH 074/229] testing --- fdog/fDOGassembly.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c98f6a7..524b83f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -739,6 +739,7 @@ def main(): if regions == 0: #no candidat region are available, no ortholog can be found + reciprocal_sequences = 0 if refBool == True: print("No candidate region found") continue @@ -771,10 +772,7 @@ def main(): else: reciprocal_sequences = 0 else: - if regions != 0: - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - else: - reciprocal_sequences = 0 + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) ################ add sequences to extended.fa in the output folder########## From c996ca6287c601856bce1ab849bcd4bdaf9f86bf Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 14:10:07 +0200 Subject: [PATCH 075/229] testing --- fdog/fDOGassembly.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 524b83f..07dbe83 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -384,6 +384,10 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva return list(orthologs), seed def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): + print(output) + print(refBool) + print(core_fasta) + print(species_list) output_file = open(output, "a+") if refBool == False: @@ -739,10 +743,11 @@ def main(): if regions == 0: #no candidat region are available, no ortholog can be found - reciprocal_sequences = 0 + print("No candidate region found") if refBool == True: - print("No candidate region found") continue + else: + reciprocal_sequences = 0 else: print(str(number_regions) + " candiate regions were found. Extracting sequences...") extract_seq(regions, db_path, tmp_path, mode) @@ -766,7 +771,8 @@ def main(): ################## checking accepted genes for co-orthologs ################ if reciprocal_sequences == 0: - print("No ortholog fulfilled the reciprocity criteria") + if regions != 0: + print("No ortholog fulfilled the reciprocity criteria") if searchTaxon == '' and refBool == True: continue else: From 3f46b83ad88816c741779f6a378e5f4ace1a6a11 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 14:15:41 +0200 Subject: [PATCH 076/229] testing --- fdog/fDOGassembly.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 07dbe83..09ac05e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -747,6 +747,7 @@ def main(): if refBool == True: continue else: + taxa = fdog_ref_species reciprocal_sequences = 0 else: print(str(number_regions) + " candiate regions were found. Extracting sequences...") @@ -765,6 +766,7 @@ def main(): continue else: reciprocal_sequences = 0 + taxa = fdog_ref_species else: reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) From b5924a81f6784730b6863c298025aafee79614ae Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 14:21:39 +0200 Subject: [PATCH 077/229] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 09ac05e..ca89dd1 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -747,7 +747,7 @@ def main(): if refBool == True: continue else: - taxa = fdog_ref_species + taxa = [fdog_ref_species] reciprocal_sequences = 0 else: print(str(number_regions) + " candiate regions were found. Extracting sequences...") @@ -766,7 +766,7 @@ def main(): continue else: reciprocal_sequences = 0 - taxa = fdog_ref_species + taxa = [fdog_ref_species] else: reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) From 490f43cc42b3e8122441f12dcded7cb8f1a26a7b Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 15:05:58 +0200 Subject: [PATCH 078/229] added function to clean up .domain files --- fdog/fDOGassembly.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index ca89dd1..3c837dd 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -384,10 +384,6 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva return list(orthologs), seed def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): - print(output) - print(refBool) - print(core_fasta) - print(species_list) output_file = open(output, "a+") if refBool == False: @@ -489,16 +485,17 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci return checked -def changes_for_fas(file, header, mode): - #def replace_first_line( src_filename, target_filename, replacement_line): - f_in = open(file) - first_line, remainder = f.readline(), f.read() - line = first_line.split("|")[0] - f_in.close() - f_out = open(file + "s","w") - f_out.write(line + "\n") - f_out.write(remainder) - f_out.close() +def clean_fas(path): + file = open(path, "r") + lines = file.readlines() + file.close() + file.open(path,"w") + + for line in lines: + long_id, remain = line.split("#") + id = long_id.split("|")[0] + new_line = id + "#" + remain + file.write(new_line) class Logger(object): def __init__(self, file): @@ -811,6 +808,8 @@ def main(): # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, mode) + clean_fas(group + "_forward.domains") + clean_fas(group + "_reverse.domains") ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) From 07c693d795385bfd0d1941271e8228aa6c71c240 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 15:15:11 +0200 Subject: [PATCH 079/229] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 3c837dd..d50bfe8 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -808,8 +808,8 @@ def main(): # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, mode) - clean_fas(group + "_forward.domains") - clean_fas(group + "_reverse.domains") + clean_fas(out + group + "_forward.domains") + clean_fas(out + group + "_reverse.domains") ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) From 3d804229698eb08161c2edd537dec774f6470a70 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 15:25:05 +0200 Subject: [PATCH 080/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d50bfe8..75e10f1 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -489,7 +489,7 @@ def clean_fas(path): file = open(path, "r") lines = file.readlines() file.close() - file.open(path,"w") + file = open(path,"w") for line in lines: long_id, remain = line.split("#") From acdb6fe068a7d221d780d651660d6da6c45a830c Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 15:47:07 +0200 Subject: [PATCH 081/229] testing --- fdog/fDOGassembly.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 75e10f1..23359d3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -401,7 +401,6 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species for entry_candidate in seq_records_candidate: if entry_candidate.id in sequenceIds: if entry_candidate.id == sequenceIds[0]: - print(entry_candidate.id) output_file.write(">" + entry_candidate.id + "|1" + "\n") output_file.write(str(entry_candidate.seq) + "\n") else: @@ -485,16 +484,22 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci return checked -def clean_fas(path): +def clean_fas(path, file_type): file = open(path, "r") lines = file.readlines() file.close() file = open(path,"w") for line in lines: - long_id, remain = line.split("#") - id = long_id.split("|")[0] - new_line = id + "#" + remain + if file_type == 'domains': + long_id, remain = line.split("#") + id = long_id.split("|")[0] + new_line = id + "#" + remain + else: + long_id, remain = line.split("\t") + id = long_id.split("|")[0] + new_line = id + "\t" + remain + file.write(new_line) class Logger(object): @@ -808,8 +813,9 @@ def main(): # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, mode) - clean_fas(out + group + "_forward.domains") - clean_fas(out + group + "_reverse.domains") + clean_fas(out + group + "_forward.domains", 'domains') + clean_fas(out + group + "_reverse.domains", 'domains') + clean_fas(out + group + ".phyloprofile", 'phyloprofile') ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) From 38aca29591e1a54430a7e395bad343657a13aef8 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 28 Apr 2021 15:58:34 +0200 Subject: [PATCH 082/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 23359d3..a021483 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -496,7 +496,7 @@ def clean_fas(path, file_type): id = long_id.split("|")[0] new_line = id + "#" + remain else: - long_id, remain = line.split("\t") + long_id, remain = line.split("\t", 1) id = long_id.split("|")[0] new_line = id + "\t" + remain From f46cdc0e65616bf95a13f8e69268092584399419 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 11 May 2021 15:59:56 +0200 Subject: [PATCH 083/229] improve user output --- fdog/fDOGassembly.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a021483..d5184b2 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -149,8 +149,8 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug name = key + "_" + str(counter) # augutus call cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" - #result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) - starting_subprocess(cmd, mode) + #print(cmd) + starting_subprocess(cmd, 'silent') # transfer augustus output to as sequence cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" starting_subprocess(cmd, mode) @@ -396,6 +396,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species output_file.write(str(entry_core.seq) + "\n") if sequenceIds != 0: + #print(sequenceIds) seq_records_candidate = readFasta(candidate_fasta) seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: @@ -677,7 +678,7 @@ def main(): print("Building a block profile for gene " + group + " \n") cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path - starting_subprocess(cmd, mode) + starting_subprocess(cmd, 'silent') if int(os.path.getsize(profile_path)) > 0: print("block profile is finished \n") @@ -689,7 +690,7 @@ def main(): starting_subprocess(cmd, mode) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path #print(cmd) - starting_subprocess(cmd, mode) + starting_subprocess(cmd, 'silent') print("block profile is finished \n") searchBool = False @@ -798,7 +799,7 @@ def main(): cmd = 'mkdir ' + tmp_path + 'anno_dir' starting_subprocess(cmd, 'silent') cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName - starting_subprocess(cmd, mode) + starting_subprocess(cmd, 'silent') #if we searched in more than one Taxon and no ortholog was found if refBool == False and searchTaxon == '': @@ -812,7 +813,7 @@ def main(): fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group - starting_subprocess(cmd, mode) + starting_subprocess(cmd, 'silent') clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') clean_fas(out + group + ".phyloprofile", 'phyloprofile') From b662346b1a96358729427f630685957e60058ad5 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 13:20:19 +0200 Subject: [PATCH 084/229] fdog.assembly started with fDOG is always silent --- .DS_Store | Bin 6148 -> 6148 bytes fdog/.DS_Store | Bin 8196 -> 8196 bytes fdog/bin/oneSeq.pl | 2 +- 3 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.DS_Store b/.DS_Store index 824f712743a6414728f27d69a840e656771a9cdf..bcbd073c8626ea73a8116c4f66a9c94aeb88f9c8 100644 GIT binary patch delta 34 pcmZoMXffDuo<-Ei&`3wY(8#P-N1@u#$lOpz!PLTh^Ai>YVF0V92}l3{ delta 34 ncmZoMXffDuo<-Eqz(hyE(Acn6N1@u#2*fcrG1>ftML`$)ct%kpLn9ppLnE_V9ffK`BXdI?1yc+2%?*M}8727{ z(imJA{2AOC5*dntFqI*fA&DV}p>ncXx^l*k4q0K|bT3IG5A delta 49 zcmV-10M7q}K!iY$PXQCLP`eKS6SE8uUjdWD5=yhR5pV&suM=1Vk${S`2N?DQvxgOc H0+E1+rmhjk diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 7139af7..7e8a248 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -701,7 +701,7 @@ if ($assembly){ $eval_blast = sprintf("%f", $eval_blast); if ($seqFile ne "") { - my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath); + my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent"); if (defined $assemblyPath){ push(@assembly_cmd, "--assemblyPath $assemblyPath") From a751205c0bdc4832cb26a8955b3a04e05f332046 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 13:41:04 +0200 Subject: [PATCH 085/229] testing --- fdog/bin/oneSeq.pl | 2 +- fdog/fDOGassembly.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 7e8a248..7139af7 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -701,7 +701,7 @@ if ($assembly){ $eval_blast = sprintf("%f", $eval_blast); if ($seqFile ne "") { - my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent"); + my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath); if (defined $assemblyPath){ push(@assembly_cmd, "--assemblyPath $assemblyPath") diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d5184b2..8884fba 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -396,7 +396,7 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species output_file.write(str(entry_core.seq) + "\n") if sequenceIds != 0: - #print(sequenceIds) + print(sequenceIds) seq_records_candidate = readFasta(candidate_fasta) seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: From eb9f585088bad8b476c02add5fbd8a78bead8c84 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 13:54:32 +0200 Subject: [PATCH 086/229] testing output --- fdog/fDOGassembly.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 8884fba..4e9e6be 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -396,7 +396,6 @@ def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species output_file.write(str(entry_core.seq) + "\n") if sequenceIds != 0: - print(sequenceIds) seq_records_candidate = readFasta(candidate_fasta) seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: @@ -800,6 +799,11 @@ def main(): starting_subprocess(cmd, 'silent') cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName starting_subprocess(cmd, 'silent') + clean_fas(out + group + "_forward.domains", 'domains') + clean_fas(out + group + "_reverse.domains", 'domains') + clean_fas(out + group + ".phyloprofile", 'phyloprofile') + + #if we searched in more than one Taxon and no ortholog was found if refBool == False and searchTaxon == '': From bb3c148b46b874865e67314a88b07b443c9dcfeb Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 14:00:32 +0200 Subject: [PATCH 087/229] testing --- fdog/fDOGassembly.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 4e9e6be..1b84a1e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -631,6 +631,10 @@ def main(): out = os.getcwd() os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') out = out + '/' + group + '/' + else: + if out[-1] != "/": + out = out + "/" + try: f = open(out + "/fdog.log", "a+") From be2b9d4b3b1ea5a5e8ba214ff0c5d5754a4f82e8 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 14:01:43 +0200 Subject: [PATCH 088/229] testing --- fdog/mergeAssemblyOutput.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py index ea6e084..11d5c36 100644 --- a/fdog/mergeAssemblyOutput.py +++ b/fdog/mergeAssemblyOutput.py @@ -107,6 +107,7 @@ def main(): set_fasta = header if cleanup == True: os.remove(directory + '/' +infile) + os.system("rm *.tsv") if phyloprofile: phyloprofile.close() From 6fbd5aadcc9ee3151ddfd1fb75a8e882b83bf1b2 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 14:06:19 +0200 Subject: [PATCH 089/229] testing --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 1b84a1e..de9f343 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -803,9 +803,9 @@ def main(): starting_subprocess(cmd, 'silent') cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName starting_subprocess(cmd, 'silent') - clean_fas(out + group + "_forward.domains", 'domains') - clean_fas(out + group + "_reverse.domains", 'domains') - clean_fas(out + group + ".phyloprofile", 'phyloprofile') + clean_fas(fasOutFile + "_forward.domains", 'domains') + clean_fas(fasOutFile + "_reverse.domains", 'domains') + clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') #if we searched in more than one Taxon and no ortholog was found From 34d683c8aaa9529344b070a0fdccaebce77a10f3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 14:07:48 +0200 Subject: [PATCH 090/229] testing --- fdog/mergeAssemblyOutput.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py index 11d5c36..79a1306 100644 --- a/fdog/mergeAssemblyOutput.py +++ b/fdog/mergeAssemblyOutput.py @@ -107,7 +107,7 @@ def main(): set_fasta = header if cleanup == True: os.remove(directory + '/' +infile) - os.system("rm *.tsv") + os.system("rm " + directory + "/*.tsv") if phyloprofile: phyloprofile.close() From f9504745c247595c867669695d8b302fd30571a7 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 14:14:29 +0200 Subject: [PATCH 091/229] testing --- fdog/bin/oneSeq.pl | 2 +- fdog/mergeAssemblyOutput.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 7139af7..7e8a248 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -701,7 +701,7 @@ if ($assembly){ $eval_blast = sprintf("%f", $eval_blast); if ($seqFile ne "") { - my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath); + my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent"); if (defined $assemblyPath){ push(@assembly_cmd, "--assemblyPath $assemblyPath") diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py index 79a1306..6c865a1 100644 --- a/fdog/mergeAssemblyOutput.py +++ b/fdog/mergeAssemblyOutput.py @@ -107,7 +107,7 @@ def main(): set_fasta = header if cleanup == True: os.remove(directory + '/' +infile) - os.system("rm " + directory + "/*.tsv") + os.system("rm " + directory + "/'*.tsv'") if phyloprofile: phyloprofile.close() From 0b129a293cd1fcf30770883e1796bd830f8e4dee Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 31 May 2021 14:28:05 +0200 Subject: [PATCH 092/229] removing automatically .tsv files if existing --- fdog/mergeAssemblyOutput.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py index 6c865a1..1606b1d 100644 --- a/fdog/mergeAssemblyOutput.py +++ b/fdog/mergeAssemblyOutput.py @@ -107,7 +107,8 @@ def main(): set_fasta = header if cleanup == True: os.remove(directory + '/' +infile) - os.system("rm " + directory + "/'*.tsv'") + elif infile.endswith('.tsv'): + os.remove(directory + '/' + infile) if phyloprofile: phyloprofile.close() From 6c6b1258f1376b0cff530e1492c7a40200946915 Mon Sep 17 00:00:00 2001 From: mueli94 <47216555+mueli94@users.noreply.github.com> Date: Mon, 31 May 2021 15:35:20 +0200 Subject: [PATCH 093/229] Fdog goes assembly (#8) * testing * shorten long header for addTaxon, check for long headers in oneseq and checkData * testing * testing * testing * changed path in hamstr.pl to current directory * changed path in hamstr.pl to current directory * testing * testing * testing * testing * testing * testing * bug fix * bug fix * fixed error mapping ID file not found * testing * testing * testing * test * test * testing * testing * testing * testing * fDOGassembly is working on complete assembly_dir * bug fix * bug fix * enabled option -filter for blastp search * bug fix fasoff * testing --strict option * bug fix in --strict option, output is corrected * bug fix in --checkCoorthologsRef * bug fix * clean up * bug fix * adapted handling of variable dataPath * testing * testing * testing * testing * test * test * test * test * test * test * testing * bug fix assemblyDir * testing * testing * testing search taxa * test * enable --searchTaxa option in fdog.assembly * bug fix * testing * testing --searchTaxa adaption * testing * test * test * write debug files to output dir * skip fa.mapping while checking genome_dir * testing * bug fix * testing * bug fix * bug fix * path fix in augustus_ppx * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * testing * testing * added new python script to merge Assembly output from the same Gene but different searchTaxa * added option to merge Assembly output after fDOG calls fdog.assembly multiple times with different searchTaxa * bug fix * corrected fdog.mergeAssembly call * testing * testing * testing * test * moved fdog.mergeAssembly call to another place * testing * testing * testing * testing * testing * testing * corrected fdog.mergeAssembly call * testing * testing * testing * testing * test * disable weight_dir check if option --assembly is used * adapted fdog.assembly call * adapted calcFAS call to deactivate .tsv output * testing * testing * bug fix in function backward search used with option --strict * testing new added option --silent * added more checks to fdogs.run * bug fix * testing * testing * testing * bug fix * bug fix * testing * testing silent mode * testing --silent * symlinks for fasta36 input; improved fdogs.run according to #5 * testing * testing * testing * testing * tetsing * testing * testing * testing * testing * testing * testing * testing * test * test * testing * testing new function to identify coorthologs * testing * testing * testing * testing * testing * testing * testing * testing * testing * finished function coorthologs * bug fix runSingle.py * cleaning output * testing * testing * testing * testing * testing * testing * testing * testing * testing * testing * testing * testing * bug fix if augutus can't idetify a gene at a candidate region * testing * bug fix * bug fix * cleaning up * testing * testing * testing * testing * bug fix in merge function, regions in minus strand were not merged correctly * testing * testing * testing * testing * testing * bug fix * testing * testing * testing * testing * testing * clean up * testing * testing * testing * testing * bug fix * testing new tblastn call * testing * testing * testing * testing * testing * code clean up * clean up code * clean up * clean up * reduce output * clean up code * check augustus * testing * adding option to recognize if co-ortholog or not in header of the extended.fa * testing * testing * testing * testing * testing * testing * testing * testing * added function starting_subprocess() to handle call of extern tools more easily * added augustus to dependencies * testing * bug fix * testing * testing * testing * testing * testing * testing * testing * testing * testing * added function to clean up .domain files * testing * testing * testing * testing * improve user output * fdog.assembly started with fDOG is always silent * testing * testing output * testing * testing * testing * testing * testing * removing automatically .tsv files if existing Co-authored-by: trvinh --- .DS_Store | Bin 6148 -> 6148 bytes .github/workflows/github_build.yml | 51 ++ .gitignore | 3 +- .travis.yml | 23 +- README.md | 3 +- fdog/.DS_Store | Bin 8196 -> 8196 bytes fdog/addTaxa.py | 3 +- fdog/addTaxon.py | 16 +- fdog/bin/hamstr.pl | 103 +-- fdog/bin/oneSeq.pl | 340 ++++++---- fdog/checkData.py | 11 +- fdog/data/.DS_Store | Bin 8196 -> 6148 bytes fdog/fDOGassembly.py | 837 ++++++++++++++++++++++++ fdog/fdog_goes_assembly/.DS_Store | Bin 6148 -> 0 bytes fdog/fdog_goes_assembly/fDOGassembly.py | 209 ------ fdog/mergeAssemblyOutput.py | 124 ++++ fdog/runMulti.py | 207 ++++-- fdog/runSingle.py | 149 ++++- fdog/setup/setup.sh | 3 +- fdog/setup/setup_conda.sh | 6 +- fdog/setupfDog.py | 6 +- setup.py | 10 +- 22 files changed, 1619 insertions(+), 485 deletions(-) create mode 100644 .github/workflows/github_build.yml create mode 100644 fdog/fDOGassembly.py delete mode 100644 fdog/fdog_goes_assembly/.DS_Store delete mode 100644 fdog/fdog_goes_assembly/fDOGassembly.py create mode 100644 fdog/mergeAssemblyOutput.py diff --git a/.DS_Store b/.DS_Store index c84405d9d29ae54bb91cc188eb50403196c8adc3..bcbd073c8626ea73a8116c4f66a9c94aeb88f9c8 100644 GIT binary patch delta 171 zcmZoMXfc=|#>B!ku~2NHo+2a1#(>?7i&&T#IVSTk*)yJ+?8DTcC0Si1vqhM-bUR%q_A+Bm@>zR;SSyf$ATQ_U+S0;JJ*^@t{E_)P^JIPz RM-E1y98d|v<_M8B%m6dnE297a delta 121 zcmZoMXfc=|#>B)qu~2NHo+2ar#(>?7jO>$nSnL^3PWEAG(3GgIHZ(BNQ7|+%tkqGd zwlp%(Q7|*KtgYqb5LY#{^-RdEtg5c5t(!Ud8;d;StjS`m4>yZ(h_YvOFr1p~CD@=PSzT>pXr!ZH zXk=Eaqfl*VWNxUVU}|ArTg%BIu4-uOnUGsqRb5kCH*2zwu)O$ephjkfB!*0eyt3e; zyqx^Jbf5_1k( y8Ck>T1;T=i+^k?@OBhNS3MShL-VjIPBRgjEI*|aT&Fm82SSFtlT8u}PFB1SkZfHaR delta 203 zcmZp1XmOa}&nUDpU^hRb&}1Hglao&imQ21Za&|JWu!OP@LmER0Lq0<~Lp%`YGo&&U z16hecvKYwDWk_PknS4=H#E4I})m1V(2c{%xc=|GbiH?I*E cWZb+<)R$>v11IBVc8PB+o7uz|fjl-w0N=DXc>n+a diff --git a/fdog/addTaxa.py b/fdog/addTaxa.py index 1c83bb5..d392c8c 100644 --- a/fdog/addTaxa.py +++ b/fdog/addTaxa.py @@ -95,7 +95,7 @@ def runAddTaxon(args): sys.exit('Problem running\n%s' % (cmd)) def main(): - version = '0.0.1' + version = '0.0.5' parser = argparse.ArgumentParser(description='You are running fdog.addTaxa version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') @@ -125,6 +125,7 @@ def main(): sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') with open(pathconfigFile) as f: outPath = f.readline().strip() + outPath = os.path.abspath(outPath) noAnno = args.noAnno coreTaxa = args.coreTaxa oldFAS = args.oldFAS diff --git a/fdog/addTaxon.py b/fdog/addTaxon.py index e09f1e4..fe0a810 100755 --- a/fdog/addTaxon.py +++ b/fdog/addTaxon.py @@ -77,13 +77,13 @@ def runBlast(args): subprocess.call([blastCmd], shell = True) except: sys.exit('Problem with running %s' % blastCmd) - fileInGenome = "%s/genome_dir/%s/%s.fa" % (outPath, specName, specName) + fileInGenome = "../../genome_dir/%s/%s.fa" % (specName, specName) fileInBlast = "%s/blast_dir/%s/%s.fa" % (outPath, specName, specName) if not Path(fileInBlast).exists(): os.symlink(fileInGenome, fileInBlast) def main(): - version = '0.0.2' + version = '0.0.5' parser = argparse.ArgumentParser(description='You are running fdog.addTaxon version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') @@ -115,6 +115,7 @@ def main(): sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') with open(pathconfigFile) as f: outPath = f.readline().strip() + outPath = os.path.abspath(outPath) noAnno = args.noAnno coreTaxa = args.coreTaxa ver = str(args.verProt) @@ -152,10 +153,13 @@ def main(): seq = str(inSeq[id].seq) # check ID id = re.sub('\|', '_', id) - if len(id) > 80: - # modIdIndex = modIdIndex + 1 - # id = specName + "_" + str(modIdIndex) + oriId = id + if len(id) > 30: + modIdIndex = modIdIndex + 1 + id = specName + "_" + str(modIdIndex) longId = 'yes' + with open(specFile + '.mapping', 'a') as mappingFile: + mappingFile.write('%s\t%s\n' % (id, oriId)) if not id in tmpDict: tmpDict[id] = 1 else: @@ -184,7 +188,7 @@ def main(): cf.close() # warning about long header if longId == 'yes': - print('\033[91mWARNING: Headers are longer than 80 characters. It could cause some troubles!\033[0m') + print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile) else: print(genomePath + '/' + specName + '.fa already exists!') diff --git a/fdog/bin/hamstr.pl b/fdog/bin/hamstr.pl index 37ae73a..7ff125e 100755 --- a/fdog/bin/hamstr.pl +++ b/fdog/bin/hamstr.pl @@ -193,9 +193,11 @@ ## 13.07.2020 (v13.3.0 - vinh) solved problem when gene ID contains PIPE ## 22.07.2020 (v13.4.0 - vinh) moved tmp blast files to output folder and delete them when finished ## 01.12.2020 (v13.4.1 - vinh) add silent option to muscle for checkCoOrthologsRef +## 21.01.2021 (v13.4.2 - vinh) fiexed bug when refspec has "dot" in its name +## 19.03.2021 (v13.4.3 - vinh) changed $path to current directory ######################## start main ########################################### -my $version = "HaMStR v.13.4.1"; +my $version = "HaMStR v.13.4.4"; ######################## checking whether the configure script has been run ### my $configure = 0; if ($configure == 0){ @@ -214,8 +216,9 @@ my $filter = 'F'; # low complexity filter switch. Default 'on'. Set of 'F' to turn off permanently. my $eval_blast = 10; # default evalue cutoff for the blast search ########## EDIT THE FOLLOWING LINES TO MODIFY DEFAULT PATHS ################### -my $path = abs_path(dirname(__FILE__)); -$path =~ s/\/bin//; +# my $path = abs_path(dirname(__FILE__)); +# $path =~ s/\/bin//; +my $path = getcwd; my $hmmpath = "$path/core_orthologs"; #path where the hmms are located my $blastpath = "$path/blast_dir"; #path to the blast-dbs my $outpath = '.'; @@ -223,10 +226,10 @@ my $hmm_dir = 'hmm_dir'; my $fa_dir = 'fa_dir'; ############################## -my $termios = new POSIX::Termios; $termios->getattr; -my $ospeed = $termios->getospeed; -my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed }; -my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/; +# my $termios = new POSIX::Termios; $termios->getattr; +# my $ospeed = $termios->getospeed; +# my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed }; +# my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/; ############################## Variables ############## my $fileobj; @@ -322,16 +325,16 @@ } ## help message my $helpmessage = " -${bold}YOU ARE RUNNING $version on $hostname$norm +YOU ARE RUNNING $version on $hostname This program is freely distributed under a GPL. Copyright (c) GRL limited: portions of the code are from separate copyrights -\n${bold}USAGE:${norm} hamstr -sequence_file=<> -hmmset=<> -taxon=<> -refspec=<> [OPTIONS] +\nUSAGE: hamstr -sequence_file=<> -hmmset=<> -taxon=<> -refspec=<> [OPTIONS] -${bold}OPTIONS:$norm +OPTIONS: -${bold}REQUIRED$norm +REQUIRED -sequence_file=<> path and name of the file containing the sequences hmmer is run against. -hmmset=<> @@ -359,7 +362,7 @@ set this flag if you are searching in protein sequences. Note, if neither the -est nor the -protein flag is set, HaMStR will guess the sequence type. -${bold}USING NON-DEFAULT PATHS$norm +USING NON-DEFAULT PATHS -blastpath=<> Lets you specify the absolute or relative path to the blast databases. DEFAULT: $blastpath @@ -368,7 +371,7 @@ -outpath=<> You can determine the path to the HaMStR output. Default: current directory. -${bold}ADDITIONAL OPTIONS$norm +ADDITIONAL OPTIONS -append set this flag if the output should be appended to the files *.out and *_cds.out. This becomes relevant when running @@ -412,7 +415,7 @@ -hmm Option to provide only a single hmm to be used for the search. Note, this file has to end with .hmm --intron=<${bold}k${norm}eep|${bold}m${norm}ask|${bold}r${norm}emove> +-intron= Specify how to deal with introns that may occur in transcript sequences. Default: keep - Introns will be retained in the transcript but will be identified by lower case letters. -longhead @@ -512,7 +515,7 @@ ## 1) check if all information is available to run HaMStR ($check, @log) = &checkInput(); if ($check == 0) { - print "\n\n${bold}There was an error running $version$norm\n\n"; + print "\n\nThere was an error running $version\n\n"; print join "\n", @log; exit; } @@ -783,11 +786,11 @@ sub checkInput { my @coresets = (`ls $hmmpath`); chomp @coresets; if (scalar(@coresets > 0)){ - print "\n${bold}THE FOLLOWING CORE ORTHOLOG SETS ARE AVAILABLE IN $hmmpath:${norm}\n\n"; + print "\nTHE FOLLOWING CORE ORTHOLOG SETS ARE AVAILABLE IN $hmmpath:\n\n"; for (my $i = 0; $i < @coresets; $i++){ my @available = qw(); my @unavailable = qw(); - print "\n${bold}$coresets[$i]${norm}\n\n"; + print "\n$coresets[$i]\n\n"; my @refspec = `head -n 20 $hmmpath/$coresets[$i]/$coresets[$i].fa |$grepprog '>' |cut -d '|' -f 2 |sort |uniq`; chomp @refspec; for (my $j = 0; $j < @refspec; $j++){ @@ -807,7 +810,7 @@ sub checkInput { } } else { - print "\n${bold}NO CORE ORTHOLOG SETS ARE AVAILABLE! CHECK $hmmpath!${norm}\n\n"; + print "\nNO CORE ORTHOLOG SETS ARE AVAILABLE! CHECK $hmmpath!\n\n"; } print "\n\n"; exit; @@ -873,12 +876,17 @@ sub checkInput { } $dbfile =~ s/.*\///; - $dbfile_short = $dbfile; - $dbfile_short =~ s/\..*//; + # $dbfile_short = $dbfile; + # $dbfile_short =~ s/\..*//; + my @dbfileTMP = split(/\./, $dbfile); pop @dbfileTMP; + $dbfile_short = join(".", @dbfileTMP); if ($central) { $dboutpath = $dbpath; # print "setting dboutpath to $dboutpath"; } + + # print "HERERERERERERERERER $dbfile #################\n"; + # print "THENNNNNNNNNNNNNNNN $dbfile_short #################\n"; ## ## 0) Check for presence of the file with the sequences that should be hamstered if (-e "$dbpath/$dbfile") { @@ -886,7 +894,7 @@ sub checkInput { } else { #the provided infile does not exist: - push @log, "${bold}FATAL:${norm} The specified infile $dbpath/$dbfile does not exist. PLEASE PROVIDE A VALID INFILE!\n"; + push @log, "FATAL: The specified infile $dbpath/$dbfile does not exist. PLEASE PROVIDE A VALID INFILE!\n"; $check = 0; return ($check, @log); } @@ -952,7 +960,7 @@ sub checkInput { push @log, "Translated file already exists, using this one"; } if (! -e "$dboutpath/$dbfile") { - push @log, "${bold}FATAL:${norm} The translation of $dbfile_base failed. Check the script translate.pl"; + push @log, "FATAL: The translation of $dbfile_base failed. Check the script translate.pl"; print "failed\n"; $check = 0; } @@ -965,7 +973,7 @@ sub checkInput { push @log, "\nCHECKING FOR PROGRAMS\n"; printOUT("checking for the blast program:\t"); if (`which $blast_prog` =~ / no /) { - push @log, "${bold}FATAL:${norm} could not execute $blast_prog. Please check if this program is installed and executable"; + push @log, "FATAL: could not execute $blast_prog. Please check if this program is installed and executable"; print "failed\n"; $check = 0; } @@ -979,12 +987,12 @@ sub checkInput { printOUT("checking for hmmsearch:\t"); my $hmmcheck = `$prog -h |$grepprog -c 'HMMER 3'`; if (! `$prog -h`) { - push @log, "${bold}FATAL:${norm} could not execute $prog. Please check if this program is installed and executable"; + push @log, "FATAL: could not execute $prog. Please check if this program is installed and executable"; print "failed: $prog is not installed or not executable\n"; $check = 0; } elsif ($hmmcheck != 1) { - push @log, "${bold}FATAL:${norm} It seems that $prog is not from the HMMER 3 package. Please check!"; + push @log, "FATAL: It seems that $prog is not from the HMMER 3 package. Please check!"; print "failed: $prog is not from the HMMER 3 package\n"; $check = 0; } @@ -996,14 +1004,14 @@ sub checkInput { if ($check_genewise) { printOUT("checking for genewise:\t"); if (! `genewise -help`) { - push @log, "${bold}FATAL:${norm} Could not execute genewise. Please check if this program is installed and executable"; + push @log, "FATAL: Could not execute genewise. Please check if this program is installed and executable"; print "failed: genewise is not executable\n"; $check = 0; } else { my $gwcheck = `echo \$WISECONFIGDIR`; if (length($gwcheck) < 1) { - push @log, "${bold}FATAL:${norm} The environmental variable WISECONFIGDIR has not been set. I am expecting troubles when invoking genewise. + push @log, "FATAL: The environmental variable WISECONFIGDIR has not been set. I am expecting troubles when invoking genewise. Please consult the installation manual for genewise and set this variable"; print "failed: the environmental variable WISECONFIGDIR has not been set.\n"; $check = 0; @@ -1014,14 +1022,14 @@ sub checkInput { } } else { - push @log, "${bold}GENEWISE-CHECK skipped:${norm} The hamstr-script has been configured with the option --protein_only. To override this setting set reconfigure the script or set the variable $check_genewise to 1"; + push @log, "GENEWISE-CHECK skipped: The hamstr-script has been configured with the option --protein_only. To override this setting set reconfigure the script or set the variable $check_genewise to 1"; } ## 4) Check for presence of the directory structure push @log, "\nCHECKING FOR HMMs\n"; printOUT("checking for presence of the hmm files:\t"); if ( ! defined $hmmset or ! -e "$hmmpath/$hmmset") { - push @log, "${bold}FATAL:${norm} You need to specify a valid core ortholog set. Make also sure that you provide the path to this set if it is not in the default location $hmmpath. You can check available core ortholog sets using the option -show_hmmsets."; + push @log, "FATAL: You need to specify a valid core ortholog set. Make also sure that you provide the path to this set if it is not in the default location $hmmpath. You can check available core ortholog sets using the option -show_hmmsets."; print "failed\n"; $check = 0; } @@ -1033,7 +1041,7 @@ sub checkInput { ## 4b) check for the presence of the hmm-files and the fasta-file if (!(-e "$hmm_dir")) { - push @log, "${bold}FATAL:${norm} Could not find $hmm_dir"; + push @log, "FATAL: Could not find $hmm_dir"; print "failed\n"; $check = 0; } else { @@ -1043,7 +1051,7 @@ sub checkInput { ### check for the presence of all hmms for (my $k = 0; $k < @hmms; $k++) { if (! -e "$hmm_dir/$hmms[$k]") { - push @log, "${bold}FATAL:${norm} $hmms[$k] has been defined but could not be found in $hmm_dir/$hmms[$k]"; + push @log, "FATAL: $hmms[$k] has been defined but could not be found in $hmm_dir/$hmms[$k]"; $check = 0; last; } else { @@ -1073,7 +1081,7 @@ sub checkInput { } } else { - push @log, "${bold}FATAL:${norm} Please provide path and name of fasta file containing the core-ortholog sequences"; + push @log, "FATAL: Please provide path and name of fasta file containing the core-ortholog sequences"; $check = 0; print "failed\n"; } @@ -1086,7 +1094,7 @@ sub checkInput { $taxon_check = 2; } else { - push @log, "${bold}FATAL:${norm} No taxon_file found. Please provide a global taxon name using the option -taxon"; + push @log, "FATAL: No taxon_file found. Please provide a global taxon name using the option -taxon"; print "failed\n"; $check = 0; } @@ -1094,7 +1102,7 @@ sub checkInput { push @log, "\nCHECKING FOR REFERENCE TAXON\n"; printOUT("checking for reference species and blast-dbs:\t"); if (!(defined $refspec_string) and (! defined $strict and ! defined $relaxed)) { - push @log, "${bold}FATAL:${norm} Please provide a reference species for the reblast!"; + push @log, "FATAL: Please provide a reference species for the reblast!"; print "failed\n"; $check = 0; } @@ -1146,7 +1154,7 @@ sub checkInput { printOUT("succeeded\n"); } else { - push @log, "${bold}FATAL:${norm} please edit the blastpath. Could not find $blastpathtmp or blast database blastpathtmp.pin does not exist."; + push @log, "FATAL: please edit the blastpath. Could not find $blastpathtmp or blast database blastpathtmp.pin does not exist."; print "$blastpathtmp failed\n"; $check = 0; } @@ -1174,7 +1182,7 @@ sub checkInput { push @log, "\tinfile ready"; } else { #the provided reference fasta file does not exist or link to file does not exist: - push @log, "${bold}FATAL:${norm} FASTA file for the specified reference $refspec[$i] does not exist. PLEASE PROVIDE A VALID REFERENCE SPECIES!\n"; + push @log, "FATAL: FASTA file for the specified reference $refspec[$i] does not exist. PLEASE PROVIDE A VALID REFERENCE SPECIES!\n"; $check = 0; return ($check, @log); } @@ -1241,7 +1249,7 @@ sub checkInput { printOUT("checking for low complexity filter setting:\t"); $filter =~ tr/ft/FT/; if ($filter ne 'T' and $filter ne 'F') { - push @log, "${bold}FATAL:${norm} Filter is set to $filter. Please set the low complexity filter either to F or T."; + push @log, "FATAL: Filter is set to $filter. Please set the low complexity filter either to F or T."; print "low complexity filter check failed\n"; $check = 0; } @@ -1283,12 +1291,10 @@ sub checkInput { `rm -rf "$fa_dir_neu"`; `mkdir "$fa_dir_neu"`; } - if (!(-d "$tmpdir")) { - `mkdir "$tmpdir"`; - } - elsif (-d "$tmpdir" and $cleartmp) { + mkdir "$tmpdir" unless -d "$tmpdir"; + if (-d "$tmpdir" and $cleartmp) { `rm -rf "$tmpdir"`; - `mkdir "$tmpdir"`; + mkdir "$tmpdir" unless -d "$tmpdir"; } } ## 14) determin whether or not the -representative flag has been set @@ -1401,23 +1407,23 @@ sub check4reciprocity { my $suc = 0; # keeps track of success for a single taxon if ($checkCoRef == 0) { ## the user does not want to check further in case that id of best blast hit and of reference species differ - printOUT("core_orthologs: ", join "\t", @original_ids , "\n"); + printOUT("core_orthologs: @original_ids\n"); ## now loop through the best hits with the same score and check whether ## among these I find the same seq as in $original my $i = 0; while ($suc == 0 and $i <@$hits) { - printOUT("blast-hit: $hits->[$i]->{name}"); + printOUT("blast-hit: $hits->[$i]->{name}\n"); ## now loop through all the refspec-sequences in the hmm file; this is the case when co-orthologs have been determine in the core-ortholog my $j = 0; while ($suc == 0 and $j < @original_ids) { if ($original_ids[$j] eq $hits->[$i]->{name}) { - printOUT("\thitting\n"); + printOUT("hitting $original_ids[$j]\n"); $refspec_final->[$k]->{hit} = $j; $suc = 1; $relaxed_suc = 1; } else { - printOUT("\nnot hitting $original_ids[$j]\n"); + printOUT("not hitting $original_ids[$j]\n"); $j ++; } if ($suc == 1) { @@ -1468,7 +1474,7 @@ sub check4reciprocity { } ## print distances (debug mode) if ($debug){ - my $distDebugFile = $path . "/output/" . $taxon_global . ".debug.dist"; + my $distDebugFile = $outpath . "/" . $taxon_global . ".debug.dist"; #$path . "/output/" . $taxon_global . ".debug.dist"; unless (-e $distDebugFile){ open (my $DISTDEBUG, ">>$distDebugFile") or die "Error, could not create file: ". "$distDebugFile"; print $DISTDEBUG "hmmset\trefid\tbestid\tqueryid\tqhdist\trhdist\n"; @@ -2024,9 +2030,8 @@ sub determineRefspecFinal { my $ac = 0; for (my $i = 0; $i < @refspec; $i++) { $fafile =~ s/\|/\\\|/g; - @original = `$grepprog -A 1 "^>$query_name|$refspec[$i]" $fafile |$sedprog -e "s/.*$refspec[$i]\|//"`; + @original = `$grepprog -A 1 "^>$query_name|$refspec[$i]" $fafile | grep -v "^\-\-\$" |$sedprog -e "s/.*$refspec[$i]\|//"`; chomp @original; - if (@original > 0) { $refspec_final->[$ac]->{refspec} = $refspec[$i]; $refspec_final->[$ac]->{searchdb} = "$blastpath/$refspec[$i]/$refspec[$i]" . $blastapp; diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 61cae86..7e8a248 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -121,9 +121,15 @@ ## Modified 22. Sep 2020 v2.2.1 (Vinh) - make sure that seed sequence always at the beginning of extended.fa output ## Modified 23. Sep 2020 v2.2.3 (Vinh) - use full taxonomy name instead of abbr taxon name for LOG ## Modified 01. Dec 2020 v2.2.4 (Vinh) - fixed bug while creating final extended.fa (and replaced grep and sed by bioperl) +## Modified 16. Feb 2021 v2.2.5 (Vinh) - core compilation works with fasoff +## Modified 18. Feb 2021 v2.2.6 (Vinh) - fixed searchTaxa and coreTaxa options +## Modified 19. March 2021 v2.2.7 (Vinh) - check for long sequence ID +## Modified 24. March 2021 v2.2.8 (Vinh) - skip fa.mapping while checking genome_dir +## Modified 29. March 2021 v2.2.9 (Vinh) - check for zero $maxAlnScore +## - solved problem with long input path for fasta36 tools ############ General settings -my $version = 'oneSeq v.2.2.4'; +my $version = 'oneSeq v.2.2.9'; ##### configure for checking if the setup.sh script already run my $configure = 0; if ($configure == 0){ @@ -133,10 +139,10 @@ my $hostname = `hostname`; chomp $hostname; ############# -my $termios = new POSIX::Termios; $termios->getattr; -my $ospeed = $termios->getospeed; -my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed }; -my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/; +# my $termios = new POSIX::Termios; $termios->getattr; +# my $ospeed = $termios->getospeed; +# my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed }; +# my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/; #### Paths my $path = abs_path(dirname(__FILE__)); $path =~ s/\/bin//; @@ -166,7 +172,7 @@ my $blast_prog = 'blastp'; my $outputfmt = 'blastxml'; my $eval_blast_query = 0.0001; -my $filter = 'T'; +my $filter = 'F'; # default for blastp my $annotation_prog = "annoFAS"; my $fas_prog = "calcFAS"; my $fdogFAS_prog = "fdogFAS"; @@ -197,6 +203,7 @@ my $idx_dir = "$path/taxonomy/"; my $dataDir = $path . '/data'; my $weightPath = "$path/weight_dir/"; +my $assembly_dir = "$path/assembly_dir/"; my @defaultRanks = ( 'superkingdom', 'kingdom', @@ -300,6 +307,15 @@ my %hashTree; my $aln = 'muscle'; my $searchTaxa; +#variables for fdog_goes_assembly +my $assembly; +my $augustusRefSpec; +my $avIntron; +my $lengthExtension; +my $assemblyPath; +my $searchTool = 'blast'; +my $matrix = 'blosum62'; +my $dataPath = ''; ################# Command line options GetOptions ( "h" => \$help, @@ -361,7 +377,15 @@ "distDeviation=s" => \$distDeviation, "aligner=s" => \$aln, "hyperthread" => \$hyperthread, - "searchTaxa=s" => \$searchTaxa + "searchTaxa=s" => \$searchTaxa, + "assembly" => \$assembly, + "assemblypath=s" => \$assemblyPath, + "augustusRefSpec=s" => \$augustusRefSpec, + "avIntron=s" => \$avIntron, + "lengthExtension=s" => \$lengthExtension, + "searchTool=s" => \$searchTool, + "scoringmatrix=s" => \$matrix, + "dataPath=s" => \$dataPath ); $outputPath = abs_path($outputPath); @@ -373,6 +397,8 @@ $weightPath = abs_path($weightPath)."/"; $genome_dir = abs_path($genome_dir)."/"; $taxaPath = $genome_dir; +$dataPath = abs_path($dataPath)."/"; +$assembly_dir = abs_path($assemblyPath)."/"; ############# do initial check if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) { @@ -381,7 +407,7 @@ initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff); print "Check finished in " . roundtime(gettime() - $checkStTime). " sec!\n"; - if (!defined $coreex) { + if (!defined $coreex && !defined $assembly) { if (!grep(/$minDist/, @defaultRanks)) { die "ERROR: minDist $minDist invalid!\n"; } @@ -464,7 +490,7 @@ # create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction) # get annotations for seed sequence if fas support is on -if ($fas_support){ +if ($fas_support && !$assembly){ if (!$weightPath) { createWeightFolder(); } @@ -473,7 +499,7 @@ my $coreStTime = gettime(); #time; #core-ortholog search -if (!$coreex) { +if (!$coreex && !$assembly) { print "\nCore compiling...\n"; $coremode = 1; $taxaPath = $blastPath; @@ -562,11 +588,14 @@ } } printDebug("The maximum alignmentscore is: $maxAlnScore"); + if ($maxAlnScore == 0) { + die("Maximum alignment score is Zero! Something went wrong with fasta36 functions!\n") + } clearTmpFiles(); my $addedTaxon = getBestOrtholog(); my $addedTaxonName = getTaxonName($addedTaxon); - print "Added TAXON: $addedTaxon\_$addedTaxonName\n"; + print "Added TAXON: $addedTaxon\t$addedTaxonName\n"; #if a new core ortholog was found if($addedTaxon ne "") { $hamstrSpecies = $hamstrSpecies . "," . $addedTaxon; @@ -608,12 +637,17 @@ my $final_eval_blast = $eval_blast*$eval_relaxfac; my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac; - $taxaPath = $genome_dir; + if (!$assembly){ + $taxaPath = $genome_dir; + } + else{ + $taxaPath = $assembly_dir; + } my @searchTaxa; - unless($groupNode) { - @searchTaxa = keys %taxa; - } else { - unless ($searchTaxa) { + unless ($searchTaxa) { + unless($groupNode) { + @searchTaxa = keys %taxa; + } else { # %taxa = getTaxa(); # print "GET TAXA TIME: ", roundtime(gettime() - $startTmp),"\n"; my $tree = getTree(); @@ -629,11 +663,11 @@ foreach (get_leaves($tree)) { push(@searchTaxa, @{$_->name('supplied')}[0]); } - } else { - open(SEARCH, $searchTaxa) || die "Cannot open $searchTaxa file!\n"; - @searchTaxa = ; - close (SEARCH); } + } else { + open(SEARCH, $searchTaxa) || die "Cannot open $searchTaxa file!\n"; + @searchTaxa = ; + close (SEARCH); } # print "PREPARE TIME: ", roundtime(gettime() - $startTmp),"\n"; @@ -645,15 +679,82 @@ foreach (sort @searchTaxa) { chomp(my $searchTaxon = $_); my $pid = $pm->start and next; + if ($coreex) { + $db = Bio::DB::Taxonomy->new(-source => 'flatfile', + -nodesfile => $idx_dir . 'nodes.dmp', + -namesfile => $idx_dir . 'names.dmp', + -directory => $idx_dir); + $db_bkp = $db; + } my $searchTaxonName = getTaxonName($searchTaxon); if (defined($searchTaxonName)) { unless ($silent) { print $searchTaxon, "\t", $searchTaxonName, "\n"; } else { - print $searchTaxonName, "\n"; + unless ($searchTaxonName eq "Unk") { + print $searchTaxonName, "\n"; + } else { + print $searchTaxon, "\n"; + } + } + } + if ($assembly){ + $eval_blast = sprintf("%f", $eval_blast); + if ($seqFile ne "") { + my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent"); + + if (defined $assemblyPath){ + push(@assembly_cmd, "--assemblyPath $assemblyPath") + } + if (defined $avIntron){ + push(@assembly_cmd, "--avIntron $avIntron "); + } + if (defined $lengthExtension){ + push(@assembly_cmd, "--lengthExtension $lengthExtension "); + } + if (!$autoclean){ + push(@assembly_cmd, "--tmp "); + } + if ($outputPath){ + push(@assembly_cmd, "--out $outputPath "); + } + if (defined $strict){ + push(@assembly_cmd, "--strict"); + } + if ($eval_blast){ + push(@assembly_cmd, "--evalBlast $eval_blast "); + } + if ($searchTool){ + push(@assembly_cmd, "--msaTool $aln "); + } + if (defined $checkcoorthologsref){ + push(@assembly_cmd, "--checkCoorthologsRef"); + } + if ($searchTool){ + push(@assembly_cmd, "--searchTool $searchTool"); + } + if ($matrix){ + push(@assembly_cmd, "--scoringmatrix $matrix"); + } + if ($coreOrthologsPath){ + push(@assembly_cmd, "--coregroupPath $coreOrthologsPath"); + } + if ($fasoff){ + push(@assembly_cmd, "--fasoff"); + } + if ($searchTaxon){ + push(@assembly_cmd, "--searchTaxon $searchTaxon"); + } + if ($filter){ + push(@assembly_cmd, "--filter $filter"); + } + printDebug(@assembly_cmd); + system(join(' ', @assembly_cmd)) == 0 or die "Error: fDOGassembly failed \n"; } } + else{ runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln); + } $pm->finish; } $pm->wait_all_children; @@ -661,8 +762,8 @@ push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!"; print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n"; -## Evaluation of all orthologs that are predicted by the final run -if(!$coreOnly){ + +if(!$coreOnly && !$assembly){ my $fasStTime = gettime(); my $processID = $$; @@ -671,10 +772,10 @@ die "ERROR: Could not find $finalOutput\n"; } # check and add seed to final extended.fa if needed - addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); # BLABLABLABLA + addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); # calculate FAS scores for final extended.fa - if ($fas_support) { + if ($fas_support && !$assembly) { print "Starting the feature architecture similarity score computation...\n"; my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu"; unless ($countercheck) { @@ -687,12 +788,21 @@ } push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; + if($autoclean){ print "Cleaning up...\n"; runAutoCleanUp($processID); } } +if ($assembly){ + my $file_assembly_out; + $file_assembly_out = $outputPath . '/' . $seqName; + my $cmd_merge; + $cmd_merge = "fdog.mergeAssembly --in $outputPath --out $file_assembly_out --cleanup"; + printDebug($cmd_merge); + system($cmd_merge); +} ## Delete tmp folder unless ($debug) { my $delTmp = "rm -rf $tmpdir"; @@ -721,8 +831,12 @@ sub clearTmpFiles { } #clear all alignment files - my @files = glob("*.scorefile"); - foreach my $file (@files) { + my @scorefiles = glob("*.scorefile"); + foreach my $file (@scorefiles) { + unlink($file); + } + my @fastaInfiles = glob("*_fasta36.fa"); + foreach my $file (@fastaInfiles) { unlink($file); } } @@ -761,21 +875,19 @@ sub getCandicontent{ sub getCumulativeAlnScores{ chdir($coreOrthologsPath . $seqName); my $candidatesFile = $outputFa . ".extended"; - my $scorefile = $$ . ".scorefile"; + my $fileId = $$; + my $scorefile = $fileId . ".scorefile"; + my $fasta36file1 = $fileId . ".1_fasta36.fa"; + my $fasta36file2 = $fileId . ".2_fasta36.fa"; my %scores; + ######################## ## step: 1 - ## setup - ## set alignment command (glocal, local, or global) - #local local:local ssearch36 Smith-Waterman - #glocal global:local glsearch36 Needleman-Wunsch - #global global:global ggsearch36 Needleman-Wunsch - my $loclocCommand = "$localaligner \"" . $outputFa . "\" \"" . $candidatesFile . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; - my $globlocCommand = "$glocalaligner \"" . $outputFa . "\" \"" . $candidatesFile . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; - my $globglobCommand = "$globalaligner \"" . $outputFa . "\" \"" . $candidatesFile . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; + ## set alignment parameters for fasta36 + my $fasta36cmd = $fasta36file1 . "\" \"" . $fasta36file2 . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; + ######################## ## step: 2 - ## setup ## candidates to hash ## %candicontent keeps info about all candidates (header and sequence) my %candicontent = getCandicontent(); @@ -784,11 +896,25 @@ sub getCumulativeAlnScores{ ## step: 3 ## get alignment scores chdir($coreOrthologsPath . $seqName); + symlink($outputFa, $fasta36file1); + symlink($candidatesFile, $fasta36file2); if ($glocal){ + #glocal global:local glsearch36 Needleman-Wunsch + my $globlocCommand = "$glocalaligner \"" . $fasta36cmd; + printDebug($globlocCommand); + # print $globlocCommand,"\n";<>; system($globlocCommand); }elsif ($global){ + #global global:global ggsearch36 Needleman-Wunsch + my $globglobCommand = "$globalaligner \"" . $fasta36cmd; + printDebug($globglobCommand); + # print $globglobCommand,"\n";<>; system($globglobCommand); }elsif ($local){ + #local local:local ssearch36 Smith-Waterman + my $loclocCommand = "$localaligner \"" . $fasta36cmd; + printDebug($loclocCommand); + # print $loclocCommand,"\n";<>; system($loclocCommand); } ######################## @@ -806,49 +932,7 @@ sub getCumulativeAlnScores{ ## Get the alinment scores for the current candidate file sub getAlnScores{ chdir($coreOrthologsPath . $seqName); - my $candidatesFile = $outputFa . ".extended"; - my $scorefile = $$ . ".scorefile"; - my %scores; - - ######################## - ## step: 1 - ## setup - ## set alignment command (glocal, local, or global) - #local local:local ssearch36 Smith-Waterman - #glocal global:local glsearch36 Needleman-Wunsch - #global global:global ggsearch36 Needleman-Wunsch - my $loclocCommand = "$localaligner " . $outputFa . " " . $candidatesFile . " -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; - my $globlocCommand = "$glocalaligner " . $outputFa . " " . $candidatesFile . " -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; - my $globglobCommand = "$globalaligner " . $outputFa . " " . $candidatesFile . " -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; - - ######################## - ## step: 2 - ## setup - ## candidates to hash - ## %candicontent keeps info about all candidates (header and sequence) - my %candicontent = getCandicontent(); - - ######################## - ## step: 3 - ## get alignment scores - chdir($coreOrthologsPath . $seqName); - if ($glocal){ - system($globlocCommand); - }elsif ($global){ - system($globglobCommand); - }elsif ($local){ - system($loclocCommand); - } - - ######################## - ## step: 4 - ## collect alignment score - ## keep track about min and max for each query/coreortholog vs candidate set - my $max = -10000000; - my $min = 10000000; - - %scores = cumulativeAlnScore($scorefile, \%candicontent); - + my %scores = getCumulativeAlnScores(); ## Normalize Alignment scores (unity-based) printDebug("Normalize alignment scores:\n"); foreach my $key (keys %scores){ @@ -885,8 +969,8 @@ sub getFasScore{ ## step: 2 ## get FAS score ## fas support: on/off + my @candidateIds = keys(%candicontent); if ($fas_support){ - my @candidateIds = keys(%candicontent); my ($name,$gene_set,$gene_id,$rep_id) = split(/\|/, $candidateIds[0]); unless (-e "$weightPath/$gene_set.json") { print "ERROR: $weightPath/$gene_set.json not found! FAS Score will be set as zero.\n"; @@ -898,6 +982,8 @@ sub getFasScore{ my @fasOutTmp = split(/\t/,$fasOutTmp); $fas_box{$candidateIds[0]} = $fasOutTmp[1]; } + } else { + $fas_box{$candidateIds[0]} = 1; } return %fas_box; } @@ -1123,10 +1209,10 @@ sub checkOptions { if ($force == 1 and $append ==1) { $force = 0; } - ### check the presence of the pre-computed core set - if ($coreex) { + ### check the presence of the pre-computed core set if options reuseCore or assembly is used + if ($coreex || $assembly) { if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") { - print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; + print "You selected the option -reuseCore or -assembly, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; exit; } } @@ -1155,7 +1241,7 @@ sub checkOptions { ### end move up ### adding new routine to generate the input sequence if -reuseCore has been set if ($coreex) { - my @refseq=`$grepprog -A 1 ">$seqName|$refSpec" $coreOrthologsPath/$seqName/$seqName.fa`; + my @refseq=`$grepprog -A 1 ">$seqName|$refSpec" $coreOrthologsPath/$seqName/$seqName.fa | grep -v "^\-\-\$"`; chomp @refseq; unless ($silent) { print "$refseq[0]\n"; @@ -1197,7 +1283,7 @@ sub checkOptions { ### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected $optbreaker = 0; - while(!$minCoreOrthologs and !$coreex) { + while(!$minCoreOrthologs and (!$coreex and !$assembly)) { if ($optbreaker >= 3){ print "No proper number given ... exiting.\n"; exit; @@ -1212,10 +1298,12 @@ sub checkOptions { $filter = 'no' if $filter eq 'F'; } - $inputSeq = fetchSequence($seqFile, $dataDir); + if (!$assembly){ + $inputSeq = fetchSequence($seqFile, $dataDir); + } ## the user has not provided a sequence id, however, the refspec is determined. - if($seqId eq '') { + if($seqId eq '' && !$assembly) { my $besthit; if (!$blast){ ## a refspec has been determined @@ -1230,6 +1318,9 @@ sub checkOptions { $refSpec = $besthit->{species}; my $details = "Evalue: " . $besthit->{evalue}; printOut("Seq id has been determined as $seqId in $refSpec with $details", 2); + if(length("$seqName|$refSpec|$seqId") > 60) { + die "Output file will have header longer than 60 characters ($seqName|$refSpec|$seqId). Please consider shorten the sequence IDs! More at https://github.com/BIONF/fDOG/wiki/Check-data-validity\n"; + } if($seqId eq '') { print "There was no significant hit for your sequence in " . $refSpec . ".\nPlease specify a sequence id on your own.\n"; exit; @@ -1241,13 +1332,13 @@ sub checkOptions { print "Please specify a valid file with taxa for the core orthologs search\n"; exit; } - my @userTaxa = parseTaxaFile(); + my @userTaxa = parseTaxaFile($coreTaxa); my %newTaxa = (); foreach (@userTaxa) { $newTaxa{$_} = $taxa{$_}; } $newTaxa{$refSpec} = $refTaxa{$refSpec}; - %taxa = %newTaxa; + %refTaxa = %newTaxa; } if($group) { @@ -1334,14 +1425,14 @@ sub checkOptions { } } - my $node; - $node = $db->get_taxon(-taxonid => $refTaxa{$refSpec}); - $node->name('supplied', $refSpec); - #### checking for the min and max distance for the core set compilation #### omit this check, if the option reuseCore has been selected (added 2019-02-04) $optbreaker = 0; - if (!$coreex) { + if (!$coreex and !$assembly) { + my $node; + #print "Testing coreex assembly\n"; + $node = $db->get_taxon(-taxonid => $refTaxa{$refSpec}); + $node->name('supplied', $refSpec); if (lc($maxDist) eq "root"){ $maxDist = 'no rank'; } @@ -1357,9 +1448,6 @@ sub checkOptions { $maxDist = parseInput($node, $in); print "You selected ". $maxDist . " as maximum rank\n\n"; } - } - $optbreaker = 0; - if (!$coreex){ while (!$minDist or (checkRank($minDist, $node) == 0)) { if ($optbreaker >= 3){ print "No proper minDist given ... exiting.\n"; @@ -1373,6 +1461,7 @@ sub checkOptions { print "You selected " . $minDist . " as minimum rank\n\n"; } } + $optbreaker = 0; #### checking in fas options if($fasoff){ @@ -1596,8 +1685,9 @@ sub getBestOrtholog { ## candidates alnScore is high enought, that it would be better with a fasScore of one ## -> evaluate if ($alnScores{$candiKey} > $rankScore * (1 + $distDeviation) - 1){ + %fas_box = getFasScore(); if (!$gotFasScore and $fas_support){ - %fas_box = getFasScore(); + # %fas_box = getFasScore(); $gotFasScore = 1; } ## get rankscore @@ -1622,8 +1712,9 @@ sub getBestOrtholog { } ## candidate has the same distance, as the last one and could be better, with a fasScore of one elsif (defined $hashTree{$newNoRankDistNode}{$key->id} and $alnScores{$candiKey} > $rankScore - 1){ + %fas_box = getFasScore(); if (!$gotFasScore and $fas_support){ - %fas_box = getFasScore(); + # %fas_box = getFasScore(); $gotFasScore = 1; } ## get rankscore @@ -1909,7 +2000,7 @@ sub getTaxonName { if (defined($taxon)) { return($taxon->scientific_name); } else { - return("Unk NCBI taxon for $taxAbbr"); + return("Unk"); } } @@ -2008,6 +2099,7 @@ sub runHamstr { print EXTENDEDFA ">$tmpId[0]\|$tmpId[-3]\|$tmpId[-2]\|$tmpId[-1]\n",$resultSeq->seq,"\n"; } } + # addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa); } else { # add seed sequence to output extended.fa if no ortholog was found in refSpec if ($taxon eq $refSpec) { @@ -2054,11 +2146,13 @@ sub addSeedSeq { # get seed sequence and add it to the beginning of the fasta output open(TEMP, ">$outputFa.temp") or die "Cannot create $outputFa.temp!\n"; my $seqio = Bio::SeqIO->new(-file => "$coreOrthologsPath/$seqName/$seqName.fa", '-format' => 'Fasta'); + my %idTmp; # used to check which seq has already been written to output while(my $seq = $seqio->next_seq) { my $id = $seq->id; if ($id =~ /$refSpec/) { + $idTmp{"$id|1"} = 1; print TEMP ">$id|1\n", $seq->seq, "\n"; - last; + #last; } } # then write other sequences @@ -2066,7 +2160,9 @@ sub addSeedSeq { while(my $seq = $seqio2->next_seq) { my $id = $seq->id; unless ($id =~ /$refSpec\|$seqId/) { # /$refSpec/) { - print TEMP ">$id\n", $seq->seq, "\n"; + unless ($idTmp{$id}) { + print TEMP ">$id\n", $seq->seq, "\n"; + } } } close(TEMP); @@ -2096,17 +2192,19 @@ sub parseInput { } ########################## sub parseTaxaFile { - open (INPUT, "<$coreTaxa") or die print "Error opening file with taxa for core orthologs search\n"; + my $coreTaxaFile = $_[0]; + open (INPUT, "<$coreTaxaFile") or die print "Error opening file with taxa for core orthologs search\n"; my @userTaxa; while() { my $line = $_; chomp($line); - if(!$taxa{$line}) { - print "You specified " . $line . " in your core orthologs file but the taxon is not in the database!\n"; - exit; - } - else { - push(@userTaxa, $line); + if (length($line) > 0) { + if(!$taxa{$line}) { + print "You specified " . $line . " in your core orthologs file but the taxon is not in the database!\n"; + exit; + } else { + push(@userTaxa, $line); + } } } close INPUT; @@ -2592,7 +2690,7 @@ sub initialCheck { } } # check weight_dir - if ($fasoff != 1) { + if ($fasoff != 1 && !$assembly) { my %seen; my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir); chomp(my $allAnno = `ls $weightDir | $sedprog \'s/\\.json//\'`); @@ -2607,7 +2705,7 @@ sub initialCheck { sub getGenomeFile { my ($folder, $filename) = @_; - chomp(my $faFile = `ls $folder/$filename.fa* | $grepprog -v \"\\.checked\\|\\.mod\\|\\.tmp\"`); + chomp(my $faFile = `ls $folder/$filename.fa* | $grepprog -v \"\\.checked\\|\\.mod\\|\\.mapping\\|\\.tmp\"`); my $out = $faFile; chomp(my $link = `$readlinkprog -f $faFile`); if ($link ne "") { @@ -2641,23 +2739,23 @@ sub checkValidFolderName { ########################### sub helpMessage { my $helpmessage = " -${bold}YOU ARE RUNNING $version on $hostname$norm +YOU ARE RUNNING $version on $hostname This program is freely distributed under a GPL. Copyright (c) GRL limited: portions of the code are from separate copyrights -\n${bold}USAGE:${norm} oneSeq.pl -seqFile=<> -seqId=<> -seqName=<> -refSpec=<> -minDist=<> -maxDist=<> [OPTIONS] +\nUSAGE: oneSeq.pl -seqFile=<> -seqId=<> -seqName=<> -refSpec=<> -minDist=<> -maxDist=<> [OPTIONS] -${bold}OPTIONS:$norm +OPTIONS: -${bold}GENERAL$norm +GENERAL -h Invoke this help method -version Print the program version -${bold}REQUIRED$norm +REQUIRED -seqFile=<> Specifies the file containing the seed sequence (protein only) in fasta format. @@ -2677,7 +2775,7 @@ sub helpMessage { -coreOrth=<> Specify the number of orthologs added to the core set. -${bold}USING NON-DEFAULT PATHS$norm +USING NON-DEFAULT PATHS -outpath=<> Specifies the path for the output directory. Default is $outputPath; @@ -2690,7 +2788,7 @@ sub helpMessage { -weightpath=<> Specifies the path for the pre-calculated feature annotion directory. Default is $weightPath; -${bold}ADDITIONAL OPTIONS$norm +ADDITIONAL OPTIONS -append Set this flag to append the output to existing output files @@ -2777,7 +2875,7 @@ sub helpMessage { Set the alignment strategy during core ortholog compilation to glocal. -searchTaxa Input file containing list of search taxa. -${bold}SPECIFYING FAS SUPPORT OPTIONS$norm +SPECIFYING FAS SUPPORT OPTIONS -fasoff Turn OFF FAS support. Default is ON. @@ -2790,7 +2888,7 @@ sub helpMessage { -countercheck Set this flag to counter-check your final profile. The FAS score will be computed in two ways (seed vs. hit and hit vs. seed). -${bold}SPECIFYING EXTENT OF OUTPUT TO SCREEN$norm +SPECIFYING EXTENT OF OUTPUT TO SCREEN -debug Set this flag to obtain more detailed information about the programs actions diff --git a/fdog/checkData.py b/fdog/checkData.py index 59256bc..84310ac 100644 --- a/fdog/checkData.py +++ b/fdog/checkData.py @@ -70,6 +70,12 @@ def checkValidFasta(file): fasta = SeqIO.parse(f, 'fasta') if not any(fasta): return('notFasta') + else: + # check for long header + inSeq = SeqIO.to_dict((SeqIO.parse(open(file), 'fasta'))) + for id in inSeq: + if len(id) > 30: + return('longHeader') # check space or tab if any(s in f.read() for s in spaceChr): return('space') @@ -90,6 +96,7 @@ def checkValidSeqs(faFile): faSeq = SeqIO.parse(open(faFile),'fasta') for fa in faSeq: id, seq = fa.description, str(fa.seq) + c = '' if any(e in id for e in spaceChr): sys.exit('*** ERROR: Invalid character found in \">%s\" in %s' % (id, faFile)) if any(c for c in seq if not c.isalpha()): @@ -131,6 +138,8 @@ def checkDataFolder(checkDir, replace, delete, concat): checkFaFile = checkValidFasta(faFile) if checkFaFile == 'notFasta': sys.exit('*** ERROR: %s does not look like a fasta file!' % faFile) + elif checkFaFile == 'longHeader': + sys.exit('*** ERROR: %s contains long headers!' % faFile) elif checkFaFile == 'space': sys.exit('*** ERROR: %s contains spaces/tabs!' % faFile) elif checkFaFile == 'multiLine': @@ -184,7 +193,7 @@ def checkMissingNcbiID(namesDmp, taxaList): return(missingTaxa.keys(), dupTaxa) def main(): - version = '0.0.2' + version = '0.0.3' parser = argparse.ArgumentParser(description='You are running fdog.checkData version ' + str(version) + '.') parser.add_argument('-g', '--genomeDir', help='Path to search taxa directory (e.g. fdog_dataPath/genome_dir)', action='store', default='') parser.add_argument('-b', '--blastDir', help='Path to blastDB directory (e.g. fdog_dataPath/blast_dir)', action='store', default='') diff --git a/fdog/data/.DS_Store b/fdog/data/.DS_Store index fde072a6aebc6f6618808f5bbd3cd63c202098d1..bf1ded6ef3f07fae44d0ee29b918a7b6e62c579b 100644 GIT binary patch delta 166 zcmZp1XfcprU|?W$DortDU=RQ@Ie-{Mvv5r;6q~50$jGrVU^g=($7CLXsf?#4TM0L4 zNmf@I8kp!P7#bVa>L^qj8i6?GCMLDDoE+k+hPIvwxs_GbHMMoKCQlHOXPiAbTTpxR zRiQq{iKUMgvvY6=G6T&A0s(Fy;R>>9W8rt^$^0^&Ad4B8AdUdJi(zv-&m3j|>H;Ig delta 201 zcmZoMXmOBWU|?W$DortDU;r^WfEYvza8E20o2aMA$h|ROH}hr%jz7$c**Q2SHn1>q zPv&8nI{7#2*~$5=8m^oSNenp*i44UIB@FQlDGZqmMV>kN$w@i+Ng!i@hW`Z8djG)y z$YNl?qGR$0)*IYN;+q>-mojcn=2^|mCBY5U;tF!mW locations[i][0]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): + #merge overlapping regions plus strand + locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations.pop(i) + size_list -= 1 + i -= 1 + elif ((locations[j][1] > locations[i][1]) and (locations[j][0] < locations[i][1]) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + #merge overlapping regions minus strand + locations[j][0] = min(locations[j][0], locations[i][0]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations.pop(i) + size_list -= 1 + i -= 1 + elif ((locations[j][0] < locations[i][0]) and (locations[i][0] - locations[j][1] <= 2*insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '+')): + #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand + locations[j][1] = max(locations[j][1], locations[i][1]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations.pop(i) + size_list -= 1 + i -=1 + elif ((locations[j][1] > locations[i][1]) and (locations[j][0] - locations[i][1] <= 2* insert_length) and (locations[j][5] == locations[i][5]) and (locations[i][5] == '-')): + #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand + locations[j][0] = min(locations[j][0], locations[i][0]) + locations[j][2] = min(locations[j][2], locations[i][2]) + locations.pop(i) + size_list -= 1 + i -=1 + i += 1 + j += 1 + + number_regions += len(locations) + blast_results[key] = locations + + return blast_results, number_regions + +def parse_blast(line, blast_results, cutoff): + # format blast line: + # format dictionary: {node_name: [(,,evalue, ,,)]} + line = line.replace("\n", "") + line_info = line.split("\t") + evalue = float(line_info[3]) + #cut off + if evalue > cutoff: + return blast_results, evalue + #add region to dictionary + else: + node_name, sstart, send, qstart, qend = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]) + split = node_name.split("|") + # finding out on which strand tBLASTn found a hit + if sstart < send: + strand = "+" + else: + sstart = int(line_info[2]) + send = int(line_info[1]) + strand = "-" + #creating a dictionary that inlcudes every tBLASTn that is better as the evalue cut-off + if len(split) > 1: + node_name = split[1] + if node_name in blast_results: + list = blast_results[node_name] + list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand]) + blast_results[node_name] = list + else: + blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand]] + + return blast_results, evalue + +def candidate_regions(intron_length, cutoff_evalue, tmp_path): + ###################### extracting candidate regions ######################## + # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 + blast_file = open(tmp_path + "/blast_results.out", "r") + evalue = 0 + blast_results = {} + #parsing blast output + while True: + line = blast_file.readline() + #end of file is reached + if not line: + break + #parsing blast output + blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) + + if blast_results == {}: + return 0,0 + else: + candidate_regions, number_regions = merge(blast_results, intron_length) + + return candidate_regions, number_regions + +def extract_seq(region_dic, path, tmp_path, mode): + + for key in region_dic: + #print("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") + cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" + starting_subprocess(cmd, mode) + +def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode): + output = open(candidatesOutFile, "w") + + for key in regions: + locations = regions[key] + counter = 0 + for i in locations: + # some variables + counter += 1 + start = str(i[0] - length_extension) + end = str(i[1] + length_extension) + name = key + "_" + str(counter) + # augutus call + cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" + #print(cmd) + starting_subprocess(cmd, 'silent') + # transfer augustus output to as sequence + cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" + starting_subprocess(cmd, mode) + # parsing header and sequences + try: + sequence_file = open(tmp_path + name + ".aa", "r") + lines = sequence_file.readlines() + for line in lines: + if line[0] == ">": + id = line.replace(">", "") + header = ">" + group + "|" + ass_name + "|" + name + "_" + id + output.write(header) + else: + output.write(line) + sequence_file.close() + except FileNotFoundError: + print("No gene found in region with ID:" + name + " , continuing with next region") + output.close() + +def searching_for_db(assembly_path): + + db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto'] + check = True + for end in db_endings: + check = check and os.path.exists(assembly_path + end) + return check + +def get_distance_biopython(file, matrix): + aln = AlignIO.read(open(file), 'fasta') + calculator = DistanceCalculator(matrix) + dm = calculator.get_distance(aln) + return dm + +def readFasta(candidatesOutFile): + seq_records = SeqIO.parse(candidatesOutFile, "fasta") + return seq_records + +def getSeedInfo(path): + dic = {} + seq_records = readFasta(path) + for entry in seq_records: + species = entry.id.split("|")[1] + geneID = entry.id.split("|")[2] + + try: + dic[species].append(geneID) + except KeyError: + dic[species] = [geneID] + + del seq_records + return dic + +def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path): + ###########getting sequences and write all in one file to make msa ######### + name_file = candidate_name + ".co" + output_file = tmp_path + name_file + '.fasta' + aln_file = tmp_path + name_file + '.aln' + genome_dir_path = dataPath + '/genome_dir/%s/%s.fa'%(fdog_ref_species, fdog_ref_species) + #print(searchTool) + + out = open(output_file, "w") + inSeq = SeqIO.to_dict((SeqIO.parse(open(genome_dir_path), 'fasta'))) + out.write(">" + best_hit + "\n") + out.write(str(inSeq[best_hit].seq) + "\n") + out.write(">" + ref + "\n") + out.write(str(inSeq[ref].seq )+ "\n") + + candidates = readFasta(candidatesOutFile) + for record in candidates: + if candidate_name in record.id: + out.write(">" + candidate_name + "\n") + out.write(str(record.seq) + "\n") + break + + out.close() + + if msaTool == "muscle": + os.system("muscle -quiet -in " + output_file + " -out " + aln_file) + #print("muscle -quiet -in " + output_file + " -out " + aln_file) + elif msaTool == "mafft-linsi": + #print("mafft-linsi") + os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) + + distances = get_distance_biopython(aln_file, matrix) + + distance_hit_query = distances[best_hit, candidate_name] + distance_ref_hit = distances[best_hit, ref] + + if distance_ref_hit < distance_hit_query: + #accepted + return 1, distance_ref_hit, distance_hit_query + + else: + #rejected + return 0, distance_ref_hit, distance_hit_query + +def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue_cut_off, taxa, searchTool, checkCo, msaTool, matrix, dataPath, filter, tmp_path, mode): + # the backward search uses the genes predicted from augustus and makes a blastp search + #the blastp search is against all species that are part of the core_ortholog group if the option --strict was chosen or only against the ref taxa + seedDic = getSeedInfo(fasta_path) + #print(fasta_path) + orthologs = [] + #print(seedDic) + blast_dir_path = dataPath + "/blast_dir/" + if strict != True: + seed = [fdog_ref_species] + try: + id_ref = seedDic[fdog_ref_species] + except KeyError: + print("The fDOG reference species isn't part of the core ortholog group, ... exciting") + return 0, seed + if searchTool == "blast": + cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + starting_subprocess(cmd, mode) + else: + print("diamonds are the girls best friends") + ##### diamond call + + alg_file = open(tmp_path + "blast_" + fdog_ref_species, "r") + lines = alg_file.readlines() + alg_file.close() + old_name = None + min = 10 + for line in lines: + id, gene, evalue = (line.replace("\n", "")).split("\t") + gene_name = gene.split("|")[2] + if gene_name != old_name: + print("candidate:%s"%(gene_name)) + print("blast-hit:%s"%(id)) + min = float(evalue) + if id in id_ref: + orthologs.append(gene) + print("\thitting\n") + else: + if checkCo == True: + for i in id_ref: + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) + if co_orthologs_result == 1: + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + orthologs.append(gene) + elif co_orthologs_result == 0: + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + else: + print("\tnothitting\n") + elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs: + if id in id_ref: + orthologs.append(gene) + print("\thitting\n") + else: + if checkCo == True: + for i in id_ref: + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) + if co_orthologs_result == 1: + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + orthologs.append(gene) + elif co_orthologs_result == 0: + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + else: + print("\tnot hitting\n") + old_name = gene_name + + + if orthologs == []: + print("No hit in the backward search, ...exciting") + return 0, seed + + else: + if taxa != []: + seed = taxa + try: + i = seed.index(fdog_ref_species) + seed.insert(0,seed.pop(i)) + except ValueError: + seed.insert(0,fdog_ref_species) + #print(seed) + #print("with taxa list from user input") + + else: + seed = [] + for key in seedDic: + if key == fdog_ref_species: + seed.insert(0,key) + else: + seed.append(key) + + orthologs = set({}) + + for species in seed: + print("backward search in species " + species + "\n") + orthologs_new = set({}) + try: + id_ref = seedDic[species] + except KeyError: + print("The species " + species + " isn't part of the core ortholog group, ... exciting") + return 0, seed + + cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + starting_subprocess(cmd, mode) + alg_file = open(tmp_path + "/blast_" + species, "r") + lines = alg_file.readlines() + alg_file.close() + old_name = None + min = 10 + for line in lines: + id, gene_name, evalue = (line.replace("\n", "")).split("\t") + if gene_name != old_name: + min = float(evalue) + if id in id_ref: + orthologs_new.add(gene_name) + + elif (gene_name == old_name) and float(evalue) == min: + if id in id_ref: + orthologs_new.add(gene_name) + + #print(species) + #print(orthologs_new) + if species == fdog_ref_species: + orthologs = orthologs_new + else: + orthologs = orthologs & orthologs_new + if orthologs == {}: + print("No ortholog was found with option --strict") + return 0, seed + + + + #print(orthologs) + orthologs = set(orthologs) + return list(orthologs), seed + +def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): + + output_file = open(output, "a+") + if refBool == False: + seq_records_core = readFasta(core_fasta) + seq_records_core = list(seq_records_core) + for species in species_list: + for entry_core in seq_records_core: + if species in entry_core.id: + output_file.write(">" + entry_core.id + "\n") + output_file.write(str(entry_core.seq) + "\n") + + if sequenceIds != 0: + seq_records_candidate = readFasta(candidate_fasta) + seq_records_candidate = list(seq_records_candidate) + for entry_candidate in seq_records_candidate: + if entry_candidate.id in sequenceIds: + if entry_candidate.id == sequenceIds[0]: + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + output_file.close() + return 0 + +def createFasInput(orthologsOutFile, mappingFile): + with open(orthologsOutFile, "r") as f: + fas_seed_id = (f.readline())[1:-1] + #fas_seed_id = fas_seed_id.split("|")[0] + + mappingFile = open(mappingFile, "a+") + + seq_records = readFasta(orthologsOutFile) + for seq in seq_records: + ncbi_id = (seq.id.split("@"))[1] + mappingFile.write(seq.id + "\t" + "ncbi" + ncbi_id + "\n") + + + return fas_seed_id + +def cleanup(tmp, tmp_path): + if tmp == False: + os.system('rm -r ' + tmp_path) + +def checkOptions(): + pass + #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!! + +def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): + if len(candidate_names) == 1: + return candidate_names + + candidates = readFasta(candidatesFile) + ref = readFasta(fasta) + + out = tmp_path + '/checkCoorthologs.fa' + f = open(out,"w") + + aln_file = tmp_path + '/checkCoorthologs.aln' + + for record in ref: + if fdog_ref_species in record.id: + ref_id = record.id + f.write(">" + record.id + "\n") + f.write(str(record.seq) + "\n") + break + + for record in candidates: + for name in candidate_names: + if name in record.id: + f.write(">" + name + "\n") + f.write(str(record.seq) + "\n") + f.close() + + if msaTool == "muscle": + os.system("muscle -quiet -in " + out + " -out " + aln_file) + elif msaTool == "mafft-linsi": + os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file) + + distances = get_distance_biopython(aln_file, matrix) + + min_dist = 10 + min_name = None + + for name in candidate_names: + distance = distances[ref_id , name] + if distance <= min_dist: + min_dist = distance + min_name = name + + checked = [min_name] + + for name in candidate_names: + if name == min_name: + pass + elif distances[min_name , name] <= distances[min_name , ref_id]: + checked.append(name) + + return checked + +def clean_fas(path, file_type): + file = open(path, "r") + lines = file.readlines() + file.close() + file = open(path,"w") + + for line in lines: + if file_type == 'domains': + long_id, remain = line.split("#") + id = long_id.split("|")[0] + new_line = id + "#" + remain + else: + long_id, remain = line.split("\t", 1) + id = long_id.split("|")[0] + new_line = id + "\t" + remain + + file.write(new_line) + +class Logger(object): + def __init__(self, file): + self.file = file + self.terminal = sys.stdout + self.log = self.file + + def write(self, message): + self.terminal.write(message) + self.log.write(message) + + def flush(self): + pass + + +def main(): + + #################### handle user input ######################################## + + version = '0.0.1' + + parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') + parser.add_argument('--version', action='version', version=str(version)) + + required = parser.add_argument_group('Required arguments') + required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', + action='store', default='', required=True) + required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) + required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', default='', required=True) + + optional = parser.add_argument_group('Optional arguments') + optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int) + optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int) + optional.add_argument('--assemblyPath', help='Path for the assembly directory', action='store', default='') + optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False) + optional.add_argument('--out', help='Output directory', action='store', default='') + optional.add_argument('--dataPath', help='data directory', action='store', default='') + optional.add_argument('--coregroupPath', help='core_ortholog directory', action='store', default='') + optional.add_argument('--searchTool', help='Choose between blast and diamond as alignemnt search tool(default:blast)', action='store', choices=['blast', 'diamond'], default='blast') + optional.add_argument('--evalBlast', help='E-value cut-off for the Blast search. (default: 0.00001)', action='store', default=0.00001, type=float) + optional.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', action='store_true', default=False) + optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') + optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False) + optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') + optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', default='') + optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') + optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) + optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') + optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='') + optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) + optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) + + + args = parser.parse_args() + + # required + group = args.gene + augustus_ref_species = args.augustusRefSpec + fdog_ref_species = args.refSpec + #paths user input + assemblyDir = args.assemblyPath + dataPath = args.dataPath + core_path = args.coregroupPath + out = args.out + pathFile = args.pathFile + #I/O + tmp = args.tmp + strict = args.strict + checkCoorthologs = args.checkCoorthologsRef + filter = args.filter + if filter == True or filter == 'yes': + filter = 'yes' + else: + filter = 'no' + #others + average_intron_length = args.avIntron + length_extension = args.lengthExtension + searchTool = args.searchTool + evalue = args.evalBlast + msaTool = args.msaTool + matrix = args.scoringmatrix + taxa = args.coreTaxa + if taxa == '': + taxa =[] + else: + taxa = taxa.split(",") + fasoff = args.fasoff + searchTaxon = args.searchTaxon + silent = args.silent + debug = args.debug + + if debug == True and silent == True: + print("It's not possible to use booth modes, please restart and use --debug or --silent") + return 1 + else: + if debug == True: + mode = 'debug' + elif silent == True: + mode = 'silent' + else: + mode = 'normal' + + #checking paths + if dataPath == '': + fdogPath = os.path.realpath(__file__).replace('/fDOGassembly.py','') + configFile = fdogPath + '/bin/pathconfig.txt' + if not os.path.exists(configFile): + sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog) or give a dataPath') + if pathFile == '': + with open(configFile) as f: + dataPath = f.readline().strip() + else: + cfg = load_config(pathFile) + try: + dataPath = cfg['dataPath'] + except: + dataPath = 'config' + if core_path == '': + core_path = out + '/core_orthologs/' + else: + if not core_path.endswith('/'): + core_path = core_path + '/' + + if assemblyDir == '': + assemblyDir = dataPath + '/assembly_dir/' + if out == '': + #print('test out \n') + out = os.getcwd() + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' + else: + if out[-1] != "/": + out = out + "/" + + + try: + f = open(out + "/fdog.log", "a+") + except FileNotFoundError: + f = open(out + "/fdog.log", "w") + + ################## How to handle std output and std error ################## + + if mode == 'silent': + sys.stderr = f + sys.stdout = f + else: + sys.stdout = Logger(f) + + # user input has to be checked here before fDOGassembly continues + assembly_names = os.listdir(assemblyDir) + + ########################## some variables ################################## + + refBool = False # checks if sequences of reference species were already part of the extended.fa file + + ########### paths ########### + + msa_path = core_path + "/" + group +"/"+ group + ".aln" + hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" + fasta_path = core_path + "/" + group +"/"+ group + ".fa" + consensus_path = out + "/tmp/" + group + ".con" + profile_path = out + "/tmp/" + group + ".prfl" + + ###################### create tmp folder ################################### + + cmd = 'mkdir ' + out + '/tmp' + starting_subprocess(cmd, 'silent') + + ######################## consensus sequence ################################ + + #make a majority-rule consensus sequence with the tool hmmemit from hmmer + print("Building a consensus sequence for gene " + group + " \n") + cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path + starting_subprocess(cmd, mode) + print("consensus sequence is finished\n") + + ######################## block profile ##################################### + + print("Building a block profile for gene " + group + " \n") + cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path + starting_subprocess(cmd, 'silent') + + if int(os.path.getsize(profile_path)) > 0: + print("block profile is finished \n") + else: + print("Building block profiles failed. Using prepareAlign to convert alignment\n") + new_path = core_path + group +"/"+ group + "_new.aln" + #print(cmd) + cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path + starting_subprocess(cmd, mode) + cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path + #print(cmd) + starting_subprocess(cmd, 'silent') + print("block profile is finished \n") + + searchBool = False + + #################### fDOG assembly computation for all species ############# + for asName in assembly_names: + if searchBool == True: + break + if searchTaxon != '' and searchBool == False: + asName = searchTaxon + searchBool = True + + ################### path definitions ################################### + + cmd = 'mkdir ' + out + '/tmp/' + asName + starting_subprocess(cmd, 'silent') + tmp_path = out + "/tmp/" + asName + "/" + candidatesOutFile = tmp_path + group + ".candidates.fa" + if searchTaxon != '': + orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" + fasOutFile = out + "/" + group + "_" + asName + mappingFile = tmp_path + group + "_" + asName + ".mapping.txt" + else: + orthologsOutFile = out + "/" + group + ".extended.fa" + fasOutFile = out + "/" + group + mappingFile = out + "/tmp/" + group + ".mapping.txt" + + print("Searching in species " + asName + "\n") + assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" + db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" + + ######################## tBLASTn ########################################### + #checks if data base exists already + db_check = searching_for_db(db_path) + if db_check == 0: + print("creating a blast data base \n") + cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path + starting_subprocess(cmd, mode) + print("database is finished \n") + else: + print('blast data base exists already, continuing...') + + #makes a tBLASTn search against the new database + #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt + print("tBLASTn search against data base") + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + starting_subprocess(cmd, mode) + print("tBLASTn search is finished") + + ################### search for candidate regions and extract seq ########### + # parse blast and filter for candiate regions + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) + + if regions == 0: + #no candidat region are available, no ortholog can be found + print("No candidate region found") + if refBool == True: + continue + else: + taxa = [fdog_ref_species] + reciprocal_sequences = 0 + else: + print(str(number_regions) + " candiate regions were found. Extracting sequences...") + extract_seq(regions, db_path, tmp_path, mode) + + ############### make Augustus PPX search ################################### + + print("starting augustus ppx \n") + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + print("augustus is finished \n") + + ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) <= 0: + print("No genes found at candidate regions\n") + if searchTaxon == '' and refBool == True: + continue + else: + reciprocal_sequences = 0 + taxa = [fdog_ref_species] + else: + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + + + ################## checking accepted genes for co-orthologs ################ + if reciprocal_sequences == 0: + if regions != 0: + print("No ortholog fulfilled the reciprocity criteria") + if searchTaxon == '' and refBool == True: + continue + else: + reciprocal_sequences = 0 + else: + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + + ################ add sequences to extended.fa in the output folder########## + + addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) + refBool = True + + ############### make Annotation with FAS ################################### + # if we want to search in only one Taxon + if searchTaxon != '' and fasoff == False: + print("Calculating FAS scores") + fas_seed_id = createFasInput(orthologsOutFile, mappingFile) + # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option + cmd = 'mkdir ' + tmp_path + 'anno_dir' + starting_subprocess(cmd, 'silent') + cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName + starting_subprocess(cmd, 'silent') + clean_fas(fasOutFile + "_forward.domains", 'domains') + clean_fas(fasOutFile + "_reverse.domains", 'domains') + clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') + + + #if we searched in more than one Taxon and no ortholog was found + + if refBool == False and searchTaxon == '': + print("No orthologs found. Exciting ...") + cleanup(tmp, tmp_path) + return 1 + #if we searched in more than one taxon + if fasoff == False and searchTaxon == '': + print("Calculating FAS scores") + tmp_path = out + '/tmp/' + fas_seed_id = createFasInput(orthologsOutFile, mappingFile) + # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option + cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + starting_subprocess(cmd, 'silent') + clean_fas(out + group + "_forward.domains", 'domains') + clean_fas(out + group + "_reverse.domains", 'domains') + clean_fas(out + group + ".phyloprofile", 'phyloprofile') + ################# remove tmp folder ######################################## + if searchTaxon != '': + cleanup(tmp, tmp_path) + else: + cleanup(tmp, out + "/tmp/") + + f.close() + +if __name__ == '__main__': + main() diff --git a/fdog/fdog_goes_assembly/.DS_Store b/fdog/fdog_goes_assembly/.DS_Store deleted file mode 100644 index e0e9ff1be0aa35d6ef237330e7d7dd1ba746d1ec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}(1u5S~p^*rck+0SS(K;TEB|ttxRqG9d_Y0?7ytv;wSiB?DPC6qc=cynH z(?=VMz9V&80{tcXOw7~QTk1`P}iS~paoEd z9Ot|DN7ViMQPD8#xxOBvUJ_?{v-v1W<;sVJMJa2t_Nn$GIM>4<5A#9R3i~HiI|%dh z$k~TlCogsw9mU<@X?=NHr+FBs-M&eVJ6#O+piEGRNGw@$AAS&&k-NH|Dd+W@{(O#=i&rnGyuCVx$f`-|O gF_yOCRa7JBS7adi4hxIuLE(ddp@9o#;GZ(^412?A3IG5A diff --git a/fdog/fdog_goes_assembly/fDOGassembly.py b/fdog/fdog_goes_assembly/fDOGassembly.py deleted file mode 100644 index ad4c362..0000000 --- a/fdog/fdog_goes_assembly/fDOGassembly.py +++ /dev/null @@ -1,209 +0,0 @@ -############################ imports ########################################### -import os -########################### functions ########################################## - - -def merge_regions(blast_results, cut_off): - number_regions = 0 - for key in blast_results: - locations = blast_results[key] - size_list = len(locations) - i = 0 - j = 1 - old_size = 0 - while size_list != old_size and i < size_list: - old_size = size_list - start = locations[i][0] - end = locations[i][1] - - #print(locations) - while j < size_list: - - # breakup point? or we have to skip this j - if (i == j) and (j + 1 < size_list): - j+=1 - elif (i == j): - break - - if (locations[i][0] < locations[j][0]) and (locations[i][1] > locations[j][0]): - # start is between start and end -> merge - locations[i][1] = max(locations[j][1], locations[i][1]) - locations[i][2] = min(locations[j][2], locations[i][2]) - locations.pop(j) - j -= 1 - elif (locations[i][0] < locations[j][1]) and (locations[i][1] > locations[j][1]): - #end is between start and end -> merge - locations[i][0] = min(locations[j][0], locations[i][0]) - locations[i][2] = min(locations[j][2], locations[i][2]) - locations.pop(j) - j -= 1 - elif (locations[i][0] > locations[j][1]) and (locations[i][0] - locations[j][1] <= cut_off): - # end is not more than cut-off distanced - locations[i][0] = locations[j][0] - locations[i][2] = min(locations[j][2], locations[i][2]) - locations.pop(j) - j -= 1 - elif (locations[i][1] < locations[j][0] and locations[j][0] - locations[i][1] <= cut_off): - # start is not more than cut-off distanced - locations[i][0] = locations[j][0] - locations[i][2] = min(locations[j][2], locations[i][2]) - locations.pop(j) - j -= 1 - j += 1 - size_list = len(locations) - - i += 1 - j = 0 - number_regions += size_list - - return blast_results, number_regions - - -def parse_blast(line, blast_results): - # format blast line: - #fomrat dictionary: {node_name: [(,)]} - #print(line) - line = line.replace("\n", "") - line_info = line.split("\t") - #print(line_info) - evalue = float(line_info[3]) - - #cut off - if evalue > 0.0001: - return blast_results, evalue - #add region to dictionary - else: - node_name, start, end = line_info[0], line_info[1], line_info[2] - if node_name in blast_results: - list = blast_results[node_name] - list.append([int(start),int(end), evalue]) - blast_results[node_name] = list - else: - blast_results[node_name] = [[int(start),int(end), evalue]] - - return blast_results, evalue - - -def candidate_regions(cut_off): - ###################### extracting candidate regions ######################## - # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 - blast_file = open("tmp/blast_results.out", "r") - - evalue = 0 - blast_results = {} - #parsing blast output - while True: - line = blast_file.readline() - #end of file is reached - if not line: - break - #parsing blast output - blast_results, evalue = parse_blast(line, blast_results) - #evalue cut-off - if not evalue <= 0.00001: - break - if blast_results == {}: - return 1 - else: - candidate_regions, number_regions = merge_regions(blast_results, cut_off) - #print(candidate_regions, number_regions) - return candidate_regions, number_regions - - -def extract_seq(region_dic, path): - #print(region_dic) - for key in region_dic: - os.system("blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out tmp/" + key + ".fasta -outfmt %f") - - -def main(): - - ########################### handle user input ############################## - - #user input core_ortholog group - #have to add an input option - - #core-ortholog group name - group = "778452" - - #species name assemblie (folder name in assemby folder) - species_name = "L.pustulata" - - #assembly species_name - assembly_name = "contigs.fa" - - augustus_ref_species = "saccharomyces_cerevisiae_S288C" - - cut_off_merging_candidates = 500 - - - ########################## paths ########################################### - - #open core_ortholog group - msa_path = "../data/core_orthologs/" + group +"/"+ group + ".aln" - hmm_path = "../data/core_orthologs/" + group +"/hmm_dir/"+ group + ".hmm" - consensus_path = "tmp/" + group + ".con" - profile_path = "tmp/" + group + ".prfl" - path_assembly = "../data/assembly_dir/" + species_name + "/" + assembly_name - - os.system('mkdir tmp') - - - ######################## consensus sequence ################################ - - #make a majority-rule consensus seqeunce with the tool hmmemit from hmmer - print("Building a consensus sequence \n") - os.system('hmmemit -c -o' + consensus_path + ' ' + hmm_path) - print("consensus seqeunce is finished\n") - - ######################## block profile ##################################### - print("Building a block profile \n") - - os.system('msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path) - print("block profile is finished \n") - ######################## tBLASTn ########################################### - - #database anlegen - print("creating a blast database \n") - os.system('makeblastdb -in ' + path_assembly + ' -dbtype nucl -parse_seqids -out ' + path_assembly) - print("database is finished \n") - - #make a tBLASTn search against the new database - - os.system('tblastn -db ' + path_assembly + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue bitscore" -out tmp/blast_results.out') - - ################### search for candidate regions and extract seq ########### - - # parse blast and filter for candiate regions - regions, number_regions = candidate_regions(cut_off_merging_candidates) - - if regions == 1: - #no candidat region are available, no ortholog can be found - print("No candidate region found") - os.system('rm -r tmp/') - return 1 - - else: - print(str(number_regions) + " candiate regions were found. Extracting sequences.") - extract_seq(regions, path_assembly) - - ############### make Augustus PPX search #################################### - for key in regions: - locations = regions[key] - counter = 0 - for i in locations: - counter += 1 - start = str(i[0]) - end = str(i[1]) - if start < end: - #print("augustus --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + ".gff") - os.system("augustus --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + "_" + str(counter) + ".gff") - else: - os.system("augustus --proteinprofile=" + profile_path + " --predictionStart=" + end + " --predictionEnd=" + start + " --species=" + augustus_ref_species + " tmp/" + key + ".fasta > tmp/" + key + "_" + str(counter) + ".gff") - - ################# remove tmp folder ######################################## - - #have to be added after program ist finished, maybe use parametere so that the user can turn it off - -if __name__ == '__main__': - main() diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py new file mode 100644 index 0000000..1606b1d --- /dev/null +++ b/fdog/mergeAssemblyOutput.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2020 Vinh Tran +# +# This script is used to merge all output files (.extended.fa, .phyloprofile, +# _forward.domains, _reverse.domains) in a given directory into one file each. +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: hannah.muelbaier@stud.uni-frankfurt.de +# +####################################################################### + +import sys +import os +from os import listdir as ldir +import argparse +from pathlib import Path + +def main(): + version = '0.0.1' + parser = argparse.ArgumentParser(description='You are running fdog.mergeAssemblyOutput version ' + str(version) + '.') + parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', + action='store', default='', required=True) + parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True) + parser.add_argument('-c', '--cleanup', help='Deletes the merged output files from fDOG', action='store_true', default=False) + args = parser.parse_args() + + directory = args.input + out = args.output + cleanup = args.cleanup + if not os.path.exists(os.path.abspath(directory)): + sys.exit('%s not found' % directory) + else: + directory = os.path.abspath(directory) + + phyloprofile = None + set_phylo = set() + domains_0 = None + set_domains_f = set() + domains_1 = None + set_domains_r = set() + ex_fasta = None + set_fasta = set() + header_bool = False + for infile in ldir(directory): + if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile': + if not phyloprofile: + phyloprofile = open(out + '.phyloprofile', 'w') + phyloprofile.write('geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n') + with open(directory + '/' + infile, 'r') as reader: + lines = reader.readlines() + for line in lines: + if line != 'geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n' and line not in set_phylo: + phyloprofile.write(line) + if len(lines) > 1: + set_phylo = set(lines) + if cleanup == True: + os.remove(directory + '/' + infile) + elif infile.endswith('_forward.domains') and not infile == out + '_forward.domains': + if not domains_0: + domains_0 = open(out + '_forward.domains', 'w') + with open(directory + '/' + infile, 'r') as reader: + lines = reader.readlines() + for line in lines: + if line not in set_domains_f: + domains_0.write(line) + if len(lines) > 1: + set_domains_f = set(lines) + if cleanup == True: + os.remove(directory + '/' + infile) + elif infile.endswith('_reverse.domains') and not infile == out + '_reverse.domains': + if not domains_1: + domains_1 = open(out + '_reverse.domains', 'w') + with open(directory + '/' + infile, 'r') as reader: + lines = reader.readlines() + for line in lines: + if line not in set_domains_r: + domains_1.write(line) + if len(lines) > 1: + set_domains_r = set(lines) + if cleanup == True: + os.remove(directory + '/' + infile) + elif infile.endswith('.extended.fa') and not infile == out + '.extended.fa': + if not ex_fasta: + ex_fasta = open(out + '.extended.fa', 'w') + with open(directory + '/' + infile, 'r') as reader: + lines = reader.readlines() + header = set() + #print(set_fasta) + for line in lines: + if line[0] == ">": + header.add(line) + if line not in set_fasta: + ex_fasta.write(line) + header_bool = True + else: + header_bool = False + else: + if header_bool == True: + ex_fasta.write(line) + set_fasta = header + if cleanup == True: + os.remove(directory + '/' +infile) + elif infile.endswith('.tsv'): + os.remove(directory + '/' + infile) + + if phyloprofile: + phyloprofile.close() + if domains_0: + domains_0.close() + if domains_1: + domains_1.close() + if ex_fasta: + ex_fasta.close() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/fdog/runMulti.py b/fdog/runMulti.py index 65335d5..a696495 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -28,6 +28,7 @@ from tqdm import tqdm import fdog.runSingle as fdogFn import shutil +import yaml def getSortedFiles(directory): list = os.listdir(directory) @@ -46,8 +47,8 @@ def prepare(args, step): outpath, hmmpath, blastpath, searchpath, weightpath, coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, - strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, debug, silent) = args + strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, + cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args mute = False if step == 'core': @@ -67,9 +68,10 @@ def prepare(args, step): pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] - orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] + orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] otherArgs = [cpu, hyperthread, debug, True] - return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) + assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] + return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) def getSeedName(seedFile): seqName = seedFile.split('.')[0] @@ -104,17 +106,20 @@ def compileCore(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core') - coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute]) - pool = mp.Pool(cpu) - coreOut = [] - for _ in tqdm(pool.imap_unordered(fdogFn.runSingle, coreCompilationJobs), total=len(coreCompilationJobs)): - coreOut.append(_) - pool.close() - pool.join() + + if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)): + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core') + coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute]) + if len(coreCompilationJobs) > 0: + pool = mp.Pool(cpu) + coreOut = [] + for _ in tqdm(pool.imap_unordered(fdogFn.runSingle, coreCompilationJobs), total=len(coreCompilationJobs)): + coreOut.append(_) + pool.close() + pool.join() + # read logs file to get runtime for individual seeds + getIndividualRuntime('core', outpath, seeds) end = time.time() - # read logs file to get runtime for individual seeds - getIndividualRuntime('core', outpath, seeds) multiCoreTime = '{:5.3f}'.format(end-start) print('==> Core compiling finished in %s sec' % multiCoreTime) #'{:5.3f}s'.format(end-start)) return(multiCoreTime) @@ -126,7 +131,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') if mute == True: print(seed) else: @@ -139,7 +144,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath): print('==> Ortholog search finished in %s sec' % multiOrthoTime) return(multiOrthoTime) -def joinOutputs(outpath, jobName, seeds, keep): +def joinOutputs(outpath, jobName, seeds, keep, silent): print('Joining single outputs...') finalFa = '%s/%s.extended.fa' % (outpath, jobName) Path(outpath+'/singleOutput').mkdir(parents=True, exist_ok=True) @@ -147,14 +152,20 @@ def joinOutputs(outpath, jobName, seeds, keep): for seed in seeds: seqName = getSeedName(seed) resultFile = '%s/%s/%s.extended.fa' % (outpath, seqName, seqName) + if silent == False: + print(resultFile) if os.path.exists(resultFile): with open(resultFile,'rb') as fd: shutil.copyfileobj(fd, wfd) shutil.move(outpath + '/' + seqName, outpath + '/singleOutput') else: Path(outpath+'/missingOutput').mkdir(parents=True, exist_ok=True) - shutil.move(outpath + '/' + seqName, outpath + '/missingOutput') + if not os.path.exists(outpath + '/missingOutput/' + seqName): + shutil.move(outpath + '/' + seqName, outpath + '/missingOutput') + if os.path.exists(outpath + '/' + seqName + '.fa'): os.remove(outpath + '/' + seqName + '.fa') + if os.path.exists(os.getcwd() + '/' + seqName + '.fa'): + os.remove(os.getcwd() + '/' + seqName + '.fa') if keep == True: try: print('Compressing single outputs...') @@ -180,7 +191,7 @@ def calcFAS (outpath, extendedFa, weightpath, cpu): sys.exit('Problem running\n%s' % (fasCmd)) def main(): - version = '0.0.13' + version = '0.0.33' parser = argparse.ArgumentParser(description='You are running fdogs.run version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) required = parser.add_argument_group('Required arguments') @@ -197,10 +208,12 @@ def main(): optional_paths.add_argument('--blastpath', help='Path for the blastDB directory', action='store', default='') optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') + optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') addtionalIO = parser.add_argument_group('Other I/O options') addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) addtionalIO.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) + addtionalIO.add_argument('--forceComplete', help='Overwrite existing core orthologs and all output files', action='store_true', default=False) addtionalIO.add_argument('--cleanup', help='Temporary output will be deleted. Default: True', action='store_true', default=True) addtionalIO.add_argument('--keep', help='Keep output of individual seed sequence. Default: False', action='store_true', default=False) addtionalIO.add_argument('--group', help='Allows to limit the search to a certain systematic group', action='store', default='') @@ -229,8 +242,15 @@ def main(): action='store', default=3, type=int) core_options.add_argument('--distDeviation', help='The deviation in score in percent (0 = 0 percent, 1 = 100 percent) allowed for two taxa to be considered similar. Default: 0.05', action='store', default=0.05, type=float) + core_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score', + action='store_true', default=False) + core_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True', + action='store_true', default=True) + core_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False', + action='store_true', default=False) ortho_options = parser.add_argument_group('Search strategy options') + ortho_options.add_argument('--searchTaxa', help='Specify list of search taxa', action='store', default='') ortho_options.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', action='store_true', default=False) ortho_options.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', @@ -239,9 +259,7 @@ def main(): action='store_true', default=False) ortho_options.add_argument('--rep', help='Obtain only the sequence being most similar to the corresponding sequence in the core set rather than all putative co-orthologs', action='store_true', default=False) - ortho_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score', - action='store_true', default=False) - ortho_options.add_argument('--lowComplexityFilterOff', help='Switch on or off the low complexity filter for the blast search. Default: False', + ortho_options.add_argument('--lowComplexityFilter', help='Switch the low complexity filter for the blast search on. Default: False', action='store_true', default=False) ortho_options.add_argument('--evalBlast', help='E-value cut-off for the Blast search. Default: 0.00005', action='store', default=0.00005, type=float) @@ -257,13 +275,6 @@ def main(): action='store_true', default=False) ortho_options.add_argument('--scoreCutoff', help='In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', action='store', default=10, type=int) - ortho_options.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', - choices=['mafft-linsi', 'muscle'], action='store', default='muscle') - ortho_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True', - action='store_true', default=True) - ortho_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False', - action='store_true', default=False) - ortho_options.add_argument('--searchTaxa', help='Specify list of search taxa', action='store', default='') fas_options = parser.add_argument_group('FAS options') fas_options.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) @@ -274,11 +285,21 @@ def main(): fas_options.add_argument('--minScore', help='Specify the threshold for coreFilter. Default: 0.75', action='store', default=0.75, type=float) optional = parser.add_argument_group('Other options') + optional.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', + choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False) optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) + assembly_options = parser.add_argument_group('Assembly options') + assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) + assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') + assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') + assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) + assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) + assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') + assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -297,10 +318,12 @@ def main(): blastpath = args.blastpath searchpath = args.searchpath weightpath = args.weightpath + pathFile = args.pathFile # other I/O arguments append = args.append force = args.force + forceComplete = args.forceComplete cleanup = args.cleanup keep = args.keep group = args.group @@ -323,7 +346,7 @@ def main(): rbh = args.rbh rep = args.rep ignoreDistance = args.ignoreDistance - lowComplexityFilterOff = args.lowComplexityFilterOff + lowComplexityFilter = args.lowComplexityFilter evalBlast = args.evalBlast evalHmmer = args.evalHmmer evalRelaxfac = args.evalRelaxfac @@ -351,22 +374,89 @@ def main(): silent = False else: silent = True + + #fdog_goes_assembly arguments + assembly = args.assembly + assemblyFile = args.assemblyFile + augustusRefSpec = args.augustusRefSpec + avIntron = args.avIntron + lengthExtension = args.lengthExtension + searchTool = args.searchTool + matrix = args.scoringmatrix + + ### check fas + if not fasoff: + try: + fasVersion = subprocess.run(['calcFAS --version'], shell = True, capture_output = True, check = True) + except: + sys.exit('Problem with calcFAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!') + + ### delete output folder and files if needed + if forceComplete: + if os.path.exists(outpath): + print("Removing existing output directory %s" % outpath) + shutil.rmtree(outpath) + Path(outpath).mkdir(parents=True, exist_ok=True) + if force: + if os.path.exists(outpath): + print("Removing existing files %s in %s*" % (jobName, outpath)) + outfiles = os.listdir(outpath) + for item in outfiles: + if item.startswith(jobName): + os.remove(os.path.join(outpath, item)) + if item.startswith("runtime"): + os.remove(os.path.join(outpath, item)) + if os.path.exists(outpath + '/missing.txt'): + os.remove(outpath + '/missing.txt') ### get fdog and data path + dataPath = '' fdogPath = os.path.realpath(__file__).replace('/runMulti.py','') pathconfigFile = fdogPath + '/bin/pathconfig.txt' if not os.path.exists(pathconfigFile): sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - with open(pathconfigFile) as f: - dataPath = f.readline().strip() + if pathFile == '': + with open(pathconfigFile) as f: + dataPath = f.readline().strip() + else: + cfg = fdogFn.load_config(pathFile) + try: + dataPath = cfg['dataPath'] + except: + dataPath = 'config' + if hmmpath == '': - hmmpath = dataPath + '/core_orthologs' + hmmpath = outpath + '/core_orthologs' + # hmmpath = dataPath + '/core_orthologs' + # if dataPath == 'config': + # try: + # hmmpath = cfg['hmmpath'] + # except: + # sys.exit('hmmpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) + else: + hmmpath = os.path.abspath(hmmpath) if blastpath == '': blastpath = dataPath + '/blast_dir' + if dataPath == 'config': + try: + blastpath = cfg['blastpath'] + except: + sys.exit('blastpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) if searchpath == '': searchpath = dataPath + '/genome_dir' + if dataPath == 'config': + try: + searchpath = cfg['searchpath'] + except: + sys.exit('searchpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) if weightpath == '': weightpath = dataPath + '/weight_dir' + if dataPath == 'config': + try: + weightpath = cfg['weightpath'] + except: + sys.exit('weightpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) + ### join options options = [fdogPath, refspec, minDist, maxDist, coreOrth, @@ -374,10 +464,11 @@ def main(): outpath, hmmpath, blastpath, searchpath, weightpath, coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, - strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, debug, silent] + strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, + cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] ### START + Path(outpath).mkdir(parents=True, exist_ok=True) multiLog = open(outpath + '/' + jobName + '_log.txt', "w") fdogStart = time.time() seeds = getSortedFiles(inFol) @@ -388,30 +479,40 @@ def main(): if reuseCore == False: multiCoreTime = compileCore(options, seeds, inFol, cpu, outpath) multiLog.write('==> Core compilation finished in %s sec\n' % multiCoreTime) + else: + if not os.path.exists(hmmpath): + sys.exit('--reuseCore was set, but no core orthologs found in %s! You could use --hmmpath to manually specify the core ortholog directory.' % outpath) ### do ortholog search if coreOnly == False: - ### create list of search taxa - searchTaxa = '' - searchGroup = 'all' - if not group == '': - print('Creating list for search taxa...') - searchTaxa = '%s/searchTaxa.txt' % (outpath) - searchGroup = group - cmd = 'perl %s/bin/getSearchTaxa.pl -i %s -b %s -h %s -r %s -n %s -t %s/taxonomy -o %s' % (fdogPath, searchpath, evalBlast, evalHmmer, evalRelaxfac, searchGroup, fdogPath, searchTaxa) - try: - subprocess.call([cmd], shell = True) - except: - sys.exit('Problem running\n%s' % (cmd)) - ### run ortholog search - multiOrthoTime = searchOrtho(options, seeds, inFol, cpu, outpath) - multiLog.write('==> Ortholog search finished in %s sec\n' % multiOrthoTime) - ### join output - finalFa = joinOutputs(outpath, jobName, seeds, keep) + if not os.path.exists('%s/%s.extended.fa' % (outpath, jobName)): + ### create list of search taxa + searchTaxa = '' + searchGroup = 'all' + if not group == '': + print('Creating list for search taxa...') + searchTaxa = '%s/searchTaxa.txt' % (outpath) + searchGroup = group + cmd = 'perl %s/bin/getSearchTaxa.pl -i %s -b %s -h %s -r %s -n %s -t %s/taxonomy -o %s' % (fdogPath, searchpath, evalBlast, evalHmmer, evalRelaxfac, searchGroup, fdogPath, searchTaxa) + try: + subprocess.call([cmd], shell = True) + except: + sys.exit('Problem running\n%s' % (cmd)) + ### run ortholog search + multiOrthoTime = searchOrtho(options, seeds, inFol, cpu, outpath) + multiLog.write('==> Ortholog search finished in %s sec\n' % multiOrthoTime) + ### join output + finalFa = joinOutputs(outpath, jobName, seeds, keep, silent) + else: + print("%s.extended.fa found in %s! If you want to re-run the ortholog search, please use --force option." % (jobName, outpath)) ### calculate FAS scores if fasoff == False: - fasTime = calcFAS(outpath, finalFa, weightpath, cpu) - multiLog.write('==> FAS calculation finished in %s sec\n' % fasTime) + if not os.path.exists('%s/%s.phyloprofile' % (outpath, jobName)): + if os.path.exists(finalFa) and os.path.getsize(finalFa) > 0: + fasTime = calcFAS(outpath, finalFa, weightpath, cpu) + multiLog.write('==> FAS calculation finished in %s sec\n' % fasTime) + else: + print("Final fasta file %s not exists or empty!" % finalFa) fdogEnd = time.time() print('==> fdogs.run finished in ' + '{:5.3f}s'.format(fdogEnd-fdogStart)) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index f235ff8..a0ded09 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -20,13 +20,24 @@ import argparse import subprocess from pathlib import Path +import yaml def checkFileExist(file): if not os.path.exists(os.path.abspath(file)): sys.exit('%s not found' % file) +def load_config(config_file): + with open(config_file, 'r') as stream: + try: + return yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + def checkInput(args): (fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath) = args + # create output directory + Path(outpath).mkdir(parents=True, exist_ok=True) + Path(hmmpath).mkdir(parents=True, exist_ok=True) # check path existing for path in [hmmpath, blastpath, searchpath, weightpath]: checkFileExist(path) @@ -38,8 +49,6 @@ def checkInput(args): seqFile = fdogPath + '/data/' + seqFile else: seqFile = os.path.abspath(seqFile) - # create output directory - Path(outpath).mkdir(parents=True, exist_ok=True) # check refspec if not os.path.exists(os.path.abspath(blastpath+'/'+refspec)): exit('Reference taxon %s not found in %s' % (refspec, blastpath)) @@ -56,13 +65,13 @@ def getfdogInfo(fdogPath, infoType): exit('%s not found' % (fdogPath + '/bin/oneSeq.pl')) def runSingle(args): - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = args # basic command (fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec) # add paths - (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs - cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath) + (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) = pathArgs + cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s -assemblypath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) # add other I/O options (append, force, noCleanup, group, blast, db) = ioArgs if append == True: @@ -98,7 +107,7 @@ def runSingle(args): if not distDeviation == 0.05: cmd = cmd + ' -distDeviation=%s' % distDeviation # add ortholo search options - (strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa) = orthoArgs + (strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa) = orthoArgs if strict == True: cmd = cmd + ' -strict' if checkCoorthologsRef == True: @@ -109,8 +118,8 @@ def runSingle(args): cmd = cmd + ' -rep' if ignoreDistance == True: cmd = cmd + ' -ignoreDistance' - if lowComplexityFilterOff == True: - cmd = cmd + ' -filter=F' + if lowComplexityFilter == True: + cmd = cmd + ' -filter=T' if not evalBlast == 0.00005: cmd = cmd + ' -evalBlast=%s' % evalBlast if not evalHmmer == 0.00005: @@ -152,7 +161,28 @@ def runSingle(args): cmd = cmd + ' -debug' if silent == True: cmd = cmd + ' -silent' - # print(cmd) + # add assembly options + (assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath) = assemblyArgs + if assembly == True: + cmd = cmd + ' -assembly' + cmd = cmd + ' -reuseCore' + if not augustusRefSpec == '': + cmd = cmd + ' -augustusRefSpec=%s' % augustusRefSpec + else: + sys.exit('An augutus reference species is requiered by using the option --assembly') + if not avIntron == '': + cmd = cmd + ' -avIntron=%s' % avIntron + if not lengthExtension == '': + cmd = cmd + ' -lengthExtension=%s' % lengthExtension + if not assemblyFile == '': + cmd = cmd + ' -assemblyFile=%s' % assemblyFile + if not searchTool == '': + cmd = cmd + ' -searchTool=%s' % searchTool + if not matrix == '': + cmd = cmd + ' -scoringmatrix=%s' % matrix + if not dataPath == '': + cmd = cmd + ' -dataPath=%s' % dataPath + #print(cmd) if mute == True: cmd = cmd + ' > /dev/null 2>&1' try: @@ -161,7 +191,7 @@ def runSingle(args): sys.exit('Problem running\n%s' % (cmd)) def main(): - version = '0.0.13' + version = '0.0.33' parser = argparse.ArgumentParser(description='You are running fdog.run version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) required = parser.add_argument_group('Required arguments') @@ -178,6 +208,9 @@ def main(): optional_paths.add_argument('--blastpath', help='Path for the blastDB directory', action='store', default='') optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') + optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') + optional_paths.add_argument('--assemblypath', help='Path for the assembly directory', action='store', default='') + addtionalIO = parser.add_argument_group('Other I/O options') addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) @@ -209,8 +242,15 @@ def main(): action='store', default=3, type=int) core_options.add_argument('--distDeviation', help='The deviation in score in percent (0 = 0 percent, 1 = 100 percent) allowed for two taxa to be considered similar. Default: 0.05', action='store', default=0.05, type=float) + core_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score', + action='store_true', default=False) + core_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True', + action='store_true', default=True) + core_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False', + action='store_true', default=False) ortho_options = parser.add_argument_group('Ortholog search strategy options') + ortho_options.add_argument('--searchTaxa', help='Specify file contains list of search taxa', action='store', default='') ortho_options.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', action='store_true', default=False) ortho_options.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', @@ -219,9 +259,7 @@ def main(): action='store_true', default=False) ortho_options.add_argument('--rep', help='Obtain only the sequence being most similar to the corresponding sequence in the core set rather than all putative co-orthologs', action='store_true', default=False) - ortho_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score', - action='store_true', default=False) - ortho_options.add_argument('--lowComplexityFilterOff', help='Switch on or off the low complexity filter for the blast search. Default: False', + ortho_options.add_argument('--lowComplexityFilter', help='Switch the low complexity filter for the blast search on. Default: False', action='store_true', default=False) ortho_options.add_argument('--evalBlast', help='E-value cut-off for the Blast search. Default: 0.00005', action='store', default=0.00005, type=float) @@ -237,13 +275,6 @@ def main(): action='store_true', default=False) ortho_options.add_argument('--scoreCutoff', help='In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', action='store', default=10, type=int) - ortho_options.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', - choices=['mafft-linsi', 'muscle'], action='store', default='muscle') - ortho_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True', - action='store_true', default=True) - ortho_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False', - action='store_true', default=False) - ortho_options.add_argument('--searchTaxa', help='Specify list of search taxa', action='store', default='') fas_options = parser.add_argument_group('FAS options') fas_options.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) @@ -254,11 +285,21 @@ def main(): fas_options.add_argument('--minScore', help='Specify the threshold for coreFilter. Default: 0.75', action='store', default=0.75, type=float) optional = parser.add_argument_group('Other options') + optional.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', + choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False) optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) + assembly_options = parser.add_argument_group('Assembly options') + assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) + assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') + assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') + assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) + assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) + assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') + assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -277,6 +318,8 @@ def main(): blastpath = args.blastpath searchpath = args.searchpath weightpath = args.weightpath + pathFile = args.pathFile + assemblypath = args.assemblypath # other I/O arguments append = args.append @@ -302,7 +345,7 @@ def main(): rbh = args.rbh rep = args.rep ignoreDistance = args.ignoreDistance - lowComplexityFilterOff = args.lowComplexityFilterOff + lowComplexityFilter = args.lowComplexityFilter evalBlast = args.evalBlast evalHmmer = args.evalHmmer evalRelaxfac = args.evalRelaxfac @@ -331,36 +374,86 @@ def main(): else: silent = True + #fdog_goes_assembly arguments + assembly = args.assembly + assemblyFile = args.assemblyFile + augustusRefSpec = args.augustusRefSpec + avIntron = args.avIntron + lengthExtension = args.lengthExtension + searchTool = args.searchTool + matrix = args.scoringmatrix + ### get fdog and data path + dataPath = '' fdogPath = os.path.realpath(__file__).replace('/runSingle.py','') pathconfigFile = fdogPath + '/bin/pathconfig.txt' if not os.path.exists(pathconfigFile): sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - with open(pathconfigFile) as f: - dataPath = f.readline().strip() + if pathFile == '': + with open(pathconfigFile) as f: + dataPath = f.readline().strip() + else: + cfg = load_config(pathFile) + try: + dataPath = cfg['dataPath'] + except: + dataPath = 'config' + if hmmpath == '': - hmmpath = dataPath + '/core_orthologs' + hmmpath = outpath + '/core_orthologs' + # hmmpath = dataPath + '/core_orthologs' + # if dataPath == 'config': + # try: + # hmmpath = cfg['hmmpath'] + # except: + # sys.exit('hmmpath not found in %s' % pathFile) + if blastpath == '': blastpath = dataPath + '/blast_dir' + if dataPath == 'config': + try: + blastpath = cfg['blastpath'] + except: + sys.exit('blastpath not found in %s' % pathFile) if searchpath == '': searchpath = dataPath + '/genome_dir' + if dataPath == 'config': + try: + searchpath = cfg['searchpath'] + except: + sys.exit('searchpath not found in %s' % pathFile) if weightpath == '': weightpath = dataPath + '/weight_dir' + if dataPath == 'config': + try: + weightpath = cfg['weightpath'] + except: + sys.exit('weightpath not found in %s' % pathFile) + + if assemblypath == '': + assemblypath = dataPath + '/assembly_dir' + if dataPath == 'config': + try: + assemblypath = cfg['assemblypath'] + except: + sys.exit('assemblypath not found in %s' % pathFile) + if assembly == True: + searchpath = assemblypath ### check input arguments seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) - # group arguments basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth] ioArgs = [append, force, noCleanup, group, blast, db] - pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] + pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath] coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] - orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilterOff, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] + orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] otherArgs = [cpu, hyperthread, debug, silent] + assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath] ### run fdog - runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False]) + runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, False]) if __name__ == '__main__': main() diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index 2894e6c..3c561e7 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -199,7 +199,8 @@ fi data_fdog_file="data_HaMStR-2019c.tar.gz" checkSumData="1748371655 621731824 $data_fdog_file" cd $outDir -if [ ! -d "$outDir/core_orthologs" ]; then mkdir "$outDir/core_orthologs"; fi +if [ ! -d "$outDir/genome_dir" ]; then mkdir "$outDir/genome_dir"; fi +if [ ! -d "$outDir/assembly_dir" ]; then mkdir "$outDir/assembly_dir"; fi if ! [ "$(ls -A $outDir/genome_dir)" ]; then echo "-------------------------------------" diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh index b8c90e6..ddc4e23 100755 --- a/fdog/setup/setup_conda.sh +++ b/fdog/setup/setup_conda.sh @@ -116,6 +116,7 @@ dependencies=( mafft # for linsi muscle fasta36 + augustus #for fdog.assembly ) for i in "${dependencies[@]}"; do @@ -134,6 +135,8 @@ for i in "${dependencies[@]}"; do fi elif [ "$tool" = "fasta36" ]; then conda install -y -c bioconda fasta3 + elif [ "$tool" = "augustus" ]; then + conda install -y -c bioconda augustus else conda install -y -c bioconda $i fi @@ -258,7 +261,8 @@ echo "done!" data_fdog_file="data_HaMStR-2019c.tar.gz" checkSumData="1748371655 621731824 $data_fdog_file" cd $outDir -if [ ! -d "$outDir/core_orthologs" ]; then mkdir "$outDir/core_orthologs"; fi +if [ ! -d "$outDir/genome_dir" ]; then mkdir "$outDir/genome_dir"; fi +if [ ! -d "$outDir/assembly_dir" ]; then mkdir "$outDir/assembly_dir"; fi if ! [ "$(ls -A $outDir/genome_dir)" ]; then echo "-------------------------------------" diff --git a/fdog/setupfDog.py b/fdog/setupfDog.py index 18c5368..b6a67d6 100644 --- a/fdog/setupfDog.py +++ b/fdog/setupfDog.py @@ -20,6 +20,7 @@ import os import argparse import subprocess +from ete3 import NCBITaxa from pathlib import Path def checkOptConflict(lib, conda): @@ -28,7 +29,7 @@ def checkOptConflict(lib, conda): sys.exit('*** ERROR: --lib and --conda cannot be used at the same time!') def main(): - version = '0.0.2' + version = '0.0.3' parser = argparse.ArgumentParser(description='You are running fdog.setup version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') @@ -60,6 +61,9 @@ def main(): dataPath = f.readline().strip() print(dataPath) sys.exit() + ### get ncbi taxonomy database for ete3 + print('Creating local NCBI taxonomy database...') + ncbi = NCBITaxa() ### run setup if conda: setupFile = '%s/setup/setup_conda.sh -o %s' % (fdogPath, outPath) diff --git a/setup.py b/setup.py index ad7a1b7..75573c1 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,8 @@ setup( name="fdog", - version="0.0.13", + version="0.0.33", + python_requires='>=3.7.0', description="Feature-aware Directed OrtholoG search tool", long_description=long_description, @@ -41,7 +42,8 @@ 'tqdm', 'ete3', 'six', - 'greedyFAS>=1.4.0' + 'PyYAML', + 'greedyFAS>=1.5.0' ], entry_points={ 'console_scripts': ["fdog.run = fdog.runSingle:main", @@ -52,7 +54,9 @@ "fdog.addTaxa = fdog.addTaxa:main", "fdog.showTaxa = fdog.showTaxa:main", "fdog.mergeOutput = fdog.mergeOutput:main", - "fdog.remove = fdog.removefDog:main"], + "fdog.remove = fdog.removefDog:main", + "fdog.assembly = fdog.fDOGassembly:main", + "fdog.mergeAssembly = fdog.mergeAssemblyOutput:main"], }, license="GPL-3.0", classifiers=[ From f8ccac590b46677d81bcc25516666b626a261f2b Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 30 Jun 2021 15:09:08 +0200 Subject: [PATCH 094/229] measure computational time --- fdog/fDOGassembly.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index de9f343..3b34a8d 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -8,6 +8,7 @@ import argparse import yaml import subprocess +import time ########################### functions ########################################## def load_config(config_file): with open(config_file, 'r') as stream: @@ -428,10 +429,6 @@ def cleanup(tmp, tmp_path): if tmp == False: os.system('rm -r ' + tmp_path) -def checkOptions(): - pass - #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!! - def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: return candidate_names @@ -520,6 +517,8 @@ def main(): #################### handle user input ######################################## + start = time.clock() + version = '0.0.1' parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') @@ -796,6 +795,7 @@ def main(): ############### make Annotation with FAS ################################### # if we want to search in only one Taxon if searchTaxon != '' and fasoff == False: + fas = time.clock() print("Calculating FAS scores") fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option @@ -816,6 +816,7 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '': + fas = time.clock() print("Calculating FAS scores") tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) @@ -833,5 +834,10 @@ def main(): f.close() + end = time.clock() + + print("Time w/o FAS: " + str(end-fas)) + print("Time complete: " + str(end-start)) + if __name__ == '__main__': main() From 1cf64f1f03dc07357e576744ba3751261b59a77b Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 30 Jun 2021 16:25:17 +0200 Subject: [PATCH 095/229] measure computational time --- fdog/fDOGassembly.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 3b34a8d..229a546 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -517,7 +517,7 @@ def main(): #################### handle user input ######################################## - start = time.clock() + start = time.time() version = '0.0.1' @@ -795,7 +795,7 @@ def main(): ############### make Annotation with FAS ################################### # if we want to search in only one Taxon if searchTaxon != '' and fasoff == False: - fas = time.clock() + fas = time.time() print("Calculating FAS scores") fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option @@ -816,7 +816,7 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '': - fas = time.clock() + fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) @@ -834,7 +834,7 @@ def main(): f.close() - end = time.clock() + end = time.time() print("Time w/o FAS: " + str(end-fas)) print("Time complete: " + str(end-start)) From 6e163ba531b1816eb5faa8f4b315e6e1e5c448ff Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 30 Jun 2021 16:32:40 +0200 Subject: [PATCH 096/229] bug fix --- fdog/fDOGassembly.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 229a546..a6a3bb8 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -832,12 +832,14 @@ def main(): else: cleanup(tmp, out + "/tmp/") - f.close() + end = time.time() print("Time w/o FAS: " + str(end-fas)) print("Time complete: " + str(end-start)) + f.close() + if __name__ == '__main__': main() From 1d1c47a572015d2cef9e121705993d29090fceee Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 1 Jul 2021 09:46:57 +0200 Subject: [PATCH 097/229] testing --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a6a3bb8..5ff5cb1 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -836,6 +836,7 @@ def main(): end = time.time() + sys.stdout = sys.__stdout__ print("Time w/o FAS: " + str(end-fas)) print("Time complete: " + str(end-start)) From 6e0ce726ffd6425574bbdb73901285463b3af5a4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 1 Jul 2021 10:12:06 +0200 Subject: [PATCH 098/229] computational time output --- fdog/fDOGassembly.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 5ff5cb1..c8a096b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -837,8 +837,7 @@ def main(): end = time.time() sys.stdout = sys.__stdout__ - print("Time w/o FAS: " + str(end-fas)) - print("Time complete: " + str(end-start)) + print(group + "\t" + str(end-start) + "\t" + str(end-start)) f.close() From a1cb75d31205dec99f6fe8ef4a6f164395086af3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 1 Jul 2021 10:18:54 +0200 Subject: [PATCH 099/229] corrected computational time output --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c8a096b..a3ac854 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -837,7 +837,7 @@ def main(): end = time.time() sys.stdout = sys.__stdout__ - print(group + "\t" + str(end-start) + "\t" + str(end-start)) + print(group + "\t" + str(end-fas) + "\t" + str(end-start)) f.close() From 328f26dda0e5e1eaaf22dfd37658f5af795c802d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 20 Jul 2021 15:21:28 +0200 Subject: [PATCH 100/229] automatic augustus installation during setup --- fdog/fDOGassembly.py | 20 +++++++++++++++++++- fdog/setup/install_lib.sh | 9 ++++++++- fdog/setup/setup.sh | 1 + 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a3ac854..2575b05 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,3 +1,21 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2020 Hannah Muelbaier +# +# This script is used to run fDOG-Assembly which performs targeted ortholog +# searches on genome assemblies +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: hannah.muelbaier@gmail.com +# +####################################################################### + ############################ imports ########################################### import os import os.path @@ -519,7 +537,7 @@ def main(): start = time.time() - version = '0.0.1' + version = '0.1.1' parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh index ff81e88..2e8ff02 100755 --- a/fdog/setup/install_lib.sh +++ b/fdog/setup/install_lib.sh @@ -85,6 +85,7 @@ dependenciesUbuntu=( perl-doc locales lib32z1 + augustus ) dependenciesMac=( @@ -94,6 +95,7 @@ dependenciesMac=( mafft brewsci/bio/muscle blast + augustus ) if [ "$sys" == "Darwin" ]; then @@ -108,7 +110,11 @@ else sudo apt-get update -y for i in "${dependenciesUbuntu[@]}"; do echo $i - sudo apt-get install -y -qq $i > /dev/null + if ["$i" == "augustus"]; then + sudo apt install augustus > /dev/null + else + sudo apt-get install -y -qq $i > /dev/null + fi done fi @@ -119,6 +125,7 @@ dependencies=( mafft muscle blastn + augustus ) for i in "${dependencies[@]}"; do diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index 3c561e7..d9e0077 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -309,6 +309,7 @@ mafft muscle clustalw blastp +augustus ) for i in "${dependencies[@]}"; do From 594715da279c625b4b9ff03fca153c7bcfde4695 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Jul 2021 15:43:59 +0200 Subject: [PATCH 101/229] added tblastn version check --- fdog/setup/setup.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index d9e0077..96ac1c1 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -310,6 +310,7 @@ muscle clustalw blastp augustus +tblastn ) for i in "${dependencies[@]}"; do @@ -319,6 +320,13 @@ for i in "${dependencies[@]}"; do tool="clustalw2" fi fi + if [ $tool == tblastn]; then + requiredver="2.9.0" + currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" + if [ "$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then + echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + fi + fi if [ -z "$(which $tool)" ]; then echo -e "\t\e[31mWARNING $tool not found!\e[0m" flag=1 From be91b3b6d3577b91bf69c73bec3a2dec4c316d5c Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Jul 2021 15:48:43 +0200 Subject: [PATCH 102/229] bug fix --- fdog/setup/setup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index 96ac1c1..e562ca8 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -320,10 +320,10 @@ for i in "${dependencies[@]}"; do tool="clustalw2" fi fi - if [ $tool == tblastn]; then + if [ $tool == "tblastn"]; then requiredver="2.9.0" currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" - if [ "$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then + if ["$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" fi fi From 4b5fb49a019ab87560a383fa05e0f40e2143b501 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Jul 2021 15:55:12 +0200 Subject: [PATCH 103/229] bug fix --- fdog/setup/setup.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index e562ca8..d5d740b 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -320,12 +320,12 @@ for i in "${dependencies[@]}"; do tool="clustalw2" fi fi - if [ $tool == "tblastn"]; then + if [ $tool == "tblastn" ]; then requiredver="2.9.0" currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" - if ["$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then - echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" - fi + # if ["$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then + # echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + # fi fi if [ -z "$(which $tool)" ]; then echo -e "\t\e[31mWARNING $tool not found!\e[0m" From c630d75f8ce7710924482f03bdf3e19796d471ac Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Jul 2021 16:15:41 +0200 Subject: [PATCH 104/229] testing BLAST version check --- fdog/setup/setup.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index d5d740b..1f74552 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -323,9 +323,10 @@ for i in "${dependencies[@]}"; do if [ $tool == "tblastn" ]; then requiredver="2.9.0" currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" - # if ["$(printf '%s\n' "$requiredver" "$currentver" | sort -V | head -n1)" = "$currentver" ]; then - # echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" - # fi + t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1) + if [ $t == $currentver ]; then + echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + fi fi if [ -z "$(which $tool)" ]; then echo -e "\t\e[31mWARNING $tool not found!\e[0m" From f31cebf94a9c5161023182b307630f0f6d9e1e50 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 21 Jul 2021 16:22:54 +0200 Subject: [PATCH 105/229] tblastn version check during fdog.setup --conda --- fdog/setup/setup_conda.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh index ddc4e23..7b4bd08 100755 --- a/fdog/setup/setup_conda.sh +++ b/fdog/setup/setup_conda.sh @@ -369,6 +369,8 @@ clustalw mafft muscle fasta3 +augustus +tblastn ) for i in "${condaPkgs[@]}"; do if [[ -z $(conda list | $grepprog "$i ") ]]; then @@ -381,6 +383,13 @@ for i in "${condaPkgs[@]}"; do progname="hmmsearch" elif [ "$i" == "fasta3" ]; then progname="fasta36" + elif [ "$i" == "tblastn" ]; then + requiredver="2.9.0" + currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" + t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1) + if [ $t == $currentver ]; then + echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + fi fi if [ -z "$(which $progname)" ]; then echo -e "\t\e[31m$i could not be installed\e[0m" From 6edf7a01486af4cde7da9a2a028936d5f7710d86 Mon Sep 17 00:00:00 2001 From: mueli94 <47216555+mueli94@users.noreply.github.com> Date: Mon, 2 Aug 2021 13:36:20 +0200 Subject: [PATCH 106/229] Fdog goes assembly (#10) * bug fix * bug fix * fixed error mapping ID file not found * testing * testing * testing * test * test * testing * testing * testing * testing * fDOGassembly is working on complete assembly_dir * bug fix * bug fix * enabled option -filter for blastp search * bug fix fasoff * testing --strict option * bug fix in --strict option, output is corrected * bug fix in --checkCoorthologsRef * bug fix * clean up * bug fix * adapted handling of variable dataPath * testing * testing * testing * testing * test * test * test * test * test * test * testing * bug fix assemblyDir * testing * testing * testing search taxa * test * enable --searchTaxa option in fdog.assembly * bug fix * testing * testing --searchTaxa adaption * testing * test * test * write debug files to output dir * skip fa.mapping while checking genome_dir * testing * bug fix * testing * bug fix * bug fix * path fix in augustus_ppx * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * bug fix * testing * testing * added new python script to merge Assembly output from the same Gene but different searchTaxa * added option to merge Assembly output after fDOG calls fdog.assembly multiple times with different searchTaxa * bug fix * corrected fdog.mergeAssembly call * testing * testing * testing * test * moved fdog.mergeAssembly call to another place * testing * testing * testing * testing * testing * testing * corrected fdog.mergeAssembly call * testing * testing * testing * testing * test * disable weight_dir check if option --assembly is used * adapted fdog.assembly call * adapted calcFAS call to deactivate .tsv output * testing * testing * bug fix in function backward search used with option --strict * testing new added option --silent * added more checks to fdogs.run * bug fix * testing * testing * testing * bug fix * bug fix * testing * testing silent mode * testing --silent * symlinks for fasta36 input; improved fdogs.run according to #5 * testing * testing * testing * testing * tetsing * testing * testing * testing * testing * testing * testing * testing * test * test * testing * testing new function to identify coorthologs * testing * testing * testing * testing * testing * testing * testing * testing * testing * finished function coorthologs * bug fix runSingle.py * cleaning output * testing * testing * testing * testing * testing * testing * testing * testing * testing * testing * testing * testing * bug fix if augutus can't idetify a gene at a candidate region * testing * bug fix * bug fix * cleaning up * testing * testing * testing * testing * bug fix in merge function, regions in minus strand were not merged correctly * testing * testing * testing * testing * testing * bug fix * testing * testing * testing * testing * testing * clean up * testing * testing * testing * testing * bug fix * testing new tblastn call * testing * testing * testing * testing * testing * code clean up * clean up code * clean up * clean up * reduce output * clean up code * check augustus * testing * adding option to recognize if co-ortholog or not in header of the extended.fa * testing * testing * testing * testing * testing * testing * testing * testing * added function starting_subprocess() to handle call of extern tools more easily * added augustus to dependencies * testing * bug fix * testing * testing * testing * testing * testing * testing * testing * testing * testing * added function to clean up .domain files * testing * testing * testing * testing * improve user output * fdog.assembly started with fDOG is always silent * testing * testing output * testing * testing * testing * testing * testing * removing automatically .tsv files if existing * measure computational time * measure computational time * bug fix * testing * computational time output * corrected computational time output * automatic augustus installation during setup * added tblastn version check * bug fix * bug fix * testing BLAST version check * tblastn version check during fdog.setup --conda Co-authored-by: trvinh --- fdog/fDOGassembly.py | 34 +++++++++++++++++++++++++++++----- fdog/setup/install_lib.sh | 9 ++++++++- fdog/setup/setup.sh | 10 ++++++++++ fdog/setup/setup_conda.sh | 9 +++++++++ 4 files changed, 56 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index de9f343..46f83c0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,3 +1,21 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2020 Hannah Muelbaier +# +# This script is used to run fDOG-Assembly which performs targeted ortholog +# searches on genome assemblies +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: hannah.muelbaier@gmail.com +# +####################################################################### + ############################ imports ########################################### import os import os.path @@ -8,6 +26,8 @@ import argparse import yaml import subprocess +import time +======= ########################### functions ########################################## def load_config(config_file): with open(config_file, 'r') as stream: @@ -428,10 +448,6 @@ def cleanup(tmp, tmp_path): if tmp == False: os.system('rm -r ' + tmp_path) -def checkOptions(): - pass - #muss ich unbedingt noch ergänzen wenn ich alle möglichen input Optionen implementiert habe!!! - def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: return candidate_names @@ -520,7 +536,10 @@ def main(): #################### handle user input ######################################## - version = '0.0.1' + start = time.time() + + version = '0.1.1' + parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) @@ -796,6 +815,7 @@ def main(): ############### make Annotation with FAS ################################### # if we want to search in only one Taxon if searchTaxon != '' and fasoff == False: + fas = time.time() print("Calculating FAS scores") fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option @@ -816,6 +836,7 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '': + fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) @@ -831,6 +852,9 @@ def main(): else: cleanup(tmp, out + "/tmp/") + end = time.time() + sys.stdout = sys.__stdout__ + #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) f.close() if __name__ == '__main__': diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh index ff81e88..2e8ff02 100755 --- a/fdog/setup/install_lib.sh +++ b/fdog/setup/install_lib.sh @@ -85,6 +85,7 @@ dependenciesUbuntu=( perl-doc locales lib32z1 + augustus ) dependenciesMac=( @@ -94,6 +95,7 @@ dependenciesMac=( mafft brewsci/bio/muscle blast + augustus ) if [ "$sys" == "Darwin" ]; then @@ -108,7 +110,11 @@ else sudo apt-get update -y for i in "${dependenciesUbuntu[@]}"; do echo $i - sudo apt-get install -y -qq $i > /dev/null + if ["$i" == "augustus"]; then + sudo apt install augustus > /dev/null + else + sudo apt-get install -y -qq $i > /dev/null + fi done fi @@ -119,6 +125,7 @@ dependencies=( mafft muscle blastn + augustus ) for i in "${dependencies[@]}"; do diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index 3c561e7..1f74552 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -309,6 +309,8 @@ mafft muscle clustalw blastp +augustus +tblastn ) for i in "${dependencies[@]}"; do @@ -318,6 +320,14 @@ for i in "${dependencies[@]}"; do tool="clustalw2" fi fi + if [ $tool == "tblastn" ]; then + requiredver="2.9.0" + currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" + t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1) + if [ $t == $currentver ]; then + echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + fi + fi if [ -z "$(which $tool)" ]; then echo -e "\t\e[31mWARNING $tool not found!\e[0m" flag=1 diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh index ddc4e23..7b4bd08 100755 --- a/fdog/setup/setup_conda.sh +++ b/fdog/setup/setup_conda.sh @@ -369,6 +369,8 @@ clustalw mafft muscle fasta3 +augustus +tblastn ) for i in "${condaPkgs[@]}"; do if [[ -z $(conda list | $grepprog "$i ") ]]; then @@ -381,6 +383,13 @@ for i in "${condaPkgs[@]}"; do progname="hmmsearch" elif [ "$i" == "fasta3" ]; then progname="fasta36" + elif [ "$i" == "tblastn" ]; then + requiredver="2.9.0" + currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" + t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1) + if [ $t == $currentver ]; then + echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" + fi fi if [ -z "$(which $progname)" ]; then echo -e "\t\e[31m$i could not be installed\e[0m" From 1b4232e6cd214650007e5c24055f3c8618fe01ae Mon Sep 17 00:00:00 2001 From: mueli94 <47216555+mueli94@users.noreply.github.com> Date: Mon, 2 Aug 2021 13:41:02 +0200 Subject: [PATCH 107/229] Added link to fDOG-Assembly poster for QfO 6.5 --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 52f11e2..9343943 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ [![Build Status](https://travis-ci.com/BIONF/fDOG.svg?branch=master)](https://travis-ci.com/BIONF/fDOG) ![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg) +# Poster fDOG - Assembly +(https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf) # Table of Contents * [How to install](#how-to-install) * [Install the fDOG package](#install-the-fdog-package) From 4798b8fea54782a68d60935bb157ad28cfeaaadb Mon Sep 17 00:00:00 2001 From: mueli94 <47216555+mueli94@users.noreply.github.com> Date: Mon, 2 Aug 2021 13:41:22 +0200 Subject: [PATCH 108/229] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9343943..8db83ce 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg) # Poster fDOG - Assembly -(https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf) +https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf # Table of Contents * [How to install](#how-to-install) * [Install the fDOG package](#install-the-fdog-package) From d64177c3cb0a6afd8a89687a4ee8196f2f85fc7d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 10:22:47 +0200 Subject: [PATCH 109/229] added option checkOff --- fdog/fDOGassembly.py | 8 +++++--- fdog/runMulti.py | 8 ++++---- fdog/runSingle.py | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 46f83c0..424b6e3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Hannah Muelbaier +# Copyright (C) 2021 Hannah Muelbaier # # This script is used to run fDOG-Assembly which performs targeted ortholog # searches on genome assemblies @@ -538,7 +538,7 @@ def main(): start = time.time() - version = '0.1.1' + version = '0.1.2' parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') @@ -668,7 +668,6 @@ def main(): else: sys.stdout = Logger(f) - # user input has to be checked here before fDOGassembly continues assembly_names = os.listdir(assemblyDir) ########################## some variables ################################## @@ -683,6 +682,9 @@ def main(): consensus_path = out + "/tmp/" + group + ".con" profile_path = out + "/tmp/" + group + ".prfl" + ##################### need a check to see if reference species is part of the core group !########## + + ###################### create tmp folder ################################### cmd = 'mkdir ' + out + '/tmp' diff --git a/fdog/runMulti.py b/fdog/runMulti.py index 6862f6d..c19b598 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -48,7 +48,7 @@ def prepare(args, step): coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args + cpu, hyperthread, checkOff, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args mute = False @@ -70,7 +70,7 @@ def prepare(args, step): coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] - otherArgs = [cpu, hyperthread, debug, True] + otherArgs = [cpu, hyperthread, checkOff, debug, True] assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) @@ -378,7 +378,7 @@ def main(): silent = False else: silent = True - + #fdog_goes_assembly arguments assembly = args.assembly assemblyFile = args.assemblyFile @@ -472,7 +472,7 @@ def main(): coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] + cpu, hyperthread, checkOff, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] ### START Path(outpath).mkdir(parents=True, exist_ok=True) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index 1b8a943..c65300f 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -453,7 +453,7 @@ def main(): coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] - otherArgs = [cpu, hyperthread, debug, silent] + otherArgs = [cpu, hyperthread, checkOff, debug, silent] assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath] ### run fdog From ef6b0dc6903837130cbac00ea9d6f499e1330373 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 11:17:54 +0200 Subject: [PATCH 110/229] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 424b6e3..1800a0a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,7 +27,7 @@ import yaml import subprocess import time -======= + ########################### functions ########################################## def load_config(config_file): with open(config_file, 'r') as stream: From d4bf11fb965dcd512790f3ec164c237deaa3a9d4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 12:06:06 +0200 Subject: [PATCH 111/229] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 1800a0a..bf272d7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -837,7 +837,7 @@ def main(): cleanup(tmp, tmp_path) return 1 #if we searched in more than one taxon - if fasoff == False and searchTaxon == '': + if fasoff == False and searchTaxon == '' and len(assembly_names) > 1: fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' From 62badce99d56fb4e634335e20db8cafebcfd89a3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 12:10:35 +0200 Subject: [PATCH 112/229] testing --- fdog/fDOGassembly.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bf272d7..2b5eaf7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -838,6 +838,8 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '' and len(assembly_names) > 1: + print(len(assembly_names)) + print(assembly_names) fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' From a51b8f4a0c0b60a33d47f4908efa8630bd67dfca Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 12:11:58 +0200 Subject: [PATCH 113/229] testing --- fdog/fDOGassembly.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bf272d7..2b5eaf7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -838,6 +838,8 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '' and len(assembly_names) > 1: + print(len(assembly_names)) + print(assembly_names) fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' From 147bbc9df5d5bf36382ddd222b9c081d061a3797 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 12:21:26 +0200 Subject: [PATCH 114/229] fixed --- fdog/fDOGassembly.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 2b5eaf7..bf272d7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -838,8 +838,6 @@ def main(): return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '' and len(assembly_names) > 1: - print(len(assembly_names)) - print(assembly_names) fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' From a992e322ca4fd459de9d0d99d867622548dc1af7 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 14:49:14 +0200 Subject: [PATCH 115/229] fixed FAS call --- fdog/fDOGassembly.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index bf272d7..9c12e9a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -831,8 +831,7 @@ def main(): #if we searched in more than one Taxon and no ortholog was found - - if refBool == False and searchTaxon == '': + if refBool == False and searchTaxon == '' and len(assembly_names) > 1: print("No orthologs found. Exciting ...") cleanup(tmp, tmp_path) return 1 @@ -843,7 +842,7 @@ def main(): tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, 'silent') clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') From abea0980ae0ba82e1565d45cabffdb455e85cdce Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 10 Sep 2021 15:29:06 +0200 Subject: [PATCH 116/229] changed FAS call --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 9c12e9a..950aef3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -823,7 +823,7 @@ def main(): # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'mkdir ' + tmp_path + 'anno_dir' starting_subprocess(cmd, 'silent') - cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName + cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName starting_subprocess(cmd, 'silent') clean_fas(fasOutFile + "_forward.domains", 'domains') clean_fas(fasOutFile + "_reverse.domains", 'domains') @@ -831,12 +831,12 @@ def main(): #if we searched in more than one Taxon and no ortholog was found - if refBool == False and searchTaxon == '' and len(assembly_names) > 1: + if refBool == False and searchTaxon == '': print("No orthologs found. Exciting ...") cleanup(tmp, tmp_path) return 1 #if we searched in more than one taxon - if fasoff == False and searchTaxon == '' and len(assembly_names) > 1: + if fasoff == False and searchTaxon == '': fas = time.time() print("Calculating FAS scores") tmp_path = out + '/tmp/' From d56b83e9cd76ce678b756de5856572d86b31a563 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 14 Sep 2021 16:29:15 +0200 Subject: [PATCH 117/229] new function that checks if input path exist and new function that check if reference species is part of core_group, multiple reference species were accepted, improved output --- fdog/fDOGassembly.py | 78 ++++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 950aef3..b27fcbe 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -29,6 +29,26 @@ import time ########################### functions ########################################## +def check_path(path): + if not os.path.exists(path): + print(path + " does not exist. Exciting ...") + sys.exit() + +def check_ref_sepc(species_list, fasta_file): + file = open(fasta_file, "r") + lines = file.readlines() + species_file = [] + + for line in lines: + if line[0] == ">": + species = line.split("|")[1] + species_file.append(species) + for species in species_list: + if species in species_file: + return species + print("Reference species is not part of the ortholog group. Exciting ...") + sys.exit() + def load_config(config_file): with open(config_file, 'r') as stream: try: @@ -298,40 +318,40 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva id, gene, evalue = (line.replace("\n", "")).split("\t") gene_name = gene.split("|")[2] if gene_name != old_name: - print("candidate:%s"%(gene_name)) - print("blast-hit:%s"%(id)) + print("candidate:%s"%(gene_name)) if mode == "debug" else "" + print("blast-hit:%s"%(id)) if mode == "debug" else "" min = float(evalue) if id in id_ref: orthologs.append(gene) - print("\thitting\n") + print("\thitting\n") if mode == "debug" else "" else: if checkCo == True: for i in id_ref: - print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else "" co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) if co_orthologs_result == 1: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" orthologs.append(gene) elif co_orthologs_result == 0: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" else: - print("\tnothitting\n") + print("\tnothitting\n") if mode == "debug" else "" elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs: if id in id_ref: orthologs.append(gene) - print("\thitting\n") + print("\thitting\n") if mode == "debug" else "" else: if checkCo == True: for i in id_ref: - print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else "" co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) if co_orthologs_result == 1: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" orthologs.append(gene) elif co_orthologs_result == 0: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" else: - print("\tnot hitting\n") + print("\tnot hitting\n") if mode == "debug" else "" old_name = gene_name @@ -548,7 +568,7 @@ def main(): required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', action='store', default='', required=True) required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) - required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', default='', required=True) + required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', nargs="+", default='', required=True) optional = parser.add_argument_group('Optional arguments') optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int) @@ -611,6 +631,7 @@ def main(): silent = args.silent debug = args.debug + # output modes if debug == True and silent == True: print("It's not possible to use booth modes, please restart and use --debug or --silent") return 1 @@ -637,22 +658,27 @@ def main(): dataPath = cfg['dataPath'] except: dataPath = 'config' - if core_path == '': - core_path = out + '/core_orthologs/' - else: - if not core_path.endswith('/'): - core_path = core_path + '/' - if assemblyDir == '': - assemblyDir = dataPath + '/assembly_dir/' + if out == '': - #print('test out \n') out = os.getcwd() os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') out = out + '/' + group + '/' else: if out[-1] != "/": out = out + "/" + check_path(out) + + if core_path == '': + core_path = out + '/core_orthologs/' + else: + if not core_path.endswith('/'): + core_path = core_path + '/' + check_path(core_path) + + if assemblyDir == '': + assemblyDir = dataPath + '/assembly_dir/' + check_path(assemblyDir) try: @@ -674,16 +700,20 @@ def main(): refBool = False # checks if sequences of reference species were already part of the extended.fa file - ########### paths ########### + ################################# paths #################################### msa_path = core_path + "/" + group +"/"+ group + ".aln" + check_path(msa_path) hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" + check_path(hmm_path) fasta_path = core_path + "/" + group +"/"+ group + ".fa" + check_path(fasta_path) consensus_path = out + "/tmp/" + group + ".con" profile_path = out + "/tmp/" + group + ".prfl" - ##################### need a check to see if reference species is part of the core group !########## + ############## is fDOG reference species part of ortholog group? ########### + fdog_ref_species = check_ref_sepc(fdog_ref_species, fasta_path) ###################### create tmp folder ################################### @@ -842,7 +872,7 @@ def main(): tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, 'silent') clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') From 343199263b697131ca6fcac375aa59b3e10b7458 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 15 Sep 2021 15:11:25 +0200 Subject: [PATCH 118/229] improved user output --- fdog/fDOGassembly.py | 59 ++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b27fcbe..232090d 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -554,22 +554,21 @@ def flush(self): def main(): - #################### handle user input ######################################## + #################### handle user input ##################################### start = time.time() version = '0.1.2' - - + ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) - + ################## required arguments ###################################### required = parser.add_argument_group('Required arguments') required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', action='store', default='', required=True) required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', nargs="+", default='', required=True) - + ################## optional arguments ###################################### optional = parser.add_argument_group('Optional arguments') optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int) optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int) @@ -592,7 +591,6 @@ def main(): optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) - args = parser.parse_args() # required @@ -711,7 +709,7 @@ def main(): consensus_path = out + "/tmp/" + group + ".con" profile_path = out + "/tmp/" + group + ".prfl" - ############## is fDOG reference species part of ortholog group? ########### + ########### is/are fDOG reference species part of ortholog group? ########## fdog_ref_species = check_ref_sepc(fdog_ref_species, fasta_path) @@ -720,32 +718,33 @@ def main(): cmd = 'mkdir ' + out + '/tmp' starting_subprocess(cmd, 'silent') + print("Gene: " + group) + print("fDOG reference species: " + fdog_ref_species + " \n") + ######################## consensus sequence ################################ #make a majority-rule consensus sequence with the tool hmmemit from hmmer - print("Building a consensus sequence for gene " + group + " \n") + print("Building a consensus sequence") cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path starting_subprocess(cmd, mode) - print("consensus sequence is finished\n") + print("\t ...finished\n") ######################## block profile ##################################### - print("Building a block profile for gene " + group + " \n") + print("Building a block profile ...") cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path starting_subprocess(cmd, 'silent') if int(os.path.getsize(profile_path)) > 0: - print("block profile is finished \n") + print("\t ...finished \n") else: print("Building block profiles failed. Using prepareAlign to convert alignment\n") new_path = core_path + group +"/"+ group + "_new.aln" - #print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path starting_subprocess(cmd, mode) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path - #print(cmd) starting_subprocess(cmd, 'silent') - print("block profile is finished \n") + print(" \t ...finished \n") searchBool = False @@ -780,19 +779,17 @@ def main(): #checks if data base exists already db_check = searching_for_db(db_path) if db_check == 0: - print("creating a blast data base \n") + print("Creating a blast data base...") cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path starting_subprocess(cmd, mode) - print("database is finished \n") - else: - print('blast data base exists already, continuing...') + print("\t ...finished \n") - #makes a tBLASTn search against the new database + #makes a tBLASTn search against database #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - print("tBLASTn search against data base") + print("Starting tBLASTn search...") cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' starting_subprocess(cmd, mode) - print("tBLASTn search is finished") + print("\t ...finished") ################### search for candidate regions and extract seq ########### # parse blast and filter for candiate regions @@ -800,25 +797,25 @@ def main(): if regions == 0: #no candidat region are available, no ortholog can be found - print("No candidate region found") + print("No candidate region found!\n") if refBool == True: continue else: taxa = [fdog_ref_species] reciprocal_sequences = 0 else: - print(str(number_regions) + " candiate regions were found. Extracting sequences...") + print(str(number_regions) + " candiate regions were found.\n") extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### - print("starting augustus ppx \n") + print("Starting augustus ppx ...") augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - print("augustus is finished \n") + print("\t ...finished \n") ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: - print("No genes found at candidate regions\n") + print("No genes found at candidate region\n") if searchTaxon == '' and refBool == True: continue else: @@ -831,7 +828,7 @@ def main(): ################## checking accepted genes for co-orthologs ################ if reciprocal_sequences == 0: if regions != 0: - print("No ortholog fulfilled the reciprocity criteria") + print("No ortholog fulfilled the reciprocity criteria \n") if searchTaxon == '' and refBool == True: continue else: @@ -848,7 +845,7 @@ def main(): # if we want to search in only one Taxon if searchTaxon != '' and fasoff == False: fas = time.time() - print("Calculating FAS scores") + print("Calculating FAS scores ...") fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'mkdir ' + tmp_path + 'anno_dir' @@ -858,6 +855,7 @@ def main(): clean_fas(fasOutFile + "_forward.domains", 'domains') clean_fas(fasOutFile + "_reverse.domains", 'domains') clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') + print("\t ...finished \n") #if we searched in more than one Taxon and no ortholog was found @@ -868,7 +866,7 @@ def main(): #if we searched in more than one taxon if fasoff == False and searchTaxon == '': fas = time.time() - print("Calculating FAS scores") + print("Calculating FAS scores ...") tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option @@ -877,6 +875,7 @@ def main(): clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') clean_fas(out + group + ".phyloprofile", 'phyloprofile') + print("\t ...finished \n") ################# remove tmp folder ######################################## if searchTaxon != '': cleanup(tmp, tmp_path) @@ -886,7 +885,9 @@ def main(): end = time.time() sys.stdout = sys.__stdout__ #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) + print("fDOG-Assembly finished complete in " + str(end-start) + "seconds.") f.close() + if __name__ == '__main__': main() From a843bfeec60a534776ec3f1e7c036c880a7b2e74 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 17 Sep 2021 10:52:37 +0200 Subject: [PATCH 119/229] added timeout for tblastn search, fixed bug during delition of tmp folder, --- fdog/fDOGassembly.py | 56 ++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 232090d..c54590c 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -27,6 +27,7 @@ import yaml import subprocess import time +import shutil ########################### functions ########################################## def check_path(path): @@ -56,13 +57,17 @@ def load_config(config_file): except yaml.YAMLError as exc: print(exc) -def starting_subprocess(cmd, mode): - if mode == 'debug': - result = subprocess.run(cmd, shell=True) - elif mode == 'silent': - result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) - elif mode == 'normal': - result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True) +def starting_subprocess(cmd, mode, time_out = None): + + try: + if mode == 'debug': + result = subprocess.run(cmd, shell=True, timeout = time_out) + elif mode == 'silent': + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True, timeout = time_out) + elif mode == 'normal': + result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True, timeout = time_out) + except subprocess.TimeoutExpired: + return 1 def merge(blast_results, insert_length): #merging overlapping and contigous candidate regions @@ -162,10 +167,11 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) if blast_results == {}: + blast_file.close() return 0,0 else: candidate_regions, number_regions = merge(blast_results, intron_length) - + blast_file.close() return candidate_regions, number_regions def extract_seq(region_dic, path, tmp_path, mode): @@ -270,6 +276,10 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates if msaTool == "muscle": os.system("muscle -quiet -in " + output_file + " -out " + aln_file) #print("muscle -quiet -in " + output_file + " -out " + aln_file) + if not os.path.exists(aln_file): + print("Muscle failed for " + candidate_name + ". Making MSA with Mafft-linsi.") + os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) + elif msaTool == "mafft-linsi": #print("mafft-linsi") os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) @@ -461,12 +471,13 @@ def createFasInput(orthologsOutFile, mappingFile): ncbi_id = (seq.id.split("@"))[1] mappingFile.write(seq.id + "\t" + "ncbi" + ncbi_id + "\n") - + mappingFile.close() return fas_seed_id def cleanup(tmp, tmp_path): if tmp == False: - os.system('rm -r ' + tmp_path) + while os.path.exists(tmp_path): + shutil.rmtree(tmp_path, ignore_errors=True) def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: @@ -537,6 +548,7 @@ def clean_fas(path, file_type): new_line = id + "\t" + remain file.write(new_line) + file.close() class Logger(object): def __init__(self, file): @@ -708,6 +720,7 @@ def main(): check_path(fasta_path) consensus_path = out + "/tmp/" + group + ".con" profile_path = out + "/tmp/" + group + ".prfl" + tmp_folder = out + "/tmp" ########### is/are fDOG reference species part of ortholog group? ########## @@ -760,7 +773,7 @@ def main(): cmd = 'mkdir ' + out + '/tmp/' + asName starting_subprocess(cmd, 'silent') - tmp_path = out + "/tmp/" + asName + "/" + tmp_path = out + "tmp/" + asName + "/" candidatesOutFile = tmp_path + group + ".candidates.fa" if searchTaxon != '': orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" @@ -788,8 +801,14 @@ def main(): #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt print("Starting tBLASTn search...") cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' - starting_subprocess(cmd, mode) - print("\t ...finished") + exit_code = starting_subprocess(cmd, mode, 3600) + if exit_code == 1: + print("The tblastn search takes too long. Exciting ...") + f.close() + cleanup(tmp, tmp_folder) + sys.exit() + else: + print("\t ...finished") ################### search for candidate regions and extract seq ########### # parse blast and filter for candiate regions @@ -861,7 +880,8 @@ def main(): #if we searched in more than one Taxon and no ortholog was found if refBool == False and searchTaxon == '': print("No orthologs found. Exciting ...") - cleanup(tmp, tmp_path) + f.close() + cleanup(tmp, tmp_folder) return 1 #if we searched in more than one taxon if fasoff == False and searchTaxon == '': @@ -878,14 +898,16 @@ def main(): print("\t ...finished \n") ################# remove tmp folder ######################################## if searchTaxon != '': - cleanup(tmp, tmp_path) + f.close() + cleanup(tmp, tmp_folder) else: - cleanup(tmp, out + "/tmp/") + f.close() + cleanup(tmp, tmp_folder) end = time.time() sys.stdout = sys.__stdout__ #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) - print("fDOG-Assembly finished complete in " + str(end-start) + "seconds.") + print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") f.close() From 36fc207095c5f865547ad6a5b152632ebb71f575 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 20 Sep 2021 16:45:31 +0200 Subject: [PATCH 120/229] added options force and append --- fdog/fDOGassembly.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c54590c..09795e4 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -28,6 +28,7 @@ import subprocess import time import shutil +import multiprocessing as mp ########################### functions ########################################## def check_path(path): @@ -602,6 +603,8 @@ def main(): optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='') optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) + optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) + optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) args = parser.parse_args() @@ -640,6 +643,8 @@ def main(): searchTaxon = args.searchTaxon silent = args.silent debug = args.debug + force = args.force + append = args.append # output modes if debug == True and silent == True: @@ -672,13 +677,25 @@ def main(): if out == '': out = os.getcwd() - os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') - out = out + '/' + group + '/' else: if out[-1] != "/": out = out + "/" check_path(out) + if os.path.exists(out + '/' + group): + if append != True and force != True: + print("Output folder for group " + group + " exists already. Please choose --force or --append.") + sys.exit() + elif force == True: + shutil.rmtree(out + '/' + group, ignore_errors=True) + elif append == True: + refBool = True # checks if sequences of reference species were already part of the extended.fa file + else: + refBool = False # checks if sequences of reference species were already part of the extended.fa file + else: + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' + if core_path == '': core_path = out + '/core_orthologs/' else: @@ -704,11 +721,9 @@ def main(): else: sys.stdout = Logger(f) - assembly_names = os.listdir(assemblyDir) + ########################### other variables ################################ - ########################## some variables ################################## - - refBool = False # checks if sequences of reference species were already part of the extended.fa file + assembly_names = os.listdir(assemblyDir) ################################# paths #################################### From 2e17db197f2e3e70f0c372a56314fc4722647770 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 23 Sep 2021 13:59:58 +0200 Subject: [PATCH 121/229] tested --foce and --append, only the 10 best candidate regions (regarding score) will be evaluated --- fdog/fDOGassembly.py | 65 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 13 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 09795e4..ae29b29 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -72,13 +72,13 @@ def starting_subprocess(cmd, mode, time_out = None): def merge(blast_results, insert_length): #merging overlapping and contigous candidate regions + #format dictionary: {node_name: [(,,evalue, ,,, )]} number_regions = 0 insert_length = int(insert_length) + score_list = [] for key in blast_results: locations = blast_results[key] locations = sorted(locations, key = lambda x: int(x[3])) - #print("test") - #print(locations) size_list = len(locations) j = 0 while j < size_list-1: @@ -88,6 +88,8 @@ def merge(blast_results, insert_length): #merge overlapping regions plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -= 1 @@ -95,6 +97,8 @@ def merge(blast_results, insert_length): #merge overlapping regions minus strand locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -= 1 @@ -102,6 +106,8 @@ def merge(blast_results, insert_length): #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -=1 @@ -109,20 +115,24 @@ def merge(blast_results, insert_length): #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -=1 i += 1 j += 1 + for entry in locations: + score_list.append(entry[6]) number_regions += len(locations) blast_results[key] = locations - return blast_results, number_regions + return blast_results, number_regions, score_list def parse_blast(line, blast_results, cutoff): - # format blast line: - # format dictionary: {node_name: [(,,evalue, ,,)]} + # format blast line: + # format dictionary: {node_name: [(,,evalue, ,,, )]} line = line.replace("\n", "") line_info = line.split("\t") evalue = float(line_info[3]) @@ -131,7 +141,7 @@ def parse_blast(line, blast_results, cutoff): return blast_results, evalue #add region to dictionary else: - node_name, sstart, send, qstart, qend = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]) + node_name, sstart, send, qstart, qend, score = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]), int(line_info[6]) split = node_name.split("|") # finding out on which strand tBLASTn found a hit if sstart < send: @@ -145,14 +155,32 @@ def parse_blast(line, blast_results, cutoff): node_name = split[1] if node_name in blast_results: list = blast_results[node_name] - list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand]) + list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]) blast_results[node_name] = list else: - blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand]] + blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]] return blast_results, evalue -def candidate_regions(intron_length, cutoff_evalue, tmp_path): +def get_x_results(blast_dic, x, score_list): + + new_dic = {} + score_list.sort(reverse=True) + min = score_list[x - 1] + number_regions = 0 + + for key in blast_dic: + key_list = [] + entries = blast_dic[key] + for i in entries: + if i[6] >= min: + key_list.append(i) + if key_list != []: + new_dic[key] = key_list + number_regions += len(key_list) + return new_dic, number_regions + +def candidate_regions(intron_length, cutoff_evalue, tmp_path, x = 10): ###################### extracting candidate regions ######################## # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 blast_file = open(tmp_path + "/blast_results.out", "r") @@ -171,8 +199,10 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): blast_file.close() return 0,0 else: - candidate_regions, number_regions = merge(blast_results, intron_length) + candidate_regions, number_regions, score_list = merge(blast_results, intron_length) blast_file.close() + if number_regions > x: + candidate_regions, number_regions = get_x_results(candidate_regions, x, score_list) return candidate_regions, number_regions def extract_seq(region_dic, path, tmp_path, mode): @@ -551,6 +581,10 @@ def clean_fas(path, file_type): file.write(new_line) file.close() +def ortholog_search(): + + pass + class Logger(object): def __init__(self, file): self.file = file @@ -583,7 +617,7 @@ def main(): required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', nargs="+", default='', required=True) ################## optional arguments ###################################### optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int) + optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int) optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int) optional.add_argument('--assemblyPath', help='Path for the assembly directory', action='store', default='') optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False) @@ -688,13 +722,18 @@ def main(): sys.exit() elif force == True: shutil.rmtree(out + '/' + group, ignore_errors=True) + refBool = False + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' elif append == True: - refBool = True # checks if sequences of reference species were already part of the extended.fa file + out = out + '/' + group + '/' + refBool = True else: refBool = False # checks if sequences of reference species were already part of the extended.fa file else: os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') out = out + '/' + group + '/' + refBool = False if core_path == '': core_path = out + '/core_orthologs/' @@ -815,7 +854,7 @@ def main(): #makes a tBLASTn search against database #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt print("Starting tBLASTn search...") - cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' exit_code = starting_subprocess(cmd, mode, 3600) if exit_code == 1: print("The tblastn search takes too long. Exciting ...") From 80562870c5d6395f3aa9cb256281dea1c157104a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 28 Sep 2021 16:07:06 +0200 Subject: [PATCH 122/229] create a function that performs the ortholog search and returns the headers of the found sequences and the corresponding tmp file in which the sequence is located --- fdog/fDOGassembly.py | 315 +++++++++++++++++++++++++++++-------------- 1 file changed, 214 insertions(+), 101 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index ae29b29..37b7095 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -464,6 +464,38 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva orthologs = set(orthologs) return list(orthologs), seed +def addRef(output, core_fasta, species_list): + print(species_list) + output_file = open(output, "a+") + seq_records_core = readFasta(core_fasta) + seq_records_core = list(seq_records_core) + for species in species_list: + for entry_core in seq_records_core: + if species in entry_core.id: + output_file.write(">" + entry_core.id + "\n") + output_file.write(str(entry_core.seq) + "\n") + output_file.close() + +def addSeq(output, seq_list): + output_file = open(output, "a+") + + for item in seq_list: + candidate_fasta = item[0] + sequenceIds = item[1] + if sequenceIds == 0 or sequenceIds == []: + pass + seq_records_candidate = readFasta(candidate_fasta) + seq_records_candidate = list(seq_records_candidate) + for entry_candidate in seq_records_candidate: + if entry_candidate.id in sequenceIds: + if entry_candidate.id == sequenceIds[0]: + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + output_file.close() + def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): output_file = open(output, "a+") @@ -581,9 +613,69 @@ def clean_fas(path, file_type): file.write(new_line) file.close() -def ortholog_search(): - - pass +def ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs): + cmd = 'mkdir ' + out + '/tmp/' + asName + starting_subprocess(cmd, 'silent') + tmp_path = out + "tmp/" + asName + "/" + candidatesOutFile = tmp_path + group + ".candidates.fa" + #orthologsOutFile = out + "/" + group + ".extended.fa" + fasOutFile = out + "/" + group + #mappingFile = out + "/tmp/" + group + ".mapping.txt" + + print("Searching in species " + asName + "\n") + assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" + db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" + db_check = searching_for_db(db_path) + + if db_check == 0: + print("Creating a blast data base...") + cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path + starting_subprocess(cmd, mode) + print("\t ...finished \n") + + #makes a tBLASTn search against database + #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt + print("Starting tBLASTn search...") + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + exit_code = starting_subprocess(cmd, mode, 3600) + if exit_code == 1: + print("The tblastn search takes too long. Exciting ...") + f.close() + cleanup(tmp, tmp_folder) + sys.exit() + else: + print("\t ...finished") + + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) + if regions == 0: + #no candidat region are available, no ortholog can be found + print("No candidate region found for species %s!\n" % asName) + return [], candidatesOutFile + + else: + print(str(number_regions) + " candiate regions were found for species %s.\n" % asName) + extract_seq(regions, db_path, tmp_path, mode) + + ############### make Augustus PPX search ################################### + print("Starting augustus ppx ...") + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + print("\t ...finished \n") + + ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) <= 0: + print("No genes found at candidate regions\n") + return [], candidatesOutFile + + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + + if reciprocal_sequences == 0: + if regions != 0: + print("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + return [], candidatesOutFile + else: + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + + return reciprocal_sequences, candidatesOutFile class Logger(object): def __init__(self, file): @@ -639,7 +731,7 @@ def main(): optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) - + optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False) args = parser.parse_args() # required @@ -679,6 +771,7 @@ def main(): debug = args.debug force = args.force append = args.append + parallel = args.parallel # output modes if debug == True and silent == True: @@ -815,120 +908,140 @@ def main(): searchBool = False - #################### fDOG assembly computation for all species ############# - for asName in assembly_names: - if searchBool == True: - break - if searchTaxon != '' and searchBool == False: - asName = searchTaxon - searchBool = True + if searchTaxon == '': + ortholog_sequences = [] + cpus = mp.cpu_count() + print(cpus) + #pool = mp.Pool(cpus) + for asName in assembly_names: + reciprocal_sequences, candidatesOutFile = ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) + ortholog_sequences.append([candidatesOutFile, reciprocal_sequences]) + + orthologsOutFile = out + "/" + group + ".extended.fa" + + if taxa == []: + taxa = [fdog_ref_species] + addRef(orthologsOutFile, fasta_path, taxa) + addSeq(orthologsOutFile, ortholog_sequences) + refBool = True + mappingFile = out + "/tmp/" + group + ".mapping.txt" - ################### path definitions ################################### - cmd = 'mkdir ' + out + '/tmp/' + asName - starting_subprocess(cmd, 'silent') - tmp_path = out + "tmp/" + asName + "/" - candidatesOutFile = tmp_path + group + ".candidates.fa" - if searchTaxon != '': - orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" - fasOutFile = out + "/" + group + "_" + asName - mappingFile = tmp_path + group + "_" + asName + ".mapping.txt" - else: - orthologsOutFile = out + "/" + group + ".extended.fa" - fasOutFile = out + "/" + group - mappingFile = out + "/tmp/" + group + ".mapping.txt" - - print("Searching in species " + asName + "\n") - assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" - db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - - ######################## tBLASTn ########################################### - #checks if data base exists already - db_check = searching_for_db(db_path) - if db_check == 0: - print("Creating a blast data base...") - cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path - starting_subprocess(cmd, mode) - print("\t ...finished \n") - - #makes a tBLASTn search against database - #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - print("Starting tBLASTn search...") - cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' - exit_code = starting_subprocess(cmd, mode, 3600) - if exit_code == 1: - print("The tblastn search takes too long. Exciting ...") - f.close() - cleanup(tmp, tmp_folder) - sys.exit() - else: - print("\t ...finished") + else: + #################### fDOG assembly computation for all species ############# + for asName in assembly_names: + if searchBool == True: + break + if searchTaxon != '' and searchBool == False: + asName = searchTaxon + searchBool = True - ################### search for candidate regions and extract seq ########### - # parse blast and filter for candiate regions - regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) + ################### path definitions ################################### - if regions == 0: - #no candidat region are available, no ortholog can be found - print("No candidate region found!\n") - if refBool == True: - continue + cmd = 'mkdir ' + out + '/tmp/' + asName + starting_subprocess(cmd, 'silent') + tmp_path = out + "tmp/" + asName + "/" + candidatesOutFile = tmp_path + group + ".candidates.fa" + if searchTaxon != '': + orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" + fasOutFile = out + "/" + group + "_" + asName + mappingFile = tmp_path + group + "_" + asName + ".mapping.txt" else: - taxa = [fdog_ref_species] - reciprocal_sequences = 0 - else: - print(str(number_regions) + " candiate regions were found.\n") - extract_seq(regions, db_path, tmp_path, mode) - - ############### make Augustus PPX search ################################### + orthologsOutFile = out + "/" + group + ".extended.fa" + fasOutFile = out + "/" + group + mappingFile = out + "/tmp/" + group + ".mapping.txt" + + print("Searching in species " + asName + "\n") + assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" + db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" + + ######################## tBLASTn ########################################### + #checks if data base exists already + db_check = searching_for_db(db_path) + if db_check == 0: + print("Creating a blast data base...") + cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path + starting_subprocess(cmd, mode) + print("\t ...finished \n") + + #makes a tBLASTn search against database + #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt + print("Starting tBLASTn search...") + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + exit_code = starting_subprocess(cmd, mode, 3600) + if exit_code == 1: + print("The tblastn search takes too long. Exciting ...") + f.close() + cleanup(tmp, tmp_folder) + sys.exit() + else: + print("\t ...finished") - print("Starting augustus ppx ...") - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - print("\t ...finished \n") + ################### search for candidate regions and extract seq ########### + # parse blast and filter for candiate regions + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) - ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) <= 0: - print("No genes found at candidate region\n") - if searchTaxon == '' and refBool == True: + if regions == 0: + #no candidat region are available, no ortholog can be found + print("No candidate region found!\n") + if refBool == True: continue else: - reciprocal_sequences = 0 taxa = [fdog_ref_species] + reciprocal_sequences = 0 else: - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + print(str(number_regions) + " candiate regions were found.\n") + extract_seq(regions, db_path, tmp_path, mode) + + ############### make Augustus PPX search ################################### + + print("Starting augustus ppx ...") + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + print("\t ...finished \n") + + ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) <= 0: + print("No genes found at candidate region\n") + if searchTaxon == '' and refBool == True: + continue + else: + reciprocal_sequences = 0 + taxa = [fdog_ref_species] + else: + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) - ################## checking accepted genes for co-orthologs ################ - if reciprocal_sequences == 0: - if regions != 0: - print("No ortholog fulfilled the reciprocity criteria \n") - if searchTaxon == '' and refBool == True: - continue + ################## checking accepted genes for co-orthologs ################ + if reciprocal_sequences == 0: + if regions != 0: + print("No ortholog fulfilled the reciprocity criteria \n") + if searchTaxon == '' and refBool == True: + continue + else: + reciprocal_sequences = 0 else: - reciprocal_sequences = 0 - else: - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - ################ add sequences to extended.fa in the output folder########## + ################ add sequences to extended.fa in the output folder########## - addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) - refBool = True + addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) + refBool = True - ############### make Annotation with FAS ################################### - # if we want to search in only one Taxon - if searchTaxon != '' and fasoff == False: - fas = time.time() - print("Calculating FAS scores ...") - fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - cmd = 'mkdir ' + tmp_path + 'anno_dir' - starting_subprocess(cmd, 'silent') - cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName - starting_subprocess(cmd, 'silent') - clean_fas(fasOutFile + "_forward.domains", 'domains') - clean_fas(fasOutFile + "_reverse.domains", 'domains') - clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') - print("\t ...finished \n") + ############### make Annotation with FAS ################################### + # if we want to search in only one Taxon + if searchTaxon != '' and fasoff == False: + fas = time.time() + print("Calculating FAS scores ...") + fas_seed_id = createFasInput(orthologsOutFile, mappingFile) + # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option + cmd = 'mkdir ' + tmp_path + 'anno_dir' + starting_subprocess(cmd, 'silent') + cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName + starting_subprocess(cmd, 'silent') + clean_fas(fasOutFile + "_forward.domains", 'domains') + clean_fas(fasOutFile + "_reverse.domains", 'domains') + clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') + print("\t ...finished \n") #if we searched in more than one Taxon and no ortholog was found From ee3636413a9a826d523229d21c9d4e5b88113fe3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 29 Sep 2021 16:10:44 +0200 Subject: [PATCH 123/229] added parallelization with bib multiprocessing --- fdog/fDOGassembly.py | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 37b7095..aadb3f0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -480,8 +480,9 @@ def addSeq(output, seq_list): output_file = open(output, "a+") for item in seq_list: - candidate_fasta = item[0] - sequenceIds = item[1] + print(item) + candidate_fasta = item[1] + sequenceIds = item[0] if sequenceIds == 0 or sequenceIds == []: pass seq_records_candidate = readFasta(candidate_fasta) @@ -613,7 +614,8 @@ def clean_fas(path, file_type): file.write(new_line) file.close() -def ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs): +def ortholog_search(args): + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args cmd = 'mkdir ' + out + '/tmp/' + asName starting_subprocess(cmd, 'silent') tmp_path = out + "tmp/" + asName + "/" @@ -628,23 +630,23 @@ def ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_speci db_check = searching_for_db(db_path) if db_check == 0: - print("Creating a blast data base...") + #print("Creating a blast data base...") cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path starting_subprocess(cmd, mode) - print("\t ...finished \n") + #print("\t ...finished \n") #makes a tBLASTn search against database #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - print("Starting tBLASTn search...") + #print("Starting tBLASTn search...") cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' exit_code = starting_subprocess(cmd, mode, 3600) if exit_code == 1: - print("The tblastn search takes too long. Exciting ...") + print("The tblastn search takes too long for species %s. Exciting ..." % asName) f.close() cleanup(tmp, tmp_folder) sys.exit() - else: - print("\t ...finished") + #else: + #print("\t ...finished") regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) if regions == 0: @@ -657,13 +659,13 @@ def ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_speci extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### - print("Starting augustus ppx ...") + #print("Starting augustus ppx ...") augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - print("\t ...finished \n") + #print("\t ...finished \n") ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: - print("No genes found at candidate regions\n") + #print("No genes found at candidate regions\n") return [], candidatesOutFile reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) @@ -910,15 +912,20 @@ def main(): if searchTaxon == '': ortholog_sequences = [] + calls = [] cpus = mp.cpu_count() - print(cpus) - #pool = mp.Pool(cpus) + pool = mp.Pool(cpus) for asName in assembly_names: - reciprocal_sequences, candidatesOutFile = ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) - ortholog_sequences.append([candidatesOutFile, reciprocal_sequences]) - + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) + #for asName in assembly_names: + #reciprocal_sequences, candidatesOutFile = ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) + #ortholog_sequences.append([candidatesOutFile, reciprocal_sequences]) + results = (pool.imap_unordered(ortholog_search, calls)) + pool.close() + pool.join() orthologsOutFile = out + "/" + group + ".extended.fa" - + for i in results: + ortholog_sequences.append(i) if taxa == []: taxa = [fdog_ref_species] addRef(orthologsOutFile, fasta_path, taxa) From da8cdcc67d7ae8306c73deb4421e0d2b9078689a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 1 Oct 2021 10:57:19 +0200 Subject: [PATCH 124/229] added output for computational time --- fdog/fDOGassembly.py | 81 +++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index aadb3f0..97ec269 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -341,7 +341,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva try: id_ref = seedDic[fdog_ref_species] except KeyError: - print("The fDOG reference species isn't part of the core ortholog group, ... exciting") + #print("The fDOG reference species isn't part of the core ortholog group, ... exciting") return 0, seed if searchTool == "blast": cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile @@ -397,7 +397,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva if orthologs == []: - print("No hit in the backward search, ...exciting") + #print("No hit in the backward search, ...exciting") return 0, seed else: @@ -422,12 +422,12 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva orthologs = set({}) for species in seed: - print("backward search in species " + species + "\n") + print("backward search in species %s\n" %species) orthologs_new = set({}) try: id_ref = seedDic[species] except KeyError: - print("The species " + species + " isn't part of the core ortholog group, ... exciting") + #print("The species " + species + " isn't part of the core ortholog group, ... exciting") return 0, seed cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile @@ -450,12 +450,13 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva #print(species) #print(orthologs_new) + #print(orthologs) if species == fdog_ref_species: orthologs = orthologs_new else: orthologs = orthologs & orthologs_new - if orthologs == {}: - print("No ortholog was found with option --strict") + if len(orthologs) == 0: + #print("No ortholog was found with option --strict") return 0, seed @@ -465,7 +466,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva return list(orthologs), seed def addRef(output, core_fasta, species_list): - print(species_list) + #print(species_list) output_file = open(output, "a+") seq_records_core = readFasta(core_fasta) seq_records_core = list(seq_records_core) @@ -480,7 +481,7 @@ def addSeq(output, seq_list): output_file = open(output, "a+") for item in seq_list: - print(item) + #print(item) candidate_fasta = item[1] sequenceIds = item[0] if sequenceIds == 0 or sequenceIds == []: @@ -540,8 +541,12 @@ def createFasInput(orthologsOutFile, mappingFile): def cleanup(tmp, tmp_path): if tmp == False: + timeout = time.time() + 60*1 while os.path.exists(tmp_path): shutil.rmtree(tmp_path, ignore_errors=True) + if time.time() > timeout: + print("tmp folder could not be removed!") + break def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: @@ -639,7 +644,10 @@ def ortholog_search(args): #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt #print("Starting tBLASTn search...") cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + time_tblastn_start = time.time() exit_code = starting_subprocess(cmd, mode, 3600) + time_tblastn_end = time.time() + time_tblastn = time_tblastn_end - time_tblastn_start if exit_code == 1: print("The tblastn search takes too long for species %s. Exciting ..." % asName) f.close() @@ -647,6 +655,7 @@ def ortholog_search(args): sys.exit() #else: #print("\t ...finished") + print("Time tblastn %s in species %s" % (str(time_tblastn), asName)) regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) if regions == 0: @@ -655,13 +664,17 @@ def ortholog_search(args): return [], candidatesOutFile else: - print(str(number_regions) + " candiate regions were found for species %s.\n" % asName) + print(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### #print("Starting augustus ppx ...") + time_augustus_start = time.time() augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) #print("\t ...finished \n") + time_augustus_end = time.time() + time_augustus = time_augustus_end - time_augustus_start + print("Time augustus: %s species %s \n" % (str(time_augustus), asName)) ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: @@ -884,7 +897,7 @@ def main(): print("fDOG reference species: " + fdog_ref_species + " \n") ######################## consensus sequence ################################ - + group_computation_time_start = time.time() #make a majority-rule consensus sequence with the tool hmmemit from hmmer print("Building a consensus sequence") cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path @@ -908,24 +921,35 @@ def main(): starting_subprocess(cmd, 'silent') print(" \t ...finished \n") + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start + searchBool = False if searchTaxon == '': ortholog_sequences = [] - calls = [] - cpus = mp.cpu_count() - pool = mp.Pool(cpus) - for asName in assembly_names: - calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) - #for asName in assembly_names: - #reciprocal_sequences, candidatesOutFile = ortholog_search(asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) - #ortholog_sequences.append([candidatesOutFile, reciprocal_sequences]) - results = (pool.imap_unordered(ortholog_search, calls)) - pool.close() - pool.join() + time_ortholog_start = time.time() + if parallel == True: + calls = [] + cpus = mp.cpu_count() + pool = mp.Pool(cpus) + for asName in assembly_names: + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) + + results = (pool.imap_unordered(ortholog_search, calls)) + pool.close() + pool.join() + for i in results: + ortholog_sequences.append(i) + else: + for asName in assembly_names: + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] + reciprocal_sequences, candidatesOutFile = ortholog_search(args) + ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) + orthologsOutFile = out + "/" + group + ".extended.fa" - for i in results: - ortholog_sequences.append(i) + time_ortholog_end = time.time() + time_ortholog = time_ortholog_end - time_ortholog_start if taxa == []: taxa = [fdog_ref_species] addRef(orthologsOutFile, fasta_path, taxa) @@ -1071,6 +1095,11 @@ def main(): clean_fas(out + group + ".phyloprofile", 'phyloprofile') print("\t ...finished \n") ################# remove tmp folder ######################################## + end = time.time() + time_fas = end - fas + print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") + print("Group preparation: %s \t Ortholog search: %s \t Fas: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) + sys.stdout = sys.__stdout__ if searchTaxon != '': f.close() cleanup(tmp, tmp_folder) @@ -1078,11 +1107,9 @@ def main(): f.close() cleanup(tmp, tmp_folder) - end = time.time() - sys.stdout = sys.__stdout__ + #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) - print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") - f.close() + if __name__ == '__main__': From ba752aa04f5ccf706982b3647499396c0064137d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 11 Oct 2021 13:29:27 +0200 Subject: [PATCH 125/229] updated fDOG-Assembly structure. fDOG-Assembly is now a separate script and can only be started with the command fdog.assembly --- fdog/bin/oneSeq.pl | 125 +++----------------- fdog/fDOGassembly.py | 223 ++++++++---------------------------- fdog/mergeAssemblyOutput.py | 124 -------------------- fdog/runMulti.py | 31 +---- fdog/runSingle.py | 64 +---------- 5 files changed, 74 insertions(+), 493 deletions(-) delete mode 100644 fdog/mergeAssemblyOutput.py diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 1b0839f..a99e1e6 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -207,7 +207,6 @@ my $idx_dir = "$path/taxonomy/"; my $dataDir = $path . '/data'; my $weightPath = "$path/weight_dir/"; -my $assembly_dir = "$path/assembly_dir/"; my @defaultRanks = ( 'superkingdom', 'kingdom', @@ -312,15 +311,6 @@ my %hashTree; my $aln = 'muscle'; my $searchTaxa; -#variables for fdog_goes_assembly -my $assembly; -my $augustusRefSpec; -my $avIntron; -my $lengthExtension; -my $assemblyPath; -my $searchTool = 'blast'; -my $matrix = 'blosum62'; -my $dataPath = ''; ################# Command line options GetOptions ( "h" => \$help, @@ -383,15 +373,7 @@ "distDeviation=s" => \$distDeviation, "aligner=s" => \$aln, "hyperthread" => \$hyperthread, - "searchTaxa=s" => \$searchTaxa, - "assembly" => \$assembly, - "assemblypath=s" => \$assemblyPath, - "augustusRefSpec=s" => \$augustusRefSpec, - "avIntron=s" => \$avIntron, - "lengthExtension=s" => \$lengthExtension, - "searchTool=s" => \$searchTool, - "scoringmatrix=s" => \$matrix, - "dataPath=s" => \$dataPath + "searchTaxa=s" => \$searchTaxa ); $outputPath = abs_path($outputPath); @@ -403,8 +385,6 @@ $weightPath = abs_path($weightPath)."/"; $genome_dir = abs_path($genome_dir)."/"; $taxaPath = $genome_dir; -$dataPath = abs_path($dataPath)."/"; -$assembly_dir = abs_path($assemblyPath)."/"; ############# do initial check if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) { @@ -414,7 +394,7 @@ initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff); } - if (!defined $coreex && !defined $assembly) { + if (!defined $coreex) { if (!grep(/$minDist/, @defaultRanks)) { die "ERROR: minDist $minDist invalid!\n"; } @@ -498,7 +478,7 @@ # create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction) # get annotations for seed sequence if fas support is on -if ($fas_support && !$assembly){ +if ($fas_support){ if (!$weightPath) { createWeightFolder(); } @@ -507,7 +487,7 @@ my $coreStTime = gettime(); #time; #core-ortholog search -if (!$coreex && !$assembly) { +if (!$coreex) { print "\nCore compiling...\n"; $coremode = 1; $taxaPath = $blastPath; @@ -645,12 +625,7 @@ my $final_eval_blast = $eval_blast*$eval_relaxfac; my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac; - if (!$assembly){ - $taxaPath = $genome_dir; - } - else{ - $taxaPath = $assembly_dir; - } + $taxaPath = $genome_dir; my @searchTaxa; unless ($searchTaxa) { unless($groupNode) { @@ -706,63 +681,7 @@ } } } - if ($assembly){ - $eval_blast = sprintf("%f", $eval_blast); - if ($seqFile ne "") { - my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent"); - - if (defined $assemblyPath){ - push(@assembly_cmd, "--assemblyPath $assemblyPath") - } - if (defined $avIntron){ - push(@assembly_cmd, "--avIntron $avIntron "); - } - if (defined $lengthExtension){ - push(@assembly_cmd, "--lengthExtension $lengthExtension "); - } - if (!$autoclean){ - push(@assembly_cmd, "--tmp "); - } - if ($outputPath){ - push(@assembly_cmd, "--out $outputPath "); - } - if (defined $strict){ - push(@assembly_cmd, "--strict"); - } - if ($eval_blast){ - push(@assembly_cmd, "--evalBlast $eval_blast "); - } - if ($searchTool){ - push(@assembly_cmd, "--msaTool $aln "); - } - if (defined $checkcoorthologsref){ - push(@assembly_cmd, "--checkCoorthologsRef"); - } - if ($searchTool){ - push(@assembly_cmd, "--searchTool $searchTool"); - } - if ($matrix){ - push(@assembly_cmd, "--scoringmatrix $matrix"); - } - if ($coreOrthologsPath){ - push(@assembly_cmd, "--coregroupPath $coreOrthologsPath"); - } - if ($fasoff){ - push(@assembly_cmd, "--fasoff"); - } - if ($searchTaxon){ - push(@assembly_cmd, "--searchTaxon $searchTaxon"); - } - if ($filter){ - push(@assembly_cmd, "--filter $filter"); - } - printDebug(@assembly_cmd); - system(join(' ', @assembly_cmd)) == 0 or die "Error: fDOGassembly failed \n"; - } - } - else{ runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln); - } $pm->finish; } $pm->wait_all_children; @@ -774,8 +693,8 @@ push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!"; print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n"; - -if(!$coreOnly && !$assembly){ +## Evaluation of all orthologs that are predicted by the final run +if(!$coreOnly){ my $fasStTime = gettime(); my $processID = $$; @@ -787,7 +706,7 @@ addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); # calculate FAS scores for final extended.fa - if ($fas_support && !$assembly) { + if ($fas_support) { print "Starting the feature architecture similarity score computation...\n"; my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu --redo_anno"; unless ($countercheck) { @@ -800,21 +719,12 @@ } push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; - if($autoclean){ print "Cleaning up...\n"; runAutoCleanUp($processID); } } -if ($assembly){ - my $file_assembly_out; - $file_assembly_out = $outputPath . '/' . $seqName; - my $cmd_merge; - $cmd_merge = "fdog.mergeAssembly --in $outputPath --out $file_assembly_out --cleanup"; - printDebug($cmd_merge); - system($cmd_merge); -} ## Delete tmp folder unless ($debug) { my $delTmp = "rm -rf $tmpdir"; @@ -1224,10 +1134,10 @@ sub checkOptions { if ($force == 1 and $append ==1) { $force = 0; } - ### check the presence of the pre-computed core set if options reuseCore or assembly is used - if ($coreex || $assembly) { + ### check the presence of the pre-computed core set + if ($coreex) { if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") { - print "You selected the option -reuseCore or -assembly, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; + print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; exit; } } @@ -1298,7 +1208,7 @@ sub checkOptions { ### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected $optbreaker = 0; - while(!$minCoreOrthologs and (!$coreex and !$assembly)) { + while(!$minCoreOrthologs and !$coreex) { if ($optbreaker >= 3){ print "No proper number given ... exiting.\n"; exit; @@ -1313,12 +1223,10 @@ sub checkOptions { $filter = 'no' if $filter eq 'F'; } - if (!$assembly){ - $inputSeq = fetchSequence($seqFile, $dataDir); - } + $inputSeq = fetchSequence($seqFile, $dataDir); ## the user has not provided a sequence id, however, the refspec is determined. - if($seqId eq '' && !$assembly) { + if($seqId eq '') { my $besthit; if (!$blast){ ## a refspec has been determined @@ -1445,9 +1353,8 @@ sub checkOptions { #### checking for the min and max distance for the core set compilation #### omit this check, if the option reuseCore has been selected (added 2019-02-04) $optbreaker = 0; - if (!$coreex and !$assembly) { + if (!$coreex) { my $node; - #print "Testing coreex assembly\n"; $node = $db->get_taxon(-taxonid => $refTaxa{$refSpec}); $node->name('supplied', $refSpec); if (lc($maxDist) eq "root"){ @@ -2709,7 +2616,7 @@ sub initialCheck { } } # check weight_dir - if ($fasoff != 1 && !$assembly) { + if ($fasoff != 1) { my %seen; my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir); my @notFolder; diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 97ec269..eb9dc41 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -737,11 +737,11 @@ def main(): optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False) optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') - optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', default='') + optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[]) optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='') + optional.add_argument('--searchTaxa', help='Search Taxon name', action='store', nargs="+", default=[]) optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) @@ -776,12 +776,8 @@ def main(): msaTool = args.msaTool matrix = args.scoringmatrix taxa = args.coreTaxa - if taxa == '': - taxa =[] - else: - taxa = taxa.split(",") fasoff = args.fasoff - searchTaxon = args.searchTaxon + searchTaxa = args.searchTaxa silent = args.silent debug = args.debug force = args.force @@ -816,7 +812,6 @@ def main(): except: dataPath = 'config' - if out == '': out = os.getcwd() else: @@ -854,7 +849,6 @@ def main(): assemblyDir = dataPath + '/assembly_dir/' check_path(assemblyDir) - try: f = open(out + "/fdog.log", "a+") except FileNotFoundError: @@ -869,8 +863,15 @@ def main(): sys.stdout = Logger(f) ########################### other variables ################################ - - assembly_names = os.listdir(assemblyDir) + if searchTaxa == []: + assembly_names = os.listdir(assemblyDir) + else: + assembly_names = os.listdir(assemblyDir) + for Taxon in searchTaxa: + if Taxon not in assembly_names: + print("Taxon %s is not in the assembly_dir" % Taxon) + sys.exit() + assembly_names = searchTaxa ################################# paths #################################### @@ -924,170 +925,48 @@ def main(): group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start - searchBool = False - - if searchTaxon == '': - ortholog_sequences = [] - time_ortholog_start = time.time() - if parallel == True: - calls = [] - cpus = mp.cpu_count() - pool = mp.Pool(cpus) - for asName in assembly_names: - calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) - - results = (pool.imap_unordered(ortholog_search, calls)) - pool.close() - pool.join() - for i in results: - ortholog_sequences.append(i) - else: - for asName in assembly_names: - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] - reciprocal_sequences, candidatesOutFile = ortholog_search(args) - ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) - - orthologsOutFile = out + "/" + group + ".extended.fa" - time_ortholog_end = time.time() - time_ortholog = time_ortholog_end - time_ortholog_start - if taxa == []: - taxa = [fdog_ref_species] - addRef(orthologsOutFile, fasta_path, taxa) - addSeq(orthologsOutFile, ortholog_sequences) - refBool = True - mappingFile = out + "/tmp/" + group + ".mapping.txt" + ###################### ortholog search ##################################### + ortholog_sequences = [] + time_ortholog_start = time.time() + if parallel == True: + ##################### parallel compuataion ############################# + calls = [] + cpus = mp.cpu_count() + pool = mp.Pool(cpus) + for asName in assembly_names: + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) + results = (pool.imap_unordered(ortholog_search, calls)) + pool.close() + pool.join() + for i in results: + ortholog_sequences.append(i) else: - #################### fDOG assembly computation for all species ############# + ###################### computation species per species ################ for asName in assembly_names: - if searchBool == True: - break - if searchTaxon != '' and searchBool == False: - asName = searchTaxon - searchBool = True - - ################### path definitions ################################### - - cmd = 'mkdir ' + out + '/tmp/' + asName - starting_subprocess(cmd, 'silent') - tmp_path = out + "tmp/" + asName + "/" - candidatesOutFile = tmp_path + group + ".candidates.fa" - if searchTaxon != '': - orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" - fasOutFile = out + "/" + group + "_" + asName - mappingFile = tmp_path + group + "_" + asName + ".mapping.txt" - else: - orthologsOutFile = out + "/" + group + ".extended.fa" - fasOutFile = out + "/" + group - mappingFile = out + "/tmp/" + group + ".mapping.txt" - - print("Searching in species " + asName + "\n") - assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" - db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - - ######################## tBLASTn ########################################### - #checks if data base exists already - db_check = searching_for_db(db_path) - if db_check == 0: - print("Creating a blast data base...") - cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path - starting_subprocess(cmd, mode) - print("\t ...finished \n") - - #makes a tBLASTn search against database - #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - print("Starting tBLASTn search...") - cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' - exit_code = starting_subprocess(cmd, mode, 3600) - if exit_code == 1: - print("The tblastn search takes too long. Exciting ...") - f.close() - cleanup(tmp, tmp_folder) - sys.exit() - else: - print("\t ...finished") - - ################### search for candidate regions and extract seq ########### - # parse blast and filter for candiate regions - regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) - - if regions == 0: - #no candidat region are available, no ortholog can be found - print("No candidate region found!\n") - if refBool == True: - continue - else: - taxa = [fdog_ref_species] - reciprocal_sequences = 0 - else: - print(str(number_regions) + " candiate regions were found.\n") - extract_seq(regions, db_path, tmp_path, mode) - - ############### make Augustus PPX search ################################### - - print("Starting augustus ppx ...") - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - print("\t ...finished \n") - - ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) <= 0: - print("No genes found at candidate region\n") - if searchTaxon == '' and refBool == True: - continue - else: - reciprocal_sequences = 0 - taxa = [fdog_ref_species] - else: - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) - - - ################## checking accepted genes for co-orthologs ################ - if reciprocal_sequences == 0: - if regions != 0: - print("No ortholog fulfilled the reciprocity criteria \n") - if searchTaxon == '' and refBool == True: - continue - else: - reciprocal_sequences = 0 - else: - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - - ################ add sequences to extended.fa in the output folder########## - - addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) - refBool = True + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] + reciprocal_sequences, candidatesOutFile = ortholog_search(args) + ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) + + ################## preparing output ######################################## + orthologsOutFile = out + "/" + group + ".extended.fa" + time_ortholog_end = time.time() + time_ortholog = time_ortholog_end - time_ortholog_start + if taxa == []: + taxa = [fdog_ref_species] + if append == True: + addSeq(orthologsOutFile, ortholog_sequences) + else: + addRef(orthologsOutFile, fasta_path, taxa) + addSeq(orthologsOutFile, ortholog_sequences) + mappingFile = out + "/tmp/" + group + ".mapping.txt" - ############### make Annotation with FAS ################################### - # if we want to search in only one Taxon - if searchTaxon != '' and fasoff == False: - fas = time.time() - print("Calculating FAS scores ...") - fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - cmd = 'mkdir ' + tmp_path + 'anno_dir' - starting_subprocess(cmd, 'silent') - cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName - starting_subprocess(cmd, 'silent') - clean_fas(fasOutFile + "_forward.domains", 'domains') - clean_fas(fasOutFile + "_reverse.domains", 'domains') - clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') - print("\t ...finished \n") - - - #if we searched in more than one Taxon and no ortholog was found - if refBool == False and searchTaxon == '': - print("No orthologs found. Exciting ...") - f.close() - cleanup(tmp, tmp_folder) - return 1 - #if we searched in more than one taxon - if fasoff == False and searchTaxon == '': + if fasoff == False: fas = time.time() print("Calculating FAS scores ...") tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, 'silent') clean_fas(out + group + "_forward.domains", 'domains') @@ -1100,17 +979,9 @@ def main(): print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") print("Group preparation: %s \t Ortholog search: %s \t Fas: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) sys.stdout = sys.__stdout__ - if searchTaxon != '': - f.close() - cleanup(tmp, tmp_folder) - else: - f.close() - cleanup(tmp, tmp_folder) - - - #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) - + f.close() + cleanup(tmp, tmp_folder) if __name__ == '__main__': main() diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py deleted file mode 100644 index 1606b1d..0000000 --- a/fdog/mergeAssemblyOutput.py +++ /dev/null @@ -1,124 +0,0 @@ -# -*- coding: utf-8 -*- - -####################################################################### -# Copyright (C) 2020 Vinh Tran -# -# This script is used to merge all output files (.extended.fa, .phyloprofile, -# _forward.domains, _reverse.domains) in a given directory into one file each. -# -# This script is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for -# more details -# -# Contact: hannah.muelbaier@stud.uni-frankfurt.de -# -####################################################################### - -import sys -import os -from os import listdir as ldir -import argparse -from pathlib import Path - -def main(): - version = '0.0.1' - parser = argparse.ArgumentParser(description='You are running fdog.mergeAssemblyOutput version ' + str(version) + '.') - parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', - action='store', default='', required=True) - parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True) - parser.add_argument('-c', '--cleanup', help='Deletes the merged output files from fDOG', action='store_true', default=False) - args = parser.parse_args() - - directory = args.input - out = args.output - cleanup = args.cleanup - if not os.path.exists(os.path.abspath(directory)): - sys.exit('%s not found' % directory) - else: - directory = os.path.abspath(directory) - - phyloprofile = None - set_phylo = set() - domains_0 = None - set_domains_f = set() - domains_1 = None - set_domains_r = set() - ex_fasta = None - set_fasta = set() - header_bool = False - for infile in ldir(directory): - if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile': - if not phyloprofile: - phyloprofile = open(out + '.phyloprofile', 'w') - phyloprofile.write('geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line != 'geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n' and line not in set_phylo: - phyloprofile.write(line) - if len(lines) > 1: - set_phylo = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('_forward.domains') and not infile == out + '_forward.domains': - if not domains_0: - domains_0 = open(out + '_forward.domains', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line not in set_domains_f: - domains_0.write(line) - if len(lines) > 1: - set_domains_f = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('_reverse.domains') and not infile == out + '_reverse.domains': - if not domains_1: - domains_1 = open(out + '_reverse.domains', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line not in set_domains_r: - domains_1.write(line) - if len(lines) > 1: - set_domains_r = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('.extended.fa') and not infile == out + '.extended.fa': - if not ex_fasta: - ex_fasta = open(out + '.extended.fa', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - header = set() - #print(set_fasta) - for line in lines: - if line[0] == ">": - header.add(line) - if line not in set_fasta: - ex_fasta.write(line) - header_bool = True - else: - header_bool = False - else: - if header_bool == True: - ex_fasta.write(line) - set_fasta = header - if cleanup == True: - os.remove(directory + '/' +infile) - elif infile.endswith('.tsv'): - os.remove(directory + '/' + infile) - - if phyloprofile: - phyloprofile.close() - if domains_0: - domains_0.close() - if domains_1: - domains_1.close() - if ex_fasta: - ex_fasta.close() - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/fdog/runMulti.py b/fdog/runMulti.py index c19b598..c19b0ff 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -48,8 +48,7 @@ def prepare(args, step): coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, checkOff, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args - + cpu, hyperthread, checkOff, debug, silent) = args mute = False if step == 'core': @@ -71,9 +70,7 @@ def prepare(args, step): fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] otherArgs = [cpu, hyperthread, checkOff, debug, True] - assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] - return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) - + return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) def getSeedName(seedFile): seqName = seedFile.split('.')[0] @@ -108,10 +105,9 @@ def compileCore(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)): (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core') - coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute]) + coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute]) if len(coreCompilationJobs) > 0: pool = mp.Pool(cpu) coreOut = [] @@ -133,7 +129,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') if mute == True: print(seed) else: @@ -295,14 +291,6 @@ def main(): optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) - assembly_options = parser.add_argument_group('Assembly options') - assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) - assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') - assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') - assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) - assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) - assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') - assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -379,15 +367,6 @@ def main(): else: silent = True - #fdog_goes_assembly arguments - assembly = args.assembly - assemblyFile = args.assemblyFile - augustusRefSpec = args.augustusRefSpec - avIntron = args.avIntron - lengthExtension = args.lengthExtension - searchTool = args.searchTool - matrix = args.scoringmatrix - ### check fas if not fasoff: try: @@ -472,7 +451,7 @@ def main(): coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, checkOff, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] + cpu, hyperthread, checkOff, debug, silent] ### START Path(outpath).mkdir(parents=True, exist_ok=True) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index c65300f..c4abb82 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -65,13 +65,13 @@ def getfdogInfo(fdogPath, infoType): exit('%s not found' % (fdogPath + '/bin/oneSeq.pl')) def runSingle(args): - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = args + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args # basic command (fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec) # add paths - (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) = pathArgs - cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s -assemblypath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) + (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs + cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath) # add other I/O options (append, force, noCleanup, group, blast, db) = ioArgs if append == True: @@ -163,28 +163,7 @@ def runSingle(args): cmd = cmd + ' -debug' if silent == True: cmd = cmd + ' -silent' - # add assembly options - (assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath) = assemblyArgs - if assembly == True: - cmd = cmd + ' -assembly' - cmd = cmd + ' -reuseCore' - if not augustusRefSpec == '': - cmd = cmd + ' -augustusRefSpec=%s' % augustusRefSpec - else: - sys.exit('An augutus reference species is requiered by using the option --assembly') - if not avIntron == '': - cmd = cmd + ' -avIntron=%s' % avIntron - if not lengthExtension == '': - cmd = cmd + ' -lengthExtension=%s' % lengthExtension - if not assemblyFile == '': - cmd = cmd + ' -assemblyFile=%s' % assemblyFile - if not searchTool == '': - cmd = cmd + ' -searchTool=%s' % searchTool - if not matrix == '': - cmd = cmd + ' -scoringmatrix=%s' % matrix - if not dataPath == '': - cmd = cmd + ' -dataPath=%s' % dataPath - #print(cmd) + # print(cmd) if mute == True: cmd = cmd + ' > /dev/null 2>&1' try: @@ -211,8 +190,6 @@ def main(): optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional_paths.add_argument('--assemblypath', help='Path for the assembly directory', action='store', default='') - addtionalIO = parser.add_argument_group('Other I/O options') addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) @@ -295,14 +272,6 @@ def main(): optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) - assembly_options = parser.add_argument_group('Assembly options') - assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) - assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') - assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') - assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) - assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) - assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') - assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -322,7 +291,6 @@ def main(): searchpath = args.searchpath weightpath = args.weightpath pathFile = args.pathFile - assemblypath = args.assemblypath # other I/O arguments append = args.append @@ -378,15 +346,6 @@ def main(): else: silent = True - #fdog_goes_assembly arguments - assembly = args.assembly - assemblyFile = args.assemblyFile - augustusRefSpec = args.augustusRefSpec - avIntron = args.avIntron - lengthExtension = args.lengthExtension - searchTool = args.searchTool - matrix = args.scoringmatrix - ### get fdog and data path dataPath = '' fdogPath = os.path.realpath(__file__).replace('/runSingle.py','') @@ -434,30 +393,19 @@ def main(): except: sys.exit('weightpath not found in %s' % pathFile) - if assemblypath == '': - assemblypath = dataPath + '/assembly_dir' - if dataPath == 'config': - try: - assemblypath = cfg['assemblypath'] - except: - sys.exit('assemblypath not found in %s' % pathFile) - if assembly == True: - searchpath = assemblypath - ### check input arguments seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) # group arguments basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth] ioArgs = [append, force, noCleanup, group, blast, db] - pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath] + pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] otherArgs = [cpu, hyperthread, checkOff, debug, silent] - assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath] ### run fdog - runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, False]) + runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False]) if __name__ == '__main__': main() From 49a430b913970276efa3c69af3ca9007f7b5e3c9 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 11 Oct 2021 15:56:40 +0200 Subject: [PATCH 126/229] testing addSeq function --- fdog/fDOGassembly.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index eb9dc41..3940f04 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -484,6 +484,8 @@ def addSeq(output, seq_list): #print(item) candidate_fasta = item[1] sequenceIds = item[0] + print(sequenceIds) + print(type(sequenceIds)) if sequenceIds == 0 or sequenceIds == []: pass seq_records_candidate = readFasta(candidate_fasta) From e18872b31a0c5b013a2c7bdece18674c3b8c5974 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 11 Oct 2021 16:02:13 +0200 Subject: [PATCH 127/229] bug fix in addSeq function --- fdog/fDOGassembly.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 3940f04..71beafc 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -484,10 +484,8 @@ def addSeq(output, seq_list): #print(item) candidate_fasta = item[1] sequenceIds = item[0] - print(sequenceIds) - print(type(sequenceIds)) if sequenceIds == 0 or sequenceIds == []: - pass + continue seq_records_candidate = readFasta(candidate_fasta) seq_records_candidate = list(seq_records_candidate) for entry_candidate in seq_records_candidate: From b4d1e0c3f8fb09ca0214c45ff40789e3b56d64b1 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sat, 16 Oct 2021 14:40:39 +0200 Subject: [PATCH 128/229] bug fix in ortholog search function --- fdog/fDOGassembly.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 71beafc..7c45233 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -650,8 +650,7 @@ def ortholog_search(args): time_tblastn = time_tblastn_end - time_tblastn_start if exit_code == 1: print("The tblastn search takes too long for species %s. Exciting ..." % asName) - f.close() - cleanup(tmp, tmp_folder) + #cleanup(tmp, tmp_folder) sys.exit() #else: #print("\t ...finished") From e85fd1c561df192ebdcd15ddd0c84336baad327f Mon Sep 17 00:00:00 2001 From: mueli94 Date: Sun, 17 Oct 2021 12:24:31 +0200 Subject: [PATCH 129/229] bug fix in ortholog search if tblastn takes to long --- fdog/fDOGassembly.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 7c45233..0a9df8f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -244,7 +244,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug output.write(line) sequence_file.close() except FileNotFoundError: - print("No gene found in region with ID:" + name + " , continuing with next region") + print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") output.close() def searching_for_db(assembly_path): @@ -315,7 +315,12 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates #print("mafft-linsi") os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) - distances = get_distance_biopython(aln_file, matrix) + try: + distances = get_distance_biopython(aln_file, matrix) + except ValueError: + print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) + return 0, "NaN", "NaN" + distance_hit_query = distances[best_hit, candidate_name] distance_ref_hit = distances[best_hit, ref] @@ -374,7 +379,8 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" orthologs.append(gene) elif co_orthologs_result == 0: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" + if distance_ref_hit != "NaN": + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" else: print("\tnothitting\n") if mode == "debug" else "" elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs: @@ -629,7 +635,7 @@ def ortholog_search(args): fasOutFile = out + "/" + group #mappingFile = out + "/tmp/" + group + ".mapping.txt" - print("Searching in species " + asName + "\n") + sys.stdout.write("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" db_check = searching_for_db(db_path) @@ -649,9 +655,10 @@ def ortholog_search(args): time_tblastn_end = time.time() time_tblastn = time_tblastn_end - time_tblastn_start if exit_code == 1: - print("The tblastn search takes too long for species %s. Exciting ..." % asName) + sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) #cleanup(tmp, tmp_folder) - sys.exit() + #sys.exit() + return [], candidatesOutFile #else: #print("\t ...finished") print("Time tblastn %s in species %s" % (str(time_tblastn), asName)) @@ -659,7 +666,7 @@ def ortholog_search(args): regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) if regions == 0: #no candidat region are available, no ortholog can be found - print("No candidate region found for species %s!\n" % asName) + sys.stdout.write("No candidate region found for species %s!\n" % asName) return [], candidatesOutFile else: @@ -684,7 +691,7 @@ def ortholog_search(args): if reciprocal_sequences == 0: if regions != 0: - print("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) return [], candidatesOutFile else: reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) @@ -976,7 +983,7 @@ def main(): end = time.time() time_fas = end - fas print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") - print("Group preparation: %s \t Ortholog search: %s \t Fas: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) + print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) sys.stdout = sys.__stdout__ f.close() From 1f9f736325253c08f33d60bc787136c10f6ef303 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 18 Oct 2021 10:23:16 +0200 Subject: [PATCH 130/229] updated input options --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 0a9df8f..e40701b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -727,7 +727,7 @@ def main(): required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', action='store', default='', required=True) required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) - required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', nargs="+", default='', required=True) + required.add_argument('--refSpec', help='Reference taxon/taxa for fDOG.', action='store', nargs="+", default='', required=True) ################## optional arguments ###################################### optional = parser.add_argument_group('Optional arguments') optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int) @@ -744,10 +744,10 @@ def main(): optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False) optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[]) - optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') + #optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional.add_argument('--searchTaxa', help='Search Taxon name', action='store', nargs="+", default=[]) + optional.add_argument('--searchTaxa', help='List of Taxa to search in', action='store', nargs="+", default=[]) optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) From 42e4ba122504f24a25f748c087d14aa0d199a419 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Tue, 19 Oct 2021 10:51:15 +0200 Subject: [PATCH 131/229] Update fDOG goes assembly to version 0.1.2 (#12) --- fdog/addTaxa.py | 15 +- fdog/addTaxon.py | 62 ++-- fdog/bin/hamstr.pl | 141 ++++----- fdog/bin/oneSeq.pl | 204 +++++-------- fdog/checkData.py | 69 +++-- fdog/fDOGassembly.py | 563 ++++++++++++++++++++++-------------- fdog/mergeAssemblyOutput.py | 124 -------- fdog/mergeOutput.py | 7 +- fdog/removefDog.py | 4 +- fdog/runMulti.py | 46 +-- fdog/runSingle.py | 74 +---- fdog/setup/install_lib.sh | 3 - fdog/setup/setup.sh | 43 +-- fdog/setup/setup_conda.sh | 25 +- setup.py | 4 +- 15 files changed, 648 insertions(+), 736 deletions(-) delete mode 100644 fdog/mergeAssemblyOutput.py diff --git a/fdog/addTaxa.py b/fdog/addTaxa.py index d392c8c..fa4a3a1 100644 --- a/fdog/addTaxa.py +++ b/fdog/addTaxa.py @@ -37,6 +37,7 @@ import re import shutil from tqdm import tqdm +from datetime import datetime def checkFileExist(file): if not os.path.exists(os.path.abspath(file)): @@ -68,20 +69,18 @@ def parseMapFile(mappingFile): try: ver = tmp[3].strip() except: - ver = 1 + ver = datetime.today().strftime('%y%m%d') #1 # print(taxName+"@"+str(taxId)+"@"+str(ver)) nameDict[fileName] = (taxName, str(taxId), str(ver)) return(nameDict) def runAddTaxon(args): - (f,n,i,o,c,v,a,cpus,replace,delete,oldFAS) = args + (f,n,i,o,c,v,a,cpus,replace,delete) = args cmd = 'fdog.addTaxon -f %s -n %s -i %s -o %s -v %s --cpus %s' % (f,n,i,o,v,cpus) if c == True: cmd = cmd + ' -c' if a == True: cmd = cmd + ' -a' - if oldFAS == True: - cmd = cmd + ' --oldFAS' if replace == True: cmd = cmd + ' --replace' if delete == True: @@ -95,7 +94,7 @@ def runAddTaxon(args): sys.exit('Problem running\n%s' % (cmd)) def main(): - version = '0.0.5' + version = '0.0.9' parser = argparse.ArgumentParser(description='You are running fdog.addTaxa version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') @@ -105,8 +104,7 @@ def main(): action='store', default='', required=True) optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='') optional.add_argument('-c', '--coreTaxa', help='Include these taxa to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False) - optional.add_argument('-a', '--noAnno', help='Do NOT annotate these taxa using annoFAS', action='store_true', default=False) - optional.add_argument('--oldFAS', help='Use old verion of FAS (annoFAS ≤ 1.2.0)', action='store_true', default=False) + optional.add_argument('-a', '--noAnno', help='Do NOT annotate these taxa using fas.doAnno', action='store_true', default=False) optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int) optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False) optional.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False) @@ -128,7 +126,6 @@ def main(): outPath = os.path.abspath(outPath) noAnno = args.noAnno coreTaxa = args.coreTaxa - oldFAS = args.oldFAS cpus = args.cpus if cpus == 0: cpus = mp.cpu_count()-2 @@ -171,7 +168,7 @@ def main(): verProt = nameDict[f][2] jobs.append([ folIn + '/' + f, nameDict[f][0], nameDict[f][1], - outPath, coreTaxa, nameDict[f][2], noAnno, cpus, replace, delete, oldFAS + outPath, coreTaxa, nameDict[f][2], noAnno, cpus, replace, delete ]) if len(dupList) > 0: diff --git a/fdog/addTaxon.py b/fdog/addTaxon.py index fe0a810..f962cba 100755 --- a/fdog/addTaxon.py +++ b/fdog/addTaxon.py @@ -32,6 +32,7 @@ import multiprocessing as mp from ete3 import NCBITaxa import re +import shutil from datetime import datetime def checkFileExist(file): @@ -83,7 +84,7 @@ def runBlast(args): os.symlink(fileInGenome, fileInBlast) def main(): - version = '0.0.5' + version = '0.0.10' parser = argparse.ArgumentParser(description='You are running fdog.addTaxon version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') @@ -91,10 +92,9 @@ def main(): required.add_argument('-i', '--taxid', help='Taxonomy ID of input taxon', action='store', default='', required=True, type=int) optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='') optional.add_argument('-n', '--name', help='Acronym name of input taxon', action='store', default='', type=str) - optional.add_argument('-v', '--verProt', help='Proteome version', action='store', default=1, type=str) + optional.add_argument('-v', '--verProt', help='Proteome version', action='store', default='', type=str) optional.add_argument('-c', '--coreTaxa', help='Include this taxon to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False) - optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using annoFAS', action='store_true', default=False) - optional.add_argument('--oldFAS', help='Use old verion of FAS (annoFAS ≤ 1.2.0)', action='store_true', default=False) + optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using fas.doAnno', action='store_true', default=False) optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int) optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False) optional.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False) @@ -119,7 +119,8 @@ def main(): noAnno = args.noAnno coreTaxa = args.coreTaxa ver = str(args.verProt) - oldFAS = args.oldFAS + if ver == '': + ver = datetime.today().strftime('%y%m%d') cpus = args.cpus if cpus == 0: cpus = mp.cpu_count()-2 @@ -135,6 +136,13 @@ def main(): specName = name+'@'+taxId+'@'+ver print('Species name\t%s' % specName) + ### remove old folder if force is set + if force: + if os.path.exists(outPath + '/genome_dir/' + specName): + shutil.rmtree(outPath + '/genome_dir/' + specName) + if os.path.exists(outPath + '/blast_dir/' + specName): + shutil.rmtree(outPath + '/blast_dir/' + specName) + ### create file in genome_dir print('Parsing FASTA file...') Path(outPath + '/genome_dir').mkdir(parents = True, exist_ok = True) @@ -147,25 +155,30 @@ def main(): f = open(specFile, 'w') index = 0 modIdIndex = 0 - longId = 'no' + # longId = 'no' tmpDict = {} + # with open(specFile + '.mapping', 'a') as mappingFile: for id in inSeq: seq = str(inSeq[id].seq) # check ID - id = re.sub('\|', '_', id) - oriId = id - if len(id) > 30: - modIdIndex = modIdIndex + 1 - id = specName + "_" + str(modIdIndex) - longId = 'yes' - with open(specFile + '.mapping', 'a') as mappingFile: - mappingFile.write('%s\t%s\n' % (id, oriId)) - if not id in tmpDict: - tmpDict[id] = 1 + # oriId = id + if ' ' in id: + sys.exit('\033[91mERROR: Sequence IDs (e.g. %s) must not contain space(s)!\033[0m' % id) else: - index = index + 1 - id = str(id) + '_' + str(index) - tmpDict[id] = 1 + if '\|' in id: + print('\033[91mWARNING: Sequence IDs contain pipe(s). They will be replaced by "_"!\033[0m') + id = re.sub('\|', '_', id) + # if len(id) > 20: + # modIdIndex = modIdIndex + 1 + # id = modIdIndex + # longId = 'yes' + # if not id in tmpDict: + # tmpDict[id] = 1 + # else: + # index = index + 1 + # id = str(index) + # tmpDict[id] = 1 + # mappingFile.write('%s\t%s\n' % (id, oriId)) # check seq if seq[-1] == '*': seq = seq[:-1] @@ -187,8 +200,8 @@ def main(): cf.write(str(datetime.now())) cf.close() # warning about long header - if longId == 'yes': - print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile) + # if longId == 'yes': + # print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile) else: print(genomePath + '/' + specName + '.fa already exists!') @@ -207,16 +220,13 @@ def main(): ### create annotation if not noAnno: Path(outPath + '/weight_dir').mkdir(parents = True, exist_ok = True) - annoCmd = 'annoFAS -i %s/%s.fa -o %s --cpus %s' % (genomePath, specName, outPath+'/weight_dir', cpus) + annoCmd = 'fas.doAnno -i %s/%s.fa -o %s --cpus %s' % (genomePath, specName, outPath+'/weight_dir', cpus) if force: annoCmd = annoCmd + " --force" - if oldFAS: - print("running old version of FAS...") - annoCmd = 'annoFAS -i %s/%s.fa -o %s -n %s --cores %s' % (genomePath, specName, outPath+'/weight_dir', specName, cpus) try: subprocess.call([annoCmd], shell = True) except: - print('\033[91mProblem with running annoFAS. You can check it with this command:\n%s\033[0m' % annoCmd) + print('\033[91mProblem with running fas.doAnno. You can check it with this command:\n%s\033[0m' % annoCmd) print('Output for %s can be found in %s within genome_dir [and blast_dir, weight_dir] folder[s]' % (specName, outPath)) diff --git a/fdog/bin/hamstr.pl b/fdog/bin/hamstr.pl index 7ff125e..3feb01e 100755 --- a/fdog/bin/hamstr.pl +++ b/fdog/bin/hamstr.pl @@ -195,9 +195,10 @@ ## 01.12.2020 (v13.4.1 - vinh) add silent option to muscle for checkCoOrthologsRef ## 21.01.2021 (v13.4.2 - vinh) fiexed bug when refspec has "dot" in its name ## 19.03.2021 (v13.4.3 - vinh) changed $path to current directory +## 19.03.2021 (v13.4.5 - vinh) do not replace space by @ for hmm output in parseHmmer4pm ######################## start main ########################################### -my $version = "HaMStR v.13.4.4"; +my $version = "HaMStR v.13.4.5"; ######################## checking whether the configure script has been run ### my $configure = 0; if ($configure == 0){ @@ -315,7 +316,7 @@ my $ublast = 0; my $accel = 0.8; #####determine the hostname####### -push @log, "VERSION:\t$version\n"; +# push @log, "VERSION:\t$version\n"; my $hostname = `hostname`; chomp $hostname; push @log, "HOSTNAME\t$hostname\n"; @@ -520,7 +521,7 @@ exit; } else { - open (OUT, ">$outpath/hamstrsearch.log") or die "could not open logfile\n"; + open (OUT, ">$outpath/fdog.log") or die "could not open logfile\n"; print OUT join "\n", @log; close OUT; } @@ -1059,7 +1060,7 @@ sub checkInput { } } } else { - push @log, "\trunning HaMStR with all hmms in $hmm_dir"; + push @log, "\trunning fDOG with all hmms in $hmm_dir"; my $hmm_dir_tmp = $hmm_dir; $hmm_dir_tmp =~ s/\|/\\\|/g; @hmms = `ls $hmm_dir_tmp`; } @@ -1299,10 +1300,10 @@ sub checkInput { } ## 14) determin whether or not the -representative flag has been set if (defined $rep) { - push @log, "\tHaMStR will run with the -representative option"; + push @log, "\tfDOG will run with the -representative option"; } else { - push @log, "\tHaMStR was called without the -representative option. More than one ortholog may be identified per core-ortholog group!"; + push @log, "\tfDOG was called without the -representative option. More than one ortholog may be identified per core-ortholog group!"; } ## check further options @@ -1854,68 +1855,68 @@ sub revComp { return($seq); } ############################## -sub parseHmmer3pm { - my ($file, $path) = @_; - my $hits; - my $query; - my %tmphash; - if (!defined $path){ - $path = '.'; - } - $file = $path . '/' . $file; - my $in = Bio::SearchIO->new( - -format => 'hmmer', - -file => $file - ); - while( my $result = $in->next_result ) { - # this is a Bio::Search::Result::HMMERResult object - if (!defined $query){ - $query = $result->query_name(); - printOUT("query is $query\n"); - } - my $hitcount = 0; - while( my $hit = $result->next_hit ) { - my $tmp = $hit->name(); - my $tmpscore = $hit->score(); - $tmp =~ s/_RF.*//; - if (!defined $tmphash{$tmp}){ - $hits->[$hitcount]->{id} = $tmp; - $hits->[$hitcount]->{hmmscore} = $tmpscore; - $hitcount++; - $tmphash{$tmp}=1; - if (defined $bhh){ - last; - } - } - } - - if (defined $hits->[0]) { - ####### a quick hack to obtain the lagPhase value - my $criticalValue; # takes the value used for candidate discrimination - my $hitLimitLoc = $hitlimit; - if (defined $autoLimit) { - printDebug("Entering getLag Routine\n"); - ## the user has invoked the autmated inference of a hit limit - ($hitLimitLoc, $criticalValue) = getLag($hits, $hitcount); - if (!defined $criticalValue) { - ## there was a problem in the computatation of the lagPhase - print "Computation of lagPhase did not succeed, switching to score threshold using a default cutoff of $scoreCutoff\n"; - ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); - } - } - elsif (defined $scoreThreshold) { - printDebug("entering the scoreThreshold routine"); - ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); - printDebug("hitlimitloc is now $hitLimitLoc"); - } - - return ($query, $hits, $hitLimitLoc, $criticalValue); - } - else { - return ($query); - } - } -} +# sub parseHmmer3pm { +# my ($file, $path) = @_; +# my $hits; +# my $query; +# my %tmphash; +# if (!defined $path){ +# $path = '.'; +# } +# $file = $path . '/' . $file; +# my $in = Bio::SearchIO->new( +# -format => 'hmmer', +# -file => $file +# ); +# while( my $result = $in->next_result ) { +# # this is a Bio::Search::Result::HMMERResult object +# if (!defined $query){ +# $query = $result->query_name(); +# printOUT("query is $query\n"); +# } +# my $hitcount = 0; +# while( my $hit = $result->next_hit ) { +# my $tmp = $hit->name(); +# my $tmpscore = $hit->score(); +# $tmp =~ s/_RF.*//; +# if (!defined $tmphash{$tmp}){ +# $hits->[$hitcount]->{id} = $tmp; +# $hits->[$hitcount]->{hmmscore} = $tmpscore; +# $hitcount++; +# $tmphash{$tmp}=1; +# if (defined $bhh){ +# last; +# } +# } +# } +# +# if (defined $hits->[0]) { +# ####### a quick hack to obtain the lagPhase value +# my $criticalValue; # takes the value used for candidate discrimination +# my $hitLimitLoc = $hitlimit; +# if (defined $autoLimit) { +# printDebug("Entering getLag Routine\n"); +# ## the user has invoked the autmated inference of a hit limit +# ($hitLimitLoc, $criticalValue) = getLag($hits, $hitcount); +# if (!defined $criticalValue) { +# ## there was a problem in the computatation of the lagPhase +# print "Computation of lagPhase did not succeed, switching to score threshold using a default cutoff of $scoreCutoff\n"; +# ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); +# } +# } +# elsif (defined $scoreThreshold) { +# printDebug("entering the scoreThreshold routine"); +# ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); +# printDebug("hitlimitloc is now $hitLimitLoc"); +# } +# +# return ($query, $hits, $hitLimitLoc, $criticalValue); +# } +# else { +# return ($query); +# } +# } +# } ############################## sub parseHmmer4pm { my ($file, $path) = @_; @@ -1931,9 +1932,9 @@ sub parseHmmer4pm { $file = $path . '/' . $file; $file =~ s/\|/\\\|/g; - my @hmmout = `$grepprog -v '#' $file |sort -rnk 9 |sed -e 's/ /@/g'`; + my @hmmout = `$grepprog -v '#' $file |sort -rnk 9`; for (my $i = 0; $i < @hmmout; $i++) { - ($hmmhits->[$i]->{target_name}, $hmmhits->[$i]->{target_accession}, $hmmhits->[$i]->{query_name}, $hmmhits->[$i]->{query_accession}, $hmmhits->[$i]->{total_evalue}, $hmmhits->[$i]->{total_score}, $hmmhits->[$i]->{total_bias}, $hmmhits->[$i]->{domain_evalue}, $hmmhits->[$i]->{domain_score}, $hmmhits->[$i]->{domain_bias}, @rest) = split(/@+/, $hmmout[$i]); + ($hmmhits->[$i]->{target_name}, $hmmhits->[$i]->{target_accession}, $hmmhits->[$i]->{query_name}, $hmmhits->[$i]->{query_accession}, $hmmhits->[$i]->{total_evalue}, $hmmhits->[$i]->{total_score}, $hmmhits->[$i]->{total_bias}, $hmmhits->[$i]->{domain_evalue}, $hmmhits->[$i]->{domain_score}, $hmmhits->[$i]->{domain_bias}, @rest) = split(/\s+/, $hmmout[$i]); if (!defined $query){ $query = $hmmhits->[$i]->{query_name}; diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl index 7e8a248..a99e1e6 100755 --- a/fdog/bin/oneSeq.pl +++ b/fdog/bin/oneSeq.pl @@ -127,13 +127,17 @@ ## Modified 24. March 2021 v2.2.8 (Vinh) - skip fa.mapping while checking genome_dir ## Modified 29. March 2021 v2.2.9 (Vinh) - check for zero $maxAlnScore ## - solved problem with long input path for fasta36 tools +## Modified 23. April 2021 v2.3.0 (Vinh) - parse fasta36 output for long IDs (longer than 60 chars) +## Modified 31. May 2021 v2.3.1 (Vinh) - added auto annotation for fdogFas +## Modified 11. June 2021 v2.3.2 (Vinh) - fixed --append option +## Modified 16. June 2021 v2.4.0 (Vinh) - add checkOff option ############ General settings -my $version = 'oneSeq v.2.2.9'; +my $version = 'oneSeq v.2.4.0'; ##### configure for checking if the setup.sh script already run my $configure = 0; if ($configure == 0){ - die "\n\n$version\n\nPLEASE RUN fdog.setup BEFORE USING fdog\n\n"; + die "\n\nPLEASE RUN fdog.setup BEFORE USING fdog\n\n"; } ##### hostname my $hostname = `hostname`; @@ -173,9 +177,9 @@ my $outputfmt = 'blastxml'; my $eval_blast_query = 0.0001; my $filter = 'F'; # default for blastp -my $annotation_prog = "annoFAS"; -my $fas_prog = "calcFAS"; -my $fdogFAS_prog = "fdogFAS"; +my $annotation_prog = "fas.doAnno"; +my $fas_prog = "fas.run"; +my $fdogFAS_prog = "fas.runFdogFas"; ##### ublast Baustelle: not implemented yet my $runublast = 0; @@ -203,7 +207,6 @@ my $idx_dir = "$path/taxonomy/"; my $dataDir = $path . '/data'; my $weightPath = "$path/weight_dir/"; -my $assembly_dir = "$path/assembly_dir/"; my @defaultRanks = ( 'superkingdom', 'kingdom', @@ -260,6 +263,7 @@ my $blastNode; my $representative; my $core_rep; +my $checkOff; my $debug; my $corestrict; my $inputSeq = ""; @@ -307,15 +311,6 @@ my %hashTree; my $aln = 'muscle'; my $searchTaxa; -#variables for fdog_goes_assembly -my $assembly; -my $augustusRefSpec; -my $avIntron; -my $lengthExtension; -my $assemblyPath; -my $searchTool = 'blast'; -my $matrix = 'blosum62'; -my $dataPath = ''; ################# Command line options GetOptions ( "h" => \$help, @@ -365,6 +360,7 @@ "blastpath=s" => \$blastPath, "searchpath=s" => \$genome_dir, "weightpath=s" => \$weightPath, + "checkOff" => \$checkOff, "debug" => \$debug, "coreHitlimit=s" => \$core_hitlimit, "hitlimit=s" => \$hitlimit, @@ -377,15 +373,7 @@ "distDeviation=s" => \$distDeviation, "aligner=s" => \$aln, "hyperthread" => \$hyperthread, - "searchTaxa=s" => \$searchTaxa, - "assembly" => \$assembly, - "assemblypath=s" => \$assemblyPath, - "augustusRefSpec=s" => \$augustusRefSpec, - "avIntron=s" => \$avIntron, - "lengthExtension=s" => \$lengthExtension, - "searchTool=s" => \$searchTool, - "scoringmatrix=s" => \$matrix, - "dataPath=s" => \$dataPath + "searchTaxa=s" => \$searchTaxa ); $outputPath = abs_path($outputPath); @@ -397,17 +385,16 @@ $weightPath = abs_path($weightPath)."/"; $genome_dir = abs_path($genome_dir)."/"; $taxaPath = $genome_dir; -$dataPath = abs_path($dataPath)."/"; -$assembly_dir = abs_path($assemblyPath)."/"; ############# do initial check if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) { print "Validity checking....\n"; my $checkStTime = gettime(); - initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff); - print "Check finished in " . roundtime(gettime() - $checkStTime). " sec!\n"; + unless($checkOff) { + initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff); + } - if (!defined $coreex && !defined $assembly) { + if (!defined $coreex) { if (!grep(/$minDist/, @defaultRanks)) { die "ERROR: minDist $minDist invalid!\n"; } @@ -420,6 +407,7 @@ die "ERROR: coreOrth not defined (must be integer)!"; } } + print "Check finished in " . roundtime(gettime() - $checkStTime). " sec!\n"; } ############# show version @@ -490,7 +478,7 @@ # create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction) # get annotations for seed sequence if fas support is on -if ($fas_support && !$assembly){ +if ($fas_support){ if (!$weightPath) { createWeightFolder(); } @@ -499,7 +487,7 @@ my $coreStTime = gettime(); #time; #core-ortholog search -if (!$coreex && !$assembly) { +if (!$coreex) { print "\nCore compiling...\n"; $coremode = 1; $taxaPath = $blastPath; @@ -637,12 +625,7 @@ my $final_eval_blast = $eval_blast*$eval_relaxfac; my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac; - if (!$assembly){ - $taxaPath = $genome_dir; - } - else{ - $taxaPath = $assembly_dir; - } + $taxaPath = $genome_dir; my @searchTaxa; unless ($searchTaxa) { unless($groupNode) { @@ -698,72 +681,20 @@ } } } - if ($assembly){ - $eval_blast = sprintf("%f", $eval_blast); - if ($seqFile ne "") { - my @assembly_cmd = ("fdog.assembly", "--gene " . $seqName, "--augustusRefSpec ". $augustusRefSpec, "--refSpec " . $refSpec, "--dataPath " . $dataPath, "--silent"); - - if (defined $assemblyPath){ - push(@assembly_cmd, "--assemblyPath $assemblyPath") - } - if (defined $avIntron){ - push(@assembly_cmd, "--avIntron $avIntron "); - } - if (defined $lengthExtension){ - push(@assembly_cmd, "--lengthExtension $lengthExtension "); - } - if (!$autoclean){ - push(@assembly_cmd, "--tmp "); - } - if ($outputPath){ - push(@assembly_cmd, "--out $outputPath "); - } - if (defined $strict){ - push(@assembly_cmd, "--strict"); - } - if ($eval_blast){ - push(@assembly_cmd, "--evalBlast $eval_blast "); - } - if ($searchTool){ - push(@assembly_cmd, "--msaTool $aln "); - } - if (defined $checkcoorthologsref){ - push(@assembly_cmd, "--checkCoorthologsRef"); - } - if ($searchTool){ - push(@assembly_cmd, "--searchTool $searchTool"); - } - if ($matrix){ - push(@assembly_cmd, "--scoringmatrix $matrix"); - } - if ($coreOrthologsPath){ - push(@assembly_cmd, "--coregroupPath $coreOrthologsPath"); - } - if ($fasoff){ - push(@assembly_cmd, "--fasoff"); - } - if ($searchTaxon){ - push(@assembly_cmd, "--searchTaxon $searchTaxon"); - } - if ($filter){ - push(@assembly_cmd, "--filter $filter"); - } - printDebug(@assembly_cmd); - system(join(' ', @assembly_cmd)) == 0 or die "Error: fDOGassembly failed \n"; - } - } - else{ runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln); - } $pm->finish; } $pm->wait_all_children; } +### remove duplicated seq in extended.fa +if (-e $finalOutput) { + addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); +} push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!"; print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n"; - -if(!$coreOnly && !$assembly){ +## Evaluation of all orthologs that are predicted by the final run +if(!$coreOnly){ my $fasStTime = gettime(); my $processID = $$; @@ -775,9 +706,9 @@ addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); # calculate FAS scores for final extended.fa - if ($fas_support && !$assembly) { + if ($fas_support) { print "Starting the feature architecture similarity score computation...\n"; - my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu"; + my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu --redo_anno"; unless ($countercheck) { $fdogFAScmd .= " --bidirectional" } @@ -788,21 +719,12 @@ } push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; - if($autoclean){ print "Cleaning up...\n"; runAutoCleanUp($processID); } } -if ($assembly){ - my $file_assembly_out; - $file_assembly_out = $outputPath . '/' . $seqName; - my $cmd_merge; - $cmd_merge = "fdog.mergeAssembly --in $outputPath --out $file_assembly_out --cleanup"; - printDebug($cmd_merge); - system($cmd_merge); -} ## Delete tmp folder unless ($debug) { my $delTmp = "rm -rf $tmpdir"; @@ -814,7 +736,10 @@ push @logOUT, "fdog finished after " . roundtime(gettime() - $startTime) . " sec!\n"; #### writing the log -open (LOGOUT, ">$outputPath/fdog.log") or warn "Failed to open fdog.log for writing"; +open (LOGOUT, ">>$outputPath/fdog.log") or die "Could not open $outputPath/fdog.log for writing\n"; +print LOGOUT "\n\n"; +my $fdogVersion = `fdog.run --version`; +print LOGOUT "fDOG v$fdogVersion\n"; print LOGOUT join "\n", @logOUT; close LOGOUT; exit; @@ -1209,10 +1134,10 @@ sub checkOptions { if ($force == 1 and $append ==1) { $force = 0; } - ### check the presence of the pre-computed core set if options reuseCore or assembly is used - if ($coreex || $assembly) { + ### check the presence of the pre-computed core set + if ($coreex) { if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") { - print "You selected the option -reuseCore or -assembly, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; + print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; exit; } } @@ -1283,7 +1208,7 @@ sub checkOptions { ### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected $optbreaker = 0; - while(!$minCoreOrthologs and (!$coreex and !$assembly)) { + while(!$minCoreOrthologs and !$coreex) { if ($optbreaker >= 3){ print "No proper number given ... exiting.\n"; exit; @@ -1298,12 +1223,10 @@ sub checkOptions { $filter = 'no' if $filter eq 'F'; } - if (!$assembly){ - $inputSeq = fetchSequence($seqFile, $dataDir); - } + $inputSeq = fetchSequence($seqFile, $dataDir); ## the user has not provided a sequence id, however, the refspec is determined. - if($seqId eq '' && !$assembly) { + if($seqId eq '') { my $besthit; if (!$blast){ ## a refspec has been determined @@ -1318,9 +1241,9 @@ sub checkOptions { $refSpec = $besthit->{species}; my $details = "Evalue: " . $besthit->{evalue}; printOut("Seq id has been determined as $seqId in $refSpec with $details", 2); - if(length("$seqName|$refSpec|$seqId") > 60) { - die "Output file will have header longer than 60 characters ($seqName|$refSpec|$seqId). Please consider shorten the sequence IDs! More at https://github.com/BIONF/fDOG/wiki/Check-data-validity\n"; - } + # if(length("$seqName|$refSpec|$seqId") > 60) { + # die "Output file will have header longer than 60 characters ($seqName|$refSpec|$seqId). Please consider shorten the sequence IDs! More at https://github.com/BIONF/fDOG/wiki/Check-data-validity\n"; + # } if($seqId eq '') { print "There was no significant hit for your sequence in " . $refSpec . ".\nPlease specify a sequence id on your own.\n"; exit; @@ -1398,22 +1321,24 @@ sub checkOptions { mkdir $outputPath or die "could not re-create the output directory $outputPath\n"; } elsif ($append) { - printOut("Appending output to $finalOutput\n", 1); - if (-e "$outputPath/$seqName.extended.profile") { + if (-e "$outputPath/$seqName.extended.fa") { ## read in the content for latter appending - printOut("Appending output to $outputPath/$seqName.extended.profile", 1); - open (IN, "<$outputPath/$seqName.extended.profile") or die "failed to open $outputPath/$seqName.extended.profile after selection of option -append\n"; + printOut("Appending output to $outputPath/$seqName.extended.fa", 1); + open (IN, "<$outputPath/$seqName.extended.fa") or die "failed to open $outputPath/$seqName.extended.fa after selection of option -append\n"; while () { - chomp $_; - my @keys = split '\|', $_; - $profile{$keys[1]} = 1; + my $line = $_; + if ($line =~ /\|/) { + chomp $line; + my @keys = split '\|', $line; + $profile{$keys[1]} = 1; + } } } elsif ($fasoff) { ## no extended.profile file exists but not necessary, because user switched off FAS support -> do nothing } else { - printOut("Option -append was selected, but the existing output was incomplete. Please restart with the -force option to overwrite the output"); + printOut("Option -append was selected, but the existing output was incomplete. Please restart with the -force option to overwrite the output", 1); exit; } } @@ -1428,9 +1353,8 @@ sub checkOptions { #### checking for the min and max distance for the core set compilation #### omit this check, if the option reuseCore has been selected (added 2019-02-04) $optbreaker = 0; - if (!$coreex and !$assembly) { + if (!$coreex) { my $node; - #print "Testing coreex assembly\n"; $node = $db->get_taxon(-taxonid => $refTaxa{$refSpec}); $node->name('supplied', $refSpec); if (lc($maxDist) eq "root"){ @@ -1790,8 +1714,9 @@ sub cumulativeAlnScore{ my $line = $_; $line =~ s/[\(\)]//g; my @line = split('\s+',$line); - - if($line[0] && ($line[0] eq $key)){ + my $shortedId = substr($key, 0, 60); + # if($line[0] && ($line[0] eq $key)){ + if($line[0] && ($line[0] eq $shortedId)){ if(exists $cumscores{$key}) { $gotScore = 1; $cumscores{$key} = $cumscores{$key} + $line[2]; @@ -2146,7 +2071,7 @@ sub addSeedSeq { # get seed sequence and add it to the beginning of the fasta output open(TEMP, ">$outputFa.temp") or die "Cannot create $outputFa.temp!\n"; my $seqio = Bio::SeqIO->new(-file => "$coreOrthologsPath/$seqName/$seqName.fa", '-format' => 'Fasta'); - my %idTmp; # used to check which seq has already been written to output + my %idTmp = (); # used to check which seq has already been written to output while(my $seq = $seqio->next_seq) { my $id = $seq->id; if ($id =~ /$refSpec/) { @@ -2162,6 +2087,7 @@ sub addSeedSeq { unless ($id =~ /$refSpec\|$seqId/) { # /$refSpec/) { unless ($idTmp{$id}) { print TEMP ">$id\n", $seq->seq, "\n"; + $idTmp{$id} = 1; } } } @@ -2643,9 +2569,9 @@ sub initialCheck { } # check executable FAS - my $fasCheckMsg = `setupFAS -t ./ -c 2>&1`; + my $fasCheckMsg = `fas.setup -t ./ -c 2>&1`; if ($fasoff != 1 && $fasCheckMsg =~ /ERROR/) { - die "ERROR: greedyFAS not ready to use! Please check https://github.com/BIONF/FAS/wiki/prepareFAS\n"; + die "ERROR: FAS not ready to use! Please check https://github.com/BIONF/FAS/wiki/setup\n"; } # check seed fasta file @@ -2690,9 +2616,19 @@ sub initialCheck { } } # check weight_dir - if ($fasoff != 1 && !$assembly) { + if ($fasoff != 1) { my %seen; my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir); + my @notFolder; + for (my $i = 0;$i < scalar(@allTaxa); $i++){ + if (-f "$blastDir/$allTaxa[$i]" || -f "$genomeDir/$allTaxa[$i]") { + push(@notFolder, $allTaxa[$i]); + splice(@allTaxa, $i, 1); + } + } + if (scalar(@notFolder) > 0) { + print "*** WARNING: Found files in $genomeDir or $blastDir:\t@notFolder\n"; + } chomp(my $allAnno = `ls $weightDir | $sedprog \'s/\\.json//\'`); my @allAnno = split(/\n/, $allAnno); my @missingAnno = array_minus(@allTaxa, @allAnno); diff --git a/fdog/checkData.py b/fdog/checkData.py index 84310ac..3aafe44 100644 --- a/fdog/checkData.py +++ b/fdog/checkData.py @@ -133,28 +133,29 @@ def checkDataFolder(checkDir, replace, delete, concat): if os.path.islink(faFile): faFile = os.path.realpath(faFile) checkFileExist(faFile) - if not '.checked' in faFile: - if not os.path.exists(faFile+".checked"): - checkFaFile = checkValidFasta(faFile) - if checkFaFile == 'notFasta': - sys.exit('*** ERROR: %s does not look like a fasta file!' % faFile) - elif checkFaFile == 'longHeader': - sys.exit('*** ERROR: %s contains long headers!' % faFile) - elif checkFaFile == 'space': - sys.exit('*** ERROR: %s contains spaces/tabs!' % faFile) - elif checkFaFile == 'multiLine': - if not concat: - print('*** ERROR: %s contains multiple-line sequences!' % faFile) - sys.exit('Please use "--concat" with "--replace" or "--delete" to join them into single lines') - else: - rewriteSeqs(faFile, replace, delete) - elif checkFaFile == 'ok': - if not (delete or replace): - checkValidSeqs(faFile) - else: - rewriteSeqs(faFile, replace, delete) - writeCheckedFile(faFile) - print(fd) + if not '.mapping' in faFile: + if not '.checked' in faFile: + if not os.path.exists(faFile+".checked"): + checkFaFile = checkValidFasta(faFile) + if checkFaFile == 'notFasta': + sys.exit('*** ERROR: %s does not look like a fasta file!' % faFile) + elif checkFaFile == 'longHeader': + sys.exit('*** ERROR: %s contains long headers!' % faFile) + elif checkFaFile == 'space': + sys.exit('*** ERROR: %s contains spaces/tabs!' % faFile) + elif checkFaFile == 'multiLine': + if not concat: + print('*** ERROR: %s contains multiple-line sequences!' % faFile) + sys.exit('Please use "--concat" with "--replace" or "--delete" to join them into single lines') + else: + rewriteSeqs(faFile, replace, delete) + elif checkFaFile == 'ok': + if not (delete or replace): + checkValidSeqs(faFile) + else: + rewriteSeqs(faFile, replace, delete) + writeCheckedFile(faFile) + print(fd) taxaList.append(fd) except subprocess.CalledProcessError as e: print('*** ERROR: Problem while searching for fasta file') @@ -162,13 +163,28 @@ def checkDataFolder(checkDir, replace, delete, concat): sys.exit() return(taxaList) -def checkCompleteAnno(weightDir, taxaList): +def checkMissingJson(weightDir, taxaList): allAnno = [f for f in listdir(weightDir) if isfile(join(weightDir, f))] taxaAnno = [s + '.json' for s in taxaList] s = set(allAnno) missingAnno = [x for x in taxaAnno if x not in s] return(missingAnno) +def checkCompleteAnno(weightDir, genomeDir): + allAnno = [f for f in listdir(weightDir) if isfile(join(weightDir, f))] + for f in allAnno: + tax = f.replace('.json', '') + print('...check annotations for %s' % tax) + jf = '%s/%s.json' % (weightDir, tax) + gf = '%s/%s/%s.fa' % (genomeDir, tax, tax) + cmd = 'fas.checkAnno -s %s -a %s -o %s' % (gf, jf, weightDir) + try: + subprocess.call([cmd], shell = True) + except subprocess.CalledProcessError as e: + print('*** ERROR: Problem while checking annotation file using fas.checkAnno!') + print(e.output.decode(sys.stdout.encoding)) + sys.exit() + def checkMissingNcbiID(namesDmp, taxaList): ncbiId = {} with open(namesDmp, 'r') as f: @@ -193,7 +209,7 @@ def checkMissingNcbiID(namesDmp, taxaList): return(missingTaxa.keys(), dupTaxa) def main(): - version = '0.0.3' + version = '0.0.6' parser = argparse.ArgumentParser(description='You are running fdog.checkData version ' + str(version) + '.') parser.add_argument('-g', '--genomeDir', help='Path to search taxa directory (e.g. fdog_dataPath/genome_dir)', action='store', default='') parser.add_argument('-b', '--blastDir', help='Path to blastDB directory (e.g. fdog_dataPath/blast_dir)', action='store', default='') @@ -237,12 +253,13 @@ def main(): ### check weightDir print('=> Checking %s...' % weightDir) - missingAnno = checkCompleteAnno(weightDir, join2Lists(genomeTaxa, blastTaxa)) + missingAnno = checkMissingJson(weightDir, join2Lists(genomeTaxa, blastTaxa)) if len(missingAnno) > 0: - print('\033[92m*** WARNING: Annotations not found for:\033[0m') + print('\033[92m*** WARNING: Annotation files not found for:\033[0m') print(*missingAnno, sep = "\n") print('NOTE: You still can run fdog without FAS using the option "-fasoff"') caution = 1 + checkCompleteAnno(weightDir, genomeDir) ### check ncbi IDs print('=> Checking NCBI taxonomy IDs...') diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 46f83c0..12fcf6f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Hannah Muelbaier + +# Copyright (C) 2021 Hannah Muelbaier # # This script is used to run fDOG-Assembly which performs targeted ortholog # searches on genome assemblies @@ -27,8 +28,30 @@ import yaml import subprocess import time -======= +import shutil +import multiprocessing as mp + ########################### functions ########################################## +def check_path(path): + if not os.path.exists(path): + print(path + " does not exist. Exciting ...") + sys.exit() + +def check_ref_sepc(species_list, fasta_file): + file = open(fasta_file, "r") + lines = file.readlines() + species_file = [] + + for line in lines: + if line[0] == ">": + species = line.split("|")[1] + species_file.append(species) + for species in species_list: + if species in species_file: + return species + print("Reference species is not part of the ortholog group. Exciting ...") + sys.exit() + def load_config(config_file): with open(config_file, 'r') as stream: try: @@ -36,23 +59,27 @@ def load_config(config_file): except yaml.YAMLError as exc: print(exc) -def starting_subprocess(cmd, mode): - if mode == 'debug': - result = subprocess.run(cmd, shell=True) - elif mode == 'silent': - result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True) - elif mode == 'normal': - result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True) +def starting_subprocess(cmd, mode, time_out = None): + + try: + if mode == 'debug': + result = subprocess.run(cmd, shell=True, timeout = time_out) + elif mode == 'silent': + result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True, timeout = time_out) + elif mode == 'normal': + result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True, timeout = time_out) + except subprocess.TimeoutExpired: + return 1 def merge(blast_results, insert_length): #merging overlapping and contigous candidate regions + #format dictionary: {node_name: [(,,evalue, ,,, )]} number_regions = 0 insert_length = int(insert_length) + score_list = [] for key in blast_results: locations = blast_results[key] locations = sorted(locations, key = lambda x: int(x[3])) - #print("test") - #print(locations) size_list = len(locations) j = 0 while j < size_list-1: @@ -62,6 +89,8 @@ def merge(blast_results, insert_length): #merge overlapping regions plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -= 1 @@ -69,6 +98,8 @@ def merge(blast_results, insert_length): #merge overlapping regions minus strand locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -= 1 @@ -76,6 +107,8 @@ def merge(blast_results, insert_length): #merging consecutive regions, the distance between booth is not longer than a cutoff, plus strand locations[j][1] = max(locations[j][1], locations[i][1]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -=1 @@ -83,20 +116,24 @@ def merge(blast_results, insert_length): #merging consecutive regions, the distance between booth is not longer than a cutoff, minus strand locations[j][0] = min(locations[j][0], locations[i][0]) locations[j][2] = min(locations[j][2], locations[i][2]) + locations[j][4] = max(locations[j][4], locations[i][4]) + locations[j][6] = max(locations[j][6], locations[i][6]) locations.pop(i) size_list -= 1 i -=1 i += 1 j += 1 + for entry in locations: + score_list.append(entry[6]) number_regions += len(locations) blast_results[key] = locations - return blast_results, number_regions + return blast_results, number_regions, score_list def parse_blast(line, blast_results, cutoff): - # format blast line: - # format dictionary: {node_name: [(,,evalue, ,,)]} + # format blast line: + # format dictionary: {node_name: [(,,evalue, ,,, )]} line = line.replace("\n", "") line_info = line.split("\t") evalue = float(line_info[3]) @@ -105,7 +142,7 @@ def parse_blast(line, blast_results, cutoff): return blast_results, evalue #add region to dictionary else: - node_name, sstart, send, qstart, qend = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]) + node_name, sstart, send, qstart, qend, score = line_info[0], int(line_info[1]), int(line_info[2]), int(line_info[4]), int(line_info[5]), int(line_info[6]) split = node_name.split("|") # finding out on which strand tBLASTn found a hit if sstart < send: @@ -119,14 +156,32 @@ def parse_blast(line, blast_results, cutoff): node_name = split[1] if node_name in blast_results: list = blast_results[node_name] - list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand]) + list.append([int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]) blast_results[node_name] = list else: - blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand]] + blast_results[node_name] = [[int(sstart),int(send), evalue, int(qstart), int(qend), strand, score]] return blast_results, evalue -def candidate_regions(intron_length, cutoff_evalue, tmp_path): +def get_x_results(blast_dic, x, score_list): + + new_dic = {} + score_list.sort(reverse=True) + min = score_list[x - 1] + number_regions = 0 + + for key in blast_dic: + key_list = [] + entries = blast_dic[key] + for i in entries: + if i[6] >= min: + key_list.append(i) + if key_list != []: + new_dic[key] = key_list + number_regions += len(key_list) + return new_dic, number_regions + +def candidate_regions(intron_length, cutoff_evalue, tmp_path, x = 10): ###################### extracting candidate regions ######################## # info about output blast http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 blast_file = open(tmp_path + "/blast_results.out", "r") @@ -142,10 +197,13 @@ def candidate_regions(intron_length, cutoff_evalue, tmp_path): blast_results, evalue = parse_blast(line, blast_results, cutoff_evalue) if blast_results == {}: + blast_file.close() return 0,0 else: - candidate_regions, number_regions = merge(blast_results, intron_length) - + candidate_regions, number_regions, score_list = merge(blast_results, intron_length) + blast_file.close() + if number_regions > x: + candidate_regions, number_regions = get_x_results(candidate_regions, x, score_list) return candidate_regions, number_regions def extract_seq(region_dic, path, tmp_path, mode): @@ -187,7 +245,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug output.write(line) sequence_file.close() except FileNotFoundError: - print("No gene found in region with ID:" + name + " , continuing with next region") + print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") output.close() def searching_for_db(assembly_path): @@ -250,11 +308,20 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates if msaTool == "muscle": os.system("muscle -quiet -in " + output_file + " -out " + aln_file) #print("muscle -quiet -in " + output_file + " -out " + aln_file) + if not os.path.exists(aln_file): + print("Muscle failed for " + candidate_name + ". Making MSA with Mafft-linsi.") + os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) + elif msaTool == "mafft-linsi": #print("mafft-linsi") os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) - distances = get_distance_biopython(aln_file, matrix) + try: + distances = get_distance_biopython(aln_file, matrix) + except ValueError: + print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) + return 0, "NaN", "NaN" + distance_hit_query = distances[best_hit, candidate_name] distance_ref_hit = distances[best_hit, ref] @@ -280,7 +347,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva try: id_ref = seedDic[fdog_ref_species] except KeyError: - print("The fDOG reference species isn't part of the core ortholog group, ... exciting") + #print("The fDOG reference species isn't part of the core ortholog group, ... exciting") return 0, seed if searchTool == "blast": cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile @@ -298,45 +365,46 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva id, gene, evalue = (line.replace("\n", "")).split("\t") gene_name = gene.split("|")[2] if gene_name != old_name: - print("candidate:%s"%(gene_name)) - print("blast-hit:%s"%(id)) + print("candidate:%s"%(gene_name)) if mode == "debug" else "" + print("blast-hit:%s"%(id)) if mode == "debug" else "" min = float(evalue) if id in id_ref: orthologs.append(gene) - print("\thitting\n") + print("\thitting\n") if mode == "debug" else "" else: if checkCo == True: for i in id_ref: - print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else "" co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) if co_orthologs_result == 1: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" orthologs.append(gene) elif co_orthologs_result == 0: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + if distance_ref_hit != "NaN": + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" else: - print("\tnothitting\n") + print("\tnothitting\n") if mode == "debug" else "" elif (gene_name == old_name) and float(evalue) == min and gene_name not in orthologs: if id in id_ref: orthologs.append(gene) - print("\thitting\n") + print("\thitting\n") if mode == "debug" else "" else: if checkCo == True: for i in id_ref: - print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) + print("Best hit %s differs from reference sequence %s! Doing further checks\n"%(id, i)) if mode == "debug" else "" co_orthologs_result, distance_ref_hit, distance_hit_query = checkCoOrthologs(gene_name, id, i, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path) if co_orthologs_result == 1: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tAccepting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" orthologs.append(gene) elif co_orthologs_result == 0: - print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) + print("\t Distance query - blast hit: %6.4f, Distance blast hit - reference: %6.4f\tRejecting\n"%(distance_hit_query, distance_ref_hit)) if mode == "debug" else "" else: - print("\tnot hitting\n") + print("\tnot hitting\n") if mode == "debug" else "" old_name = gene_name if orthologs == []: - print("No hit in the backward search, ...exciting") + #print("No hit in the backward search, ...exciting") return 0, seed else: @@ -361,12 +429,12 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva orthologs = set({}) for species in seed: - print("backward search in species " + species + "\n") + print("backward search in species %s\n" %species) orthologs_new = set({}) try: id_ref = seedDic[species] except KeyError: - print("The species " + species + " isn't part of the core ortholog group, ... exciting") + #print("The species " + species + " isn't part of the core ortholog group, ... exciting") return 0, seed cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile @@ -389,12 +457,13 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva #print(species) #print(orthologs_new) + #print(orthologs) if species == fdog_ref_species: orthologs = orthologs_new else: orthologs = orthologs & orthologs_new - if orthologs == {}: - print("No ortholog was found with option --strict") + if len(orthologs) == 0: + #print("No ortholog was found with option --strict") return 0, seed @@ -403,6 +472,39 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva orthologs = set(orthologs) return list(orthologs), seed +def addRef(output, core_fasta, species_list): + #print(species_list) + output_file = open(output, "a+") + seq_records_core = readFasta(core_fasta) + seq_records_core = list(seq_records_core) + for species in species_list: + for entry_core in seq_records_core: + if species in entry_core.id: + output_file.write(">" + entry_core.id + "\n") + output_file.write(str(entry_core.seq) + "\n") + output_file.close() + +def addSeq(output, seq_list): + output_file = open(output, "a+") + + for item in seq_list: + #print(item) + candidate_fasta = item[1] + sequenceIds = item[0] + if sequenceIds == 0 or sequenceIds == []: + continue + seq_records_candidate = readFasta(candidate_fasta) + seq_records_candidate = list(seq_records_candidate) + for entry_candidate in seq_records_candidate: + if entry_candidate.id in sequenceIds: + if entry_candidate.id == sequenceIds[0]: + output_file.write(">" + entry_candidate.id + "|1" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + else: + output_file.write(">" + entry_candidate.id + "|0" + "\n") + output_file.write(str(entry_candidate.seq) + "\n") + output_file.close() + def addSequences(sequenceIds, candidate_fasta, core_fasta, output, name, species_list, refBool, tmp_path): output_file = open(output, "a+") @@ -441,12 +543,18 @@ def createFasInput(orthologsOutFile, mappingFile): ncbi_id = (seq.id.split("@"))[1] mappingFile.write(seq.id + "\t" + "ncbi" + ncbi_id + "\n") - + mappingFile.close() return fas_seed_id def cleanup(tmp, tmp_path): if tmp == False: - os.system('rm -r ' + tmp_path) + timeout = time.time() + 60*1 + while os.path.exists(tmp_path): + shutil.rmtree(tmp_path, ignore_errors=True) + if time.time() > timeout: + print("tmp folder could not be removed!") + break + def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: @@ -517,6 +625,80 @@ def clean_fas(path, file_type): new_line = id + "\t" + remain file.write(new_line) + file.close() + +def ortholog_search(args): + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args + cmd = 'mkdir ' + out + '/tmp/' + asName + starting_subprocess(cmd, 'silent') + tmp_path = out + "tmp/" + asName + "/" + candidatesOutFile = tmp_path + group + ".candidates.fa" + #orthologsOutFile = out + "/" + group + ".extended.fa" + fasOutFile = out + "/" + group + #mappingFile = out + "/tmp/" + group + ".mapping.txt" + + sys.stdout.write("Searching in species " + asName + "\n") + assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" + db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" + db_check = searching_for_db(db_path) + + if db_check == 0: + #print("Creating a blast data base...") + cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path + starting_subprocess(cmd, mode) + #print("\t ...finished \n") + + #makes a tBLASTn search against database + #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt + #print("Starting tBLASTn search...") + cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' + time_tblastn_start = time.time() + exit_code = starting_subprocess(cmd, mode, 3600) + time_tblastn_end = time.time() + time_tblastn = time_tblastn_end - time_tblastn_start + if exit_code == 1: + sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) + #cleanup(tmp, tmp_folder) + #sys.exit() + return [], candidatesOutFile + #else: + #print("\t ...finished") + print("Time tblastn %s in species %s" % (str(time_tblastn), asName)) + + regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) + if regions == 0: + #no candidat region are available, no ortholog can be found + sys.stdout.write("No candidate region found for species %s!\n" % asName) + return [], candidatesOutFile + + else: + print(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) + extract_seq(regions, db_path, tmp_path, mode) + + ############### make Augustus PPX search ################################### + #print("Starting augustus ppx ...") + time_augustus_start = time.time() + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + #print("\t ...finished \n") + time_augustus_end = time.time() + time_augustus = time_augustus_end - time_augustus_start + print("Time augustus: %s species %s \n" % (str(time_augustus), asName)) + + ################# backward search to filter for orthologs################### + if int(os.path.getsize(candidatesOutFile)) <= 0: + #print("No genes found at candidate regions\n") + return [], candidatesOutFile + + reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) + + if reciprocal_sequences == 0: + if regions != 0: + sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + return [], candidatesOutFile + else: + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + + return reciprocal_sequences, candidatesOutFile class Logger(object): def __init__(self, file): @@ -534,24 +716,22 @@ def flush(self): def main(): - #################### handle user input ######################################## + #################### handle user input ##################################### start = time.time() - - version = '0.1.1' - - + version = '0.1.2' + ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) - + ################## required arguments ###################################### required = parser.add_argument_group('Required arguments') required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', action='store', default='', required=True) required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) - required.add_argument('--refSpec', help='Reference taxon for fDOG.', action='store', default='', required=True) - + required.add_argument('--refSpec', help='Reference taxon/taxa for fDOG.', action='store', nargs="+", default='', required=True) + ################## optional arguments ###################################### optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 5000)',action='store', default=5000, type=int) + optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int) optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int) optional.add_argument('--assemblyPath', help='Path for the assembly directory', action='store', default='') optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False) @@ -564,15 +744,16 @@ def main(): optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False) optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') - optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', default='') - optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') + optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[]) + #optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional.add_argument('--searchTaxon', help='Search Taxon name', action='store', default='') + optional.add_argument('--searchTaxa', help='List of Taxa to search in', action='store', nargs="+", default=[]) optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) - - + optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) + optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) + optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False) args = parser.parse_args() # required @@ -602,15 +783,15 @@ def main(): msaTool = args.msaTool matrix = args.scoringmatrix taxa = args.coreTaxa - if taxa == '': - taxa =[] - else: - taxa = taxa.split(",") fasoff = args.fasoff - searchTaxon = args.searchTaxon + searchTaxa = args.searchTaxa silent = args.silent debug = args.debug + force = args.force + append = args.append + parallel = args.parallel + # output modes if debug == True and silent == True: print("It's not possible to use booth modes, please restart and use --debug or --silent") return 1 @@ -637,23 +818,43 @@ def main(): dataPath = cfg['dataPath'] except: dataPath = 'config' + + if out == '': + out = os.getcwd() + else: + if out[-1] != "/": + out = out + "/" + check_path(out) + + if os.path.exists(out + '/' + group): + if append != True and force != True: + print("Output folder for group " + group + " exists already. Please choose --force or --append.") + sys.exit() + elif force == True: + shutil.rmtree(out + '/' + group, ignore_errors=True) + refBool = False + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' + elif append == True: + out = out + '/' + group + '/' + refBool = True + else: + refBool = False # checks if sequences of reference species were already part of the extended.fa file + else: + os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') + out = out + '/' + group + '/' + refBool = False + if core_path == '': core_path = out + '/core_orthologs/' else: if not core_path.endswith('/'): core_path = core_path + '/' + check_path(core_path) if assemblyDir == '': assemblyDir = dataPath + '/assembly_dir/' - if out == '': - #print('test out \n') - out = os.getcwd() - os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') - out = out + '/' + group + '/' - else: - if out[-1] != "/": - out = out + "/" - + check_path(assemblyDir) try: f = open(out + "/fdog.log", "a+") @@ -668,194 +869,130 @@ def main(): else: sys.stdout = Logger(f) - # user input has to be checked here before fDOGassembly continues - assembly_names = os.listdir(assemblyDir) - - ########################## some variables ################################## - - refBool = False # checks if sequences of reference species were already part of the extended.fa file + ########################### other variables ################################ + if searchTaxa == []: + assembly_names = os.listdir(assemblyDir) + else: + assembly_names = os.listdir(assemblyDir) + for Taxon in searchTaxa: + if Taxon not in assembly_names: + print("Taxon %s is not in the assembly_dir" % Taxon) + sys.exit() + assembly_names = searchTaxa - ########### paths ########### + ################################# paths #################################### msa_path = core_path + "/" + group +"/"+ group + ".aln" + check_path(msa_path) hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" + check_path(hmm_path) fasta_path = core_path + "/" + group +"/"+ group + ".fa" + check_path(fasta_path) consensus_path = out + "/tmp/" + group + ".con" profile_path = out + "/tmp/" + group + ".prfl" + tmp_folder = out + "/tmp" + + ########### is/are fDOG reference species part of ortholog group? ########## + + fdog_ref_species = check_ref_sepc(fdog_ref_species, fasta_path) ###################### create tmp folder ################################### cmd = 'mkdir ' + out + '/tmp' starting_subprocess(cmd, 'silent') - ######################## consensus sequence ################################ + print("Gene: " + group) + print("fDOG reference species: " + fdog_ref_species + " \n") + ######################## consensus sequence ################################ + group_computation_time_start = time.time() #make a majority-rule consensus sequence with the tool hmmemit from hmmer - print("Building a consensus sequence for gene " + group + " \n") + print("Building a consensus sequence") cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path starting_subprocess(cmd, mode) - print("consensus sequence is finished\n") + print("\t ...finished\n") ######################## block profile ##################################### - print("Building a block profile for gene " + group + " \n") + print("Building a block profile ...") cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path starting_subprocess(cmd, 'silent') if int(os.path.getsize(profile_path)) > 0: - print("block profile is finished \n") + print("\t ...finished \n") else: print("Building block profiles failed. Using prepareAlign to convert alignment\n") new_path = core_path + group +"/"+ group + "_new.aln" - #print(cmd) cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path starting_subprocess(cmd, mode) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path - #print(cmd) starting_subprocess(cmd, 'silent') - print("block profile is finished \n") - - searchBool = False - - #################### fDOG assembly computation for all species ############# - for asName in assembly_names: - if searchBool == True: - break - if searchTaxon != '' and searchBool == False: - asName = searchTaxon - searchBool = True - - ################### path definitions ################################### - - cmd = 'mkdir ' + out + '/tmp/' + asName - starting_subprocess(cmd, 'silent') - tmp_path = out + "/tmp/" + asName + "/" - candidatesOutFile = tmp_path + group + ".candidates.fa" - if searchTaxon != '': - orthologsOutFile = out + "/" + group + "_" + asName + ".extended.fa" - fasOutFile = out + "/" + group + "_" + asName - mappingFile = tmp_path + group + "_" + asName + ".mapping.txt" - else: - orthologsOutFile = out + "/" + group + ".extended.fa" - fasOutFile = out + "/" + group - mappingFile = out + "/tmp/" + group + ".mapping.txt" - - print("Searching in species " + asName + "\n") - assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" - db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - - ######################## tBLASTn ########################################### - #checks if data base exists already - db_check = searching_for_db(db_path) - if db_check == 0: - print("creating a blast data base \n") - cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path - starting_subprocess(cmd, mode) - print("database is finished \n") - else: - print('blast data base exists already, continuing...') - - #makes a tBLASTn search against the new database - #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - print("tBLASTn search against data base") - cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' - starting_subprocess(cmd, mode) - print("tBLASTn search is finished") - - ################### search for candidate regions and extract seq ########### - # parse blast and filter for candiate regions - regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) - - if regions == 0: - #no candidat region are available, no ortholog can be found - print("No candidate region found") - if refBool == True: - continue - else: - taxa = [fdog_ref_species] - reciprocal_sequences = 0 - else: - print(str(number_regions) + " candiate regions were found. Extracting sequences...") - extract_seq(regions, db_path, tmp_path, mode) - - ############### make Augustus PPX search ################################### - - print("starting augustus ppx \n") - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - print("augustus is finished \n") - - ################# backward search to filter for orthologs################### - if int(os.path.getsize(candidatesOutFile)) <= 0: - print("No genes found at candidate regions\n") - if searchTaxon == '' and refBool == True: - continue - else: - reciprocal_sequences = 0 - taxa = [fdog_ref_species] - else: - reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) - - - ################## checking accepted genes for co-orthologs ################ - if reciprocal_sequences == 0: - if regions != 0: - print("No ortholog fulfilled the reciprocity criteria") - if searchTaxon == '' and refBool == True: - continue - else: - reciprocal_sequences = 0 - else: - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - - ################ add sequences to extended.fa in the output folder########## - - addSequences(reciprocal_sequences, candidatesOutFile, fasta_path, orthologsOutFile, group, taxa, refBool, tmp_path) - refBool = True - - ############### make Annotation with FAS ################################### - # if we want to search in only one Taxon - if searchTaxon != '' and fasoff == False: - fas = time.time() - print("Calculating FAS scores") - fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - cmd = 'mkdir ' + tmp_path + 'anno_dir' - starting_subprocess(cmd, 'silent') - cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + '_' + asName - starting_subprocess(cmd, 'silent') - clean_fas(fasOutFile + "_forward.domains", 'domains') - clean_fas(fasOutFile + "_reverse.domains", 'domains') - clean_fas(fasOutFile + ".phyloprofile", 'phyloprofile') - - - #if we searched in more than one Taxon and no ortholog was found + print(" \t ...finished \n") + + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start + + ###################### ortholog search ##################################### + + ortholog_sequences = [] + time_ortholog_start = time.time() + if parallel == True: + ##################### parallel compuataion ############################# + calls = [] + cpus = mp.cpu_count() + pool = mp.Pool(cpus) + for asName in assembly_names: + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) + + results = (pool.imap_unordered(ortholog_search, calls)) + pool.close() + pool.join() + for i in results: + ortholog_sequences.append(i) + else: + ###################### computation species per species ################ + for asName in assembly_names: + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] + reciprocal_sequences, candidatesOutFile = ortholog_search(args) + ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) + + ################## preparing output ######################################## + orthologsOutFile = out + "/" + group + ".extended.fa" + time_ortholog_end = time.time() + time_ortholog = time_ortholog_end - time_ortholog_start + if taxa == []: + taxa = [fdog_ref_species] + if append == True: + addSeq(orthologsOutFile, ortholog_sequences) + else: + addRef(orthologsOutFile, fasta_path, taxa) + addSeq(orthologsOutFile, ortholog_sequences) + mappingFile = out + "/tmp/" + group + ".mapping.txt" - if refBool == False and searchTaxon == '': - print("No orthologs found. Exciting ...") - cleanup(tmp, tmp_path) - return 1 - #if we searched in more than one taxon - if fasoff == False and searchTaxon == '': + if fasoff == False: fas = time.time() - print("Calculating FAS scores") + print("Calculating FAS scores ...") + tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - # bug in calcFAS when using --tsv, have to wait till it's fixed before I can use the option - cmd = 'calcFAS --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group starting_subprocess(cmd, 'silent') clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') clean_fas(out + group + ".phyloprofile", 'phyloprofile') + print("\t ...finished \n") ################# remove tmp folder ######################################## - if searchTaxon != '': - cleanup(tmp, tmp_path) - else: - cleanup(tmp, out + "/tmp/") + end = time.time() + time_fas = end - fas + print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") + print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) + sys.stdout = sys.__stdout__ end = time.time() sys.stdout = sys.__stdout__ #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) f.close() + cleanup(tmp, tmp_folder) if __name__ == '__main__': main() diff --git a/fdog/mergeAssemblyOutput.py b/fdog/mergeAssemblyOutput.py deleted file mode 100644 index 1606b1d..0000000 --- a/fdog/mergeAssemblyOutput.py +++ /dev/null @@ -1,124 +0,0 @@ -# -*- coding: utf-8 -*- - -####################################################################### -# Copyright (C) 2020 Vinh Tran -# -# This script is used to merge all output files (.extended.fa, .phyloprofile, -# _forward.domains, _reverse.domains) in a given directory into one file each. -# -# This script is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for -# more details -# -# Contact: hannah.muelbaier@stud.uni-frankfurt.de -# -####################################################################### - -import sys -import os -from os import listdir as ldir -import argparse -from pathlib import Path - -def main(): - version = '0.0.1' - parser = argparse.ArgumentParser(description='You are running fdog.mergeAssemblyOutput version ' + str(version) + '.') - parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', - action='store', default='', required=True) - parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True) - parser.add_argument('-c', '--cleanup', help='Deletes the merged output files from fDOG', action='store_true', default=False) - args = parser.parse_args() - - directory = args.input - out = args.output - cleanup = args.cleanup - if not os.path.exists(os.path.abspath(directory)): - sys.exit('%s not found' % directory) - else: - directory = os.path.abspath(directory) - - phyloprofile = None - set_phylo = set() - domains_0 = None - set_domains_f = set() - domains_1 = None - set_domains_r = set() - ex_fasta = None - set_fasta = set() - header_bool = False - for infile in ldir(directory): - if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile': - if not phyloprofile: - phyloprofile = open(out + '.phyloprofile', 'w') - phyloprofile.write('geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line != 'geneID\tncbiID\torthoID\tFAS_F\tFAS_B\n' and line not in set_phylo: - phyloprofile.write(line) - if len(lines) > 1: - set_phylo = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('_forward.domains') and not infile == out + '_forward.domains': - if not domains_0: - domains_0 = open(out + '_forward.domains', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line not in set_domains_f: - domains_0.write(line) - if len(lines) > 1: - set_domains_f = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('_reverse.domains') and not infile == out + '_reverse.domains': - if not domains_1: - domains_1 = open(out + '_reverse.domains', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - for line in lines: - if line not in set_domains_r: - domains_1.write(line) - if len(lines) > 1: - set_domains_r = set(lines) - if cleanup == True: - os.remove(directory + '/' + infile) - elif infile.endswith('.extended.fa') and not infile == out + '.extended.fa': - if not ex_fasta: - ex_fasta = open(out + '.extended.fa', 'w') - with open(directory + '/' + infile, 'r') as reader: - lines = reader.readlines() - header = set() - #print(set_fasta) - for line in lines: - if line[0] == ">": - header.add(line) - if line not in set_fasta: - ex_fasta.write(line) - header_bool = True - else: - header_bool = False - else: - if header_bool == True: - ex_fasta.write(line) - set_fasta = header - if cleanup == True: - os.remove(directory + '/' +infile) - elif infile.endswith('.tsv'): - os.remove(directory + '/' + infile) - - if phyloprofile: - phyloprofile.close() - if domains_0: - domains_0.close() - if domains_1: - domains_1.close() - if ex_fasta: - ex_fasta.close() - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/fdog/mergeOutput.py b/fdog/mergeOutput.py index 2628280..a6c13c2 100644 --- a/fdog/mergeOutput.py +++ b/fdog/mergeOutput.py @@ -20,14 +20,15 @@ import os from os import listdir as ldir import argparse -from pathlib import Path + def main(): version = '0.0.1' parser = argparse.ArgumentParser(description='You are running fdog.mergeOutput version ' + str(version) + '.') - parser.add_argument('-i','--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', + parser.add_argument('-i', '--input', + help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', action='store', default='', required=True) - parser.add_argument('-o','--output', help='Output name', action='store', default='', required=True) + parser.add_argument('-o', '--output', help='Output name', action='store', default='', required=True) args = parser.parse_args() directory = args.input diff --git a/fdog/removefDog.py b/fdog/removefDog.py index 0ea27eb..7b705ea 100644 --- a/fdog/removefDog.py +++ b/fdog/removefDog.py @@ -19,9 +19,9 @@ import os import argparse import subprocess -from pathlib import Path import shutil + def query_yes_no(question, default='yes'): valid = {'yes': True, 'y': True, 'ye': True, 'no': False, 'n': False} @@ -44,6 +44,7 @@ def query_yes_no(question, default='yes'): sys.stdout.write('Please respond with "yes" or "no" ' '(or "y" or "n").\n') + def main(): version = '0.0.1' parser = argparse.ArgumentParser(description='You are running fdog.remove version ' + str(version) + '.') @@ -81,5 +82,6 @@ def main(): print('NOTE: fdog genome data are still available at %s.' % dataPath) + if __name__ == '__main__': main() diff --git a/fdog/runMulti.py b/fdog/runMulti.py index a696495..c19b0ff 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -48,7 +48,7 @@ def prepare(args, step): coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix) = args + cpu, hyperthread, checkOff, debug, silent) = args mute = False if step == 'core': @@ -69,9 +69,8 @@ def prepare(args, step): coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] - otherArgs = [cpu, hyperthread, debug, True] - assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] - return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) + otherArgs = [cpu, hyperthread, checkOff, debug, True] + return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) def getSeedName(seedFile): seqName = seedFile.split('.')[0] @@ -106,10 +105,9 @@ def compileCore(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)): (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core') - coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute]) + coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute]) if len(coreCompilationJobs) > 0: pool = mp.Pool(cpu) coreOut = [] @@ -131,7 +129,7 @@ def searchOrtho(options, seeds, inFol, cpu, outpath): for seed in seeds: seqFile = [inFol + '/' + seed] seqName = getSeedName(seed) - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') if mute == True: print(seed) else: @@ -178,7 +176,7 @@ def joinOutputs(outpath, jobName, seeds, keep, silent): def calcFAS (outpath, extendedFa, weightpath, cpu): print('Starting calculating FAS scores...') start = time.time() - fasCmd = 'fdogFAS -i %s -w %s --cores %s' % (extendedFa, weightpath, cpu) + fasCmd = 'fas.runFdogFas -i %s -w %s --cores %s --redo_anno' % (extendedFa, weightpath, cpu) try: subprocess.call([fasCmd], shell = True) end = time.time() @@ -191,7 +189,7 @@ def calcFAS (outpath, extendedFa, weightpath, cpu): sys.exit('Problem running\n%s' % (fasCmd)) def main(): - version = '0.0.33' + version = '0.0.45' parser = argparse.ArgumentParser(description='You are running fdogs.run version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) required = parser.add_argument_group('Required arguments') @@ -289,17 +287,10 @@ def main(): choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False) + optional.add_argument('--checkOff', help='Set this flag to turn of the initial checks. Default: False', action='store_true', default=False) optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) - assembly_options = parser.add_argument_group('Assembly options') - assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) - assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') - assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') - assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) - assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) - assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') - assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -368,28 +359,20 @@ def main(): # others cpu = args.cpu hyperthread = args.hyperthread + checkOff = args.checkOff debug = args.debug silentOff = args.silentOff if silentOff == True: silent = False else: silent = True - - #fdog_goes_assembly arguments - assembly = args.assembly - assemblyFile = args.assemblyFile - augustusRefSpec = args.augustusRefSpec - avIntron = args.avIntron - lengthExtension = args.lengthExtension - searchTool = args.searchTool - matrix = args.scoringmatrix ### check fas if not fasoff: try: - fasVersion = subprocess.run(['calcFAS --version'], shell = True, capture_output = True, check = True) + fasVersion = subprocess.run(['fas.run --version'], shell = True, capture_output = True, check = True) except: - sys.exit('Problem with calcFAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!') + sys.exit('Problem with FAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!') ### delete output folder and files if needed if forceComplete: @@ -403,7 +386,10 @@ def main(): outfiles = os.listdir(outpath) for item in outfiles: if item.startswith(jobName): - os.remove(os.path.join(outpath, item)) + try: + os.remove(os.path.join(outpath, item)) + except: + shutil.rmtree(outpath+'/'+item) if item.startswith("runtime"): os.remove(os.path.join(outpath, item)) if os.path.exists(outpath + '/missing.txt'): @@ -465,7 +451,7 @@ def main(): coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, fasoff, countercheck, coreFilter, minScore, strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, debug, silent, assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix] + cpu, hyperthread, checkOff, debug, silent] ### START Path(outpath).mkdir(parents=True, exist_ok=True) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index a0ded09..c4abb82 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -65,13 +65,13 @@ def getfdogInfo(fdogPath, infoType): exit('%s not found' % (fdogPath + '/bin/oneSeq.pl')) def runSingle(args): - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, mute) = args + (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args # basic command (fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec) # add paths - (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) = pathArgs - cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s -assemblypath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath) + (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs + cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath) # add other I/O options (append, force, noCleanup, group, blast, db) = ioArgs if append == True: @@ -153,36 +153,17 @@ def runSingle(args): if minScore > 0: cmd = cmd + ' -coreFilter=%s -minScore=%s' % (coreFilter, minScore) # add other options - (cpu, hyperthread, debug, silent) = otherArgs + (cpu, hyperthread, checkOff, debug, silent) = otherArgs cmd = cmd + ' -cpu=%s' % cpu if hyperthread == True: cmd = cmd + ' -hyperthread' + if checkOff == True: + cmd = cmd + ' -checkOff' if debug == True: cmd = cmd + ' -debug' if silent == True: cmd = cmd + ' -silent' - # add assembly options - (assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath) = assemblyArgs - if assembly == True: - cmd = cmd + ' -assembly' - cmd = cmd + ' -reuseCore' - if not augustusRefSpec == '': - cmd = cmd + ' -augustusRefSpec=%s' % augustusRefSpec - else: - sys.exit('An augutus reference species is requiered by using the option --assembly') - if not avIntron == '': - cmd = cmd + ' -avIntron=%s' % avIntron - if not lengthExtension == '': - cmd = cmd + ' -lengthExtension=%s' % lengthExtension - if not assemblyFile == '': - cmd = cmd + ' -assemblyFile=%s' % assemblyFile - if not searchTool == '': - cmd = cmd + ' -searchTool=%s' % searchTool - if not matrix == '': - cmd = cmd + ' -scoringmatrix=%s' % matrix - if not dataPath == '': - cmd = cmd + ' -dataPath=%s' % dataPath - #print(cmd) + # print(cmd) if mute == True: cmd = cmd + ' > /dev/null 2>&1' try: @@ -191,7 +172,7 @@ def runSingle(args): sys.exit('Problem running\n%s' % (cmd)) def main(): - version = '0.0.33' + version = '0.0.45' parser = argparse.ArgumentParser(description='You are running fdog.run version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) required = parser.add_argument_group('Required arguments') @@ -209,8 +190,6 @@ def main(): optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional_paths.add_argument('--assemblypath', help='Path for the assembly directory', action='store', default='') - addtionalIO = parser.add_argument_group('Other I/O options') addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) @@ -289,17 +268,10 @@ def main(): choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False) + optional.add_argument('--checkOff', help='Set this flag to turn of the initial checks. Default: False', action='store_true', default=False) optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) - assembly_options = parser.add_argument_group('Assembly options') - assembly_options.add_argument('--assembly', help='Turn on support of assembly input files',action='store_true', default=False) - assembly_options.add_argument('--assemblyFile', help='Input file containing the assembly seqeunce', action='store', default='') - assembly_options.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') - assembly_options.add_argument('--avIntron', help='average Intron length of the assembly species', action='store', default=5000, type=int) - assembly_options.add_argument('--lengthExtension', help='length extension of the candidate region', action='store', default=5000, type=int) - assembly_options.add_argument('--searchTool', help='Choose between BLAST or Diamond as a alignemnt search tool. DEFAULT: BLAST', choices=['blast', 'diamond'], action='store', default='blast') - assembly_options.add_argument('--scoringmatrix', help ='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') ### get arguments args = parser.parse_args() @@ -319,7 +291,6 @@ def main(): searchpath = args.searchpath weightpath = args.weightpath pathFile = args.pathFile - assemblypath = args.assemblypath # other I/O arguments append = args.append @@ -367,6 +338,7 @@ def main(): # others cpu = args.cpu hyperthread = args.hyperthread + checkOff = args.checkOff debug = args.debug silentOff = args.silentOff if silentOff == True: @@ -374,15 +346,6 @@ def main(): else: silent = True - #fdog_goes_assembly arguments - assembly = args.assembly - assemblyFile = args.assemblyFile - augustusRefSpec = args.augustusRefSpec - avIntron = args.avIntron - lengthExtension = args.lengthExtension - searchTool = args.searchTool - matrix = args.scoringmatrix - ### get fdog and data path dataPath = '' fdogPath = os.path.realpath(__file__).replace('/runSingle.py','') @@ -430,30 +393,19 @@ def main(): except: sys.exit('weightpath not found in %s' % pathFile) - if assemblypath == '': - assemblypath = dataPath + '/assembly_dir' - if dataPath == 'config': - try: - assemblypath = cfg['assemblypath'] - except: - sys.exit('assemblypath not found in %s' % pathFile) - if assembly == True: - searchpath = assemblypath - ### check input arguments seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) # group arguments basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth] ioArgs = [append, force, noCleanup, group, blast, db] - pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath, assemblypath] + pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] fasArgs = [fasoff, countercheck, coreFilter, minScore] orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] - otherArgs = [cpu, hyperthread, debug, silent] - assemblyArgs = [assembly, assemblyFile, augustusRefSpec, avIntron, lengthExtension, searchTool, matrix, dataPath] + otherArgs = [cpu, hyperthread, checkOff, debug, silent] ### run fdog - runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, assemblyArgs, False]) + runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False]) if __name__ == '__main__': main() diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh index 2e8ff02..1eaf176 100755 --- a/fdog/setup/install_lib.sh +++ b/fdog/setup/install_lib.sh @@ -154,9 +154,6 @@ perlModules=( List::Util Parallel::ForkManager POSIX - XML::SAX - XML::NamespaceSupport - XML::Parser Getopt::Long IO::Handle IPC::Run diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh index 1f74552..28eb851 100755 --- a/fdog/setup/setup.sh +++ b/fdog/setup/setup.sh @@ -114,14 +114,20 @@ echo "Downloading and installing annotation tools/databases:" fasta36="yes" if [ -z "$(which fasta36)" ]; then fasta36="no" - fasta36v="fasta-36.3.8h" + # fasta36v="fasta-36.3.8h" + fasta36v="36.3.8h_04-May-2020" if ! [ -f "bin/aligner/bin/fasta36" ]; then - echo "fasta-36" - wget "http://faculty.virginia.edu/wrpearson/fasta/fasta36/${fasta36v}.tar.gz" - tar xf $fasta36v.tar.gz - rm "${fasta36v}.tar.gz" - mv $fasta36v/* $CURRENT/bin/aligner/ - rm -rf $fasta36v + echo "fasta36" + # wget "http://faculty.virginia.edu/wrpearson/fasta/fasta36/${fasta36v}.tar.gz" + # tar xf $fasta36v.tar.gz + # rm "${fasta36v}.tar.gz" + # mv $fasta36v/* $CURRENT/bin/aligner/ + # rm -rf $fasta36v + wget "https://github.com/wrpearson/fasta36/archive/refs/tags/v${fasta36v}.tar.gz" + tar xf "v${fasta36v}.tar.gz" + rm "v${fasta36v}.tar.gz" + mv fasta36-${fasta36v}/* $CURRENT/bin/aligner/ + rm -rf "fasta36-${fasta36v}" cd "$CURRENT/bin/aligner/src" if [ $sys=="Linux" ]; then make -f ../make/Makefile.linux64_sse2 all @@ -162,10 +168,10 @@ if ! [ -f "$CURRENT/taxonomy/nodes" ]; then exit fi -fasPrepare=0 +setupFAS=0 if [ $fas == 1 ]; then cd "$CURRENT/bin" - if [ -z "$(which annoFAS)" ]; then + if [ -z "$(which fas.doAnno)" ]; then echo "FAS" pip install --user greedyFAS if [ -z "$($grepprog \$HOME/.local/bin:\$PATH ~/$bashFile)" ]; then @@ -174,22 +180,22 @@ if [ $fas == 1 ]; then if [ -z "$($grepprog $homedir/.local/bin ~/$rprofile)" ]; then echo "Sys.setenv(PATH = paste(\"$homedir/.local/bin\", Sys.getenv(\"PATH\"), sep=\":\"))" >> ~/$rprofile fi - fasPrepare=1 + setupFAS=1 else - if ! [ -z "$(prepareFAS -t ./ --check 2>&1 | grep ERROR)" ]; then - fasPrepare=1 + if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then + setupFAS=1 fi fi cd $CURRENT source ~/$bashFile - if [ -z "$(which annoFAS)" ]; then + if [ -z "$(which fas.doAnno)" ]; then echo -e "Installation of FAS failed! Please try again or install FAS by yourself using \e[91mpip install greedyFAS\e[0m!" echo -e "For more info, please check FAS website at \e[91mhttps://github.com/BIONF/FAS\e[0m" exit else - if ! [ -z "$(prepareFAS -t ./ --check 2>&1 | grep ERROR)" ]; then - fasPrepare=1 + if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then + setupFAS=1 fi fi echo "done!" @@ -346,9 +352,6 @@ perlModules=( List::Util Parallel::ForkManager POSIX - XML::SAX - XML::NamespaceSupport - XML::Parser Getopt::Long IO::Handle IPC::Run @@ -409,9 +412,9 @@ else echo "-------------------------------------" $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/hamstr.pl $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/oneSeq.pl - if [ "$fasPrepare" == 1 ]; then + if [ "$setupFAS" == 1 ]; then echo "All tests succeeded." - echo -e "\e[91mPLEASE RUN\e[0m \e[96mprepareFAS\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m" + echo -e "\e[91mPLEASE RUN\e[0m \e[96mfas.setup\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m" echo "Then you can test fdog with:" else echo "All tests succeeded, fdog should be ready to run. You can test it with:" diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh index 7b4bd08..73b8573 100755 --- a/fdog/setup/setup_conda.sh +++ b/fdog/setup/setup_conda.sh @@ -163,9 +163,6 @@ perlModules=( List::Util Parallel::ForkManager POSIX - XML::SAX - XML::NamespaceSupport - XML::Parser Getopt::Long IO::Handle IPC::Run @@ -230,28 +227,28 @@ if ! [ -f "$CURRENT/taxonomy/nodes" ]; then fi cd "$CURRENT/bin" -fasPrepare=0 -if [ -z "$(which annoFAS)" ]; then +setupFAS=0 +if [ -z "$(which fas.doAnno)" ]; then echo "FAS" conda install -y -c BIONF fas - if [ -z "$(which annoFAS)" ]; then + if [ -z "$(which fas.doAnno)" ]; then echo -e "\e[31mInstallation of FAS failed! Please try again!\e[0m" exit fi - fasPrepare=1 + setupFAS=1 else - if ! [ -z "$(prepareFAS -t ./ --check 2>&1 | grep ERROR)" ]; then - fasPrepare=1 + if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then + setupFAS=1 fi fi -if [ -z "$(which annoFAS)" ]; then +if [ -z "$(which fas.doAnno)" ]; then echo -e "Installation of FAS failed! Please try again or install FAS by yourself using \e[91mconda install -c BIONF fas\e[0m or \e[91mpip install greedyFAS\e[0m" echo -e "For more info, please check FAS website at \e[91mhttps://github.com/BIONF/FAS\e[0m" exit else - if ! [ -z "$(prepareFAS -t ./ --check 2>&1 | grep ERROR)" ]; then - fasPrepare=1 + if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then + setupFAS=1 fi fi cd $CURRENT @@ -435,9 +432,9 @@ else echo "-------------------------------------" $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/hamstr.pl $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/oneSeq.pl - if [ "$fasPrepare" == 1 ]; then + if [ "$setupFAS" == 1 ]; then echo "All tests succeeded." - echo -e "\e[91mPLEASE RUN\e[0m \e[96msetupFAS\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m" + echo -e "\e[91mPLEASE RUN\e[0m \e[96mfas.setup\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m" echo "Then you can test fdog with:" else echo "All tests succeeded, fdog should be ready to run. You can test it with:" diff --git a/setup.py b/setup.py index 75573c1..b61e66b 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ setup( name="fdog", - version="0.0.33", + version="0.0.45", python_requires='>=3.7.0', description="Feature-aware Directed OrtholoG search tool", @@ -43,7 +43,7 @@ 'ete3', 'six', 'PyYAML', - 'greedyFAS>=1.5.0' + 'greedyFAS>=1.11.2' ], entry_points={ 'console_scripts': ["fdog.run = fdog.runSingle:main", From 6d7df01742ec284f9df85a4f38b5ae06a4bb1a89 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 11:34:59 +0200 Subject: [PATCH 132/229] updated help function --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e40701b..4733b4b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -769,7 +769,7 @@ def main(): tmp = args.tmp strict = args.strict checkCoorthologs = args.checkCoorthologsRef - filter = args.filter + # if filter == True or filter == 'yes': filter = 'yes' else: From ac2652b3162e8fc6d7af94ed6bb0ccea0b10053d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 11:40:22 +0200 Subject: [PATCH 133/229] updated help function --- fdog/fDOGassembly.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 207b50f..27a36c2 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- ####################################################################### - # Copyright (C) 2021 Hannah Muelbaier # # This script is used to run fDOG-Assembly which performs targeted ortholog @@ -555,7 +554,6 @@ def cleanup(tmp, tmp_path): print("tmp folder could not be removed!") break - def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: return candidate_names @@ -719,6 +717,7 @@ def main(): #################### handle user input ##################################### start = time.time() + version = '0.1.2' ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') @@ -770,11 +769,11 @@ def main(): tmp = args.tmp strict = args.strict checkCoorthologs = args.checkCoorthologsRef - # - if filter == True or filter == 'yes': - filter = 'yes' - else: - filter = 'no' + #filter = args.filter + #if filter == True or filter == 'yes': + #filter = 'yes' + #else: + #filter = 'no' #others average_intron_length = args.avIntron length_extension = args.lengthExtension @@ -972,7 +971,6 @@ def main(): if fasoff == False: fas = time.time() print("Calculating FAS scores ...") - tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group @@ -988,9 +986,6 @@ def main(): print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) sys.stdout = sys.__stdout__ - end = time.time() - sys.stdout = sys.__stdout__ - #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) f.close() cleanup(tmp, tmp_folder) From 688b21e79318679690e1d88bc0e242c169be4da6 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Tue, 19 Oct 2021 11:52:46 +0200 Subject: [PATCH 134/229] rm filter option --- fdog/fDOGassembly.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 12fcf6f..f7f5e05 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -770,11 +770,11 @@ def main(): tmp = args.tmp strict = args.strict checkCoorthologs = args.checkCoorthologsRef - filter = args.filter - if filter == True or filter == 'yes': - filter = 'yes' - else: - filter = 'no' + #filter = args.filter + #if filter == True or filter == 'yes': + #filter = 'yes' + #else: + #filter = 'no' #others average_intron_length = args.avIntron length_extension = args.lengthExtension From 075616852382405d6c922fde0677fdc210ca37fc Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 14:32:47 +0200 Subject: [PATCH 135/229] error handling of ValueError in function get_distance_biopython --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 27a36c2..d216048 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -317,7 +317,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates try: distances = get_distance_biopython(aln_file, matrix) - except ValueError: + except get_distance_biopython.ValueError: print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) return 0, "NaN", "NaN" From f9d4623faa9817bb3f56672c29cf40df47110bce Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 15:36:51 +0200 Subject: [PATCH 136/229] test --- fdog/fDOGassembly.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d216048..adc48b2 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -317,6 +317,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates try: distances = get_distance_biopython(aln_file, matrix) + print(distances) except get_distance_biopython.ValueError: print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) return 0, "NaN", "NaN" @@ -658,6 +659,7 @@ def ortholog_search(args): sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) #cleanup(tmp, tmp_folder) #sys.exit() + sys.stdout.flush() return [], candidatesOutFile #else: #print("\t ...finished") @@ -667,6 +669,7 @@ def ortholog_search(args): if regions == 0: #no candidat region are available, no ortholog can be found sys.stdout.write("No candidate region found for species %s!\n" % asName) + sys.stdout.flush() return [], candidatesOutFile else: @@ -685,6 +688,7 @@ def ortholog_search(args): ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: #print("No genes found at candidate regions\n") + sys.stdout.flush() return [], candidatesOutFile reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) @@ -692,10 +696,12 @@ def ortholog_search(args): if reciprocal_sequences == 0: if regions != 0: sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + sys.stdout.flush() return [], candidatesOutFile else: reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + sys.stdout.flush() return reciprocal_sequences, candidatesOutFile class Logger(object): From 134f94d830803c708b989d339201501ecad8ab39 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 15:46:02 +0200 Subject: [PATCH 137/229] test --- fdog/fDOGassembly.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index adc48b2..d6877e2 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -317,14 +317,16 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates try: distances = get_distance_biopython(aln_file, matrix) - print(distances) + distance_hit_query = distances[best_hit, candidate_name] + distance_ref_hit = distances[best_hit, ref] + #print(distances) except get_distance_biopython.ValueError: print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) return 0, "NaN", "NaN" - distance_hit_query = distances[best_hit, candidate_name] - distance_ref_hit = distances[best_hit, ref] + #distance_hit_query = distances[best_hit, candidate_name] + #distance_ref_hit = distances[best_hit, ref] if distance_ref_hit < distance_hit_query: #accepted From 81af9add957ca8ec3eb0257a8d9d0b2e452ab2e9 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 15:50:21 +0200 Subject: [PATCH 138/229] test --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d6877e2..111baf7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -320,7 +320,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates distance_hit_query = distances[best_hit, candidate_name] distance_ref_hit = distances[best_hit, ref] #print(distances) - except get_distance_biopython.ValueError: + except ValueError: print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) return 0, "NaN", "NaN" From 1c54841813a862987790ef7940d40dccbc8a9642 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 15:51:25 +0200 Subject: [PATCH 139/229] test --- fdog/fDOGassembly.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 111baf7..4bd9938 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -317,6 +317,9 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates try: distances = get_distance_biopython(aln_file, matrix) + print(distances) + print(best_hit) + print(candidate_name) distance_hit_query = distances[best_hit, candidate_name] distance_ref_hit = distances[best_hit, ref] #print(distances) From 8eb12a52ca85a97a1174028ba0c9018a70459dba Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 19 Oct 2021 15:58:14 +0200 Subject: [PATCH 140/229] fixed item not found error in distance function --- fdog/fDOGassembly.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 4bd9938..111baf7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -317,9 +317,6 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates try: distances = get_distance_biopython(aln_file, matrix) - print(distances) - print(best_hit) - print(candidate_name) distance_hit_query = distances[best_hit, candidate_name] distance_ref_hit = distances[best_hit, ref] #print(distances) From 326ff4259b578d479f980914d1be0bc95d8290b7 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 20 Oct 2021 10:25:11 +0200 Subject: [PATCH 141/229] cleaning up output --- fdog/fDOGassembly.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 111baf7..36db8a3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -244,7 +244,8 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug output.write(line) sequence_file.close() except FileNotFoundError: - print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") + pass + #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") output.close() def searching_for_db(assembly_path): @@ -321,7 +322,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates distance_ref_hit = distances[best_hit, ref] #print(distances) except ValueError: - print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) + #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) return 0, "NaN", "NaN" From 97750b6f1fd010dc5998a7e1636a0663f7bfdcd8 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Wed, 20 Oct 2021 11:39:36 +0200 Subject: [PATCH 142/229] Fdog goes assembly (#13) --- fdog/fDOGassembly.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f7f5e05..8aeec9b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -2,6 +2,7 @@ ####################################################################### + # Copyright (C) 2021 Hannah Muelbaier # # This script is used to run fDOG-Assembly which performs targeted ortholog @@ -245,7 +246,9 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug output.write(line) sequence_file.close() except FileNotFoundError: - print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") + pass + #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") + output.close() def searching_for_db(assembly_path): @@ -318,13 +321,16 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates try: distances = get_distance_biopython(aln_file, matrix) + distance_hit_query = distances[best_hit, candidate_name] + distance_ref_hit = distances[best_hit, ref] + #print(distances) except ValueError: - print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) - return 0, "NaN", "NaN" + #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) - distance_hit_query = distances[best_hit, candidate_name] - distance_ref_hit = distances[best_hit, ref] + + #distance_hit_query = distances[best_hit, candidate_name] + #distance_ref_hit = distances[best_hit, ref] if distance_ref_hit < distance_hit_query: #accepted @@ -660,6 +666,8 @@ def ortholog_search(args): sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) #cleanup(tmp, tmp_folder) #sys.exit() + sys.stdout.flush() + return [], candidatesOutFile #else: #print("\t ...finished") @@ -669,6 +677,8 @@ def ortholog_search(args): if regions == 0: #no candidat region are available, no ortholog can be found sys.stdout.write("No candidate region found for species %s!\n" % asName) + sys.stdout.flush() + return [], candidatesOutFile else: @@ -687,6 +697,7 @@ def ortholog_search(args): ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: #print("No genes found at candidate regions\n") + sys.stdout.flush() return [], candidatesOutFile reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) @@ -694,10 +705,12 @@ def ortholog_search(args): if reciprocal_sequences == 0: if regions != 0: sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + sys.stdout.flush() return [], candidatesOutFile else: reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + sys.stdout.flush() return reciprocal_sequences, candidatesOutFile class Logger(object): @@ -988,9 +1001,6 @@ def main(): print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) sys.stdout = sys.__stdout__ - end = time.time() - sys.stdout = sys.__stdout__ - #print(group + "\t" + str(end-fas) + "\t" + str(end-start)) f.close() cleanup(tmp, tmp_folder) From a7f9e19097922f3c69921c4ed17199ae1ba83bc8 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Wed, 20 Oct 2021 12:04:27 +0200 Subject: [PATCH 143/229] bug fix in function checkCoOrthologs --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 8aeec9b..9b745db 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -325,6 +325,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates distance_ref_hit = distances[best_hit, ref] #print(distances) except ValueError: + pass #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) From 7b8745b8d1da86606a51d779580a68009927f91c Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Wed, 20 Oct 2021 12:20:45 +0200 Subject: [PATCH 144/229] bug fix --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 9b745db..10f7aeb 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -327,6 +327,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates except ValueError: pass #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) + return 0, "NaN", "NaN" From c21a3f5b6ffe29c5beeb21b6a992dea15a4d02f7 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 22 Oct 2021 11:28:00 +0200 Subject: [PATCH 145/229] enabled output during parallel computation --- fdog/fDOGassembly.py | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 36db8a3..760e6d0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -631,6 +631,7 @@ def clean_fas(path, file_type): def ortholog_search(args): (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args + output = [] cmd = 'mkdir ' + out + '/tmp/' + asName starting_subprocess(cmd, 'silent') tmp_path = out + "tmp/" + asName + "/" @@ -639,7 +640,7 @@ def ortholog_search(args): fasOutFile = out + "/" + group #mappingFile = out + "/tmp/" + group + ".mapping.txt" - sys.stdout.write("Searching in species " + asName + "\n") + output.append("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" db_check = searching_for_db(db_path) @@ -659,24 +660,20 @@ def ortholog_search(args): time_tblastn_end = time.time() time_tblastn = time_tblastn_end - time_tblastn_start if exit_code == 1: - sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) - #cleanup(tmp, tmp_folder) - #sys.exit() - sys.stdout.flush() - return [], candidatesOutFile + output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) + return [], candidatesOutFile, output #else: #print("\t ...finished") - print("Time tblastn %s in species %s" % (str(time_tblastn), asName)) + output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName)) regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) if regions == 0: #no candidat region are available, no ortholog can be found - sys.stdout.write("No candidate region found for species %s!\n" % asName) - sys.stdout.flush() - return [], candidatesOutFile + output.append("No candidate region found for species %s!\n" % asName) + return [], candidatesOutFile, output else: - print(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) + output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### @@ -686,26 +683,23 @@ def ortholog_search(args): #print("\t ...finished \n") time_augustus_end = time.time() time_augustus = time_augustus_end - time_augustus_start - print("Time augustus: %s species %s \n" % (str(time_augustus), asName)) + output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: #print("No genes found at candidate regions\n") - sys.stdout.flush() - return [], candidatesOutFile + return [], candidatesOutFile, output reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) if reciprocal_sequences == 0: if regions != 0: - sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) - sys.stdout.flush() - return [], candidatesOutFile + output.append("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + return [], candidatesOutFile, output else: reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - sys.stdout.flush() - return reciprocal_sequences, candidatesOutFile + return reciprocal_sequences, candidatesOutFile, output class Logger(object): def __init__(self, file): @@ -956,12 +950,14 @@ def main(): pool.close() pool.join() for i in results: - ortholog_sequences.append(i) + print(i[2]) + ortholog_sequences.append(i[0], i[1]) else: ###################### computation species per species ################ for asName in assembly_names: args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] - reciprocal_sequences, candidatesOutFile = ortholog_search(args) + reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search(args) + print(output_ortholog_search) ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) ################## preparing output ######################################## From d4374231dd228c97dd42f771f6d9b462faf2eb47 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 22 Oct 2021 11:30:49 +0200 Subject: [PATCH 146/229] enabled output during parallel computation --- fdog/fDOGassembly.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 08de346..42ddf69 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -664,17 +664,8 @@ def ortholog_search(args): time_tblastn_end = time.time() time_tblastn = time_tblastn_end - time_tblastn_start if exit_code == 1: -<<<<<<< HEAD output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) return [], candidatesOutFile, output -======= - sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) - #cleanup(tmp, tmp_folder) - #sys.exit() - sys.stdout.flush() - - return [], candidatesOutFile ->>>>>>> 0016fa5fd0081814b3d2457b7f6b3d5ac4b987a1 #else: #print("\t ...finished") output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName)) @@ -682,15 +673,8 @@ def ortholog_search(args): regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) if regions == 0: #no candidat region are available, no ortholog can be found -<<<<<<< HEAD output.append("No candidate region found for species %s!\n" % asName) return [], candidatesOutFile, output -======= - sys.stdout.write("No candidate region found for species %s!\n" % asName) - sys.stdout.flush() - - return [], candidatesOutFile ->>>>>>> 0016fa5fd0081814b3d2457b7f6b3d5ac4b987a1 else: output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) From 7a37abc0a5109147779704517eddef55135a10ba Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 22 Oct 2021 11:44:35 +0200 Subject: [PATCH 147/229] bug fix --- fdog/fDOGassembly.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 42ddf69..6464384 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -325,7 +325,9 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates distance_ref_hit = distances[best_hit, ref] #print(distances) except ValueError: + pass #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) + return 0, "NaN", "NaN" From 02f004671375ebf02c9bc0a607723f6409a9150f Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 22 Oct 2021 11:56:50 +0200 Subject: [PATCH 148/229] improved output --- fdog/fDOGassembly.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 6464384..56de5f1 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -955,15 +955,17 @@ def main(): pool.close() pool.join() for i in results: - print(i[2]) ortholog_sequences.append(i[0], i[1]) + for k in i[2]: + print(k) else: ###################### computation species per species ################ for asName in assembly_names: args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search(args) - print(output_ortholog_search) ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) + for k in output_ortholog_search: + print(k) ################## preparing output ######################################## orthologsOutFile = out + "/" + group + ".extended.fa" From 52feba3fdc5d50a9d2f14953297fad5381091531 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 22 Oct 2021 12:09:58 +0200 Subject: [PATCH 149/229] improved output --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 56de5f1..ad10cc8 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -955,7 +955,7 @@ def main(): pool.close() pool.join() for i in results: - ortholog_sequences.append(i[0], i[1]) + ortholog_sequences.append([i[0], i[1]]) for k in i[2]: print(k) else: From 254034d6d16b1757c41fc5973da6f267bc19c5fe Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Mon, 25 Oct 2021 10:11:37 +0200 Subject: [PATCH 150/229] Fdog goes assembly (#14) --- fdog/fDOGassembly.py | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 10f7aeb..9ed4e88 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -637,6 +637,7 @@ def clean_fas(path, file_type): def ortholog_search(args): (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args + output = [] cmd = 'mkdir ' + out + '/tmp/' + asName starting_subprocess(cmd, 'silent') tmp_path = out + "tmp/" + asName + "/" @@ -645,7 +646,7 @@ def ortholog_search(args): fasOutFile = out + "/" + group #mappingFile = out + "/tmp/" + group + ".mapping.txt" - sys.stdout.write("Searching in species " + asName + "\n") + output.append("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" db_check = searching_for_db(db_path) @@ -665,26 +666,21 @@ def ortholog_search(args): time_tblastn_end = time.time() time_tblastn = time_tblastn_end - time_tblastn_start if exit_code == 1: - sys.stdout.write("The tblastn search takes too long for species %s. Exciting ..." % asName) - #cleanup(tmp, tmp_folder) - #sys.exit() - sys.stdout.flush() + output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) + return [], candidatesOutFile, output - return [], candidatesOutFile #else: #print("\t ...finished") - print("Time tblastn %s in species %s" % (str(time_tblastn), asName)) + output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName)) regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) if regions == 0: #no candidat region are available, no ortholog can be found - sys.stdout.write("No candidate region found for species %s!\n" % asName) - sys.stdout.flush() - - return [], candidatesOutFile + output.append("No candidate region found for species %s!\n" % asName) + return [], candidatesOutFile, output else: - print(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) + output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) extract_seq(regions, db_path, tmp_path, mode) ############### make Augustus PPX search ################################### @@ -694,26 +690,23 @@ def ortholog_search(args): #print("\t ...finished \n") time_augustus_end = time.time() time_augustus = time_augustus_end - time_augustus_start - print("Time augustus: %s species %s \n" % (str(time_augustus), asName)) + output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: #print("No genes found at candidate regions\n") - sys.stdout.flush() - return [], candidatesOutFile + return [], candidatesOutFile, output reciprocal_sequences, taxa = backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, evalue, taxa, searchTool, checkCoorthologs, msaTool, matrix, dataPath, filter, tmp_path, mode) if reciprocal_sequences == 0: if regions != 0: - sys.stdout.write("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) - sys.stdout.flush() - return [], candidatesOutFile + output.append("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) + return [], candidatesOutFile, output else: reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) - sys.stdout.flush() - return reciprocal_sequences, candidatesOutFile + return reciprocal_sequences, candidatesOutFile, output class Logger(object): def __init__(self, file): @@ -963,13 +956,17 @@ def main(): pool.close() pool.join() for i in results: - ortholog_sequences.append(i) + ortholog_sequences.append([i[0], i[1]]) + for k in i[2]: + print(k) else: ###################### computation species per species ################ for asName in assembly_names: args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] - reciprocal_sequences, candidatesOutFile = ortholog_search(args) + reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search(args) ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) + for k in output_ortholog_search: + print(k) ################## preparing output ######################################## orthologsOutFile = out + "/" + group + ".extended.fa" From 9c228b2865d1682f2040250f5f4107f11c8d11c4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 25 Oct 2021 13:23:43 +0200 Subject: [PATCH 151/229] a file can be used as input for --searchTaxa --- fdog/fDOGassembly.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index ad10cc8..dbd49e0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -880,12 +880,24 @@ def main(): if searchTaxa == []: assembly_names = os.listdir(assemblyDir) else: - assembly_names = os.listdir(assemblyDir) - for Taxon in searchTaxa: - if Taxon not in assembly_names: - print("Taxon %s is not in the assembly_dir" % Taxon) - sys.exit() - assembly_names = searchTaxa + if len(searchTaxa) > 1: + assembly_names = os.listdir(assemblyDir) + for Taxon in searchTaxa: + if Taxon not in assembly_names: + print("Taxon %s is not in the assembly_dir" % Taxon) + sys.exit() + assembly_names = searchTaxa + else: + if searchTaxa[0] in assembly_names: + assembly_names = searchTaxa + elif os.path.isfile(searchTaxa[0]): + with open(searchTaxa[0]) as file: + lines = file.readlines() + assembly_names = [line.rstrip() for line in lines] + else: + print("Input %s for search Taxa is not in the assembly_dir or an existing file" % searchTaxa[0]) + + ################################# paths #################################### From fdb30730476e611d74e0ed8d527ef8711821a7d9 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 25 Oct 2021 13:39:19 +0200 Subject: [PATCH 152/229] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index dee4ab4..fc510c4 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -889,7 +889,7 @@ def main(): sys.exit() assembly_names = searchTaxa else: - if searchTaxa[0] in assembly_names: + if searchTaxa[0] in os.listdir(assemblyDir): assembly_names = searchTaxa elif os.path.isfile(searchTaxa[0]): with open(searchTaxa[0]) as file: From 42f7def51ea85f12f7fecd57830c5101a3439b9c Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Mon, 25 Oct 2021 14:00:56 +0200 Subject: [PATCH 153/229] Fdog goes assembly (#15) --- fdog/fDOGassembly.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 9ed4e88..fc510c4 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -881,12 +881,24 @@ def main(): if searchTaxa == []: assembly_names = os.listdir(assemblyDir) else: - assembly_names = os.listdir(assemblyDir) - for Taxon in searchTaxa: - if Taxon not in assembly_names: - print("Taxon %s is not in the assembly_dir" % Taxon) - sys.exit() - assembly_names = searchTaxa + if len(searchTaxa) > 1: + assembly_names = os.listdir(assemblyDir) + for Taxon in searchTaxa: + if Taxon not in assembly_names: + print("Taxon %s is not in the assembly_dir" % Taxon) + sys.exit() + assembly_names = searchTaxa + else: + if searchTaxa[0] in os.listdir(assemblyDir): + assembly_names = searchTaxa + elif os.path.isfile(searchTaxa[0]): + with open(searchTaxa[0]) as file: + lines = file.readlines() + assembly_names = [line.rstrip() for line in lines] + else: + print("Input %s for search Taxa is not in the assembly_dir or an existing file" % searchTaxa[0]) + + ################################# paths #################################### From f43820e9fc66ec930e89e50ffeba679d5b9f43cd Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 26 Oct 2021 16:55:41 +0200 Subject: [PATCH 154/229] fixed bug in searching_for_db --- fdog/fDOGassembly.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index fc510c4..1b44ea9 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -257,6 +257,11 @@ def searching_for_db(assembly_path): check = True for end in db_endings: check = check and os.path.exists(assembly_path + end) + + if check == False: + check = True + for end in db_endings: + check = check and os.path.exists(assembly_path + '.00.' + end) return check def get_distance_biopython(file, matrix): @@ -563,7 +568,6 @@ def cleanup(tmp, tmp_path): print("tmp folder could not be removed!") break - def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): if len(candidate_names) == 1: return candidate_names From 7d12ffa28c25f2115ad6005e9d4bf7071508023c Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 26 Oct 2021 17:10:06 +0200 Subject: [PATCH 155/229] fixed bug in function searching_for_db --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 1b44ea9..fdb90fa 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -261,7 +261,7 @@ def searching_for_db(assembly_path): if check == False: check = True for end in db_endings: - check = check and os.path.exists(assembly_path + '.00.' + end) + check = check and os.path.exists(assembly_path + '.00' + end) return check def get_distance_biopython(file, matrix): From 110073f4e00da8795dd43cbec28ac12b9d90b4f4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 26 Oct 2021 17:22:32 +0200 Subject: [PATCH 156/229] bug fix searching_for_db function --- fdog/fDOGassembly.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index fdb90fa..2b9e6fb 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -256,12 +256,8 @@ def searching_for_db(assembly_path): db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto'] check = True for end in db_endings: - check = check and os.path.exists(assembly_path + end) - - if check == False: - check = True - for end in db_endings: - check = check and os.path.exists(assembly_path + '.00' + end) + if not any(File.endswith(end) for File in os.listdir(assembly_path)): + check = False return check def get_distance_biopython(file, matrix): From afd28c60bf071f9d2943b6ebcb18ee2c4dcd0c09 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 26 Oct 2021 17:28:05 +0200 Subject: [PATCH 157/229] testing --- fdog/fDOGassembly.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 2b9e6fb..b92cefc 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -649,7 +649,8 @@ def ortholog_search(args): output.append("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - db_check = searching_for_db(db_path) + blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + db_check = searching_for_db(blast_dir_path) if db_check == 0: #print("Creating a blast data base...") From 6076c5da9bf5f4abfca1a724dc142b763c46e674 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 26 Oct 2021 17:29:55 +0200 Subject: [PATCH 158/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b92cefc..d220039 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -649,7 +649,7 @@ def ortholog_search(args): output.append("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" - blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/" db_check = searching_for_db(blast_dir_path) if db_check == 0: From 2f38455330bef1e8b63a2d8ab0c1aed375c7c479 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 9 Feb 2022 14:38:49 +0100 Subject: [PATCH 159/229] reorganization of code to enable the use of metaeuk as an alternative to Augustus --- fdog/.DS_Store | Bin 8196 -> 8196 bytes fdog/fDOGassembly.py | 145 ++++++++++++++++++++++++++----------------- 2 files changed, 89 insertions(+), 56 deletions(-) diff --git a/fdog/.DS_Store b/fdog/.DS_Store index 34e42555d35fd3e0f289e49c57c3fa62ffc1f870..a99a01c231b8aab3b888fe9e4dacf4b66808b3f0 100644 GIT binary patch delta 40 wcmZp1XmOa}&nU7nU^hRb$YvgaaOTbHg(FxdHu!92m-xoA*;8~M)5Hc(01vhes{jB1 delta 69 zcmZp1XmOa}&nUVvU^hRb=w=>)aAs*ShFpeJh9ZV^AnC|Z41}pbktBv3hRVr#!U{~x V&YMGo(^)pNOMGM5yitUm8312<5m*2K diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d220039..8a9af97 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- ####################################################################### - - # Copyright (C) 2021 Hannah Muelbaier # # This script is used to run fDOG-Assembly which performs targeted ortholog @@ -635,8 +633,8 @@ def clean_fas(path, file_type): file.write(new_line) file.close() -def ortholog_search(args): - (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs) = args +def ortholog_search_tblastn(args): + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction) = args output = [] cmd = 'mkdir ' + out + '/tmp/' + asName starting_subprocess(cmd, 'silent') @@ -670,8 +668,6 @@ def ortholog_search(args): output.append("The tblastn search takes too long for species %s. Skipping species ..." % asName) return [], candidatesOutFile, output - #else: - #print("\t ...finished") output.append("Time tblastn %s in species %s" % (str(time_tblastn), asName)) regions, number_regions = candidate_regions(average_intron_length, evalue, tmp_path) @@ -684,14 +680,18 @@ def ortholog_search(args): output.append(str(number_regions) + " candiate region(s) were found for species %s.\n" % asName) extract_seq(regions, db_path, tmp_path, mode) - ############### make Augustus PPX search ################################### - #print("Starting augustus ppx ...") - time_augustus_start = time.time() - augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - #print("\t ...finished \n") - time_augustus_end = time.time() - time_augustus = time_augustus_end - time_augustus_start - output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) + + if gene_prediction == "augustus": + ############### make Augustus PPX search ################################### + #print("Starting augustus ppx ...") + time_augustus_start = time.time() + augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) + #print("\t ...finished \n") + time_augustus_end = time.time() + time_augustus = time_augustus_end - time_augustus_start + output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) + else: + print("test") ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: @@ -709,6 +709,48 @@ def ortholog_search(args): return reciprocal_sequences, candidatesOutFile, output +def blockProfiles(core_path, group, mode): + + ######################## paths ################################ + msa_path = core_path + "/" + group +"/"+ group + ".aln" + check_path(msa_path) + profile_path = out + "/tmp/" + group + ".prfl" + + ######################## block profile ##################################### + + print("Building a block profile ...") + cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path + starting_subprocess(cmd, 'silent') + + if int(os.path.getsize(profile_path)) > 0: + print("\t ...finished \n") + else: + print("Building block profiles failed. Using prepareAlign to convert alignment\n") + new_path = core_path + group +"/"+ group + "_new.aln" + cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path + starting_subprocess(cmd, mode) + cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path + starting_subprocess(cmd, 'silent') + print(" \t ...finished \n") + + return profile_path + +def consensusSequence(core_path, group, mode): + + ######################## paths ################################ + hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" + check_path(hmm_path) + consensus_path = out + "/tmp/" + group + ".con" + + ######################## consensus sequence ################################ + #make a majority-rule consensus sequence with the tool hmmemit from hmmer + print("Building a consensus sequence") + cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path + starting_subprocess(cmd, mode) + print("\t ...finished\n") + + return consensus_path + class Logger(object): def __init__(self, file): self.file = file @@ -722,7 +764,6 @@ def write(self, message): def flush(self): pass - def main(): #################### handle user input ##################################### @@ -736,7 +777,6 @@ def main(): required = parser.add_argument_group('Required arguments') required.add_argument('--gene', help='Core_ortholog group name. Folder inlcuding the fasta file, hmm file and aln file has to be located in core_orthologs/', action='store', default='', required=True) - required.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='', required=True) required.add_argument('--refSpec', help='Reference taxon/taxa for fDOG.', action='store', nargs="+", default='', required=True) ################## optional arguments ###################################### optional = parser.add_argument_group('Optional arguments') @@ -763,11 +803,12 @@ def main(): optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False) + optional.add_argument('--augustus', help= 'Gene prediction is done by using the tool Augustus PPX', action='store_true', default=False) + optional.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') args = parser.parse_args() # required group = args.gene - augustus_ref_species = args.augustusRefSpec fdog_ref_species = args.refSpec #paths user input assemblyDir = args.assemblyPath @@ -800,6 +841,18 @@ def main(): append = args.append parallel = args.parallel + #gene prediction tool + augustus = args.augustus + if augutus == True: + augustus_ref_species = args.augustusRefSpec + if augustus_ref_species == '': + print("Augustus reference species is required when using Augustus as gene prediction tool") + return 1 + gene_prediction = "augustus" + else: + gene_prediction = "metaeuk" + + # output modes if debug == True and silent == True: print("It's not possible to use booth modes, please restart and use --debug or --silent") @@ -903,14 +956,8 @@ def main(): ################################# paths #################################### - msa_path = core_path + "/" + group +"/"+ group + ".aln" - check_path(msa_path) - hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" - check_path(hmm_path) fasta_path = core_path + "/" + group +"/"+ group + ".fa" check_path(fasta_path) - consensus_path = out + "/tmp/" + group + ".con" - profile_path = out + "/tmp/" + group + ".prfl" tmp_folder = out + "/tmp" ########### is/are fDOG reference species part of ortholog group? ########## @@ -925,47 +972,30 @@ def main(): print("Gene: " + group) print("fDOG reference species: " + fdog_ref_species + " \n") - ######################## consensus sequence ################################ - group_computation_time_start = time.time() - #make a majority-rule consensus sequence with the tool hmmemit from hmmer - print("Building a consensus sequence") - cmd = 'hmmemit -c -o' + consensus_path + ' ' + hmm_path - starting_subprocess(cmd, mode) - print("\t ...finished\n") + ###################### preparations ######################################## - ######################## block profile ##################################### + if augustus == True: + group_computation_time_start = time.time() + consensus_path = consensusSequence(core_path, group, mode) + profile_path = blockProfiles(core_path, group, mode) + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start - print("Building a block profile ...") - cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path - starting_subprocess(cmd, 'silent') - - if int(os.path.getsize(profile_path)) > 0: - print("\t ...finished \n") - else: - print("Building block profiles failed. Using prepareAlign to convert alignment\n") - new_path = core_path + group +"/"+ group + "_new.aln" - cmd = 'prepareAlign < ' + msa_path + ' > ' + new_path - starting_subprocess(cmd, mode) - cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path - starting_subprocess(cmd, 'silent') - print(" \t ...finished \n") - - group_computation_time_end = time.time() - time_group = group_computation_time_end - group_computation_time_start ###################### ortholog search ##################################### ortholog_sequences = [] time_ortholog_start = time.time() + if parallel == True: - ##################### parallel compuataion ############################# + ##################### parallel computation ############################# calls = [] cpus = mp.cpu_count() pool = mp.Pool(cpus) for asName in assembly_names: - calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs]) + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction]) - results = (pool.imap_unordered(ortholog_search, calls)) + results = (pool.imap_unordered(ortholog_search_tblastn, calls)) pool.close() pool.join() for i in results: @@ -973,18 +1003,20 @@ def main(): for k in i[2]: print(k) else: - ###################### computation species per species ################ + ###################### computation species wise ################ for asName in assembly_names: - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs] - reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search(args) + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction] + reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args) ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) for k in output_ortholog_search: print(k) - ################## preparing output ######################################## - orthologsOutFile = out + "/" + group + ".extended.fa" time_ortholog_end = time.time() time_ortholog = time_ortholog_end - time_ortholog_start + + ################## preparing output ######################################## + orthologsOutFile = out + "/" + group + ".extended.fa" + if taxa == []: taxa = [fdog_ref_species] if append == True: @@ -1006,6 +1038,7 @@ def main(): clean_fas(out + group + "_reverse.domains", 'domains') clean_fas(out + group + ".phyloprofile", 'phyloprofile') print("\t ...finished \n") + ################# remove tmp folder ######################################## end = time.time() time_fas = end - fas From e088dff0ac04bd6fd5aa27aedf38af1502eda834 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 9 Feb 2022 16:05:30 +0100 Subject: [PATCH 160/229] included metaeuk --- fdog/fDOGassembly.py | 74 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 8a9af97..f12e9cc 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -212,6 +212,20 @@ def extract_seq(region_dic, path, tmp_path, mode): cmd = "blastdbcmd -db " + path + " -dbtype 'nucl' -entry " + key + " -out " + tmp_path + key + ".fasta -outfmt %f" starting_subprocess(cmd, mode) +def extract_sequence_from_to(name, file, start, end): + out = name + ".fasta" + if start < 0: + start = 0 + with open(out,"w") as f: + for seq_record in SeqIO.parse(file, "fasta"): + f.write(str(seq_record.id) + "\n") + sequence_length = len(seq_record.seq) + if end > sequence_length: + end = sequence_length + f.write(str(seq_record.seq[start:end]) + "\n") + + return out, start, end + def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode): output = open(candidatesOutFile, "w") @@ -246,9 +260,43 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug except FileNotFoundError: pass #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") - output.close() +def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, core_group): + output = open(candidatesOutFile, "w") + + for key in regions: + locations = regions[key] + counter = 0 + for i in locations: + #some variables + counter += 1 + start = str(i[0] - length_extension) + end = str(i[1] + length_extension) + name = key + "_" + str(counter) + file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) + #metaeuk call + cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk" + print(cmd) + starting_subprocess(cmd, mode) + # parsing header and sequences + try: + sequence_file = open(tmp_path + name + ".fas", "r") + lines = sequence_file.readlines() + id = 0 + for line in lines: + if line[0] == ">": + id += 1 + header = ">" + group + "|" + ass_name + "|" + name + "_" + id + output.write(header) + else: + output.write(line) + sequence_file.close() + except FileNotFoundError: + pass + + output.close() + def searching_for_db(assembly_path): db_endings = ['.ndb', '.nhr', '.nin', '.nog', '.nos', '.not', '.nsq', '.ntf', '.nto'] @@ -473,8 +521,6 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva #print("No ortholog was found with option --strict") return 0, seed - - #print(orthologs) orthologs = set(orthologs) return list(orthologs), seed @@ -651,14 +697,11 @@ def ortholog_search_tblastn(args): db_check = searching_for_db(blast_dir_path) if db_check == 0: - #print("Creating a blast data base...") cmd = 'makeblastdb -in ' + assembly_path + ' -dbtype nucl -parse_seqids -out ' + db_path starting_subprocess(cmd, mode) - #print("\t ...finished \n") #makes a tBLASTn search against database #codon table argument [-db_gencode int_value], table available ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt - #print("Starting tBLASTn search...") cmd = 'tblastn -db ' + db_path + ' -query ' + consensus_path + ' -outfmt "6 sseqid sstart send evalue qstart qend score " -evalue ' + str(evalue) + ' -out ' + tmp_path + '/blast_results.out' time_tblastn_start = time.time() exit_code = starting_subprocess(cmd, mode, 3600) @@ -683,15 +726,17 @@ def ortholog_search_tblastn(args): if gene_prediction == "augustus": ############### make Augustus PPX search ################################### - #print("Starting augustus ppx ...") time_augustus_start = time.time() augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, asName, group, tmp_path, mode) - #print("\t ...finished \n") time_augustus_end = time.time() time_augustus = time_augustus_end - time_augustus_start output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) else: - print("test") + time_metaeuk_start = time.time() + metaeuk(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path) + time_metaeuk_end = time.time() + time_metaeuk = time_metaeuk_end - time_metaeuk_start + output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName))") ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: @@ -820,11 +865,6 @@ def main(): tmp = args.tmp strict = args.strict checkCoorthologs = args.checkCoorthologsRef - #filter = args.filter - #if filter == True or filter == 'yes': - #filter = 'yes' - #else: - #filter = 'no' #others average_intron_length = args.avIntron length_extension = args.lengthExtension @@ -852,7 +892,6 @@ def main(): else: gene_prediction = "metaeuk" - # output modes if debug == True and silent == True: print("It's not possible to use booth modes, please restart and use --debug or --silent") @@ -952,8 +991,6 @@ def main(): else: print("Input %s for search Taxa is not in the assembly_dir or an existing file" % searchTaxa[0]) - - ################################# paths #################################### fasta_path = core_path + "/" + group +"/"+ group + ".fa" @@ -980,6 +1017,9 @@ def main(): profile_path = blockProfiles(core_path, group, mode) group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start + else: + print("test") + #concatinade core_group sequences if metaeuk should be run without tblastn ###################### ortholog search ##################################### From 5cb0f2bba80f33ae3b35861b7891a14ff6ae34ce Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:26:37 +0100 Subject: [PATCH 161/229] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f12e9cc..f891b47 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -736,7 +736,7 @@ def ortholog_search_tblastn(args): metaeuk(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path) time_metaeuk_end = time.time() time_metaeuk = time_metaeuk_end - time_metaeuk_start - output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName))") + output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName)) ################# backward search to filter for orthologs################### if int(os.path.getsize(candidatesOutFile)) <= 0: From cb085c71af0bda7eb2f7907f0c6a01fa4719f00d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:31:14 +0100 Subject: [PATCH 162/229] bug fix --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f891b47..64192b1 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -814,7 +814,7 @@ def main(): #################### handle user input ##################################### start = time.time() - version = '0.1.2' + version = '0.1.3' ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) @@ -883,7 +883,7 @@ def main(): #gene prediction tool augustus = args.augustus - if augutus == True: + if augustus == True: augustus_ref_species = args.augustusRefSpec if augustus_ref_species == '': print("Augustus reference species is required when using Augustus as gene prediction tool") From 0d2d26db84d471960cf9e61e18d7721befce253c Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:36:11 +0100 Subject: [PATCH 163/229] bug fix --- fdog/fDOGassembly.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 64192b1..f68c3aa 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -754,7 +754,7 @@ def ortholog_search_tblastn(args): return reciprocal_sequences, candidatesOutFile, output -def blockProfiles(core_path, group, mode): +def blockProfiles(core_path, group, mode, out): ######################## paths ################################ msa_path = core_path + "/" + group +"/"+ group + ".aln" @@ -780,7 +780,7 @@ def blockProfiles(core_path, group, mode): return profile_path -def consensusSequence(core_path, group, mode): +def consensusSequence(core_path, group, mode, out): ######################## paths ################################ hmm_path = core_path + "/" + group +"/hmm_dir/"+ group + ".hmm" @@ -1013,8 +1013,8 @@ def main(): if augustus == True: group_computation_time_start = time.time() - consensus_path = consensusSequence(core_path, group, mode) - profile_path = blockProfiles(core_path, group, mode) + consensus_path = consensusSequence(core_path, group, mode, out) + profile_path = blockProfiles(core_path, group, mode, out) group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start else: From 8d9ce6015e2b3a395d546b1f0033e918f0e3e1d2 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:43:01 +0100 Subject: [PATCH 164/229] added preparation steps for metaeuk (tblastn search preparation) --- fdog/fDOGassembly.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f68c3aa..aa037e8 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1019,7 +1019,11 @@ def main(): time_group = group_computation_time_end - group_computation_time_start else: print("test") + group_computation_time_start = time.time() + consensus_path = consensusSequence(core_path, group, mode, out) #concatinade core_group sequences if metaeuk should be run without tblastn + group_computation_time_end = time.time() + time_group = group_computation_time_end - group_computation_time_start ###################### ortholog search ##################################### From 65c8835fd080a227dc19f0f51dad39668e114130 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:45:28 +0100 Subject: [PATCH 165/229] bug fix --- fdog/fDOGassembly.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index aa037e8..c82e8fb 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -880,11 +880,12 @@ def main(): force = args.force append = args.append parallel = args.parallel + augustus_ref_species = args.augustusRefSpec #gene prediction tool augustus = args.augustus if augustus == True: - augustus_ref_species = args.augustusRefSpec + if augustus_ref_species == '': print("Augustus reference species is required when using Augustus as gene prediction tool") return 1 From 83275925f7e71b0d8b0609b79b89216a46b3084d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:47:26 +0100 Subject: [PATCH 166/229] bug fix --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c82e8fb..8bbfeba 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1020,6 +1020,7 @@ def main(): time_group = group_computation_time_end - group_computation_time_start else: print("test") + profile_path = "" group_computation_time_start = time.time() consensus_path = consensusSequence(core_path, group, mode, out) #concatinade core_group sequences if metaeuk should be run without tblastn From fb62700935cb87d4d03b32ca0ecc36346ee02037 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:48:47 +0100 Subject: [PATCH 167/229] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 8bbfeba..11a8504 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -733,7 +733,7 @@ def ortholog_search_tblastn(args): output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) else: time_metaeuk_start = time.time() - metaeuk(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path) + metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path) time_metaeuk_end = time.time() time_metaeuk = time_metaeuk_end - time_metaeuk_start output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName)) From f5e25dbc5fb65596bd65312a3e6d6feb83529653 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:51:55 +0100 Subject: [PATCH 168/229] bug fix --- fdog/fDOGassembly.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 11a8504..54294d4 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -214,15 +214,15 @@ def extract_seq(region_dic, path, tmp_path, mode): def extract_sequence_from_to(name, file, start, end): out = name + ".fasta" - if start < 0: + if int(start) < 0: start = 0 with open(out,"w") as f: for seq_record in SeqIO.parse(file, "fasta"): f.write(str(seq_record.id) + "\n") sequence_length = len(seq_record.seq) - if end > sequence_length: + if int(end) > sequence_length: end = sequence_length - f.write(str(seq_record.seq[start:end]) + "\n") + f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end From e59ae539a7e4a679058c1d2535aa53809b9ccb5e Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:54:08 +0100 Subject: [PATCH 169/229] bug fix --- fdog/fDOGassembly.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 54294d4..990bbd0 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -213,6 +213,7 @@ def extract_seq(region_dic, path, tmp_path, mode): starting_subprocess(cmd, mode) def extract_sequence_from_to(name, file, start, end): + print(name) out = name + ".fasta" if int(start) < 0: start = 0 @@ -222,6 +223,8 @@ def extract_sequence_from_to(name, file, start, end): sequence_length = len(seq_record.seq) if int(end) > sequence_length: end = sequence_length + print(start) + print(end) f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end From 188ae4b8a54866978b984335042e74b0d0b9ecc3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 11:58:05 +0100 Subject: [PATCH 170/229] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 990bbd0..233d8f5 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -219,7 +219,7 @@ def extract_sequence_from_to(name, file, start, end): start = 0 with open(out,"w") as f: for seq_record in SeqIO.parse(file, "fasta"): - f.write(str(seq_record.id) + "\n") + f.write(">" + str(seq_record.id) + "\n") sequence_length = len(seq_record.seq) if int(end) > sequence_length: end = sequence_length From 93e79fea116a8387aa8d5df5b08b7b143ada2078 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 12:03:07 +0100 Subject: [PATCH 171/229] bug fix --- fdog/fDOGassembly.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 233d8f5..11091da 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -213,7 +213,7 @@ def extract_seq(region_dic, path, tmp_path, mode): starting_subprocess(cmd, mode) def extract_sequence_from_to(name, file, start, end): - print(name) + #print(name) out = name + ".fasta" if int(start) < 0: start = 0 @@ -223,8 +223,8 @@ def extract_sequence_from_to(name, file, start, end): sequence_length = len(seq_record.seq) if int(end) > sequence_length: end = sequence_length - print(start) - print(end) + #print(start) + #print(end) f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end @@ -280,7 +280,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk" - print(cmd) + #print(cmd) starting_subprocess(cmd, mode) # parsing header and sequences try: @@ -290,7 +290,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group for line in lines: if line[0] == ">": id += 1 - header = ">" + group + "|" + ass_name + "|" + name + "_" + id + header = ">" + group + "|" + ass_name + "|" + name + "_" + str(id) output.write(header) else: output.write(line) From 90eb408d967041e1d3f1960c8ebfe2745853d1ed Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 13:04:42 +0100 Subject: [PATCH 172/229] testing other paramteres for metaeuk --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 11091da..25b4a6c 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -279,7 +279,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call - cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk" + cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" #print(cmd) starting_subprocess(cmd, mode) # parsing header and sequences From ca4133aa4ab7389d8c4827d8ebc6702988609e26 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 13:20:27 +0100 Subject: [PATCH 173/229] testing new parameters --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 25b4a6c..f35c80c 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -286,6 +286,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group try: sequence_file = open(tmp_path + name + ".fas", "r") lines = sequence_file.readlines() + print(lines) id = 0 for line in lines: if line[0] == ">": From 6be72527e89676e3f1a89ffb8db492771d198307 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 13:30:19 +0100 Subject: [PATCH 174/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index f35c80c..a8995fa 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -299,7 +299,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group except FileNotFoundError: pass - output.close() + output.close() def searching_for_db(assembly_path): From 926963f369aeebc3bfeb5160574961061da90777 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 13:44:33 +0100 Subject: [PATCH 175/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a8995fa..0836198 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -291,7 +291,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group for line in lines: if line[0] == ">": id += 1 - header = ">" + group + "|" + ass_name + "|" + name + "_" + str(id) + header = ">" + group + "|" + ass_name + "|" + name + "_" + str(id) + "\n" output.write(header) else: output.write(line) From 062eefcc7fc94bba111c1c1e977d2fd8a3f4caec Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 13:44:38 +0100 Subject: [PATCH 176/229] testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 0836198..48a6f85 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -280,7 +280,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" - #print(cmd) + print(cmd) starting_subprocess(cmd, mode) # parsing header and sequences try: From 49c080e1b76bb65e89268ba46a52dc86d06e4ffc Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 13:54:56 +0100 Subject: [PATCH 177/229] testing --- fdog/fDOGassembly.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 48a6f85..ebca99e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -223,8 +223,9 @@ def extract_sequence_from_to(name, file, start, end): sequence_length = len(seq_record.seq) if int(end) > sequence_length: end = sequence_length - #print(start) - #print(end) + #for testing only + start = 0 + end = len(seq_record.seq) f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end From fb8e97aff28edb0d827ccef10890a8997e9ec1b0 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 14:06:25 +0100 Subject: [PATCH 178/229] testing --- fdog/fDOGassembly.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index ebca99e..d22b281 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -224,8 +224,8 @@ def extract_sequence_from_to(name, file, start, end): if int(end) > sequence_length: end = sequence_length #for testing only - start = 0 - end = len(seq_record.seq) + #start = 0 + #end = len(seq_record.seq) f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end @@ -281,13 +281,13 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" - print(cmd) + #print(cmd) starting_subprocess(cmd, mode) # parsing header and sequences try: sequence_file = open(tmp_path + name + ".fas", "r") lines = sequence_file.readlines() - print(lines) + #print(lines) id = 0 for line in lines: if line[0] == ">": From be1b56a32c98610b5f8360fd20f1f777e8875b1f Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 14:42:37 +0100 Subject: [PATCH 179/229] metaeuk is incldued and running in fdog_assembly workflow --- fdog/fDOGassembly.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d22b281..40c63f8 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -280,8 +280,9 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call - cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" + cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk" #print(cmd) + # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 starting_subprocess(cmd, mode) # parsing header and sequences try: @@ -1024,7 +1025,7 @@ def main(): group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start else: - print("test") + #print("test") profile_path = "" group_computation_time_start = time.time() consensus_path = consensusSequence(core_path, group, mode, out) From cb9a5fd6c0e23f6907dd8a056bc2fe1dc2736d96 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 15:07:16 +0100 Subject: [PATCH 180/229] testing other metaeuk parameters --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 40c63f8..edaaffe 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -280,7 +280,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call - cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk" + cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" #print(cmd) # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 starting_subprocess(cmd, mode) From 79791e8f52c95ea2e2e62d228081225508eca07f Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 15:20:37 +0100 Subject: [PATCH 181/229] using complete contigs for metaeuk --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index edaaffe..c837c33 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -224,8 +224,8 @@ def extract_sequence_from_to(name, file, start, end): if int(end) > sequence_length: end = sequence_length #for testing only - #start = 0 - #end = len(seq_record.seq) + start = 0 + end = len(seq_record.seq) f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end From f6f72f7e0a5b3628045449afc9a350a542e1c339 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 10 Feb 2022 15:34:36 +0100 Subject: [PATCH 182/229] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index c837c33..edaaffe 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -224,8 +224,8 @@ def extract_sequence_from_to(name, file, start, end): if int(end) > sequence_length: end = sequence_length #for testing only - start = 0 - end = len(seq_record.seq) + #start = 0 + #end = len(seq_record.seq) f.write(str(seq_record.seq[int(start):int(end)]) + "\n") return out, start, end From 61a1ee54036074d2d3079766dae26a1bd1a2b300 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 11 Feb 2022 11:41:27 +0100 Subject: [PATCH 183/229] added parameter for own metaeuk db --- fdog/fDOGassembly.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index edaaffe..20b74e3 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -266,7 +266,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") output.close() -def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, core_group): +def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, db): output = open(candidatesOutFile, "w") for key in regions: @@ -280,7 +280,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) #metaeuk call - cmd = "metaeuk easy-predict " + file + " " + core_group + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" + cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" #print(cmd) # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 starting_subprocess(cmd, mode) @@ -686,7 +686,7 @@ def clean_fas(path, file_type): file.close() def ortholog_search_tblastn(args): - (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction) = args + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db) = args output = [] cmd = 'mkdir ' + out + '/tmp/' + asName starting_subprocess(cmd, 'silent') @@ -739,7 +739,11 @@ def ortholog_search_tblastn(args): output.append("Time augustus: %s species %s \n" % (str(time_augustus), asName)) else: time_metaeuk_start = time.time() - metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, fasta_path) + if metaeuk_db == '': + db = fasta_path + else: + db = metaeuk_db + metaeuk_single(regions, candidatesOutFile, length_extension, asName, group, tmp_path, mode, db) time_metaeuk_end = time.time() time_metaeuk = time_metaeuk_end - time_metaeuk_start output.append("Time metaeuk: %s species %s \n" % (str(time_metaeuk), asName)) @@ -856,6 +860,7 @@ def main(): optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False) optional.add_argument('--augustus', help= 'Gene prediction is done by using the tool Augustus PPX', action='store_true', default=False) optional.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') + optional.add_argument('--metaeukDb', help='path to metaeuk reference database', action='store', default='') args = parser.parse_args() # required @@ -887,6 +892,7 @@ def main(): append = args.append parallel = args.parallel augustus_ref_species = args.augustusRefSpec + metaeuk_db = args.metaeukDb #gene prediction tool augustus = args.augustus @@ -964,6 +970,12 @@ def main(): assemblyDir = dataPath + '/assembly_dir/' check_path(assemblyDir) + if metaeuk_db != '': + if not metaeuk_db.endswith('/'): + metaeuk_db = metaeuk_db + '/' + check_path(metaeuk_db) + + try: f = open(out + "/fdog.log", "a+") except FileNotFoundError: @@ -1045,7 +1057,7 @@ def main(): cpus = mp.cpu_count() pool = mp.Pool(cpus) for asName in assembly_names: - calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction]) + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db]) results = (pool.imap_unordered(ortholog_search_tblastn, calls)) pool.close() @@ -1057,7 +1069,7 @@ def main(): else: ###################### computation species wise ################ for asName in assembly_names: - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction] + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db] reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args) ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) for k in output_ortholog_search: From 81ec9a562d52b9546fd4c7161e89725b9e23783a Mon Sep 17 00:00:00 2001 From: mueli94 Date: Fri, 11 Feb 2022 11:47:31 +0100 Subject: [PATCH 184/229] bugfix --- fdog/fDOGassembly.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 20b74e3..daf8bff 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -971,8 +971,6 @@ def main(): check_path(assemblyDir) if metaeuk_db != '': - if not metaeuk_db.endswith('/'): - metaeuk_db = metaeuk_db + '/' check_path(metaeuk_db) From 17a546a155cf5efa09f7c8e16c888a10a9d65615 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 14 Feb 2022 14:40:39 +0100 Subject: [PATCH 185/229] for debugging function get_distance_biopython --- fdog/fDOGassembly.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index daf8bff..4a05627 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -313,6 +313,7 @@ def searching_for_db(assembly_path): return check def get_distance_biopython(file, matrix): + print(file) aln = AlignIO.read(open(file), 'fasta') calculator = DistanceCalculator(matrix) dm = calculator.get_distance(aln) From c260ce4b1fceabf421dbf2c2b459ee2ea92978f7 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 14 Feb 2022 15:19:06 +0100 Subject: [PATCH 186/229] testing --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 4a05627..664e429 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -313,7 +313,7 @@ def searching_for_db(assembly_path): return check def get_distance_biopython(file, matrix): - print(file) + #print(file) aln = AlignIO.read(open(file), 'fasta') calculator = DistanceCalculator(matrix) dm = calculator.get_distance(aln) @@ -637,7 +637,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci for record in candidates: for name in candidate_names: if name in record.id: - f.write(">" + name + "\n") + f.write(">" + record.id + "\n") f.write(str(record.seq) + "\n") f.close() From 0ec76787dffb4a5aa6b8ab0304992775f382335d Mon Sep 17 00:00:00 2001 From: mueli94 Date: Wed, 23 Feb 2022 10:47:17 +0100 Subject: [PATCH 187/229] bug fix, testing --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 664e429..ec41ec2 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -636,7 +636,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci for record in candidates: for name in candidate_names: - if name in record.id: + if name == record.id: f.write(">" + record.id + "\n") f.write(str(record.seq) + "\n") f.close() From 76e503819d7376a59a0a71b8fe9a3c548ad6ecf5 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Thu, 24 Feb 2022 11:04:24 +0100 Subject: [PATCH 188/229] bug fix --- fdog/fDOGassembly.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index ec41ec2..0aead0e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -634,11 +634,14 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci f.write(str(record.seq) + "\n") break + already_written = [] for record in candidates: for name in candidate_names: if name == record.id: - f.write(">" + record.id + "\n") - f.write(str(record.seq) + "\n") + if name not in already_written: + f.write(">" + record.id + "\n") + f.write(str(record.seq) + "\n") + already_written.append(name) f.close() if msaTool == "muscle": From ad12f0aaa68e331847b1e4379cb62cae56c2f729 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 1 Mar 2022 11:22:44 +0100 Subject: [PATCH 189/229] gff file positions were corrected during fDOG-Assembly run --- fdog/fDOGassembly.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 0aead0e..d7a8e37 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -268,6 +268,8 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, db): output = open(candidatesOutFile, "w") + region = open(candidatesOutFile.replace(".candidates.fa", ".regions.txt"), "w") + region.write("Conting/scaffold" + "\t" + "start" + "\t" + "end" + "\n") for key in regions: locations = regions[key] @@ -279,6 +281,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group end = str(i[1] + length_extension) name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) + region.write(file + "\t" + str(start) + "\t" + str(end)) #metaeuk call cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" #print(cmd) @@ -298,6 +301,15 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group else: output.write(line) sequence_file.close() + + gff_file = open(tmp_path + name + ".gff", "r") + lines = gff_file.readlines() + for line in lines: + values = line.split("\t") + values[3] = int(values[3]) + int(start) + values[4] = int(values[4]) + int(start) + gff_file.write("\t".join(values)) + gff_file.close() except FileNotFoundError: pass From 6b15f26c04e30b3516d2b560527498c255474e74 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 1 Mar 2022 13:56:48 +0100 Subject: [PATCH 190/229] bug fix --- fdog/fDOGassembly.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d7a8e37..e8ed0ee 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -303,12 +303,12 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group sequence_file.close() gff_file = open(tmp_path + name + ".gff", "r") - lines = gff_file.readlines() - for line in lines: - values = line.split("\t") - values[3] = int(values[3]) + int(start) - values[4] = int(values[4]) + int(start) - gff_file.write("\t".join(values)) + lines = gff_file.readlines() + for line in lines: + values = line.split("\t") + values[3] = int(values[3]) + int(start) + values[4] = int(values[4]) + int(start) + gff_file.write("\t".join(values)) gff_file.close() except FileNotFoundError: pass From 7d7504f1f76e01a4cd27cad5a371ef3c6cc7bcf4 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 1 Mar 2022 14:18:39 +0100 Subject: [PATCH 191/229] bug fix --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e8ed0ee..051f331 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -306,8 +306,8 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group lines = gff_file.readlines() for line in lines: values = line.split("\t") - values[3] = int(values[3]) + int(start) - values[4] = int(values[4]) + int(start) + values[3] = str(int(values[3]) + int(start)) + values[4] = str(int(values[4]) + int(start)) gff_file.write("\t".join(values)) gff_file.close() except FileNotFoundError: From 826d676f3846cfa16a6fbba5cdba0d066e158023 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 1 Mar 2022 14:40:52 +0100 Subject: [PATCH 192/229] bug fix --- fdog/fDOGassembly.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 051f331..3770e9b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -306,8 +306,10 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group lines = gff_file.readlines() for line in lines: values = line.split("\t") - values[3] = str(int(values[3]) + int(start)) - values[4] = str(int(values[4]) + int(start)) + new_start = int(values[3]) + int(start) + values[3] = str(new_start) + new_end = int(values[4]) + int(start) + values[4] = str(new_end) gff_file.write("\t".join(values)) gff_file.close() except FileNotFoundError: From 8a832fc1c67161e9361a94bc29f32d9863e284a0 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 1 Mar 2022 15:00:56 +0100 Subject: [PATCH 193/229] bug fix --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 3770e9b..2168b5d 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -302,7 +302,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group output.write(line) sequence_file.close() - gff_file = open(tmp_path + name + ".gff", "r") + gff_file = open(tmp_path + name + ".gff", "r+") lines = gff_file.readlines() for line in lines: values = line.split("\t") From 14c852c8ed8b53d5f2007820406084ac72908dea Mon Sep 17 00:00:00 2001 From: mueli94 Date: Tue, 1 Mar 2022 15:34:59 +0100 Subject: [PATCH 194/229] bug fix --- fdog/fDOGassembly.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 2168b5d..7027236 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -302,15 +302,18 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group output.write(line) sequence_file.close() - gff_file = open(tmp_path + name + ".gff", "r+") + gff_file = open(tmp_path + name + ".gff", "r") lines = gff_file.readlines() + new_lines = [] for line in lines: values = line.split("\t") - new_start = int(values[3]) + int(start) - values[3] = str(new_start) - new_end = int(values[4]) + int(start) - values[4] = str(new_end) - gff_file.write("\t".join(values)) + values[3] = str(int(values[3]) + int(start)) + values[4] = str(int(values[4]) + int(start)) + new_lines.append("\t".join(values)) + gff_file.close() + gff_file = open(tmp_path + name + ".gff", "w") + for line in new_lines: + gff_file.write(line) gff_file.close() except FileNotFoundError: pass From b1a25aa450d31cc0a05776f5eaec554d15cfa686 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Wed, 15 Jun 2022 16:08:44 +0200 Subject: [PATCH 195/229] bug fix blast DB computation (#23) --- .DS_Store | Bin 6148 -> 6148 bytes .gitignore | 7 +++ fdog/.gitignore | 143 +++++++++++++++++++++++++++++++++++++++++++ fdog/fDOGassembly.py | 3 + fdog/runMulti.py | 12 ++-- fdog/runSingle.py | 2 +- setup.py | 2 +- 7 files changed, 161 insertions(+), 8 deletions(-) create mode 100644 fdog/.gitignore diff --git a/.DS_Store b/.DS_Store index bcbd073c8626ea73a8116c4f66a9c94aeb88f9c8..592bcd75f056c0282a98197e05ceaaca71ada43e 100644 GIT binary patch delta 136 zcmZoMXffE}&7!m;sURn_xWvHV8Y2@k3o9Et2RjEhM{ICLetB?7Vo7PS)8uLvZy!z$ z&UgWd>S`l1104lp6XRMPg=$M9Alt;)thSbuLsVJcIw(FnCpRy@ivbK686h+SFO-H+ T-J8W&KQc{h5ZcVn@s}R}{Z}39 delta 137 zcmZoMXffE}%_4CssURn_xWvHVIwKP^3o9Et2L~4i7cbZ3L>6JzfW(rFq{+D~-T@q( z9Gvk2lGW8lhDJIHhDK(!IttZ>M&^b(3Z@q3wY8ia;;M$Wo(Z{?Rn;}Mb+Z`2fRPbG VGw?%c7&Ut{3+qRw&Fmb1`2mpQ9GCzA diff --git a/.gitignore b/.gitignore index 38cf321..90963b6 100644 --- a/.gitignore +++ b/.gitignore @@ -128,6 +128,13 @@ dmypy.json # Pyre type checker .pyre/ +# DS_store +**/.DS_Store +/fdog/.DS_Store +/fdog/data/.DS_Store +/fdog/bin/.DS_Store +/fdog/setup/.DS_Store + #Hannah /fdog/data/core_orthologs/ /fdog/data/assembly_dir/ diff --git a/fdog/.gitignore b/fdog/.gitignore new file mode 100644 index 0000000..1912743 --- /dev/null +++ b/fdog/.gitignore @@ -0,0 +1,143 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# DS_store +**/.DS_Store +/fdog/.DS_Store +/fdog/data/.DS_Store +/fdog/bin/.DS_Store +/fdog/setup/.DS_Store + +#Hannah +/fdog/data/core_orthologs/ +/fdog/data/assembly_dir/ +/fdog/fdog_goes_assembly/tmp/ +taxdump* +/fdog/fDOGassembly.py diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 9fc840d..9c0dc6b 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -721,6 +721,9 @@ def ortholog_search_tblastn(args): assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" db_path = assemblyDir + "/" + asName + "/blast_dir/" + asName + ".fa" blast_dir_path = assemblyDir + "/" + asName + "/blast_dir/" + if not os.path.exists(blast_dir_path): + cmd = 'mkdir ' + blast_dir_path + starting_subprocess(cmd, 'silent') db_check = searching_for_db(blast_dir_path) if db_check == 0: diff --git a/fdog/runMulti.py b/fdog/runMulti.py index be552a7..ca8a058 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -75,7 +75,7 @@ def prepare(args, step): return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) def getSeedName(seedFile): - seqName = seedFile.split('.')[0] + seqName = seedFile.rsplit('.', 1)[0] seqName = re.sub('[\|\.]', '_', seqName) return(seqName) @@ -217,14 +217,11 @@ def createConfigPP(outpath, jobName, refspec): settings['rank'] = 'species' settings['refspec'] = refspec settings['clusterProfile'] = 'TRUE' - print("HERER") - print(settings) - print('%s/%s.config.yml' % (outpath, jobName)) with open('%s/%s.config.yml' % (outpath, jobName), 'w') as configfile: yaml.dump(settings, configfile, default_flow_style = False) def main(): - version = '0.0.51' + version = '0.0.52' parser = argparse.ArgumentParser(description='You are running fdogs.run version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) required = parser.add_argument_group('Required arguments') @@ -535,7 +532,10 @@ def main(): ### join output finalFa = joinOutputs(outpath, jobName, seeds, keep, silent) else: - print("%s.extended.fa found in %s! If you want to re-run the ortholog search, please use --force option." % (jobName, outpath)) + if append == True: + sys.exit("Currently the append option is not available. Please use fdog.run if you need this option!") + else: + sys.exit("%s.extended.fa found in %s! If you want to re-run the ortholog search, please use --force or --append option." % (jobName, outpath)) ### calculate FAS scores if fasoff == False: if os.path.exists('%s/%s.phyloprofile' % (outpath, jobName)): diff --git a/fdog/runSingle.py b/fdog/runSingle.py index f239f90..df22bd2 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -199,7 +199,7 @@ def getTaxName(taxId): return(name) def main(): - version = '0.0.51' + version = '0.0.52' parser = argparse.ArgumentParser(description='You are running fdog.run version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) required = parser.add_argument_group('Required arguments') diff --git a/setup.py b/setup.py index 86d521d..63bc4cb 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ setup( name="fdog", - version="0.0.51", + version="0.0.52", python_requires='>=3.7.0', description="Feature-aware Directed OrtholoG search tool", From 7f666b7f73aede70d5f09622e0ec11f4d30f129b Mon Sep 17 00:00:00 2001 From: Hannah <47216555+mueli94@users.noreply.github.com> Date: Tue, 24 Oct 2023 14:05:37 +0200 Subject: [PATCH 196/229] Squashed commit of the following: commit 3f2929b64ef28051f2f848914be50f024cdc64ef Merge: 7fb439f 959e0a0 Author: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Mon Oct 23 15:06:29 2023 +0200 Merge pull request #2 from BIONF/master deactivate fas config output commit 7fb439fa0e5e0e53d5db2f7e54550ec55885ed2f Author: mueli94 Date: Fri Oct 20 10:05:26 2023 +0200 Revert "Merge branch 'fdog-assembly' of https://github.com/mueli94/fDOG-Assembly into fdog-assembly" This reverts commit d688bbb6f9498d3e8e02a20a8321a91b9215a23d, reversing changes made to 5068b7acf249209c0b4d951b23fa7c2d79af32ab. commit d0d359f0afc52ca9f3a4b8782315b6ba56f00094 Merge: 5068b7a d688bbb Author: mueli94 Date: Fri Oct 20 10:05:01 2023 +0200 Merge branch 'fdog-assembly' of https://github.com/mueli94/fDOG-Assembly into fdog-assembly commit d688bbb6f9498d3e8e02a20a8321a91b9215a23d Merge: 5068b7a 3b62945 Author: mueli94 Date: Thu Oct 19 16:28:55 2023 +0200 Merge branch 'fdog-assembly' of https://github.com/mueli94/fDOG-Assembly into fdog-assembly commit 5068b7acf249209c0b4d951b23fa7c2d79af32ab Author: mueli94 Date: Thu Oct 19 16:20:18 2023 +0200 contig fasta files where delited from the tmp folder to save memory commit 3b629458e9f12d7f60089385dd04d5a2a0284d9f Author: Hannah <47216555+mueli94@users.noreply.github.com> Date: Thu Sep 21 16:31:17 2023 +0200 further updates to adapt fDOG-Assembly to the new fDOG version. Additionally bugfix in co-ortholog detection if MSA crashed due to too many sequences. commit 41660e6d7e3d3ade28230d281bf3701fdaac7615 Author: Hannah <47216555+mueli94@users.noreply.github.com> Date: Wed Sep 20 10:34:56 2023 +0200 fixed augustus version commit 5d77522b91c854cdb0e38502806b083ed8c8c403 Author: mueli94 Date: Tue Sep 19 15:02:57 2023 +0200 Bugfix muscle v5 command in fdog.addCoreGroup commit 959e0a0fb1544654d99931686e09b69f70d6facc Author: trvinh Date: Tue Sep 19 14:49:48 2023 +0200 deactivate fas config output commit ea40a0cd682f3b88392e7e7e92ba2e0df5735002 Merge: 3140fc4 3cc809c Author: Hannah <47216555+mueli94@users.noreply.github.com> Date: Mon Sep 18 17:27:04 2023 +0200 Merge branch 'fdog-assembly' of https://github.com/mueli94/fDOG-Assembly into fdog-assembly commit 3140fc40dad5b8d56ec4313608b90c42cad47835 Author: Hannah <47216555+mueli94@users.noreply.github.com> Date: Mon Sep 18 17:25:39 2023 +0200 script to produce msa and hmm in the format fDOG-Assembly requires from a fasta file commit 3cc809c49bfd29c391d59e2eea7619d307c1358c Author: Hannah <47216555+mueli94@users.noreply.github.com> Date: Mon Sep 18 17:21:07 2023 +0200 script to produce msa and hmm in the format fDOG-Assembly requires from a fasta file commit ef5acb725a00fa6fc811a18aa5360ec137aacc2c Author: Hannah <47216555+mueli94@users.noreply.github.com> Date: Mon Sep 18 17:20:09 2023 +0200 further changes to adapt to new fDOG version commit a832824f2db3e23866463aceaa0f03a968bbb130 Author: Hannah <47216555+mueli94@users.noreply.github.com> Date: Mon Sep 18 13:17:11 2023 +0200 adjustments to new muscle version and fDOG version commit 8df8a737fe06d7b2fc6739453594ce89eafbfa0b Author: Hannah <47216555+mueli94@users.noreply.github.com> Date: Mon Sep 18 11:38:06 2023 +0200 added fDOG-Assembly dependencies commit fc0ea453574efbff8cf8be31d18a68711709a346 Author: Hannah <47216555+mueli94@users.noreply.github.com> Date: Mon Sep 18 10:58:18 2023 +0200 added fDOG-Assembly workflow commit 5ccf8242c5714fb17d6d73cd2d41f10496a17cb8 Author: trvinh Date: Wed Aug 16 13:33:40 2023 +0200 check valid rank for refspec only commit deeda8add38064e096368b3dd0d4f80369373203 Author: Vinh Tran Date: Wed Aug 16 13:19:31 2023 +0200 Update CHANGELOG.md commit eb902a63d94ae59dd065688e58cc5db6ef7eb3fd Author: trvinh Date: Wed Aug 16 12:03:36 2023 +0200 option to not added all taxa commit e6ce3d19bbb152a0906296e7ae0e5f4755cfbc76 Author: trvinh Date: Wed Aug 16 11:50:10 2023 +0200 changed orthID NA to fdogMA for manually added taxa commit 5cb0946e6c4dccd54f8dd5eda46e7228e3114564 Author: trvinh Date: Wed Aug 16 09:57:26 2023 +0200 added all searchTaxa to phyloprofile commit 3e0859b3753c497f9394b570f7d70c87708dc1c7 Author: trvinh Date: Tue Aug 15 15:39:54 2023 +0200 added check for invalid min-/maxDist; works with fasta36.3.8g commit b7ec6fa437ece98f9aa25ce4f2001a12641b4d24 Author: trvinh Date: Tue Aug 15 15:19:36 2023 +0200 fixed worrking with old fasta36 commit 553cdaa442c3468ae86e503b45aa14cf46bf4f52 Author: trvinh Date: Tue Aug 15 12:44:18 2023 +0200 added check for invalid min-/maxDist; works with fasta36.3.8g commit c756ab03eeea5f71f6685588a3c575f0c6415447 Author: trvinh Date: Tue Aug 15 11:48:59 2023 +0200 added check for invalid min-/maxDist; works with fasta36.3.8g commit a746c8fa045d7fed7012f53ea7ab83bcac3d154c Author: trvinh Date: Tue Aug 15 11:32:45 2023 +0200 added check for invalid min-/maxDist; works with fasta36.3.8g commit 48372cf2ee93c21063a8138d27946b4152ac133a Author: trvinh Date: Tue Jul 18 11:11:37 2023 +0200 added warning for no hmm hit commit a8541f89e3b2317c29ac79f45767afb1aaffc35a Author: trvinh Date: Thu Jun 15 16:24:24 2023 +0200 set seqlen limit for muscle5 commit b06b3a12eb640a8442320781cf483523331f9e34 Author: trvinh Date: Mon Jun 12 14:09:09 2023 +0200 fixed mapping file for addTaxon commit c5ca07cf56935b5c0a5c3ba73f1bf2648aac38c2 Author: trvinh Date: Thu May 25 14:13:45 2023 +0200 fixed output filename for muscle5 commit b014157c38bab799b08cffe92fb12385db5fb378 Author: trvinh Date: Thu May 11 13:13:19 2023 +0200 check empty tmp dir before deleting in addTaxon commit 052cf704d5406cc8ba1e79b989a49355b2580f4b Author: trvinh Date: Fri Mar 10 15:42:15 2023 +0100 muscle5,checkBlast,ignoreAnnoCheck commit 7e6bfbc5a1f54e610c9f8d28b44175f1f97e3a99 Author: trvinh Date: Wed Feb 22 09:57:23 2023 +0100 adapt evalue commit 56995b147ce7a60de635e910f74f87a93c39a87c Author: trvinh Date: Wed Feb 22 09:56:40 2023 +0100 adapt evalue commit 7798d15db810c9b06f13f56aea072c2af44679a5 Author: trvinh Date: Mon Feb 20 13:50:31 2023 +0100 rename log fdogs.run commit 03c9038595a18503724041b801b0122b74927fe0 Author: trvinh Date: Fri Feb 17 14:29:03 2023 +0100 fix number of hmmhits core compilation commit f7c4cf830b8ef2c69cdbee4314aaf623eddad107 Author: trvinh Date: Tue Feb 14 12:47:09 2023 +0100 fix check for pathconfig file commit 57859800b224271805a6f5731fb26671661b7630 Author: trvinh Date: Tue Feb 14 11:14:23 2023 +0100 fix sorting hmm hits commit bb185a3aaca3128a58e865a5ed60a43a655fb804 Author: trvinh Date: Mon Feb 6 14:14:52 2023 +0100 add several options to work with data paths; resolves: #28 commit 8bd2359c366276fa9b1b811b1a7605d57367a485 Author: trvinh Date: Mon Feb 6 14:04:01 2023 +0100 add several options to work with data paths; resolves: #28 commit 7c3191a75f4a762a91ea52f10cc53a7135a987e2 Author: trvinh Date: Mon Feb 6 13:59:31 2023 +0100 add several options to work with data paths commit ab5491b6e0c180722f367089b2d175b49b5fc741 Author: trvinh Date: Fri Feb 3 15:46:46 2023 +0100 option to use different names for data folders commit 198393dc31c89ef31df391b55933a905df605962 Author: trvinh Date: Fri Feb 3 15:07:49 2023 +0100 add option to update json file to checkData commit 4339bad93a84eefd090e579b679121b8422909ce Author: trvinh Date: Thu Feb 2 17:17:34 2023 +0100 accept old folder names commit bfbc324aff95289d03663324d1a2f082ccf44854 Author: trvinh Date: Thu Feb 2 16:39:34 2023 +0100 added fn desc commit ad8741d5df55a49af642a49914238291139c0301 Author: trvinh Date: Thu Feb 2 10:17:29 2023 +0100 v0.1.5 version bump commit ad2cbbdac62a1525e0d99b8e74b92a2a2cf969db Author: trvinh Date: Thu Feb 2 10:06:53 2023 +0100 sort hmm hits using domain scores; correct tree walking; setup force not delete data; correct profile output with fasOff commit 206def02f2cec7ad3aa5a3297001cc50d5f6a95c Author: trvinh Date: Thu Feb 2 10:04:18 2023 +0100 sort hmm hits using domain scores; correct tree walking; setup force not delete data; correct profile output with fasOff commit 1edad42cbfb50150e7918f5bfac5c6cf39325671 Author: trvinh Date: Fri Jan 27 09:33:58 2023 +0100 version bump commit 8855db4d5a77c4f3085e8f9f66089f0b22e5de9b Author: trvinh Date: Fri Jan 27 09:33:38 2023 +0100 reduce evalue for identify seed ID commit 83326905754032fae0e52640404656baeca8862c Author: trvinh Date: Thu Jan 26 14:48:27 2023 +0100 fix bug identify seed ID with reuseCore commit 9edaf9ebc84f4f886b014b90de60f077953339d0 Author: trvinh Date: Thu Jan 26 10:47:14 2023 +0100 speed up preparation in fdogs.run commit 2405698b9b9037b3b8beedb9ef69d9fe020810a0 Author: trvinh Date: Thu Jan 26 10:24:23 2023 +0100 save core jobs for fdogs.run to file commit e72825223e8fb634a1277fb8e77fdcee337f24c5 Author: trvinh Date: Thu Jan 26 09:51:00 2023 +0100 fix path to refspec fa; add amino to hmmbuild commit f984b2665ad4dfc45d65fcdd5173a6d7f12c04b9 Author: trvinh Date: Wed Jan 25 14:41:09 2023 +0100 added preparation runtime to fdogs.run commit 8624f7dca706a3de6a0c9c7a5f2b77cbf245be5f Author: trvinh Date: Tue Jan 24 14:23:03 2023 +0100 fixed kimura distance devide by 0 commit dd9a45cab14c12386204c64ade04c5b225cb84f8 Author: trvinh Date: Tue Jan 24 10:57:38 2023 +0100 updated README commit 055051fcf4d2700a70e79dc35f5ce9e8ae76f302 Author: Vinh Tran Date: Tue Jan 24 10:21:08 2023 +0100 0.1.0 (#26) * create v0.1.0 * simplfying hamstr.pl * first python conversion * modified addTaxon and addTaxa * added install/check dependencies * rename data folders * added runtime for core complilation commit d8eea1fe68efd6a497865aafb41ef2f82a0f2ede Author: trvinh Date: Wed Oct 12 09:10:56 2022 +0200 fixed addTaxon replacing pipe commit 20aa839eb84cb6b5161140b443bc9850f1cf9287 Author: trvinh Date: Mon Jun 13 10:19:40 2022 +0200 removed fdog assembly files commit a0747e0f7388d4d26e848a764774976e050ba0c7 Author: trvinh Date: Tue Mar 15 14:19:32 2022 +0100 fdogs accepts dots in seed filename --- .DS_Store | Bin 6148 -> 0 bytes .github/workflows/github_build.yml | 25 +- CHANGELOG.md | 38 + README.md | 22 +- fdog/.DS_Store | Bin 8196 -> 0 bytes fdog/addTaxa.py | 225 +-- fdog/addTaxon.py | 221 +-- fdog/bin/Filehandler.pm | 45 - fdog/bin/getSearchTaxa.pl | 145 -- fdog/bin/hamstr.pl | 2358 ----------------------- fdog/bin/oneSeq.pl | 2860 ---------------------------- fdog/bin/run_genewise_hamstr.pm | 260 --- fdog/bin/translate.pl | 197 -- fdog/checkData.py | 539 ++++-- fdog/data/conda_requirements.yml | 8 + fdog/data/dependencies.txt | 9 + fdog/fDOGassembly.py | 119 +- fdog/{bin => libs}/__init__.py | 0 fdog/libs/addtaxon.py | 202 ++ fdog/libs/alignment.py | 185 ++ fdog/libs/blast.py | 95 + fdog/libs/corecompile.py | 420 ++++ fdog/libs/fas.py | 120 ++ fdog/libs/fasta.py | 77 + fdog/libs/hmm.py | 108 ++ fdog/libs/orthosearch.py | 274 +++ fdog/libs/output.py | 114 ++ fdog/libs/preparation.py | 202 ++ fdog/libs/tree.py | 141 ++ fdog/libs/zzz.py | 196 ++ fdog/makeCoreGroupFromFasta.py | 99 + fdog/mergeOutput.py | 8 +- fdog/removefDog.py | 29 +- fdog/runMulti.py | 688 +++---- fdog/runSingle.py | 519 ++--- fdog/setPaths.py | 86 + fdog/setup/__init__.py | 0 fdog/setup/indexTaxonomy.pl | 21 - fdog/setup/install_lib.sh | 189 -- fdog/setup/setup.sh | 427 ----- fdog/setup/setup_conda.sh | 447 ----- fdog/setupfDog.py | 280 ++- fdog/showTaxa.py | 40 +- setup.py | 12 +- 44 files changed, 3779 insertions(+), 8271 deletions(-) delete mode 100644 .DS_Store create mode 100644 CHANGELOG.md delete mode 100644 fdog/.DS_Store delete mode 100755 fdog/bin/Filehandler.pm delete mode 100644 fdog/bin/getSearchTaxa.pl delete mode 100755 fdog/bin/hamstr.pl delete mode 100755 fdog/bin/oneSeq.pl delete mode 100755 fdog/bin/run_genewise_hamstr.pm delete mode 100755 fdog/bin/translate.pl create mode 100644 fdog/data/conda_requirements.yml create mode 100644 fdog/data/dependencies.txt rename fdog/{bin => libs}/__init__.py (100%) create mode 100644 fdog/libs/addtaxon.py create mode 100644 fdog/libs/alignment.py create mode 100644 fdog/libs/blast.py create mode 100644 fdog/libs/corecompile.py create mode 100644 fdog/libs/fas.py create mode 100644 fdog/libs/fasta.py create mode 100644 fdog/libs/hmm.py create mode 100644 fdog/libs/orthosearch.py create mode 100644 fdog/libs/output.py create mode 100644 fdog/libs/preparation.py create mode 100644 fdog/libs/tree.py create mode 100644 fdog/libs/zzz.py create mode 100644 fdog/makeCoreGroupFromFasta.py create mode 100644 fdog/setPaths.py delete mode 100644 fdog/setup/__init__.py delete mode 100644 fdog/setup/indexTaxonomy.pl delete mode 100755 fdog/setup/install_lib.sh delete mode 100755 fdog/setup/setup.sh delete mode 100755 fdog/setup/setup_conda.sh diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 592bcd75f056c0282a98197e05ceaaca71ada43e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKL5mYH6n^QBc4|?2P}qww;B{H+c3BWF({3vi)WgoA2bJ1!Y8{-Nl+KJ*OJUCX zW4!uH{4bvLdr5X_XZ7Gkq~t;JC3){l`o5B6LPVlhME8hVMC76{I@eJB#@Ns0lGV%z zI|a`%Oyuh@ksbYBK<+vD(QJXwrdH;-kSkK=SQSA;m4 zAmqifIF00@CyO-76gSWgx8XKMt=;AF;HbUlbq+hLJ#Tsc0PVxh(Q4IjZ{4~3;AHeJ zIZNedExW+Er0lxEbNGVBMLzF?d78*{iZPC_(<2(vF%9Sm1vIFr@)Z&5sx;{_brCtF zhhPMBO8XR3y2%o^E35{cAi7I^um@nAQB+~@FI->8rHm$&Q$aJt&*)<%{zoPL24ctb z0sm})*_p%rIdDp!Hn}y hm.fa + fdog.addTaxon -f hm.fa -i 9606 -o ./ -c -a + ls - name: Deploy if: startsWith(github.event.ref, 'refs/tags') uses: casperdcl/deploy-pypi@v2 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..add2833 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,38 @@ +# Changelog + +## [Dev version] + +### Added +- +### Changed +- +### Fixed +- + +## [0.1.23] - 2023.08.16 + +### Added +- Option to NOT adding all search taxa (`--notAddingTaxa`); OFF by default, +i.e. all search taxa will be present in phyloprofile output +- Check invalid min-/max rank for referece species (specified by --minDist and --maxDist). +If the specified ranks (by default, `--minDist genus` `--maxDist kingdom`) are not available, +the next valid ranks will be suggested (or automatically applied if default ranks are used) +- Check if seed sequence cannot be retrieved by blast. Return with the blast command + +### Changed +- +### Fixed +- Fixed issue with long directory path for FASTA36 v36.3.8g Dec 2017 + +## [0.1.12] - 2023.03.10 + +### Added +- Option to not check annotations for fdog.checkData (option `--ignoreAnno`) +- Check compatibility between blastp and blast DBs + +### Changed +- Work with MUSCLE v5.1 +- Replace MuscleCommandline and MafftCommandline by subprocess.run + +### Fixed +- diff --git a/README.md b/README.md index 8db83ce..082a46d 100644 --- a/README.md +++ b/README.md @@ -44,30 +44,24 @@ export PATH=$HOME/.local/bin:$PATH After installing *fdog*, you need to setup *fdog* to get its dependencies and pre-calculated data. -**NOTE**: in case you haven't installed [greedyFAS](https://github.com/BIONF/FAS) before, it will be installed automatically within *fDOG* setup. However, you need to run [setupFAS](https://github.com/BIONF/FAS/wiki/setupFAS) after *fDOG* setup finished before actually using *fDOG*! +**NOTE**: in case you haven't installed [greedyFAS](https://github.com/BIONF/FAS), it will be installed automatically within *fDOG* setup. However, you need to run [setupFAS](https://github.com/BIONF/FAS/wiki/setupFAS) after *fDOG* setup finished before actually using *fDOG*! You can setup fDOG by running this command ``` -fdog.setup -o /output/path/for/fdog/data +fdog.setup -d /output/path/for/fdog/data ``` -or, in case you are using Anaconda -``` -fdog.setup -o /output/path/for/fdog/data --conda -``` - -*You should have the sudo password ready, otherwise some missing dependencies cannot be installed. See [dependency list](#dependencies) for more info. If you do not have root privileges, ask your admin to install those dependencies using `fdog.setup --lib` command.* [Pre-calculated data set](https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure) of fdog will be saved in `/output/path/for/fdog/data`. After the setup run successfully, you can start using *fdog*. **Please make sure to check if you need to run [setupFAS](https://github.com/BIONF/FAS/wiki/setupFAS) first.** You will get a warning if any of the dependencies are not ready to use, please solve those issues and rerun `fdog.setup`. -*For debugging the setup, please create a log file by running the setup as e.g. `fdog.setup | tee log.txt` for Linux/MacOS or `fdog.setup --conda | tee log.txt` for Anaconda and send us that log file, so that we can trouble shoot the issues. Most of the problems can be solved by just re-running the setup.* +*For debugging the setup, please create a log file by running the setup as e.g. `fdog.setup | tee log.txt` and send us that log file, so that we can trouble shoot the issues. Most of the problems can be solved by just re-running the setup.* # Usage *fdog* will run smoothly with the provided sample input file 'infile.fa' if everything is set correctly. ``` -fdog.run --seqFile infile.fa --seqName test --refspec HUMAN@9606@3 +fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3 ``` The output files with the prefix `test` will be saved at your current working directory. You can have an overview about all available options with the command @@ -81,9 +75,9 @@ Please find more information in [our wiki](https://github.com/BIONF/fDOG/wiki) t Within the data package we provide a set of 78 reference taxa. They can be automatically downloaded during the setup. This data comes "ready to use" with the *fdog* framework. Species data must be present in the three directories listed below: -* genome_dir (Contains sub-directories for proteome fasta files for each species) -* blast_dir (Contains sub-directories for BLAST databases made with `makeblastdb` out of your proteomes) -* weight_dir (Contains feature annotation files for each proteome) +* searchTaxa_dir (Contains sub-directories for proteome fasta files for each species) +* coreTaxa_dir (Contains sub-directories for BLAST databases made with `makeblastdb` out of your proteomes) +* annotation_dir (Contains feature annotation files for each proteome) For each species/taxon there is a sub-directory named in accordance to the naming schema ([Species acronym]@[NCBI ID]@[Proteome version]) @@ -95,7 +89,7 @@ For adding **one gene set**, please use the `fdog.addTaxon` function: fdog.addTaxon -f newTaxon.fa -i tax_id [-o /output/directory] [-n abbr_tax_name] [-c] [-v protein_version] [-a] ``` -in which, the first 3 arguments are required including `newTaxon.fa` is the gene set that need to be added, `tax_id` is its NCBI taxonomy ID, `/output/directory` is where the sub-directories can be found (*genome_dir*, *blast_dir* and *weight_dir*). If not given, new taxon will be added into the same directory of pre-calculated data. Other arguments are optional, which are `-n` for specify your own taxon name (if not given, an abbriviate name will be suggested based on the NCBI taxon name of the input `tax_id`), `-c` for calculating the BLAST DB (only needed if you need to include your new taxon into the list of taxa for compilating the core set), `-v` for identifying the genome/proteome version (default will be 1), and `-a` for turning off the annotation step (*not recommended*). +in which, the first 3 arguments are required including `newTaxon.fa` is the gene set that need to be added, `tax_id` is its NCBI taxonomy ID, `/output/directory` is where the sub-directories can be found (*genome_dir*, *blast_dir* and *weight_dir*). If not given, new taxon will be added into the same directory of pre-calculated data. Other arguments are optional, which are `-n` for specify your own taxon name (if not given, an abbriviate name will be suggested based on the NCBI taxon name of the input `tax_id`), `-c` for calculating the BLAST DB (only needed if you need to include your new taxon into the list of taxa for compilating the core set), `-v` for identifying the genome/proteome version (default will be the current date ), and `-a` for turning off the annotation step (*not recommended*). ## Adding a list of gene sets into fDOG For adding **more than one gene set**, please use the `fdog.addTaxa` script: diff --git a/fdog/.DS_Store b/fdog/.DS_Store deleted file mode 100644 index a99a01c231b8aab3b888fe9e4dacf4b66808b3f0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHM!EVz)5S>j^aFdqyP>FIt@`YPe#i6wHfD}Rz>H!rg6&wJyaT6K~*NzgWp@g8^ z`2s-V7x)giBff~1NVZbn87%&VN2F?NlcxQ95ro8vnT_X(xhJka* z0PhbrD(kY=L#5@`0U=WW=qxVFf;RF1iLtdVYdut2LeZzn9)yMponioZX{fEPp@Q_QM0$4_#PrP~Apd z{VoeyL^-Na9mg*83slbelbDYQOnwX#V#E}bxSBxCkc*iG{X*$)a_Ji?ZbsP)|@rBw{)f7FFB=x?UdFB1-tK*Hk^We zHgh4$p!j^yWs3>U<>Mqor9muC@a|3=Y{Gb)F8WNe#&7 zv;}sf^pz0bU>je(ELKihkF%QLepBtv7_Uxo4ZL1=6t8@;v9|U{ypEViFuO?4z~~O$ z15X_s4LX7!`gD@!vzadm-Y&tqL2WvO+@UVkr6pyGzY|Bvhgf7S0LP<+bPUb22+>ap zQbtM~nFq3qsJaj7V@xj<#gP+7&1+PZqYNYY7&UD)_Mtd#RbU(#e%)=Y)-h*TRb?*)bQl+jggUXr1@Vo{cbUNgB6!zZjcAAiKXwIw#AO2 zvKR-W{Pr|vjjk=G@h3$q) uO9+Ha$Dw694n6wA5N#W#j45k9R2o-M{`wCA{Nb*@&&~T^E`br}kbxhuD9V}u diff --git a/fdog/addTaxa.py b/fdog/addTaxa.py index fa4a3a1..c03a9a8 100644 --- a/fdog/addTaxa.py +++ b/fdog/addTaxa.py @@ -1,17 +1,13 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to prepare data for fdog. -# For each given genome FASTA file, It will create a folder within genome_dir +# For each given genome FASTA file, It will create a folder within searchTaxa_dir # with the naming scheme of fdog ([Species acronym]@[NCBI ID]@[Proteome version] -# e.g HUMAN@9606@3), a annotation file in JSON format in weight_dir and -# a blast DB in blast_dir folder (optional). -# For a long header of original FASTA sequence, only the first word -# will be taken as the ID of new fasta file, everything after the -# first whitespace will be removed. If this first word is not unique, -# an automatically increasing index will be added. +# e.g HUMAN@9606@3), a annotation file in JSON format in annotation_dir and +# a blast DB in coreTaxa_dir folder (optional). # # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -26,104 +22,78 @@ import sys import os import argparse -from os import listdir -from os.path import isfile, join from pathlib import Path -import subprocess +from Bio import SeqIO import multiprocessing as mp +from tqdm import tqdm from ete3 import NCBITaxa -import csv -from io import StringIO import re import shutil -from tqdm import tqdm from datetime import datetime +import time +from pkg_resources import get_distribution +from collections import OrderedDict -def checkFileExist(file): - if not os.path.exists(os.path.abspath(file)): - sys.exit('%s not found' % file) +import fdog.libs.zzz as general_fn +import fdog.libs.tree as tree_fn +import fdog.libs.addtaxon as add_taxon_fn -def getTaxName(taxId): - ncbi = NCBITaxa() - try: - ncbiName = ncbi.get_taxid_translator([taxId])[int(taxId)] - ncbiName = re.sub('[^a-zA-Z1-9\s]+', '', ncbiName) - taxName = ncbiName.split() - name = taxName[0][:3].upper()+taxName[1][:2].upper() - except: - name = "UNK" + taxId - return(name) -def parseMapFile(mappingFile): - nameDict = {} - with open(mappingFile) as f: +def parse_map_file(mapping_file, folIn): + """ Create spec name from mapping file + And also check if given input files in mapping file exist + """ + name_dict = {} + with open(mapping_file) as f: for line in f: if not '#' in line: tmp = line.split('\t') - fileName = tmp[0] - taxId = tmp[1].strip() + file_name = tmp[0] + file_in = '%s/%s' % (folIn, file_name) + general_fn.check_file_exist(file_in) + tax_id = tmp[1].strip() try: - taxName = tmp[2].strip() + tax_name = tmp[2].strip() except: - taxName = getTaxName(taxId) + tax_name = '' try: ver = tmp[3].strip() except: - ver = datetime.today().strftime('%y%m%d') #1 - # print(taxName+"@"+str(taxId)+"@"+str(ver)) - nameDict[fileName] = (taxName, str(taxId), str(ver)) - return(nameDict) + ver = datetime.today().strftime('%y%m%d') + spec_name = add_taxon_fn.generate_spec_name(tax_id, tax_name, ver) + name_dict[file_in] = spec_name + return(name_dict) -def runAddTaxon(args): - (f,n,i,o,c,v,a,cpus,replace,delete) = args - cmd = 'fdog.addTaxon -f %s -n %s -i %s -o %s -v %s --cpus %s' % (f,n,i,o,v,cpus) - if c == True: - cmd = cmd + ' -c' - if a == True: - cmd = cmd + ' -a' - if replace == True: - cmd = cmd + ' --replace' - if delete == True: - cmd = cmd + ' --delete' - # print(cmd) - logFile = o + '/addTaxa2fDog.log' - cmd = cmd + ' >> ' + logFile - try: - subprocess.call([cmd], shell = True) - except: - sys.exit('Problem running\n%s' % (cmd)) def main(): - version = '0.0.9' - parser = argparse.ArgumentParser(description='You are running fdog.addTaxa version ' + str(version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-i', '--input', help='Path to input folder', action='store', default='', required=True) required.add_argument('-m', '--mapping', - help='Tab-delimited text file containing tabtabtab. The last 2 columns are optional.', + help='Tab-delimited text file containing tabtabtab. The last 2 columns are optional.', action='store', default='', required=True) optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='') - optional.add_argument('-c', '--coreTaxa', help='Include these taxa to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False) - optional.add_argument('-a', '--noAnno', help='Do NOT annotate these taxa using fas.doAnno', action='store_true', default=False) + optional.add_argument('--searchpath', help='Path to search taxa folder (e.g. fdog_data/searchTaxa_dir)', action='store', default='') + optional.add_argument('--corepath', help='Path to core taxa folder (e.g. fdog_data/coreTaxa_dir)', action='store', default='') + optional.add_argument('--annopath', help='Path to annotation folder (e.g. fdog_data/annotation_dir)', action='store', default='') + optional.add_argument('-c', '--coreTaxa', help='Include this taxon to core taxa (i.e. taxa in coreTaxa_dir folder)', action='store_true', default=False) + optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using fas.doAnno', action='store_true', default=False) optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int) optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False) optional.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False) - optional.add_argument('-f', '--force', help='Force overwrite existing data', action='store_true', default=False) + optional.add_argument('--force', help='Force overwrite existing data', action='store_true', default=False) - ### get arguments args = parser.parse_args() folIn = args.input + folIn = os.path.abspath(folIn) mapping = args.mapping - checkFileExist(mapping) + general_fn.check_file_exist(mapping) outPath = args.outPath - if outPath == '': - fdogPath = os.path.realpath(__file__).replace('/addTaxa.py','') - pathconfigFile = fdogPath + '/bin/pathconfig.txt' - if not os.path.exists(pathconfigFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - with open(pathconfigFile) as f: - outPath = f.readline().strip() - outPath = os.path.abspath(outPath) + searchpath = args.searchpath + corepath = args.corepath + annopath = args.annopath noAnno = args.noAnno coreTaxa = args.coreTaxa cpus = args.cpus @@ -131,61 +101,66 @@ def main(): cpus = mp.cpu_count()-2 replace = args.replace delete = args.delete + add_taxon_fn.check_conflict_opts(replace, delete) force = args.force - - ### get existing genomes - Path(outPath + "/genome_dir").mkdir(parents = True, exist_ok = True) - Path(outPath + "/weight_dir").mkdir(parents = True, exist_ok = True) - genomeFiles = listdir(outPath + "/genome_dir") - - ### generate taxon names from mapping file - nameDict = parseMapFile(mapping) - - ### read all input fasta files and create addTaxon jobs - jobs = [] - dupList = {} - faFiles = [f for f in listdir(folIn) if isfile(join(folIn, f))] - for f in faFiles: - # tmp = f.split('.') - if f in nameDict: - # check duplicated taxon name in existing data - taxName = '@'.join(nameDict[f]) - flag = 1 - if taxName in genomeFiles: - if force: - shutil.rmtree(outPath + "/genome_dir/" + taxName) - if not noAnno: - shutil.rmtree(outPath + "/weight_dir/" + taxName) - else: - flag = 0 - dupList[f] = taxName - - if flag == 1: - fasta = folIn + '/' + f - name = nameDict[f][0] - taxid = nameDict[f][1] - verProt = nameDict[f][2] - jobs.append([ - folIn + '/' + f, nameDict[f][0], nameDict[f][1], - outPath, coreTaxa, nameDict[f][2], noAnno, cpus, replace, delete - ]) - - if len(dupList) > 0: - print("These taxa are probably already present in %s:" % (outPath + "/genome_dir")) - for f in dupList: - print('\t'+f+'\t'+dupList[f]) - if force: - print('They will be deleted and re-compiled!') - else: - sys.exit("Please remove them from the mapping file or use different Name/ID/Version!") - - print('Parsing...') - for job in tqdm(jobs): - # print('@'.join([job[1],job[2],job[5]]) + '\t' + job[0]) - runAddTaxon(job) - - print('Output can be found in %s' % outPath) + start = time.time() + ### parse mapping file + name_dict = parse_map_file(mapping, folIn) + + ### initiate paths + fdogPath = os.path.realpath(__file__).replace('/addTaxa.py','') + (outPath, searchpath, corepath, annopath) = add_taxon_fn.get_paths(outPath, fdogPath, searchpath, corepath, annopath) + Path(searchpath).mkdir(parents = True, exist_ok = True) + + ### create file in searchTaxa_dir [and coreTaxa_dir] + genome_jobs = [] + blast_jobs = [] + for f in name_dict: + spec_name = name_dict[f] + ## remove old folder if force is set + if force == True: + if os.path.exists('%s/%s' % (searchpath, spec_name)): + shutil.rmtree('%s/%s' % (searchpath, spec_name)) + if os.path.exists('%s/%s' % (corepath, spec_name)): + shutil.rmtree('%s/%s' % (corepath, spec_name)) + ## create jobs + genome_path = '%s/%s' % (searchpath, spec_name) + Path(genome_path).mkdir(parents = True, exist_ok = True) + genome_jobs.append([f, genome_path, spec_name, force, replace, delete]) + if coreTaxa: + genome_file = '%s/%s.fa' % (genome_path, spec_name) + blast_jobs.append([searchpath, corepath, outPath, spec_name, genome_file, force, True]) + pool = mp.Pool(cpus) + + print('Parsing genome for %s species...' % len(genome_jobs)) + genome_out = [] + for _ in tqdm(pool.imap_unordered(add_taxon_fn.create_genome, genome_jobs), + total=len(genome_jobs)): + genome_out.append(_) + out_msg = 'Output for %s can be found in %s' % (spec_name, searchpath) + if len(blast_jobs) > 0: + print('\nCreating Blast DB for %s species...' % len(blast_jobs)) + blast_out = [] + for _ in tqdm(pool.imap_unordered(add_taxon_fn.create_blastdb, blast_jobs), + total=len(blast_jobs)): + blast_out.append(_) + out_msg = '%s, %s' % (out_msg, corepath) + + ### create annotation + if not noAnno: + Path(annopath).mkdir(parents = True, exist_ok = True) + for f in name_dict: + genome_file = '%s/%s/%s.fa' % (searchpath, name_dict[f], name_dict[f]) + add_taxon_fn.create_annoFile(annopath, genome_file, cpus, force) + if os.path.exists('%s/tmp' % annopath): + if not os.listdir('%s/tmp' % annopath): + shutil.rmtree('%s/tmp' % annopath) + out_msg = '%s, %s' % (out_msg, annopath) + + end = time.time() + print('==> Adding %s taxa finished in %s' % (len(name_dict), '{:5.3f}s'.format(end - start))) + print('==> %s' % out_msg) if __name__ == '__main__': main() diff --git a/fdog/addTaxon.py b/fdog/addTaxon.py index f962cba..bd17fe0 100755 --- a/fdog/addTaxon.py +++ b/fdog/addTaxon.py @@ -1,17 +1,13 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to prepare data for fdog. -# It will create a folder within genome_dir with the naming scheme of -# fdog ([Species acronym]@[NCBI ID]@[Proteome version], e.g -# HUMAN@9606@3) and a annotation file in JSON format in weight_dir -# (optional). -# For a long header of original FASTA sequence, only the first word -# will be taken as the ID of new fasta file, everything after the -# first whitespace will be removed. If this first word is not unique, -# an automatically increasing index will be added. +# For each given genome FASTA file, It will create a folder within searchTaxa_dir +# with the naming scheme of fdog ([Species acronym]@[NCBI ID]@[Proteome version] +# e.g HUMAN@9606@3), a annotation file in JSON format in annotation_dir and +# a blast DB in coreTaxa_dir folder (optional). # # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -27,73 +23,30 @@ import os import argparse from pathlib import Path -from Bio import SeqIO -import subprocess -import multiprocessing as mp -from ete3 import NCBITaxa -import re import shutil +import multiprocessing as mp from datetime import datetime +from pkg_resources import get_distribution + +import fdog.libs.zzz as general_fn +import fdog.libs.tree as tree_fn +import fdog.libs.addtaxon as add_taxon_fn -def checkFileExist(file): - if not os.path.exists(os.path.abspath(file)): - sys.exit('%s not found' % file) - -def checkOptConflict(replace, delete): - if delete: - if replace: - sys.exit('*** ERROR: only one option can be choose between "--replace" and "--delete"') - if replace: - if delete: - sys.exit('*** ERROR: only one option can be choose between "--replace" and "--delete"') - -def checkTaxId(taxId): - ncbi = NCBITaxa() - tmp = ncbi.get_rank([taxId]) - try: - tmp = ncbi.get_rank([taxId]) - rank = tmp[int(taxId)] - if not rank == 'species': - print('\033[92mWARNING: rank of %s is not SPECIES (%s)\033[0m' % (taxId, rank)) - else: - print('\033[92mNCBI taxon info: %s %s\033[0m' % (taxId, ncbi.get_taxid_translator([taxId])[int(taxId)])) - except: - print('\033[92mWARNING: %s not found in NCBI taxonomy database!\033[0m' % taxId) - -def getTaxName(taxId): - ncbi = NCBITaxa() - try: - ncbiName = ncbi.get_taxid_translator([taxId])[int(taxId)] - ncbiName = re.sub('[^a-zA-Z1-9\s]+', '', ncbiName) - taxName = ncbiName.split() - name = taxName[0][:3].upper()+taxName[1][:2].upper() - except: - name = "UNK" + taxId - return(name) - -def runBlast(args): - (specName, specFile, outPath) = args - blastCmd = 'makeblastdb -dbtype prot -in %s -out %s/blast_dir/%s/%s' % (specFile, outPath, specName, specName) - try: - subprocess.call([blastCmd], shell = True) - except: - sys.exit('Problem with running %s' % blastCmd) - fileInGenome = "../../genome_dir/%s/%s.fa" % (specName, specName) - fileInBlast = "%s/blast_dir/%s/%s.fa" % (outPath, specName, specName) - if not Path(fileInBlast).exists(): - os.symlink(fileInGenome, fileInBlast) def main(): - version = '0.0.10' - parser = argparse.ArgumentParser(description='You are running fdog.addTaxon version ' + str(version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-f', '--fasta', help='FASTA file of input taxon', action='store', default='', required=True) required.add_argument('-i', '--taxid', help='Taxonomy ID of input taxon', action='store', default='', required=True, type=int) optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='') + optional.add_argument('--searchpath', help='Path to search taxa folder (e.g. fdog_data/searchTaxa_dir)', action='store', default='') + optional.add_argument('--corepath', help='Path to core taxa folder (e.g. fdog_data/coreTaxa_dir)', action='store', default='') + optional.add_argument('--annopath', help='Path to annotation folder (e.g. fdog_data/annotation_dir)', action='store', default='') optional.add_argument('-n', '--name', help='Acronym name of input taxon', action='store', default='', type=str) optional.add_argument('-v', '--verProt', help='Proteome version', action='store', default='', type=str) - optional.add_argument('-c', '--coreTaxa', help='Include this taxon to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False) + optional.add_argument('-c', '--coreTaxa', help='Include this taxon to core taxa (i.e. taxa in coreTaxa_dir folder)', action='store_true', default=False) optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using fas.doAnno', action='store_true', default=False) optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int) optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False) @@ -102,133 +55,65 @@ def main(): args = parser.parse_args() - checkFileExist(args.fasta) + general_fn.check_file_exist(args.fasta) faIn = args.fasta - name = args.name.upper() taxId = str(args.taxid) - # outPath = str(Path(args.outPath).resolve()) - outPath = args.outPath #str(Path(args.outPath).resolve()) - if outPath == '': - fdogPath = os.path.realpath(__file__).replace('/addTaxon.py','') - pathconfigFile = fdogPath + '/bin/pathconfig.txt' - if not os.path.exists(pathconfigFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - with open(pathconfigFile) as f: - outPath = f.readline().strip() - outPath = os.path.abspath(outPath) - noAnno = args.noAnno - coreTaxa = args.coreTaxa + outPath = args.outPath + searchpath = args.searchpath + corepath = args.corepath + annopath = args.annopath + name = args.name.upper() ver = str(args.verProt) if ver == '': ver = datetime.today().strftime('%y%m%d') + noAnno = args.noAnno + coreTaxa = args.coreTaxa cpus = args.cpus if cpus == 0: cpus = mp.cpu_count()-2 replace = args.replace delete = args.delete - checkOptConflict(replace, delete) + add_taxon_fn.check_conflict_opts(replace, delete) force = args.force ### species name after fdog naming scheme - checkTaxId(taxId) - if name == "": - name = getTaxName(taxId) - specName = name+'@'+taxId+'@'+ver - print('Species name\t%s' % specName) + spec_name = add_taxon_fn.generate_spec_name(taxId, name, ver) + print('Species name\t%s' % spec_name) + + ### get paths + fdogPath = os.path.realpath(__file__).replace('/addTaxon.py','') + (outPath, searchpath, corepath, annopath) = add_taxon_fn.get_paths(outPath, fdogPath, searchpath, corepath, annopath) ### remove old folder if force is set - if force: - if os.path.exists(outPath + '/genome_dir/' + specName): - shutil.rmtree(outPath + '/genome_dir/' + specName) - if os.path.exists(outPath + '/blast_dir/' + specName): - shutil.rmtree(outPath + '/blast_dir/' + specName) + if force == True: + if os.path.exists('%s/%s' % (searchpath, spec_name)): + shutil.rmtree('%s/%s' % (searchpath, spec_name)) + if os.path.exists('%s/%s' % (corepath, spec_name)): + shutil.rmtree('%s/%s' % (corepath, spec_name)) + + ### initiate paths + genome_path = add_taxon_fn.create_folders(searchpath, corepath, annopath, spec_name, coreTaxa, noAnno) - ### create file in genome_dir + ### create file in searchTaxa_dir print('Parsing FASTA file...') - Path(outPath + '/genome_dir').mkdir(parents = True, exist_ok = True) - genomePath = outPath + '/genome_dir/' + specName - Path(genomePath).mkdir(parents = True, exist_ok = True) - # load fasta seq - inSeq = SeqIO.to_dict((SeqIO.parse(open(faIn), 'fasta'))) - specFile = genomePath + '/' + specName + '.fa' - if (not os.path.exists(os.path.abspath(specFile))) or (os.stat(specFile).st_size == 0) or force: - f = open(specFile, 'w') - index = 0 - modIdIndex = 0 - # longId = 'no' - tmpDict = {} - # with open(specFile + '.mapping', 'a') as mappingFile: - for id in inSeq: - seq = str(inSeq[id].seq) - # check ID - # oriId = id - if ' ' in id: - sys.exit('\033[91mERROR: Sequence IDs (e.g. %s) must not contain space(s)!\033[0m' % id) - else: - if '\|' in id: - print('\033[91mWARNING: Sequence IDs contain pipe(s). They will be replaced by "_"!\033[0m') - id = re.sub('\|', '_', id) - # if len(id) > 20: - # modIdIndex = modIdIndex + 1 - # id = modIdIndex - # longId = 'yes' - # if not id in tmpDict: - # tmpDict[id] = 1 - # else: - # index = index + 1 - # id = str(index) - # tmpDict[id] = 1 - # mappingFile.write('%s\t%s\n' % (id, oriId)) - # check seq - if seq[-1] == '*': - seq = seq[:-1] - specialChr = 'no' - if any(c for c in seq if not c.isalpha()): - specialChr = 'yes' - if specialChr == 'yes': - if replace or delete: - if replace: - seq = re.sub('[^a-zA-Z]', 'X', seq) - if delete: - seq = re.sub('[^a-zA-Z]', '', seq) - else: - sys.exit('\033[91mERROR: %s sequence contains special character!\033[0m\nYou can use --replace or --delete to solve it.' % (id)) - f.write('>%s\n%s\n' % (id, seq)) - f.close() - # write .checked file - cf = open(specFile+'.checked', 'w') - cf.write(str(datetime.now())) - cf.close() - # warning about long header - # if longId == 'yes': - # print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile) - else: - print(genomePath + '/' + specName + '.fa already exists!') + genome_file = add_taxon_fn.create_genome([faIn, genome_path, spec_name, force, replace, delete]) + out_msg = 'Output for %s can be found in %s' % (spec_name, searchpath) ### create blast db if coreTaxa: - print('Creating Blast DB...') - Path(outPath + '/blast_dir').mkdir(parents = True, exist_ok = True) - if (not os.path.exists(os.path.abspath(outPath + '/blast_dir/' + specName + '/' + specName + '.phr'))) or force: - try: - runBlast([specName, specFile, outPath]) - except: - print('\033[91mProblem with creating BlastDB.\033[0m') - else: - print('Blast DB already exists!') + print('\nCreating Blast DB...') + add_taxon_fn.create_blastdb([searchpath, corepath, outPath, spec_name, genome_file, force, False]) + out_msg = '%s, %s' % (out_msg, corepath) ### create annotation if not noAnno: - Path(outPath + '/weight_dir').mkdir(parents = True, exist_ok = True) - annoCmd = 'fas.doAnno -i %s/%s.fa -o %s --cpus %s' % (genomePath, specName, outPath+'/weight_dir', cpus) - if force: - annoCmd = annoCmd + " --force" - try: - subprocess.call([annoCmd], shell = True) - except: - print('\033[91mProblem with running fas.doAnno. You can check it with this command:\n%s\033[0m' % annoCmd) - - print('Output for %s can be found in %s within genome_dir [and blast_dir, weight_dir] folder[s]' % (specName, outPath)) + add_taxon_fn.create_annoFile(annopath, genome_file, cpus, force) + if os.path.exists('%s/tmp' % annopath): + if not os.listdir('%s/tmp' % annopath): + shutil.rmtree('%s/tmp' % annopath) + out_msg = '%s, %s' % (out_msg, annopath) + + print('\n==> %s' % out_msg) if __name__ == '__main__': main() diff --git a/fdog/bin/Filehandler.pm b/fdog/bin/Filehandler.pm deleted file mode 100755 index 5f1e08e..0000000 --- a/fdog/bin/Filehandler.pm +++ /dev/null @@ -1,45 +0,0 @@ -package Filehandler; -use strict; -# PROGRAM NAME: Filehandler.pm - -# AUTHOR: INGO EBERSBERGER, ingo.ebersberger@univie.ac.at - -# PROGRAM DESCRIPTION: A module that retrieves a filename, path and -# input separator. It opens a file and hands back an object where via -# the command 'next' the next line of the file is fetched. - -# DATE: 19.08.2003 - - -# DATE LAST MODIFIED: - - -##################### start subroutine ####################### -## blessing the variable: -## constructor that returns a file handle -sub TIEHANDLE { - my $class = shift; - my $name = shift; - my $path = shift; - $/ = shift; - $path =~ s/\/$//; - my $self; - open ($self, "$path/$name") or die "could not open $path/$name\n"; - bless($self, $class); - return ($self); -} - -sub READLINE { - my ($self) = shift; - return <$self>; -} -sub CLOSE { - my $self = shift; - close ($self) or die "could not close filehandle\n"; -} - -sub PRINT { - my $self = shift; - print $self @_; -} -1; diff --git a/fdog/bin/getSearchTaxa.pl b/fdog/bin/getSearchTaxa.pl deleted file mode 100644 index 33b7b30..0000000 --- a/fdog/bin/getSearchTaxa.pl +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use Bio::DB::Taxonomy; -use Bio::Tree::Tree; -use Bio::TreeIO; -use Getopt::Std; -use Cwd 'abs_path'; - -sub usage { - my $msg = shift; - print "example: perl getSearchTaxa.pl -i genome_dir -b 0.00005 -h 0.00005 -r 10 -n mammalia -t taxonomy -o searchList.txt\n"; - print "-i\tFolder contains all search species (e.g. genome_dir)\n"; - die $msg."\n"; -} - -# global variables -our($opt_i,$opt_b,$opt_h,$opt_r,$opt_n,$opt_t,$opt_o); -getopts('i:b:h:r:n:t:o:'); - -# sanity checks; -my $genome_dir = ($opt_i) ? $opt_i : usage("ERROR: No input folder given\n"); -my $eval_blast = ($opt_b) ? $opt_b : usage("ERROR: No eval_blast given\n"); -my $eval_hmmer = ($opt_h) ? $opt_h : usage("ERROR: No eval_hmmer given\n"); -my $eval_relaxfac = ($opt_r) ? $opt_r : usage("ERROR: No eval_relaxfac given\n"); -my $group = ($opt_n) ? $opt_n : usage("ERROR: No group given\n"); -my $idx_dir = ($opt_t) ? $opt_t : usage("ERROR: No taxonomy dir given\n"); -my $output = ($opt_o) ? $opt_o : usage("ERROR: No output given\n"); - -open(OUT, ">$output") || die "Cannot create $output\n"; -my $groupNode; -my %taxa; -my $db; - -if($group ne "all") { - $db = Bio::DB::Taxonomy->new(-source => 'flatfile', - -nodesfile => $idx_dir . 'nodes.dmp', - -namesfile => $idx_dir . 'names.dmp', - -directory => $idx_dir); - checkGroup($group); - # get tree - %taxa = getTaxa($genome_dir); - my $tree = getTree(); - my $final_eval_blast = $eval_blast*$eval_relaxfac; - my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac; - if($groupNode) { - foreach($tree->get_nodes()) { - if($_->id == $groupNode->id) { - $groupNode = $_; - } - } - $tree->set_root_node($groupNode); - } - foreach (get_leaves($tree)) { - my $tmp = @{$_->name('supplied')}[0]; - print OUT $tmp,"\n"; - } -} else { - %taxa = getTaxa($genome_dir); - foreach my $tax (keys %taxa) { - print OUT $tax,"\n"; - } -} -exit; - -sub checkGroup { - my ($group) = $_[0]; - my $node = $db->get_taxon(-name => $group); - if($node) { - $groupNode = $node; - } else { - print "Your selected group " . $group . " was not found in the taxonomic tree... TERMINATING\n"; - exit; - } -} - -sub getTaxa { - my ($genome_dir) = $_[0]; - ## removal of misplaced files in genome_dir - if (-e "$genome_dir/query.sql"){ - unlink("$genome_dir/query.sql"); - } - if (-e "$genome_dir/@@.fa"){ - unlink("$genome_dir/@@.fa"); - } - my @taxonlist = `ls $genome_dir`; - chomp @taxonlist; - for (my $i = 0; $i < @taxonlist; $i++) { - my ($taxon_name, $ncbi_id, $src_id) = split /@/, $taxonlist[$i]; - if (!$src_id) { - $src_id = ''; - } - $taxon_name = $taxonlist[$i]; - $taxa{$taxon_name} = $ncbi_id; - } - my $hashcount = keys(%taxa); - return(%taxa); -} - -sub getTree { - # the full lineages of the species are merged into a single tree - my $tree; - foreach my $key (sort {lc $a cmp lc $b} keys %taxa) { - my $node = $db->get_taxon(-taxonid => $taxa{$key}); - if (!defined $node){ - print "ISSUE in sub getTree. No correspodence found in taxonomy file for $key and taxid $taxa{$key}. Skipping...\n"; - next; - } - else { - $node->name('supplied', $key); - if($tree) { - $tree->merge_lineage($node); - } - else { - $tree = Bio::Tree::Tree->new(-verbose => $db->verbose, -node => $node); - } - } - } - return $tree; -} - -sub get_leaves { - my $tree = $_[0]; - my $delFlag = 0; - if(defined($_[1])){ - $delFlag = $_[1]; - } - - my $node = $tree->get_root_node; - my @leaves; - my @children = ($node); - for (@children) { - push @children, $_->each_Descendent(); - } - for (@children) { - push @leaves, $_ if defined($_->name('supplied')); - } - # if the tree is set to be deleted - if ($delFlag){ - @leaves = qw(); - return @leaves; - }else{ - return @leaves; - } -} diff --git a/fdog/bin/hamstr.pl b/fdog/bin/hamstr.pl deleted file mode 100755 index 09ec8ec..0000000 --- a/fdog/bin/hamstr.pl +++ /dev/null @@ -1,2358 +0,0 @@ -#!/usr/bin/perl -use strict; -use Getopt::Long; -use Parallel::ForkManager; -use Bio::SearchIO; -use Bio::Search::Hit::BlastHit; -use Bio::SeqIO; -use Bio::Align::ProteinStatistics; -use Bio::AlignIO; -use Term::Cap; -use POSIX; -use Cwd; -use Cwd 'abs_path'; -use Statistics::R; -use File::Basename; -use lib dirname(__FILE__); -use run_genewise_hamstr; - -# PROGRAMNAME: hamstr.pl - -# Copyright (C) 2009 INGO EBERSBERGER, ingo.ebersberger@univie.ac.at -# This program is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published -# by the Free Software Foundation; either version 3 of the License -# or any later version. - -# This program is distributed in the hope that it will be useful -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# You should have received a copy of the GNU General Public License -# along with this program; If not, see http://www.gnu.org/licenses - -# PROGRAM DESCRIPTION: HaMStR is a program for targeted ortholog search in both EST/RNAseq -# and protein sequence data. - -# DATE: Wed Dec 19 10:41:09 CEST 2007 - -# PROGRAM HISTORY -##23. 07. 2010: found a bug in the extraction of the -## hmm hit sequence from the sequnence_file. A end-of-line char was missing. - -##09.08.2010: added the option to choose the new blastp program from ncbi. Just comment -##out line 45 in the script and uncomment line 46. Note, in order to make this work I have -##to slightly modify the blast output since otherwise it will not be parsed by the Bioperl -##Blast parser. Currently this is a pretty dirty $sedprog hack. It will also take care of removin -##the string lcl| that is added in some instances to the hit id in the blast output. - -## I added the option that one can now provide a comma-separated string of phmm names as an -## argument for the option -hmm - -## 08.03.2011: -## 1) BUG-FIX: Hamstr will now remove automatically newlines from the input sequence file -## 2) BUG-FIX: The sequence header remains now the same whether or not the flag -representative -## has been chosen. - -## 10.04.2011 -## 1) added some information to the log file. - -## 20.05.2011 -## 1) BUG-FIX: The grep for the EST sequence in the sub-routine predictORF received also a hit when -## the search pattern was only a substring of the EST sequence identifier. In some cases the wrong EST -## was then used to predict the ORF. This has been fixed. - -## 30.05.2011 -## 1) Extension: a command line option -longhead has been added. The user can now specify that the -## full sequence id including whitespaces will considered throughout the hamstr search. Note, the -## whitespaces will be replaced by the string specified in the variabel $idsep. -## 2) Modification from the bug fix from 20.05.2011. In the grep for the original EST it is no longer -## necessary that the search string and the EST id are identical over their entire length. Instead the -## search string may be a prefix of the EST id ending with a whitespace. - -## 27.06.2011 -## 1) Extension: I added the option to run a true reciprocal best hit search. Only the best hit from the -## hmmer search is used to check for reciprocity. - -## 06.12.2011 -## 1) Extension: I added the option -hit_limit to set the number of hmmsearch hits that HaMStR uses for -## the re-blast. - -## 10.02.2012 -## 1) Extension: I added checks for the appropriate hmmsearch version (HMMER 3) and for genewise and -## its environmental variable WISECONFIGDIR. -## 2) Bug fix in the -rbh option. - -## 11.09.2012 -## 1) Bug fix: -hitlimit, even if not set explicitely has been invoked resulting in a more stringent -## behaviour of HaMStR. This has been fixed resulting in longer run-times. -## 18.12.2012 -## 1) Bug fix: There was a bug in the CDS extraction for reverse complemented -## sequences. A new line was moved to the beginning of the sequence -## leading to index errors. - -## 18.12.2013 -## 1) Bug fix: I have now adapted the script such that it no longer requires the default directory structure -## 2) Extension: Hamstr is now capable of identifying co-orthologs (sub routine IdentifyCoorthologsProt) -## 3) The re-blast for EST sequences is now a BlastX solving the problem of duplicated output for contigs with a pHMM -## hit in more than one frame. - -## 08.01.2014 -## Extension: check for co-orthology between ref-protein and best blast hit in cases both are not identical. -## Bug fix: option -rbh was disfunctional due to a missing function in new sub-routine parseHmmer3pm -## Bug fix: sortRef actually did not sort anything as it was accessing the unsorted file - -## 09.01.2014 -## Extension: Add the possibility to sort the hits by the hmmersearch Score rather than an alignment score -## This will make the best hmmersearch hit surviving the re-blast automatically the representative - -## 10.01.2014 -## Bug fix (minor): modified option -outpath to accept non-default value -## modification of the translate_tc call. - -## 17.01.2014 -## Bug fix (minor): added the option --anysymbol to the mafft-linsi command to avoid crash when protein sequences -## contain non-standard amino acids (U = selenocystein) - -## 14.02.2014 -## Extension: added the option to use ublast rather than blast for the reciprocity check - -## 25.02.2014 -## Extension: added the option -reuse. By default, old results will now be deleted unless flag -reuse has been set -## Extension: added syntax required for running fact implemented into Hamstr2.0. Option -fact does not occur in help -## as this works only in context with Hamstr2.0 -## Extension: Hamstr now outputs a results summary at the end of the run. -## Extension: added the option -cleartmp to faciliate automatic removal of the tmp-dir prior to the hamstr run - -## 05.03.2014 -## Bug fix (minor): Variable $grepprog was not used throughout the script. In some routines 'grep' was hard coded. -## On MAC OS this could lead to unwanted call of BSD grep resulting in an error during re-blast. - -## 16.03.2014 -## Bug fix (major): There was a problem in translating ESTs in the correct frame. This has been fixed. -## Modification (minor): The alignment positions together with the score are no longer sorted externally. - -## 02.04.2014 -## Bug fix (minor): Flag $runublast was not changed to 1 when configuring hamstr -## with the ublast option. - -## 05.08.2014 -## Exentsion: Update to version 13.2.3. New features include the option to run hamstr in silent mode and the option -## to parallelize the hamstr search for individual core orthologs using the Parallel::ForkManager package - -## 14.08.2014 -## Bug fix (minor): corrected typo in sub routine call 'printOUT' - -## 31.07.2015 -## Extension: Update to version 13.2.4. New feaure provides the option to entirely remove intron sequences and incomplete codons -## from transcripts - -## 03.07.2015 -## Minor extension: Selected behavior with respect to introns in transcripts will be printed to hamstrsearch.log - -## 05.07.2015 -## Minor bug fix: A no-blast hit was not reported properly in the sub routine check4reciprocity resulting in rare -## cases in the acceptance of a spurious ortholog. - -## 14.08.2015 -## Change of output file naming. Upon selection of the strict option the reference species is no longer appended -## to the output file name. - -## 30.01.2016 -## Minor changes including the better integration of the onseq.pl script. Among others the blast files are no longer -## expected to have the '_prot' appendix. - -## 12.02.2016 -## Minor bug fix: In some instances the representative protein was not chosen correctly due to a bug in the subroutine -## sortRef. Analyses of transcript data are not affected at all. - -## 19.12.2017 -## Extension: HaMStR can now automatically determine the hit limit up to which candidates from the intial -## hmm search are evaluated as potential orthologs. Two options are available, either an hmm score driven -## cutoff determination, or alternatively, a lagPhase-based estimator. - -## 02.02.2018 -## Bug fix (solved): using grep within the checkcoorthologsref routine could cause an incomplete alignment of the reference gene, -## the candidate ortholog and the best blast hit. The resulting distance (kimura) calculation may caused an overoptimistic -## acceptence of co-ortholgy relations. The bug onyl occured while using the option checkCoOrthologsRef. -## HaMStR keeps original gene sets in FASTA format and *.fa.mod will link to the original FASTA file (no linebreaks within a sequence). - -## 28.02.2018 -## Minor Bug fix (solved): HaMStR is not longer asking infinite times for the replacement of already existing output files. -## Minor Bug fix (solved): Backward compatibility extended. Naming of reference fasta files: (*.fa and *_prot.fa) - -## 20.07.2019 -## fixed the issue of long proteins with best total hmm bit score but very poor domain scores. Allow now the option to sort -## hmmsearch output according to the best domain bit score. The current routine assumes that neither query nor -## hit has whitespaces in their names - -## 14.04.2020 (Vinh) -## Bug fix (solved): existing symbolic link cannot recognized while checking the reference fasta file - -## 10.07.2020 (v13.2.12 - vinh) solved problem when gene ID contains PIPE -## 13.07.2020 (v13.3.0 - vinh) solved problem when gene ID contains PIPE -## 22.07.2020 (v13.4.0 - vinh) moved tmp blast files to output folder and delete them when finished -## 01.12.2020 (v13.4.1 - vinh) add silent option to muscle for checkCoOrthologsRef -## 21.01.2021 (v13.4.2 - vinh) fiexed bug when refspec has "dot" in its name -## 19.03.2021 (v13.4.3 - vinh) changed $path to current directory -## 19.03.2021 (v13.4.5 - vinh) do not replace space by @ for hmm output in parseHmmer4pm -## 12.01.2022 (v13.4.6 - vinh) change aligner from MUSCLE to MAFFT if the sequence is longer than 12,000 aa - -######################## start main ########################################### -my $version = "HaMStR v.13.4.6"; - -######################## checking whether the configure script has been run ### -my $configure = 0; -if ($configure == 0){ - die "\n\n$version\n\nPLEASE RUN setup1s BEFORE USING HAMSTR\n\n"; -} -########## EDIT THE FOLLOWING LINES TO CUSTOMIZE YOUR SCRIPT ################## -my $prog = 'hmmsearch'; #program for the hmm search -my $eval = 1; # default evalue cutoff for the hmm search -my $sedprog = 'sed'; -my $grepprog = 'grep'; -my $readlinkprog = 'readlink'; -my $alignmentprog = 'clustalw'; -my $alignmentprog_co = 'muscle'; -########## EDIT THE FOLLOWING TWO LINES TO CHOOSE YOUR BLAST PROGRAM ########## -my $blast_prog = 'blastp'; -my $filter = 'F'; # low complexity filter switch. Default 'on'. Set of 'F' to turn off permanently. -my $eval_blast = 10; # default evalue cutoff for the blast search -########## EDIT THE FOLLOWING LINES TO MODIFY DEFAULT PATHS ################### -# my $path = abs_path(dirname(__FILE__)); -# $path =~ s/\/bin//; -my $path = getcwd; -my $hmmpath = "$path/core_orthologs"; #path where the hmms are located -my $blastpath = "$path/blast_dir"; #path to the blast-dbs -my $outpath = '.'; -my $idsep = '__'; #character used to replace whitespaces in the sequence header with (flag -longhead) -my $hmm_dir = 'hmm_dir'; -my $fa_dir = 'fa_dir'; -############################## -# my $termios = new POSIX::Termios; $termios->getattr; -# my $ospeed = $termios->getospeed; -# my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed }; -# my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/; - -############################## Variables ############## -my $fileobj; -## The main variable storing most of the results; -## $fileobj->{$taxon}->{prot}->[$hitcounter] -## $fileobj->{$taxon}->{ids}->[$hitcounter] -## $fileobj->{$taxon}->{cds}->[$hitcounter] -## $fileobj->{$taxon}->{hmmscore}->[$hitcounter] -####################################################### -my $pid = $$; -my $help; -my $debug; -my $seq2store_file=''; -my $cds2store_file=''; -my $hmm; -my @hmms; -my $fa; -my $fafile; -my @seqs2store; -my @cds2store; -my $dbpath; -my $dboutpath; -my $ep2eg; -my $dbfile_base; -my $aln; -my $idfile; -my $taxon_check = 0; -my $hmmset; -my $show_coreortholog_sets; -my $hmmsearch_dir; -my $dbfile; # the file hmmsearch is run against -my $dbfile_short; -my $taxon_file; -my $refspec_string; -my @refspec = qw(); -my @primer_taxa; -my $refspec_name = ''; -my $taxon_global; -my $fa_dir_neu = ''; -my $gwrefprot; -my $seqtype; -my $align; -my $rep; -my $estflag; -my $proteinflag; -my $refseq; -my $strict; -my $relaxed; -my $refspec_final = ''; -my $central; -my $concat; -my $seqs2store_file; -my $append; -my $longhead; -my $check = 1; -my @log = qw(); -my $bhh; -my $hitlimit; -my $autoLimit; -my $scoreThreshold; -my $scoreCutoff = 10; -my $nonoverlappingCO; -my $algorithm = 'blastp'; -my $frame; -my $checkCoRef; -my $sortalign; -my $check_genewise = 1; -my $outputfmt = 'blastxml'; -my $fact; -my $runFACTparameter; -my $hmmcount; -my $reuse; -my $cleartmp; -my $ver; -my $silent; -my $cpu = 1; -my $force; -my $keepintron = 'k'; -my $blastapp = ''; -my $blastdbend = '.pin'; -######### ublast options ######### -my $runublast = 1; -my $ublast = 0; -my $accel = 0.8; -#####determine the hostname####### -# push @log, "VERSION:\t$version\n"; -my $hostname = `hostname`; -chomp $hostname; -push @log, "HOSTNAME\t$hostname\n"; -################################# -if (@ARGV==0) { - $help = 1; -} -## help message -my $helpmessage = " -YOU ARE RUNNING $version on $hostname - -This program is freely distributed under a GPL. -Copyright (c) GRL limited: portions of the code are from separate copyrights - -\nUSAGE: hamstr -sequence_file=<> -hmmset=<> -taxon=<> -refspec=<> [OPTIONS] - -OPTIONS: - -REQUIRED --sequence_file=<> - path and name of the file containing the sequences hmmer is run against. --hmmset=<> - specifies the name of the core-ortholog set. - The program will look for the files in the default directory 'core-orthologs' unless you specify - a different path via the option -hmmpath. --refspec=<> - sets the reference species. Note, it has to be a species that contributed sequences - to the hmms you are using. NO DEFAULT IS SET! For a list of possible reference - taxa you can have a look at the speclist.txt file in the default core-ortholog sets - that come with this distribution. Please use the abreviations in this list. If you choose - to use core-orthologs where not every taxon is represented in all core-orthologs, you - can provide a comma-separated list with the preferred refspec first. The lower-ranking - reference species will only be used if a certain gene is not present in the preferred - refspecies due to alternative paths in the transitive closure to define the core-orthologs. - CURRENTLY NO CHECK IS IMPLEMENTED! - NOTE: A BLAST-DB FOR THE REFERENCE SPECIES IS REQUIRED! --taxon - You need to specify a default taxon name from which your ESTs or protein sequences are derived. --est - set this flag if you are searching in ESTs. Note, if neither the -est nor the -protein flag is set, HaMStR will - guess the sequence type. If you select this flag, make sure to specify how to deal with introns retained in the - ESTs. Check option -intron! --protein - set this flag if you are searching in protein sequences. Note, if neither the -est nor the -protein flag is set, HaMStR will - guess the sequence type. - -USING NON-DEFAULT PATHS - --blastpath=<> - Lets you specify the absolute or relative path to the blast databases. DEFAULT: $blastpath --hmmpath=<> - Lets you specify the absolute or relative path to the core ortholog set. DEFAULT: $hmmpath --outpath=<> - You can determine the path to the HaMStR output. Default: current directory. - -ADDITIONAL OPTIONS - --append - set this flag if the output should be appended to the files *.out and *_cds.out. This becomes relevant when running - hamstrsearch with individual hmms and you want to combine the results. --central - set this flag to store the modified infile in the same directory as the infile rather than in the output dir. --checkCoorthologsRef - If the re-blast does not identify the original reference protein sequence as best hit, HaMStR will check whether the best blast - hit is likely a co-ortholog of the reference protein relative to the search taxon. NOTE: Setting this flag will substantially increase - the sensitivity of HaMStR but most likely affect also the specificity, especially when the search taxon is evolutionarily only very - distantly related to the reference taxon. --cleartmp - set this flag to remove existing tmp dir in the HaMStR output directory. --concat - set this flag if you want hamstr to concatenate sequences that align to non-overlapping parts of the reference protein. - If you choose this flag, no co-orthologs will be predicted. --cpu - You can specify the number of parallel jobs in the HaMStR search. HaMStR uses the Parallel::ForkManager module for this purpose. --eval_blast=<> - This option allows to set the e-value cut-off for the Blast search. Default: 10 --eval_hmmer=<> - This options allows to set the e-value cut-off for the HMM search.Default: 1 --filter= - Set this flag to F if the re-blast should be performed without low-complexity filtering. Default is T. --force - Setting this flag forces hamstr to overwrite existing output files (files ending with .out) without further asking. --hit_limit=<> - By default, HaMStR will re-blast all hmmsearch hits against the reference proteome. Reduce the number - of hits for reblast with this option. --autoLimit - Setting this flag will invoke a lagPhase analysis on the score distribution from the hmmer search. This will determine automatically - a hit_limit for each query. --scoreThreshold - Instead of setting an automatic hit limit, you can specify with this flag that only candidates with an hmm score no less - than x percent of the hmm score of the best hit are further evaluated. Default is x = 10. - You can change this cutoff with the option -scoreCutoff. Note, when setting this lag, it will be effective for - both the core ortholog compilation and the final ortholog search. --scoreCutoff=<> - In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a - candidate of the hmmsearch will be subjected for further evaluation. Default: 10%. --hmm - Option to provide only a single hmm to be used for the search. - Note, this file has to end with .hmm --intron= - Specify how to deal with introns that may occur in transcript sequences. Default: keep - Introns will be retained in the transcript - but will be identified by lower case letters. --longhead - Set this flag in the case your sequence identifier contain whitespaces and you whish to keep - the entire sequence identifier throughout your analysis. HaMStR will then replace the whitespaces with - a '__'. If this flag is not set, HaMStR will truncate the sequence - Identifier at the first whitespace, however if and only if the sequence identifier then remain unique. - NOTE: too long sequence headers (~ > 30 chars) will cause trouble in the hmmsearch as the program will truncate - the output! --nonoverlapping_cos - If you set this flag, non-overlapping co-orthologs will be reported as well. NOTE: this flag is still experimental --rbh - set this flag if you want to use a reciprocal best hit criterion. Only the highest scoring - hit from the hmmer search will be used for re-blast. --relaxed - set this flag if the reciprocity criterion is fulfilled when the re-blast against - any of the primer taxa was successfull. Note that setting this flag will substantially decrease the - stringency of the ortholog assignment with the consequence of an increased number of false positives. --representative - From all sequences that fulfill the reciprocity criterion the one showing the highest similarity to the - core ortholog sequence in the reference species is identified and selected as representative. --reuse - Set this flag if you want to prevent HaMStR from overwriting previous results. --show_hmmsets - setting this flag will list all available core ortholog sets in the specified path. Can be combined with -hmmpath. --silent - Supresses (almost) all print statements to the screen. --debug - Get some additional meta information as print out to the screen. --sort_global_align - Setting this flag will tell hamstr to sort ortholog candidates according to their global alignment score to the reference - sequence rather than according to the score they have achieved in the hmmer search (local). NOTE: In the case of searching - EST data this flag is automatically set. --strict - Set this flag if the reciprocity criterion is only fulfilled when the re-blast against - all primer taxa was successfull --aligner - Choose between muscle or mafft-linsi for the alignment of multiple sequences. DEFAULT: muscle - \n\n"; - -GetOptions ( - "append" => \$append, - "autoLimit" => \$autoLimit, - "aligner=s" => \$alignmentprog_co, - "blastpath=s" => \$blastpath, - "checkCoorthologsRef" => \$checkCoRef, - "concat" => \$concat, - "cpu=s" => \$cpu, - "central" => \$central, - "debug" => \$debug, - "est" => \$estflag, - "eval_blast=s" => \$eval_blast, - "eval_hmmer=s" => \$eval, - "fasta_file=s" => \$fafile, - "filter=s" => \$filter, - "force" => \$force, - "h" => \$help, - "hit_limit=s" => \$hitlimit, - "hmm=s" => \$hmm, - "hmmset=s" => \$hmmset, - "hmmpath=s" => \$hmmpath, - "intron=s" => \$keepintron, - "longhead" => \$longhead, - "nonoverlapping_cos" => \$nonoverlappingCO, - "outpath=s" => \$outpath, - "protein"=> \$proteinflag, - "rbh" => \$bhh, - "refspec=s" => \$refspec_string, - "relaxed" => \$relaxed, - "representative" => \$rep, - "reuse" => \$reuse, - "sequence_file=s" => \$dbfile, - "scoreCutoff=s" => \$scoreCutoff, - "scoreThreshold" => \$scoreThreshold, - "show_hmmsets" => \$show_coreortholog_sets, - "silent" => \$silent, - "sort_global_align" => \$sortalign, - "strict" => \$strict, - "taxon_file=s" => \$taxon_file, - "taxon=s" => \$taxon_global, - "ublast" => \$ublast, - "v" => \$ver, - "accel=s" => \$accel, - "fact" => \$fact, - "cleartmp" => \$cleartmp -); - -if ($help) { - print $helpmessage; - exit; -} -elsif($ver){ - print "$version\n"; - exit; -} - -## 1) check if all information is available to run HaMStR -($check, @log) = &checkInput(); -if ($check == 0) { - print "\n\nThere was an error running $version\n\n"; - print join "\n", @log; - exit; -} -else { - open (OUT, ">$outpath/fdog.log") or die "could not open logfile\n"; - print OUT join "\n", @log; - close OUT; -} -my $tmpdir = "$outpath/tmp"; - -### read in of the core-ortholog sequences -my $co_seqs = parseSeqfile("$fafile"); - -## initialize the forking procedure -my $pm = new Parallel::ForkManager($cpu); - -## collect all the entries of the final output file -#my ($spid, $exit_code, $ident, $exit_signal, $core_dump, $data); -#$pm->run_on_finish(sub { -# ($spid, $exit_code, $ident, $exit_signal, $core_dump, $data) = @_; -# $core_dump = undef; -# if ($seqderef){ -# push @seqs2store, @$seqderef; -# if ($estflag) { -# my $estderef = $data->[1]; -# push @cds2store, @$estderef; -# } -# } -#}); - -## 2) loop through the hmms -## process each hmm file separately -$hmmcount = scalar(@hmms); - -for (my $i = 0; $i < @hmms; $i++) { - my $pid = $pm->start and next; - my $localid = $$; - $frame = undef; - $fileobj = undef; - my @seqs = qw(); - my @newseqs = qw();## var to contain the sequences to be added to the orthologous cluster - my @newcds = qw(); - my $hmm = $hmms[$i]; - printOUT("Processing $hmm\n"); - my $hmmout = $hmm; - $hmmout =~ s/\.hmm/\.out/; - ## 3) run the hmm search - if (!(-e "$hmmsearch_dir/$hmmout")) { - printOUT("\n\nnow running $prog using $hmm\n"); - my $hmmOutFile = "$hmmsearch_dir/$hmmout"; - my $hmmModel = "$hmm_dir/$hmm"; - my $hmmInfile = "$dboutpath/$dbfile"; - `$prog --noali --tblout \"$hmmOutFile\" -E $eval \"$hmmModel\" \"$hmmInfile\"` or die "Problem running hmmsearch as $prog --noali --tblout \"$hmmOutFile\" -E $eval \"$hmmModel\" \"$hmmInfile\". No output $hmmsearch_dir/$hmmout\n"; - } - else { - printOUT("an hmmresult $hmmout already exists. Using this one!\n"); - } - - ## 4) process the hmm search result - my $hitcount = 0; - ## 4a) loop through the individual results - ## now the modified version for hmmer3 comes - my $hitlimit_local = $hitlimit; - my ($query_name, $results, $hitlimit_local, $criticalValue) = parseHmmer4pm($hmmout, $hmmsearch_dir); - if (! $results) { - printOUT("no hit found for $query_name\n"); - $pm->finish; - next; - } - ## Automatic hit limit information - if (defined $autoLimit) { - printDebug("Automatic cutoff estimation via a lag Phase analysis was selected. Estimated lag point is $criticalValue. Limiting the number of hits for the evaluation from " . scalar(@$results) . " to $hitlimit_local"); - } - elsif (defined $scoreThreshold) { - printDebug("Automatic cutoff estimation via a minimal score was selected. Cutoff: $scoreCutoff percent of the best hmm score. Hits with an hmm score below $criticalValue are not considered. Limiting the number of hits for the evaluation from " . scalar(@$results) . " to $hitlimit_local"); - } - ## - printOUT("Results for $query_name\n"); - my ($check, $refspec_final) = &determineRefspecFinal($query_name, @refspec); - if ($check == 0) { - die "error in retrieving refspec data\n"; - } - if (!defined $hitlimit_local or $hitlimit_local > scalar(@$results)) { - $hitlimit_local = scalar(@$results); - } - for (my $k = 0; $k < $hitlimit_local; $k++) { - my $hitname = $results->[$k]->{id}; - my $hithmmscore = $results->[$k]->{hmmscore}; - printOUT("$hitname\n"); - my $keep = 0; - my $hitseq = ''; - $refseq = ''; - ## 4b) test for the reciprocity criterion fulfilled - ($keep, $hitseq, $frame) = &check4reciprocity($localid, $query_name, $hitname, $refspec_final, @refspec); - if ($keep == 1) { - ## blast search with the hmm hit identifies the core-ortholog sequence of the reference species - my $taxon = $taxon_global; - ## put the info about the hits into an object for later post-processing - ### HERE COMES THE NEW STUFF THAT DEALS WITH THE DIFFERENT POSSIBILITIES: STRICT, RELAXED OR WHATEVER... - $fileobj = &determineReferences ($localid, $fileobj, $taxon, $refspec_final, $hitname, $hithmmscore, $hitseq, $hitcount); - $hitcount++; - } - else { - printOUT("Reciprocity not fulfilled!\n\n"); - } - } - ## 5) do the rest only if at least one hit was obtained - if (defined $fileobj) { - ## 5a) if the hits are derived from ESTs, get the best ORF - if ($estflag) { - $fileobj = &predictORF($frame); - } - &processHits($localid, $fileobj); - if (!$rep and !$concat) { - ## identify co-orothologs only for protein sequences. This adds a key 'coorthologs' to the $fileobj->{$taxon} that - ## holds the index values for the sequences in the $fileobj->{$taxon}->{ids} and the corresponding {prot} array ref - ## that made it into the co-ortholog field - &identifyCoorthologsProt($localid, $taxon_global); - } - ## 6) prepare the output - my @taxa = keys(%$fileobj); - for (my $i = 0; $i< @taxa; $i++) { - push @newseqs, ">$query_name|$fileobj->{$taxa[$i]}->{refspec_final}|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}|1"; - push @newseqs, $fileobj->{$taxa[$i]}->{refprot}; - if ($estflag) { - push @newcds, ">$query_name|$fileobj->{$taxa[$i]}->{refspec_final}|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}|1"; - push @newcds, $fileobj->{$taxa[$i]}->{refcds}; - } - if (!$rep and !$concat){ - ## print the remaining sequences only when the -representative option has not been chosen. - my $coorthologsobj = $fileobj->{$taxa[$i]}->{coorthologs}; - my $idobj = $fileobj->{$taxa[$i]}->{ids}; - my $protobj = $fileobj->{$taxa[$i]}->{prot}; - my $cdsobj = $fileobj->{$taxa[$i]}->{cds}; - my $refspecobj = $fileobj->{$taxa[$i]}->{refspec}; - for (my $j = 0; $j < @$coorthologsobj; $j++) { - my $index = $coorthologsobj->[$j]; - push @newseqs, ">$query_name|$refspecobj->[$index]|$taxa[$i]|$idobj->[$index]|0"; - push @newseqs, $protobj->[$index]; - if ($estflag) { - push @newcds, ">$query_name|$refspecobj->[$index]|$taxa[$i]|$idobj->[$index]|0"; - push @newcds, $cdsobj->[$index]; - } - } - } - my $refs = $co_seqs->{$query_name}; - for (keys %$refs) { - my $line = ">$query_name|$_|" . $refs->{$_}->{seqid} . "\n" . $refs->{$_}->{seq}; - push @seqs, $line; - } - chomp @seqs; - printOUT("\n"); - @seqs = (@seqs, @newseqs); - open (OUT, ">$fa_dir_neu/$query_name.fa"); - print OUT join "\n", @seqs; - print OUT "\n"; - close OUT; - if ($estflag) { - open (OUT, ">$fa_dir_neu/$query_name.cds.fa"); - print OUT join "\n", @newcds; - close OUT; - } - open (OUT, ">>$seqs2store_file") or die "failed to open output file\n"; - if ($estflag){ - open (OUT2, ">>$cds2store_file") or die "failed to open output file for cds\n"; - } - for (my $i = 0; $i < @newseqs; $i+= 2) { - my $line = $newseqs[$i] . "|" . $newseqs[$i+1]; - $line =~ s/>//; - - print OUT $line; - print OUT "\n"; - # push @seqs2store, $line; - if ($estflag) { - my $cdsline = $newcds[$i] . "|" . $newcds[$i+1]; - $cdsline =~ s/>//; - print OUT2 $cdsline; - print OUT2 "\n"; - push @cds2store, $cdsline; - } - } - close OUT; - close OUT2; - } - } - if (@seqs2store > 0) { - my $seqref = \@seqs2store; - my $estref = \@cds2store; - $pm->finish; - } - else { - $pm->finish; - } -} - -$pm->wait_all_children; - -### The following bit of code has been out-commented as shared memory between forked child -### processes does not exist. The handing back of return values from the child to the parent -### does work, however leads to memory problems. -### all HaMStR searches have been completed and all children have finished. Do the output - -#if (@seqs2store > 0) { -# if ($append) { -# open (OUT, ">>$seqs2store_file") or die "failed to open output file\n"; -# } -# else { -# open (OUT, ">$seqs2store_file") or die "failed to open output file\n"; -# } -# print OUT join "\n", @seqs2store; -# print OUT "\n"; -# close OUT; -# if ($estflag) { -# if ($append) { -# open (OUT, ">>$cds2store_file") or die "failed to open output file\n"; -# } -# else { -# open (OUT, ">$cds2store_file") or die "failed to open output file\n"; -# } -# print OUT join "\n", @cds2store; -# print OUT "\n"; -# close OUT; -# } -#} -########################################################################################### - -my $orthologs = 0; -if (-e $seqs2store_file) { - $orthologs = `less $seqs2store_file |wc -l`; - if ($fact){ - ## starting funFACT.pl - system("perl runFact.pl $runFACTparameter $outpath $blastpath $taxon_global $refspec_string"); - } -} -else { - printOUT("no hits found\n\n"); -} -### WRAP UP ##### -my $fa_dir_neu_tmp = $fa_dir_neu; $fa_dir_neu_tmp =~ s/\|/\\\|/g; -my $ortholog_groups = `ls $fa_dir_neu_tmp |$grepprog -v 'cds.fa' |wc -l`; -my $hmmsearch_dir_tmp = $hmmsearch_dir; $hmmsearch_dir_tmp =~ s/\|/\\\|/g; -my $hmmsearched = `ls $hmmsearch_dir_tmp |wc -l`; -chomp ($ortholog_groups, $hmmsearched, $orthologs); - -if (!defined $silent) { - print "\n\n -####HaMStR completed!######### -Results of HaMStR search in $taxon_global -Number of core_orthologs searched: $hmmcount -Number of core_orthologs with hmmsearch output: $hmmsearched -Number of ortholog_groups extended: $ortholog_groups -Number of orthologous sequences: $orthologs -##############################\n\n"; -} else { - # print "$taxon_global done\n"; -} -exit; - - -##################### start sub ############### - -####### checkInput performs a number of checks whether sufficient information -### and all data are available to run HaMStR -sub checkInput { - ######### check a number of flags that only serve for providing the user with some information - if (defined $show_coreortholog_sets) { - ## Do nothing but just list all available core ortholog sets in $hmmpath - my @coresets = (`ls $hmmpath`); - chomp @coresets; - if (scalar(@coresets > 0)){ - print "\nTHE FOLLOWING CORE ORTHOLOG SETS ARE AVAILABLE IN $hmmpath:\n\n"; - for (my $i = 0; $i < @coresets; $i++){ - my @available = qw(); - my @unavailable = qw(); - print "\n$coresets[$i]\n\n"; - my @refspec = `head -n 20 $hmmpath/$coresets[$i]/$coresets[$i].fa |$grepprog '>' |cut -d '|' -f 2 |sort |uniq`; - chomp @refspec; - for (my $j = 0; $j < @refspec; $j++){ - if (-e "$blastpath/$refspec[$j]"){ - push @available, "\t$refspec[$j]"; - } - else { - push @unavailable, "\t$refspec[$j]"; - } - } - print "\tAvailable reference taxa:\n"; - print join "\n", @available; - if (@unavailable > 0){ - print "\n\n\tUnvailable reference taxa (no Blast db at $blastpath)\n"; - print join "\n", @unavailable; - } - } - } - else { - print "\nNO CORE ORTHOLOG SETS ARE AVAILABLE! CHECK $hmmpath!\n\n"; - } - print "\n\n"; - exit; - } - ######### push all user defined variables into the log file ################ - push @log, "\nUSER DEFINED PARAMTERS (inc. default values)\n"; - my %parameters = (append => $append, - blastpath => $blastpath, - checkCoorthologsRef => $checkCoRef, - cleartmp => $cleartmp, - concat => $concat, - est => $estflag, - eval_blast => $eval_blast, - eval_hmmer => $eval, - filter => $filter, - hit_limit => $hitlimit, - hmm => $hmm, - hmmset => $hmmset, - hmmpath => $hmmpath, - intron => $keepintron, - longhead => $longhead, - nonoverlapping_cos => $nonoverlappingCO, - outpath => $outpath, - protein => $proteinflag, - rbh => $bhh, - refspec => $refspec_string, - relaxed => $relaxed, - representative => $rep, - reuse => $reuse, - sequence_file => $dbfile, - show_hmmsets => $show_coreortholog_sets, - sort_global_align => $sortalign, - strict => $strict, - taxon => $taxon_global, - ublast => $ublast); - - foreach ( sort keys %parameters) { - if (defined $parameters{$_}) { - push @log, "\t -$_:\t$parameters{$_}"; - } - else { - push @log, "\t -$_:\tnot set"; - } - } - - ############################################################################# - my $check = 1; - - if (!defined $dbfile) { - push @log, "You need to specify a valid infile with the option -sequence_file\n\n"; - $check = 0; - return($check, @log); - } - ### for FACT use the unmodified value of $dbfile - $runFACTparameter = $dbfile; - ## extract the path from the dbpath if available and prune of trailing '/' - if ($dbfile =~ /(.*\/)/) { - $dbpath = $1; - $dbpath =~ s/\/$//; - } - else { - $dbpath = '.'; - - } - $dbfile =~ s/.*\///; - # $dbfile_short = $dbfile; - # $dbfile_short =~ s/\..*//; - my @dbfileTMP = split(/\./, $dbfile); pop @dbfileTMP; - $dbfile_short = join(".", @dbfileTMP); - if ($central) { - $dboutpath = $dbpath; - # print "setting dboutpath to $dboutpath"; - } - - # print "HERERERERERERERERER $dbfile #################\n"; - # print "THENNNNNNNNNNNNNNNN $dbfile_short #################\n"; - ## - ## 0) Check for presence of the file with the sequences that should be hamstered - if (-e "$dbpath/$dbfile") { - push @log, "\t$dbfile ready"; - } - else { - #the provided infile does not exist: - push @log, "FATAL: The specified infile $dbpath/$dbfile does not exist. PLEASE PROVIDE A VALID INFILE!\n"; - $check = 0; - return ($check, @log); - } - ## 1) check for filetype - printOUT("checking for sequence type:\n"); - if (!defined $estflag and !defined $proteinflag) { - push @log, "\nCHECKING SEQUENCE TYPE\n"; - push @log, "\tNo file sequence type was determined. HaMStR will guess whether EST or protein sequences are analyzed"; - my $seq = `head -n 2 $dboutpath/$dbfile |tail -n 1`; - my $orilength = length($seq); - $seq =~ s/[AGCTN]//ig; - if (length($seq) / $orilength >0.1) { - $proteinflag = 1; - printOUT("Guessing sequence type: Protein\n"); - push @log, "\tMore than 10% of the first sequence in the file are non-AGCTN. Guessing sequence type: Protein"; - } - else { - $estflag = 1; - printOUT("Guessing sequence type: DNA\n"); - push @log, "\tLess than 10% of the first sequence in the file are non-AGCTN. Guessing sequence type: DNA\n"; - } - $check = 1; - } - if ($estflag and !$check_genewise) { - push @log, "\n\nHaMStR has been configured with the flag --protein_only and will not accept DNA sequences as input. I am stopping tests here! If you really want to analyse DNA sequence data please reconfigure.\n"; - $check = 0; - return ($check, @log); - } - ## $dbfile_base hat den originalen file namen, egal ob est oder protein - $dbfile_base = $dbfile; - - if ($ublast){ - if ($runublast){ - $blast_prog = 'usearch'; - $algorithm = 'ublast'; - $outputfmt = 'blasttable'; - $blastdbend = '.udb'; - } - else { - push @log, "\n\nHaMStR has been configured with the --noublast option. Either re-start without the -ublast flag or reconfigure\n"; - $check = 0; - return($check, @log); - } - } - if ($estflag) { - $dbfile = "$dbfile.tc"; - $algorithm = 'blastx'; - if ($blast_prog eq 'blastp'){ - $blast_prog = 'blastx'; - } - $sortalign = 1; - push @log, "HaMStR will run on the ESTs in $dbfile_base"; - push @log, "\nTRANSLATING ESTs\n"; - if (!(-e "$dboutpath/$dbfile")) { - printOUT("translating $dbfile_base, this may take a while\n"); - `$path/bin/translate.pl -infile=$dboutpath/$dbfile_base -outfile=$dbfile -outpath=$dboutpath`; - open (LOG, "$outpath/hamstrsearch.log"); - my @info = ; - @log = (@log, @info); - close LOG; - } - else { - push @log, "Translated file already exists, using this one"; - } - if (! -e "$dboutpath/$dbfile") { - push @log, "FATAL: The translation of $dbfile_base failed. Check the script translate.pl"; - print "failed\n"; - $check = 0; - } - else { - ## file type is protein - printOUT("succeeded\n"); - } - } - ## 2) Check for presence of the blast program - push @log, "\nCHECKING FOR PROGRAMS\n"; - printOUT("checking for the blast program:\t"); - if (`which $blast_prog` =~ / no /) { - push @log, "FATAL: could not execute $blast_prog. Please check if this program is installed and executable"; - print "failed\n"; - $check = 0; - } - else { - push @log, "\tcheck for $blast_prog succeeded"; - unless ($silent) { - print "succeeded\n"; - } - } - ## 3) Check for presence of hmmsearch - printOUT("checking for hmmsearch:\t"); - my $hmmcheck = `$prog -h |$grepprog -c 'HMMER 3'`; - if (! `$prog -h`) { - push @log, "FATAL: could not execute $prog. Please check if this program is installed and executable"; - print "failed: $prog is not installed or not executable\n"; - $check = 0; - } - elsif ($hmmcheck != 1) { - push @log, "FATAL: It seems that $prog is not from the HMMER 3 package. Please check!"; - print "failed: $prog is not from the HMMER 3 package\n"; - $check = 0; - } - else { - push @log, "\tcheck for $prog succeeded"; - printOUT("succeeded\n"); - } - ## 3b) Check for genewise - if ($check_genewise) { - printOUT("checking for genewise:\t"); - if (! `genewise -help`) { - push @log, "FATAL: Could not execute genewise. Please check if this program is installed and executable"; - print "failed: genewise is not executable\n"; - $check = 0; - } - else { - my $gwcheck = `echo \$WISECONFIGDIR`; - if (length($gwcheck) < 1) { - push @log, "FATAL: The environmental variable WISECONFIGDIR has not been set. I am expecting troubles when invoking genewise. - Please consult the installation manual for genewise and set this variable"; - print "failed: the environmental variable WISECONFIGDIR has not been set.\n"; - $check = 0; - } - else { - printOUT("\tsucceeded\n"); - } - } - } - else { - push @log, "GENEWISE-CHECK skipped: The hamstr-script has been configured with the option --protein_only. To override this setting set reconfigure the script or set the variable $check_genewise to 1"; - } - ## 4) Check for presence of the directory structure - - push @log, "\nCHECKING FOR HMMs\n"; - printOUT("checking for presence of the hmm files:\t"); - if ( ! defined $hmmset or ! -e "$hmmpath/$hmmset") { - push @log, "FATAL: You need to specify a valid core ortholog set. Make also sure that you provide the path to this set if it is not in the default location $hmmpath. You can check available core ortholog sets using the option -show_hmmsets."; - print "failed\n"; - $check = 0; - } - else { - $hmmpath = "$hmmpath/$hmmset"; - $fafile = "$hmmpath/$hmmset" . '.fa'; - $hmm_dir = "$hmmpath/$hmm_dir"; - $hmmsearch_dir = $outpath .'/hmm_search_' . $dbfile_short . '_' . $hmmset; - - ## 4b) check for the presence of the hmm-files and the fasta-file - if (!(-e "$hmm_dir")) { - push @log, "FATAL: Could not find $hmm_dir"; - print "failed\n"; - $check = 0; - } else { - if (defined $hmm) { - @hmms = split ',', $hmm; - chomp @hmms; - ### check for the presence of all hmms - for (my $k = 0; $k < @hmms; $k++) { - if (! -e "$hmm_dir/$hmms[$k]") { - push @log, "FATAL: $hmms[$k] has been defined but could not be found in $hmm_dir/$hmms[$k]"; - $check = 0; - last; - } else { - push @log, "\t$hmms[$k] has been found"; - } - } - } else { - push @log, "\trunning fDOG with all hmms in $hmm_dir"; - my $hmm_dir_tmp = $hmm_dir; $hmm_dir_tmp =~ s/\|/\\\|/g; - @hmms = `ls $hmm_dir_tmp`; - } - chomp @hmms; - printOUT("\tsucceeded\n"); - } - } - ## 6) Test for presence of the fasta file containing the sequences of the core-ortholog cluster - printOUT("checking for presence of the core-ortholog file:\t"); - if (defined $fafile) { - if (! -e "$fafile") { - push @log, "Fatal: Could not find the file $fafile"; - print "failed\n"; - $check = 0; - } - else { - push @log, "\tcheck for $fafile succeeded"; - printOUT("\tsucceeded\n"); - } - } - else { - push @log, "FATAL: Please provide path and name of fasta file containing the core-ortholog sequences"; - $check = 0; - print "failed\n"; - } - ## 7) Checks for the taxon_file - push @log, "\nCHECKING TAXON NAME\n"; - printOUT("testing whether the taxon has been determined:\t"); - if (defined $taxon_global) { - push @log, "\tusing default taxon $taxon_global for all sequences"; - printOUT("succeeded\n"); - $taxon_check = 2; - } - else { - push @log, "FATAL: No taxon_file found. Please provide a global taxon name using the option -taxon"; - print "failed\n"; - $check = 0; - } - ## 8) Check for reference taxon - push @log, "\nCHECKING FOR REFERENCE TAXON\n"; - printOUT("checking for reference species and blast-dbs:\t"); - if (!(defined $refspec_string) and (! defined $strict and ! defined $relaxed)) { - push @log, "FATAL: Please provide a reference species for the reblast!"; - print "failed\n"; - $check = 0; - } - elsif (defined $strict or defined $relaxed) { - if (! defined $refspec_string) { - ## The user has not provided a string of reference taxa. Chose all from the fasta file containing - ## the core orthologs. - @refspec = `$grepprog '>' $fafile |cut -d '|' -f 2 |sort |uniq`; - chomp @refspec; - $refspec_string = join ',', @refspec; - } - else { - @refspec = split (/,/, $refspec_string); - } - if ($strict) { - push @log, "\tStrict flag has been set. Reference species for the reblast: All of $refspec_string"; - } - else { - push @log, "\tRelaxed flag has been set. Reference species for the reblast: Any of $refspec_string"; - } - if (@refspec == 0) { - print "failed\n"; - $check = 0; - } - else { - printOUT("succeeded\n"); - } - } - else { - push @log, "\t Reference species for the re-blast: $refspec_string"; - @refspec = split(/,/, $refspec_string); - $refspec_name = $refspec[0]; - printOUT("succeeded\n"); - } - ## 9) Check for presence of the required blast dbs - printOUT("checking for blast-dbs:\t"); - push @log, "\nCHECKING FOR BLAST DATABASES\n"; - for (my $i = 0; $i < @refspec; $i++) { - my $blastpathtmp = "$blastpath/$refspec[$i]/$refspec[$i]"; - if (-e $blastpathtmp . $blastdbend) { - push @log, "\tcheck for $blastpathtmp succeeded"; - printOUT("succeeded\n"); - } - elsif (-e $blastpathtmp . '_prot' . $blastdbend){ - ## the check for the file naming '_prot' is only to maintain backward compatibility - $blastapp = '_prot'; - $blastpathtmp = $blastpathtmp . $blastapp; - push @log, "\tcheck for $blastpathtmp succeeded"; - printOUT("succeeded\n"); - } - else { - push @log, "FATAL: please edit the blastpath. Could not find $blastpathtmp or blast database blastpathtmp.pin does not exist."; - print "$blastpathtmp failed\n"; - $check = 0; - } - } - ## 9.1) Check for presence of the required FASTA file of reference species - printOUT("checking for reference fasta files:\t"); - push @log, "\nCHECKING FOR REFERENCE FASTA FILES\n"; - for (my $i = 0; $i < @refspec; $i++) { - my $referencedb = "$blastpath/$refspec[$i]/$refspec[$i]".".fa"; - my $referencedb_prot = "$blastpath/$refspec[$i]/$refspec[$i]"."_prot.fa"; # backward compatibility - my $ref_dir = "$blastpath/$refspec[$i]"; - my $link = $referencedb; - unless (-e $referencedb) { - $link = `$readlinkprog $referencedb`; - unless ($link =~ /^\./ || $link =~ /^\//) { - my $cwd = cwd(); - die "Linked source for $referencedb not found in $cwd!"; - } - } - # my $ref_location = $referencedb; # not used anywhere else - chomp($link); - if (-e $referencedb || -e $link) { - push @log, "\tinfile ready"; - } elsif (-e "$referencedb_prot"){ - push @log, "\tinfile ready"; - } else { - #the provided reference fasta file does not exist or link to file does not exist: - push @log, "FATAL: FASTA file for the specified reference $refspec[$i] does not exist. PLEASE PROVIDE A VALID REFERENCE SPECIES!\n"; - $check = 0; - return ($check, @log); - } - } - - ## 10) Set the file where the matched seqs are found - my $strictstring = ''; - if (defined $strict) { - $strictstring = '.strict'; - } - $seqs2store_file = $outpath . '/hamstrsearch_' . $dbfile_short . '_' . $hmmset . $strictstring . '.out'; - $cds2store_file = $outpath . '/hamstrsearch_' . $dbfile_short . '_' . $hmmset . '_cds' . $strictstring . '.out'; - - if (! $append){ - if (-e "$seqs2store_file") { - my $answer = 'Y'; - my $breaker = 0; - if (!$force){ - print "A HaMStR outfile $seqs2store_file already exists and option -force has not been chosen! Shall I overwrite this file [Y|N]: "; - $answer = ; - chomp $answer; - while ($answer !~ /[YN]/i and ($breaker < 4)) { - $breaker ++; - print "Please answer with 'Y' or 'N':\t"; - $answer = ; - chomp $answer; - if (($breaker > 3) and ($answer !~ /[YN]/i)){ - print "No proper answer is given: exiting.\nPlease re-start HaMStR with the -append option, or alternatively remove the file manually, or force the replacement of exsiting files with option -force.\n"; - exit; - } - } - } - if ($answer =~ /Y/i) { - open (OUT, ">$seqs2store_file") or die "failed to open $seqs2store_file\n"; - print OUT ''; - close OUT; - if ($estflag){ - open (OUT, ">$cds2store_file") or die "failed to open $cds2store_file\n"; - print OUT ''; - close OUT; - } - } - else { - print "You chose to not overwrite the existing output files. Please re-start HaMStR with the -append option, or alternatively remove the file manually.\n"; - exit; - } - } - } - ## 11) apply the evalue-cut-off to the hmmsearch program - push @log, "\nPROGRAM OPTIONS\n"; - push @log, "\thmmsearch will run with an e-value limit of $eval"; - - ## 11b) hit limit for the re-blast - if ($hitlimit) { - push @log, "\tre-blast hit_limit: $hitlimit"; - } - else { - push @log, "\tre-blast hit_limit: none applied"; - } - ## 11c) The blast evalue limit - push @log, "\tBlast will run with an evalue limit of $eval_blast\n"; - - ## 12) check for filter setting for BLAST - printOUT("checking for low complexity filter setting:\t"); - $filter =~ tr/ft/FT/; - if ($filter ne 'T' and $filter ne 'F') { - push @log, "FATAL: Filter is set to $filter. Please set the low complexity filter either to F or T."; - print "low complexity filter check failed\n"; - $check = 0; - } - else { - push @log, "\tcheck for low complexity filter setting succeeded. Chosen value is $filter"; - if ($blast_prog ne 'blastall'){ - $filter = 'yes' if $filter eq 'T'; - $filter = 'no' if $filter eq 'F'; - } - printOUT("succeeded\n"); - } - - ## 13) setting up the directories where the output files will be put into. - $fa_dir_neu = $outpath . '/fa_dir_' . $dbfile_short . '_' . $hmmset . '_' . $refspec[0]; - $tmpdir = $outpath . '/tmp'; # . $tmpdir; - if (!$strict) { - $fa_dir_neu = $outpath . '/fa_dir_' . $dbfile_short . '_' . $hmmset . '_' . $refspec[0]; - } - if ($strict) { - $fa_dir_neu = $outpath . '/fa_dir_' . $dbfile_short . '_' . $hmmset; - $fa_dir_neu .= '_strict'; - } - - if ($relaxed) { - $fa_dir_neu .= '_relaxed'; - } - if ($check == 1) { - if (!(-e "$hmmsearch_dir")) { - `mkdir "$hmmsearch_dir"`; - } - elsif (-e "$hmmsearch_dir" and ! $reuse) { - `rm -rf "$hmmsearch_dir"`; - `mkdir "$hmmsearch_dir"`; - } - if (!(-e "$fa_dir_neu")) { - `mkdir "$fa_dir_neu"`; - } - elsif (-e "$fa_dir_neu" and ! $reuse) { - `rm -rf "$fa_dir_neu"`; - `mkdir "$fa_dir_neu"`; - } - mkdir "$tmpdir" unless -d "$tmpdir"; - if (-d "$tmpdir" and $cleartmp) { - `rm -rf "$tmpdir"`; - mkdir "$tmpdir" unless -d "$tmpdir"; - } - } - ## 14) determin whether or not the -representative flag has been set - if (defined $rep) { - push @log, "\tfDOG will run with the -representative option"; - } - else { - push @log, "\tfDOG was called without the -representative option. More than one ortholog may be identified per core-ortholog group!"; - } - - ## check further options - if (defined $nonoverlappingCO){ - push @log, "\tThe flag -nonoverlapping_cos has been set. HaMStR will output co-orthologs even when they align to non-overlapping parts of the reference sequence"; - } - if (defined $checkCoRef){ - push @log, "\tThe flag -CheckCoorthologsRef has been set."; - } - if (defined $bhh){ - push @log, "\tThe flag -rbh has been set. HaMStR will run with the reciprocal best hit option."; - } - if ($sortalign){ - push @log, "\tThe flag -sort_global_align has been set. HaMStR will sort hits according to the global alignment score against the reference sequence. (Default for EST data)." - } - - ## check how hamstr should deal with possible introns in transcripts: - if ($estflag) { - my $breaker = 0; - while ($keepintron !~ /^[kmr]/i and ($breaker < 4)){ - $breaker ++; - print "option intron was set to $keepintron: Please answer either with 'k(eep)', 'm(ask)', or 'r(emove)':\t"; - $keepintron = ; - chomp $keepintron; - if (($breaker > 3) and ($keepintron !~ /^[kmr]/i)){ - print "No proper answer is given: exiting.\nPlease re-start HaMStR with the option -intron=[kmr].\nOptions are 'k(eep)', 'm(ask)', or 'r(emove)'. Default is 'k(eep)' introns.\n"; - exit; - } - } - if ($keepintron =~ /^k/i) { - push @log, "\tKeep introns (Default) has been chosen. HaMStR will keep any introns in lower case in the reported CDS. Thus, CDS cannot be directly translated into the aa sequence."; - } - elsif ($keepintron =~ /^m/i) { - push @log, "\tMask introns has been chosen. HaMStR will keep any introns but masks them as 'N' in the reported CDS. Thus, CDS cannot be directly translated into the aa sequence." - } - elsif ($keepintron =~ /^r/i) { - push @log, "\tRemove introns has been chosen. HaMStR will remove any position that genewise could not align to the reference protein rendering the CDS consistent with the amino acid sequence"; - } - - } - - return ($check, @log); -} -################# -## check4reciprocity is the second major part of the program. It checks -## whether the protein sequence that has been identified by the hmmsearch -## identifies in turn the protein from the reference taxon that was used to -## build the hmm. -sub check4reciprocity { - my $frame; - my ($localid, $query_name, $hitname, $refspec_final, @refspec) = @_; - my $searchdb; - my $strict_suc = -1; # keeps track of success for all taxa - my $relaxed_suc = 0; # keeps track of success for at least one taxon - ## get the sequence that was identified as hit in the pHMM search from the db_file - my $hitseq = `$grepprog -m 1 -A 1 ">$hitname\$" $dboutpath/$dbfile_base | tail -n 1`; - if (!defined $hitseq) { - print "could not retrieve a sequence for $hitname. Skipping...\n"; - return(0, '', '', ''); - } - ## continue with the blast - chomp $hitseq; - ## now run the blast - open (OUT, ">$tmpdir/$$.fa") or die "could not open out for writing\n"; - print OUT ">$hitname\n$hitseq"; - close OUT; - ## now comes the new part that does one to many blast searches. We need to iterate through all - ## entries in the file $refspec_final and perform the Blast against each reftaxon. Note, unless - ## $strict or $relaxed flags are set, there will be only a single reftaxon. If $relaxed is chosen - ## then we can stop the blast searches as soon as the reciprocity is fulfilled. - for (my $k = 0; $k < @$refspec_final; $k++) { - my $orthocount = $refspec_final->[$k]->{orthocount}; - ## 1) Perform the blast search with the k-th reftaxon - printOUT("Reftaxon: $refspec_final->[$k]->{refspec}\n"); - $tmpdir =~ s/\|/\\\|/g; - if ($blast_prog =~ /blast[px]/) { - !`$blast_prog -db $refspec_final->[$k]->{searchdb} -seg '$filter' -max_target_seqs 10 -evalue $eval_blast -outfmt 5 -query $tmpdir/$$.fa -out $tmpdir/$$.blast` or die "Problem running $blast_prog\n"; - ### postprocess the outfile - } - elsif ($blast_prog =~ /blastall/) { - !`blastall -p $algorithm -d $refspec_final->[$k]->{searchdb} -F $filter -e $eval_blast -m7 -i $tmpdir/$$.fa -o $tmpdir/$$.blast` or die "Problem running $blast_prog\n" - } - else { - if ($estflag){ - `$blast_prog -ublast $tmpdir/$$.fa -db $refspec_final->[$k]->{searchdb}.udb -strand both -accel $accel -evalue $eval_blast -blast6out $tmpdir/$$.blast` or die "Problem running $blast_prog\n;" - } - else { - `$blast_prog -ublast $tmpdir/$$.fa -db $refspec_final->[$k]->{searchdb}.udb -accel $accel -evalue $eval_blast -blast6out $tmpdir/$$.blast` or die "Problem running $blast_prog\n;" - } - ## sort the output as ublast does not do it (at least not for ESTs) - `sort -n -r -k 12 $tmpdir/$$.blast >$tmpdir/blastsort.tmp`; - `mv $tmpdir/blastsort.tmp $tmpdir/$$.blast`; - #################### - } - ## 2) now parse the best blast hit - my $hits = &getBestBlasthit("$tmpdir/$$.blast"); - if (defined $hits and @$hits > 0) { - ## at least one blast hit - $frame = $hits->[0]->{frame}; - my $idsref = $refspec_final->[$k]->{refid}; - my @original_ids = @$idsref; - my $suc = 0; # keeps track of success for a single taxon - if ($checkCoRef == 0) { - ## the user does not want to check further in case that id of best blast hit and of reference species differ - printOUT("core_orthologs: @original_ids\n"); - ## now loop through the best hits with the same score and check whether - ## among these I find the same seq as in $original - my $i = 0; - while ($suc == 0 and $i <@$hits) { - printOUT("blast-hit: $hits->[$i]->{name}\n"); - ## now loop through all the refspec-sequences in the hmm file; this is the case when co-orthologs have been determine in the core-ortholog - my $j = 0; - while ($suc == 0 and $j < @original_ids) { - if ($original_ids[$j] eq $hits->[$i]->{name}) { - printOUT("hitting $original_ids[$j]\n"); - $refspec_final->[$k]->{hit} = $j; - $suc = 1; - $relaxed_suc = 1; - } - else { - printOUT("not hitting $original_ids[$j]\n"); - $j ++; - } - if ($suc == 1) { - $relaxed_suc = 1; - if ($strict_suc == -1) { - $strict_suc = 1; - } - } - } - $i++; - } - if ($suc == 0) { - # none of the blast hits matched against the the reftaxon seq - $strict_suc = 0; - } - } - - else { - ## The user has chosen to search more sensitive, asking whether the best blast hit might be a co-ortholog to the reference sequence - my $qhdistance; - my $rhdistance; - printOUT("core_orthologs: $original_ids[0]\n"); - ## we will check only the best blast hit and impose a distance criterion - ## in case of an EST, we will have to predict the reading frame and translate it... - my $bestid = $hits->[0]->{name}; - my $refid = $original_ids[0]; - ## get the sequences from the blast db. Currently, I'm using a simple grep - my $bestseq = `$grepprog -m 1 -A 1 ">$bestid" $refspec_final->[$k]->{searchdb}.fa |tail -n 1` or die "Could not retrieve original sequence for besthit\n"; - my $refseq = `$grepprog -m 1 -A 1 ">$refid" $refspec_final->[$k]->{searchdb}.fa |tail -n 1` or die "Could not retrieve original sequence for refseq\n"; - chomp ($bestseq, $refseq); - printOUT("blast-hit: $bestid"); - my $queryseq = $hitseq; - if ($bestid eq $refid) { - printOUT("\thitting\n"); - $refspec_final->[$k]->{hit} = 0; - $suc = 1; - $relaxed_suc = 1; - } - else { - printOUT("\nBest hit $bestid differs from reference sequence $refid! Doing further checks\n"); - if ($estflag){ - printOUT("Frame is $hits->[0]->{frame} or $frame\n"); - my ($hitseqtr) = &findORF($hitseq, $bestseq, $frame); - ($suc, $qhdistance, $rhdistance) = &checkCoorthologRef($localid, $hitseqtr, $bestseq, $refseq); - } - else { - ($suc, $qhdistance, $rhdistance) = &checkCoorthologRef($localid, $hitseq, $bestseq, $refseq); - } - ## print distances (debug mode) - if ($debug){ - my $distDebugFile = $outpath . "/" . $taxon_global . ".debug.dist"; #$path . "/output/" . $taxon_global . ".debug.dist"; - unless (-e $distDebugFile){ - open (my $DISTDEBUG, ">>$distDebugFile") or die "Error, could not create file: ". "$distDebugFile"; - print $DISTDEBUG "hmmset\trefid\tbestid\tqueryid\tqhdist\trhdist\n"; - close $DISTDEBUG; - } - if (-e $distDebugFile){ - open (my $DISTDEBUG, ">>$distDebugFile") or die "Error, could not create file: ". "$distDebugFile"; - print $DISTDEBUG "$query_name\t$refid\t$bestid\t$hitname\t$qhdistance\t$rhdistance\n"; - close $DISTDEBUG; - } - } - - if ($suc == 1) { - printOUT("\t Distance query - blast hit: $qhdistance, Distance blast hit - reference: $rhdistance\tAccepting\n"); - $refspec_final->[$k]->{hit} = 0; - } - else { - printOUT("\t Distance query - blast hit: $qhdistance; Distance blast hit - reference: $rhdistance Rejecting\n"); - } - } - if ($suc == 1){ - $relaxed_suc = 1; - if ($strict_suc == -1) { - $strict_suc = 1; - } - } - else { - $strict_suc = 0; - } - } - } - else { - printOUT("no hit obtained\n"); - $strict_suc = 0; - } - ## when the user has chosen the strict flag, there is no reason to continue when $suc - ## has remained 0 (reciprocity criterion not fulfilled). Thus, return to main. - if ($strict and $strict_suc == 0) { - return (0, $hitseq); - } - } - if ($relaxed_suc == 1) { - if ($estflag and $frame eq '-') { - ## reverse sequence - $hitseq = &revComp($hitseq); - } - return (1, $hitseq, $frame); - } - else { - return (0, $hitseq); - } -} - -############# -sub getBestBlasthit { - my $hits; - my $count = 0; - my ($file) = @_; - $file =~ s/\\//g; - my $searchio = Bio::SearchIO->new( - -file => "$file", - -format => $outputfmt) - or die "parse failed"; - while(my $result = $searchio->next_result){ - my $sig; - my $sig_old; - while( my $hit = $result->next_hit) { - my $frameval = $hit->strand('query'); - if ($frameval >0){ - $frame = '+'; - } - elsif ($frameval <0 ) { - $frame = '-'; - } - elsif (!defined $frameval and $estflag) { - die "error in obtaining frame in sub getBestBlasthit\n"; - } - else { - $frame = 'na'; - } - - ## now I enter all top hits having the same evalue into the result - $sig = $hit->score; - if (!defined $sig_old) { - $sig_old = $sig; - } - if ($sig == $sig_old) { - if ($estflag){ - printOUT("frame is $frame\n"); - $hits->[$count]->{frame} = $frame; - } - $hits->[$count]->{name} = $hit->name; - $count ++; - } - else { - ## there is no lower ranking hit with the same score as the best hit. End the loop. - last; - } - } - } - return($hits); -} -################## -sub getTaxon { - my ($hitname) = @_; - if ($hitname =~ /\D/) { - $hitname =~ s/_.*//; - } - my $taxon = `$grepprog -m 1 "^$hitname," $taxon_file | $sedprog -e 's/^.*,//'`; - chomp $taxon; - $taxon =~ s/^[0-9]+,//; - $taxon =~ s/\s*$//; - $taxon =~ s/\s/_/g; - if ($taxon) { - return ($taxon); - } - else { - return(); - } -} -############### -sub determineReferences { - my ($localid, $fileobj, $taxon, $refspec_final, $hitname, $hithmmscore, $hitseq, $hitcount) = @_; - my $refseq = ''; - my $refspec; - ## now we have to distinguish between three cases: - ## 1) hamstr is running in normal mode and one refspec has been determined. In this case, $refspec_final - ## contains data only from a single species. - ## 2) hamstr is running in normal mode and alternative refspecs have been determined by the user. - ## $refspec_final may contain results from more than one species, but we need to consider only the first - ## entry. - ## 3) hamstr is running in the strict mode. In this case $refspec_final contains data from several taxa and we need - ## to select the taxon and sequence that is most similar to the hamstered sequence. - ## 4) hamstr is running in the relaxed mode. In this case $refspec_final may contain data from several taxa and - ## we need to select the taxon and the sequence that is most similar to the hamstered sequence. - if (defined $strict or defined $relaxed) { - ## more than one refspec. Now find the one that fits best - my $max_score = 0; - for (my $i = 0; $i < @$refspec_final; $i++) { - ## first, check whether the reciprocity criterion has been fulfilled - if (defined $refspec_final->[$i]->{hit}) { - my $rcn = $refspec_final->[$i]->{hit}; - my $refseq_cand = $refspec_final->[$i]->{sequence}->[$rcn]; - my $refspec_cand_id = $refspec_final->[$i]->{refid}->[$rcn]; - my $refspec_cand = $refspec_final->[$i]->{refspec}; - my $score = &getAlignmentScore($localid, $refseq_cand, $hitseq); - if ($score > $max_score) { - $refspec = $refspec_cand; - $refseq = $refseq_cand; - $max_score = $score; - } - } - } - } - else { ## no choice, just one refspec - my $rcn = $refspec_final->[0]->{hit}; - $refseq = $refspec_final->[0]->{sequence}->[$rcn]; - $refspec = $refspec_final->[0]->{refspec}; -} -$fileobj->{$taxon}->{prot}->[$hitcount] = $hitseq; -$fileobj->{$taxon}->{ids}->[$hitcount] = $hitname; -$fileobj->{$taxon}->{hmmscore}->[$hitcount] = $hithmmscore; -$fileobj->{$taxon}->{refseq}->[$hitcount]= $refseq; -$fileobj->{$taxon}->{refspec}->[$hitcount] = $refspec; -return($fileobj); -} -############### -sub processHits { - my ($localid, $fileobj) = @_; - ## 1) align all hit sequences for a taxon against the reference species - my @taxa = keys(%$fileobj); - for (my $i = 0; $i < @taxa; $i++) { - &orfRanking($localid, $taxa[$i]); - } -} - - -################ -sub predictORF { - my $fileobj_new; - my @taxa = keys(%$fileobj); - for (my $i = 0; $i < @taxa; $i++) { - my $protobj = $fileobj->{$taxa[$i]}->{prot}; - my $idobj = $fileobj->{$taxa[$i]}->{ids}; - my $refseqobj = $fileobj->{$taxa[$i]}->{refseq}; - my $refspecobj = $fileobj->{$taxa[$i]}->{refspec}; - my @ids = @$idobj; - for (my $j = 0; $j < @ids; $j++) { - my $refseq = $refseqobj->[$j]; - my $refspec = $refspecobj->[$j]; - my $est = $protobj->[$j]; - if (! $est) { - die "error in retrieval of est sequence for $ids[$j] in subroutine processHits\n"; - } - ### debuggin IUB code - if ($est =~ /[^AGCT]/i) { - $est =~ s/[^AGCTagct]/n/g; - } - printOUT("running genewise using frame $frame\n"); - my $gw = run_genewise_hamstr->new($est, $refseq, $tmpdir, $keepintron); - my $translation = $gw->translation; - my $cds = $gw->codons; - $translation =~ s/[-!]//g; - $fileobj_new->{$taxa[$i]}->{ids}->[$j] = $ids[$j]; - $fileobj_new->{$taxa[$i]}->{prot}->[$j] = $translation; - $fileobj_new->{$taxa[$i]}->{cds}->[$j] = $cds; - $fileobj_new->{$taxa[$i]}->{refseq}->[$j] = $refseq; - $fileobj_new->{$taxa[$i]}->{refspec}->[$j] = $refspec; - } - } - return($fileobj_new); -} -############################ -sub orfRanking { - my ($localid, $spec) = @_; - my $result; - my $refprot; - my $refcds; - my @toalign; - my $protobj = $fileobj->{$spec}->{prot}; - my $idobj = $fileobj->{$spec}->{ids}; - my $refcluster; ## variables to take the cluster and its id for later analysis - my $refid; - if (@$protobj == 1) { - ## nothing to chose from - $refprot = $protobj->[0]; - $refcds = $fileobj->{$spec}->{cds}->[0]; - my $length = length($refprot); - $refid = $idobj->[0]; - } - else { - ## more than one cluster - ## note, I set the refseq fix to the first entry. This is to avoid that in this routine - ## sequences from different taxa are used. - push @toalign, ">$fileobj->{$spec}->{refspec}->[0]"; - push @toalign, $fileobj->{$spec}->{refseq}->[0]; - ## now walk through all the contigs - for (my $i = 0; $i < @$protobj; $i++) { - my @testseq = (">$idobj->[$i]", $protobj->[$i]); - @testseq = (@testseq, @toalign); - open (OUT, ">$tmpdir/$localid.ref.fa") or die "could not open file for writing refseqs\n"; - print OUT join "\n", @testseq; - close OUT; - ## run clustalw - !(`$alignmentprog -infile=$tmpdir/$localid.ref.fa -output=fasta -outfile=$tmpdir/$localid.ref.aln 2>&1 >$tmpdir/$localid.ref.log`) or die "error running clustalw\n"; - ## get the alignment score - $result->[$i]->{score} = `$grepprog "Alignment Score" $tmpdir/$localid.ref.log |$sedprog -e 's/[^0-9]//g'`; - if (!$result->[$i]->{score}) { - die "error in determining alignment score\n"; - } - chomp $result->[$i]->{score}; - ## get the aligned sequence - open (ALN, "$tmpdir/$localid.ref.aln") or die "failed to open alignment file\n"; - my @aln = ; - close ALN; - my $aseq = extractSeq($idobj->[$i], @aln); - ## remove the terminal gaps - $aseq =~ s/-*$//; - $result->[$i]->{aend} = length $aseq; - my ($head) = $aseq =~ /^(-*).*/; - ($result->[$i]->{astart}) = length($head)+1; - ## add the hmmscore to $result - $result->[$i]->{hmmscore} = $fileobj->{$spec}->{hmmscore}->[$i]; - } - ### the results for all seqs has been gathered, now order them according to alignment start in the refseq - $result = &sortRef($result); - ($refprot, $refcds, $refid) = &determineRef($result,$spec); - } - $fileobj->{$spec}->{refprot} = $refprot; - $fileobj->{$spec}->{refcds} = $refcds; - $fileobj->{$spec}->{refid} = $refid; - $fileobj->{$spec}->{refspec_final} = $fileobj->{$spec}->{refspec}->[0]; - return(); -} -########################### -sub sortRef { - my $result = shift; - my @sortref; - for (my $i = 0; $i < @$result; $i++) { - $sortref[$i]->{index} = $i; - $sortref[$i]->{astart} = $result->[$i]->{astart}; - $sortref[$i]->{aend} = $result->[$i]->{aend}; - $sortref[$i]->{score} = $result->[$i]->{score}; - $sortref[$i]->{hmmscore} = $result->[$i]->{hmmscore}; - } - @sortref = sort { $a->{astart} <=> $b->{astart} } @sortref; - for (my $i = 0; $i < @sortref; $i++) { - ($result->[$i]->{id}, $result->[$i]->{start}, $result->[$i]->{end}, $result->[$i]->{score}, $result->[$i]->{hmmscore}) = ($sortref[$i]->{index}, $sortref[$i]->{astart}, $sortref[$i]->{aend}, $sortref[$i]->{score}, $sortref[$i]->{hmmscore}); - } - return($result); -} -######################## -sub determineRef { - my ($result, $spec) = @_; - my $lastend = 0; - my $lastscore = 0; - my $final; - my $count = 0; - my $id = ''; - my $scorekey = 'hmmscore'; - if ($sortalign){ - $scorekey = 'score'; - } - for (my $i = 0; $i < @$result; $i++) { - if ($result->[$i]->{start} < $lastend or $lastend == 0) { - if ($result->[$i]->{$scorekey} > $lastscore) { - $lastend = $result->[$i]->{end}; - $lastscore = $result->[$i]->{$scorekey}; - $id = $result->[$i]->{id}; - printOUT("ref is $id with score $lastscore\n"); - } - } - elsif ($result->[$i]->{start} > $lastend) { - ## a new part of the alignment is covered. Fix the results obtained so far - $final->[$count]->{id} = $id; - $lastend = $result->[$i]->{end}; - $id = $result->[$i]->{id}; - $count++; - } - } - $final->[$count]->{id} = $id; - ## now concatenate the results - my $refprot = ''; - my $refid = ''; - my $refcds = ''; - - ## now comes a dirty hack. The user has the chance to maximize phylogentic information by concatenating - ## orthologous sequences that do no align to the same part of the reference protein (option -concat). If so, - ## the co-ortholog-detection at a later step will not work and will be disabled. - my $looplimit = 1; - if ($concat) { - $looplimit = scalar(@$final); - } - for (my $i = 0; $i < $looplimit; $i++) { - my $seq = $fileobj->{$spec}->{prot}->[$final->[$i]->{id}]; - my $cdsseq = $fileobj->{$spec}->{cds}->[$final->[$i]->{id}]; - my $length = length($seq); - if ($concat){ - $refid .= "$fileobj->{$spec}->{ids}->[$final->[$i]->{id}]-$length" . "PP"; - } - else { - $refid .= "$fileobj->{$spec}->{ids}->[$final->[$i]->{id}]"; - } - $refprot .= $seq; - if ($estflag) { - $refcds .= $cdsseq; - } - } - $refid =~ s/PP$//; - return($refprot, $refcds, $refid); -} -############################# -sub extractSeq { - my ($id, @aln) = @_; - my $seq = ''; - my $start = 0; - for (my $i = 0; $i < @aln; $i++) { - if ($aln[$i] =~ $id) { - $start = 1; - } - elsif ($aln[$i] =~ />/ and $start == 1) { - last; - } - elsif ($start == 1) { - $seq .= $aln[$i]; - } - } - $seq =~ s/\s//g; - return ($seq); -} -############################## -sub revComp { - my ($seq) = @_; - chomp($seq); - $seq =~ tr/AGCTYRKMWSagct/TCGARYMKWSTCGA/; - $seq = reverse($seq); - return($seq); -} -############################## -# sub parseHmmer3pm { -# my ($file, $path) = @_; -# my $hits; -# my $query; -# my %tmphash; -# if (!defined $path){ -# $path = '.'; -# } -# $file = $path . '/' . $file; -# my $in = Bio::SearchIO->new( -# -format => 'hmmer', -# -file => $file -# ); -# while( my $result = $in->next_result ) { -# # this is a Bio::Search::Result::HMMERResult object -# if (!defined $query){ -# $query = $result->query_name(); -# printOUT("query is $query\n"); -# } -# my $hitcount = 0; -# while( my $hit = $result->next_hit ) { -# my $tmp = $hit->name(); -# my $tmpscore = $hit->score(); -# $tmp =~ s/_RF.*//; -# if (!defined $tmphash{$tmp}){ -# $hits->[$hitcount]->{id} = $tmp; -# $hits->[$hitcount]->{hmmscore} = $tmpscore; -# $hitcount++; -# $tmphash{$tmp}=1; -# if (defined $bhh){ -# last; -# } -# } -# } -# -# if (defined $hits->[0]) { -# ####### a quick hack to obtain the lagPhase value -# my $criticalValue; # takes the value used for candidate discrimination -# my $hitLimitLoc = $hitlimit; -# if (defined $autoLimit) { -# printDebug("Entering getLag Routine\n"); -# ## the user has invoked the autmated inference of a hit limit -# ($hitLimitLoc, $criticalValue) = getLag($hits, $hitcount); -# if (!defined $criticalValue) { -# ## there was a problem in the computatation of the lagPhase -# print "Computation of lagPhase did not succeed, switching to score threshold using a default cutoff of $scoreCutoff\n"; -# ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); -# } -# } -# elsif (defined $scoreThreshold) { -# printDebug("entering the scoreThreshold routine"); -# ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); -# printDebug("hitlimitloc is now $hitLimitLoc"); -# } -# -# return ($query, $hits, $hitLimitLoc, $criticalValue); -# } -# else { -# return ($query); -# } -# } -# } -############################## -sub parseHmmer4pm { - my ($file, $path) = @_; - my $hmmhits; - my $hits; - my $query; - my @rest; - my %tmphash; - my $hitcount = 0; - if (!defined $path){ - $path = '.'; - } - $file = $path . '/' . $file; - - $file =~ s/\|/\\\|/g; - my @hmmout = `$grepprog -v '#' $file |sort -rnk 9`; - for (my $i = 0; $i < @hmmout; $i++) { - ($hmmhits->[$i]->{target_name}, $hmmhits->[$i]->{target_accession}, $hmmhits->[$i]->{query_name}, $hmmhits->[$i]->{query_accession}, $hmmhits->[$i]->{total_evalue}, $hmmhits->[$i]->{total_score}, $hmmhits->[$i]->{total_bias}, $hmmhits->[$i]->{domain_evalue}, $hmmhits->[$i]->{domain_score}, $hmmhits->[$i]->{domain_bias}, @rest) = split(/\s+/, $hmmout[$i]); - - if (!defined $query){ - $query = $hmmhits->[$i]->{query_name}; - printOUT("query is $query\n"); - } - my $tmp = $hmmhits->[$i]->{target_name}; - my $tmpscore = $hmmhits->[$i]->{domain_score}; - $tmp =~ s/_RF.*//; - if (!defined $tmphash{$tmp}){ - $hits->[$hitcount]->{id} = $tmp; - $hits->[$hitcount]->{hmmscore} = $tmpscore; - $hitcount++; - $tmphash{$tmp}=1; - if (defined $bhh){ - last; - } - } - - } - if (defined $hits->[0]) { - ####### limit the list of hmm hits - my $criticalValue; # takes the value used for candidate discrimination - my $hitLimitLoc = $hitlimit; - if (defined $scoreThreshold) { - printDebug("entering the scoreThreshold routine"); - ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); - printDebug("hitlimitloc is now $hitLimitLoc"); - } - - return ($query, $hits, $hitLimitLoc, $criticalValue); - } - else { - return ($query); - } - -} -############################## -sub parseSeqfile { - my $seqref; - my $id; - my $spec; - my $seqid; - my $seq; - my $file = shift; - open (IN, "$file") or die "failed to open $file\n"; - my @seqs = ; - close IN; - chomp @seqs; - for (my $i = 0; $i < @seqs; $i++) { - if ($seqs[$i] =~ />/) { - $seqs[$i] =~ s/>//; - if (defined $id and defined $seq) { - $seqref->{$id}->{$spec}->{seqid} = $seqid; - $seqref->{$id}->{$spec}->{seq} = $seq; - $seq = undef; - } - ($id, $spec, $seqid) = split (/\|/, $seqs[$i]); - } - else { - $seq .= $seqs[$i]; - } - } - if (defined $id and defined $seq) { - $seqref->{$id}->{$spec}->{seqid} = $seqid; - $seqref->{$id}->{$spec}->{seq} = $seq; - $seq = undef; - } - return ($seqref); -} -################## -sub getAlignmentScore{ - my ($localid, $refseq_cand, $hitseq) = @_; - my @testseq = ('>hitseq', $hitseq, '>refseq', $refseq_cand); - open (OUT, ">$tmpdir/$localid.ref.fa") or die "could not open file for writing refseqs\n"; - print OUT join "\n", @testseq; - close OUT; - ## run clustalw - !(`$alignmentprog -infile=$tmpdir/$localid.ref.fa -output=fasta -outfile=$tmpdir/$localid.ref.aln 2>&1 >$tmpdir/$localid.ref.log`) or die "error running clustalw\n"; - ## get the alignment score - my $score = `$grepprog "Alignment Score" $tmpdir/$localid.ref.log |$sedprog -e 's/[^0-9]//g'`; - if (!$score) { - die "error in determining alignment score! Problem with ClustalW\n"; - } - chomp $score; - return ($score); -} -######################3 -sub determineRefspecFinal { - my ($query_name, @refspec) = @_; - my $refspec_final; - ## now get the id and the sequence used for building the hmm. Note, the latter will be - ## needed at a later step to determine the best hit - my @original; - my $ac = 0; - for (my $i = 0; $i < @refspec; $i++) { - $fafile =~ s/\|/\\\|/g; - @original = `$grepprog -A 1 "^>$query_name|$refspec[$i]" $fafile | grep -v "^\-\-\$" |$sedprog -e "s/.*$refspec[$i]\|//"`; - chomp @original; - if (@original > 0) { - $refspec_final->[$ac]->{refspec} = $refspec[$i]; - $refspec_final->[$ac]->{searchdb} = "$blastpath/$refspec[$i]/$refspec[$i]" . $blastapp; - ## now allow for more than one sequence per core-ortholog cluster and species - $refspec_final->[$ac]->{orthocount} = 0; - for (my $j = 0; $j < @original; $j+= 2) { - $refspec_final->[$ac]->{refid}->[$refspec_final->[$ac]->{orthocount}] = $original[$j]; - $refspec_final->[$ac]->{sequence}->[$refspec_final->[$ac]->{orthocount}] = $original[$j+1]; - $refspec_final->[$ac]->{orthocount} += 1; - } - $ac++; - @original = qw(); - if (!defined $strict and !defined $relaxed) { - ## one reftaxon is enough - last; - } - } - else { - printOUT("original sequence not be found with grepping for ^>$query_name|$refspec[$i]. Proceeding with next refspec\n"); - } - } - if (! defined $refspec_final->[0]->{refid}) { - print "original sequence not found\n"; - return (0, $refspec_final); - } - ## now print some wordy information... - if (!defined $strict and !defined $relaxed) { - printOUT("REFSPEC is $refspec_final->[0]->{refspec}\n"); - } - return(1, $refspec_final); -} - -############## co-ortholog prediction using a alignment score criterion as in InParanoid. -sub identifyCoorthologsProt{ - my ($localid, $spec) = @_; - my @get; - my @infofile; - my $protobject = $fileobj->{$spec}->{prot}; #this is an array ref - my $idobject = $fileobj->{$spec}->{ids}; - my @genes2check = @$idobject; - my $refseq = $fileobj->{$spec}->{refprot}; - my $refid = $fileobj->{$spec}->{refid}; - if ($estflag) { - $refid =~ s/-[0-9]+$//; - } - my $refspec_final = $fileobj->{$spec}->{refspec_final}; - ## initialize the array with the sequences to be aligned with the reference sequence - my @out = qw(); - my @hitids = qw(); - push @out, ">$refspec_final"; - push @out, $fileobj->{$spec}->{refseq}->[0]; - for (my $i = 0; $i < @genes2check; $i++) { - my $seq = $protobject->[$i]; - chomp $seq; - push @out, ">" . $spec .'|' . $genes2check[$i]; - push @out, $seq; - } - ## writing sequences to file - my $tmpdirTmp = $tmpdir; $tmpdirTmp =~ s/\\//g; - open (OUT, ">$tmpdirTmp/$localid.orth.fa") or die "failed to open $localid.orth.fa\n"; - print OUT join "\n", @out; - close OUT; - - ## check sequence length - my $alignmentprog_co_tmp = $alignmentprog_co; - my $tooLong = checkSeqLen("$tmpdir/$localid.orth.fa"); - if ($tooLong == 1) { - $alignmentprog_co_tmp = "mafft-linsi"; - } - printOUT("Aligner: $alignmentprog_co_tmp\n"); - - ## aligning sequences - if ($alignmentprog_co_tmp eq 'mafft-linsi'){ - `mafft --maxiterate 1000 --localpair --anysymbol --quiet $tmpdir/$localid.orth.fa > "$tmpdirTmp/$localid.orth.aln"`; - } - elsif ($alignmentprog_co_tmp eq 'muscle') { - `muscle -quiet -in $tmpdir/$localid.orth.fa -out "$tmpdirTmp/$localid.orth.aln"`; - } - else { - die "$alignmentprog_co_tmp is neither mafft-linsi nor muscle\n"; - } - if (! -e "$tmpdirTmp/$localid.orth.aln") { - die "something wrong running $alignmentprog_co_tmp\n"; - } - ## do the matrix caluclation - my $in = Bio::AlignIO->new(-format => 'fasta', - -file => "$tmpdirTmp/$localid.orth.aln"); - my $aln = $in->next_aln; - my $pepstats = Bio::Align::ProteinStatistics->new(); - my $kimura = $pepstats->distance(-align => $aln, - -method => 'Kimura'); - ## do the evaluation - ### get the pairwise distances to the yeast sequences - #### get the represenative id - my $smallestdist = $kimura->get_entry("$refspec_final", "$spec|$refid"); - push @get, $spec.'|'.$refid; - push @infofile, $spec.'|'.$refid.'|'.$smallestdist.'|'.1; - printOUT("smalles dist is $smallestdist, besthit is $refid from $spec\n"); - ## now get any other hit protein that is closer to besthit than the representative seq - ## is to the refspec - my $count = 0; #this counter keeps track of the entries in the coorthologs field of $fileobj->{$spec} - for (my $i = 0; $i < @genes2check; $i++) { - if ($genes2check[$i] ne $refid) { - my $dist = $kimura->get_entry("$spec|$refid", "$spec|$genes2check[$i]"); - if ($dist <= $smallestdist or ($dist =~ /NaN/ and defined $nonoverlappingCO)) { - printOUT("co-ortholog detected: $genes2check[$i] with distance $dist compared to $smallestdist of $refid\n"); - $fileobj->{$spec}->{coorthologs}->[$count] = $i; - $count++; - push @infofile, $spec.'|'.$genes2check[$i].'|'.$dist.'|'.0; - } - else { - printOUT("co-ortholog rejected: $genes2check[$i] with distance $dist compared to $smallestdist of $refid\n"); - } - } - } - my $counter = 0; - while (defined $fileobj->{$spec}->{coorthologs}->[$counter]){ - my $index = $fileobj->{$spec}->{coorthologs}->[$counter]; - $counter ++; - } - printOUT(join "\n", @infofile); -} - -######## sub check sequence length. If a sequence is longer than 12.000 aa, -######## change MUSCLE to MAFFT-LINSI (due to Segmentation fault issue of MUSCLE) -sub checkSeqLen { - my $file =$_[0]; - my $out = `awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length(\$0)}END{print seqlen}' "$file"`; - my @out = split("\n", $out); - foreach my $line (@out) { - if ($line !~ />/ & $line > 12000) { - return(1) - } - } - return(0) -} - -######## sub checkCoorthologRef -sub checkCoorthologRef { - ## relevant steps are - ## 1) get query sequence and query id, - ## 2) get refseq - ## 3) get seq for best blast hit - ## compute the distance query - best blast hit and best blast hit - reference seq - ## return '1' if d(q,b)>d(r,b), else return '0'; - my ($localid, $query, $best, $ref) = @_; - open (OUT, ">$tmpdir/$localid.co.fa") or die "failed to open $localid.co.fa\n"; - print OUT ">query\n$query\n>best\n$best\n>ref\n$ref\n"; - close OUT; - - ## check sequence length - my $alignmentprog_co_tmp = $alignmentprog_co; - my $tooLong = checkSeqLen("$tmpdir/$localid.co.fa"); - if ($tooLong == 1) { - $alignmentprog_co_tmp = "mafft-linsi"; - } - printOUT("Aligner: $alignmentprog_co_tmp\n"); - - ## aligning sequences - if ($alignmentprog_co_tmp eq 'mafft-linsi'){ - `mafft --maxiterate 1000 --localpair --anysymbol --quiet $tmpdir/$localid.co.fa > "$tmpdir/$localid.co.aln"`; - } - elsif ($alignmentprog_co_tmp eq 'muscle') { - `muscle -quiet -in $tmpdir/$localid.co.fa -out "$tmpdir/$localid.co.aln"`; - } - else { - die "$alignmentprog_co_tmp is neither mafft-linsi nor muscle\n"; - } - if (! -e "$tmpdir/$localid.co.aln") { - die "something wrong running $alignmentprog_co_tmp in routine checkCoorthologRef\n"; - } - ## do the matrix caluclation - my $in = Bio::AlignIO->new(-format => 'fasta', - -file => "$tmpdir/$localid.co.aln"); - my $aln = $in->next_aln; - my $pepstats = Bio::Align::ProteinStatistics->new(); - my $kimura = $pepstats->distance(-align => $aln, - -method => 'Kimura'); - ## do the evaluation - ### get the pairwise distances to the yeast sequences - #### get the represenative id - my $querydist = $kimura->get_entry('query', 'best'); - my $refdist = $kimura->get_entry('best','ref'); - if (($querydist > $refdist) or ($querydist == 0 and $refdist == 0)){ - return(1, $querydist, $refdist); - } - else { - return(0, $querydist, $refdist); - } -} -####### sub findORF -sub findORF{ - my ($est, $prot, $frame) = @_; - if ($frame eq '-') { - $est = revComp($est); - } - ### debuggin IUB code - if ($est =~ /[^AGCT]/i) { - $est =~ s/[^AGCTagct]/n/g; - } - printOUT("\trunning genewise using frame $frame\n"); - my $gw = run_genewise_hamstr->new($est, $prot, "$tmpdir"); - my $translation = $gw->translation; - return ($translation, $est); -} -####### sub printOUT -sub printOUT { - my $message = shift; - if (!defined $silent) { - print $message; - } - return(); -} -###### sub getLag -sub getLag { - print "\nInside getlag\n"; - my ($hits, $hitcount) = @_; - my $minScore = $hits->[$hitcount-1]->{hmmscore}; - my $maxScore = $hits->[0]->{hmmscore}; - if ($minScore == $maxScore) { - ## there is nothing to do, since there is either only one hit, or all hits have the same - ## hmmscore. Return the value of $hitcount. - return($hitcount, 1); - } - ## debug - else { - print "hitcount is $hitcount, max is $maxScore, min is $minScore\n"; - my @yData = qw(); - my @xData = qw(); - my @xDataLog = qw(); - ## now we generate a reversed list of the normalized bitscores - for (my $i = 0; $i < $hitcount; $i++) { - push(@yData, 1 - ($hits->[$i]->{hmmscore} - $minScore)/($maxScore - $minScore)); - push(@xDataLog, log(0.1*($i+1))); - push(@xData, (0.1*($i+1))); - } - ## The module requires a sufficient amount of trailing 1 to measure the lag point, - ## so we just append them - for (my $i = $hitcount; $i < ($hitcount+20); $i++) { - push(@yData, 1); - push(@xData, 0.1*($i)); - push(@xDataLog, 0.1*($i)); - } - ### calculate end point of lag phase - my $R = Statistics::R->new(); - # set variables for R - my $lagPoint = computeLagPoint($R, \@xDataLog, \@yData); - if ($lagPoint eq 'NA'){ - print "Least square fit to data failed! Trying log-transformed data.\n"; - my $lagPoint = computeLagPoint($R, \@xDataLog, \@yData); - } - ### compute the cutoff - if ($lagPoint eq 'NA') { - return(); - } - else { - my $hitLimitGetLag; - print "limit is $lagPoint. Abs is " . abs($lagPoint) . "\n"; - for (my $i = 0; $i < @xData; $i++) { - if ($xData[$i] > abs($lagPoint)) { - $hitLimitGetLag = $i + 1; - print "Setting hl to $hitLimitGetLag\n"; - last; - } - } - print "hitlimit in getLag is $hitLimitGetLag\n"; - return ($hitLimitGetLag, $lagPoint); - } - } -} -########################## -sub getHitLimit { - my ($hits, $hitcount) = @_; - my $hitLimitLoc = 0; - my $maxScore = $hits->[0]->{hmmscore}; - my $limit = $maxScore / 100 * (100 - $scoreCutoff); - for (my $i = 0; $i < $hitcount; $i++) { - if ($hits->[$i]->{hmmscore} >= $limit) { - $hitLimitLoc++; - } - else { - last; - } - } - return ($hitLimitLoc, $limit); -} -## debug -########################## -sub printDebug{ - my @message = @_; - if ($debug){ - print join "\n", @message; - print "\n"; - } -} -########################## -sub computeLagPoint { - my ($R, $xdata, $ydata) = @_; - $R->set( 'x', \@$xdata); - $R->set( 'y', \@$ydata ); - # define function - $R->run( q`func = function(t,params){ 1/(1 + exp(4 * params[1] * (params[2] - x) + 2)) }`); - # do Nonlinear Least Squares Fitting - $R->run(q`try <- try(nls(y ~ 1/(1 + exp(4 * mean * (lamda - x) + 2)), - start = list(mean=1.4, lamda=0.5), - control = list(maxiter=500)), TRUE)`); - $R->run(q`if(class(try) != "try-error"){ - f = nls(y ~ 1/(1 + exp(4 * mean * (lamda - x) + 2)), - start = list(mean=1.4, lamda=0.5), - control = list(maxiter=500)) - p = coef(f) - lagPoint = p[2] - } else { - lagPoint = "NA" - }`); - - - ### return lag point - my $lagPoint = $R->get('lagPoint'); - return($lagPoint); -} diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl deleted file mode 100755 index 2fe8333..0000000 --- a/fdog/bin/oneSeq.pl +++ /dev/null @@ -1,2860 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use File::Copy; -use File::Copy qw(move); -use File::Basename; -use File::Path; -use File::Path qw/make_path/; -use File::Path 'rmtree'; -use File::Which; -use lib dirname(__FILE__); -use Parallel::ForkManager; -use IO::Handle; -use Getopt::Long; -use Bio::DB::Taxonomy; -use Bio::Tree::Tree; -use Bio::TreeIO; -use Bio::Tools::Run::StandAloneBlast; -use Bio::Seq; -use Bio::SeqIO; -use Bio::SearchIO; -use Term::Cap; -use POSIX; - -use Capture::Tiny qw/capture/; -use IPC::Run qw( run timeout ); -use Time::HiRes; -use List::Util qw(shuffle); -use Cwd; -use Cwd 'abs_path'; -use Array::Utils qw(:all); -use Try::Tiny; - -my $startTime = gettime(); - -# Copyright (C) 2009 INGO EBERSBERGER, ebersberger@bio.uni-frankfurt.de -# This program is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published -# by the Free Software Foundation; either version 3 of the License -# or any later version. - -# This program is distributed in the hope that it will be useful -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# You should have received a copy of the GNU General Public License -# along with this program; If not, see http://www.gnu.org/licenses - -# PROGRAM DESCRIPTION: oneSeq.pl is a program for targeted ortholog search in protein sequence data. - -# PROGRAM HISTORY -## This script is based on a perl script authored by Peter Schmitzberger in the course -## of his Master's project at the CIBIV, MFPL, Vienna, Austria - -## MODIFIED: 13. Aug. 2015 - solved path issues. Script will now work together with -## HaMStR 13.2.5 - -## Modified 14. Aug. 2015 - added the options -outpath and -hmmpath and a more refined -## clean up after the search, in cases where a custom outpath has been chosen. - -## Modified: 01. Feb. 2016: restructured major parts - -## Modified: 04. Feb. 2016: - Additions - feature architecture similarity (fas) score support -## - alternations in the program flow -## - global/local option for alignments -## - additional options -## - autocleanup -## - ENV SWATDIR for alignment support (local copy required or use SWATDIR=/home/holger/appz/phredphrap) -## - if you run oneSeq.pl in DB mode, please adapt /bin/run-query.sh to your username and passwort -## - - -## Modified 07. Aug. 2017: - Changes: - change of alignment program, swat replaced by -## ssearch (local:local) and glsearch (global:local) and ggsearch (global:global) -## - selection of best fitting ortholog candidate modified -## - coreFilter: strict, relaxed and none -## - -## Modified 19. Jan. 2018: - Additions - added option to prioritize closer taxon if two taxa have a similar score -## - after a taxon has been choosen every taxa closer gets discarded in the next cycles -## - added commandline parameter to choose the deviation allowed for two taxa to be considered similar -## - -## Modified 09. Feb. 2018: - Changes - Now the HaMStR candidate search climbs the tree and evaluates only one taxon at a time -## - The FAS score for a candidate will now only be calculated, if the alignment score is high enough, -## to top the current best candidate -## - If a candidate reaches the maximum score the search stops and a new round starts -## - If a candidate is within deviation range of the maximum score only the taxa, which are on the same tree branch, -## will get evaluated and then the search gets canceled and a new round starts - -## Modified 24. Nov. 2018: Release - release oneSeq v1.3.1 -## - Not included feature/feature-updated-fas-util - -## Modified 19. July 2019: - Changes - added option to run muscle instead of mafft - -## Modified 22. July 2019: - invoked priority mode for the fas score computation if t = 30 - -## Modified 2. Dec. 2019 -## Bug Fix: Check for taxa with invalid NCBI Taxonomy Id runs now properly and crashes are avoided -## Implemented cleanup of the core ortholog directory to avoid accumulation of feature annotations - -## Modified 05. Feb. 2020 (Vinh): - added option to set number of CPUs for FAS annotation -## - input faste file must not be present in data folder or working directory -## - output files will be stored either in user defined directory set via -outpath option, or in working directory by default - -## Bug fix 14. April 2020 (Ingo): - fixed bug that inactivated the -append option - -## Modified 14. April 2020 (Vinh): - added option for using user-defined blast_dir, genome_dir and weight_dir -## - reference species (and taxa for core-set compilation) specified from blast_dir - -## Modified 16. Juni 2020 (Vinh): major change in FAS score calculation (v1.7.0) -## - no need for profile_prog, architecture_prog and visualsPath -## - final FAS score calculation is done using hamstrFAS - -## Modified 16. Juni 2020 v1.7.1 (Vinh) - replace greedyFAS by calcFAS -## Modified 07. July 2020 v1.7.2 (Vinh) - check if FAS executable -## Modified 10. July 2020 v1.7.3 (Vinh) - solved problem when gene ID contains PIPE -## Modified 13. July 2020 v1.8.0 (Vinh) - added initial check, no longer use .mod files -## Modified 22. July 2020 v1.9.0 (Vinh) - moved tmp blast files to output folder and delete them when finished -## Modified 27. Aug 2020 v2.1.0 (Vinh) - option to input newick tree for search taxa -## Modified 07. Sep 2020 v2.2.0 (Vinh) - append seed sequence to output extended.fa if no ortholog was found in refspec -## Modified 22. Sep 2020 v2.2.1 (Vinh) - make sure that seed sequence always at the beginning of extended.fa output -## Modified 23. Sep 2020 v2.2.3 (Vinh) - use full taxonomy name instead of abbr taxon name for LOG -## Modified 01. Dec 2020 v2.2.4 (Vinh) - fixed bug while creating final extended.fa (and replaced grep and sed by bioperl) -## Modified 16. Feb 2021 v2.2.5 (Vinh) - core compilation works with fasoff -## Modified 18. Feb 2021 v2.2.6 (Vinh) - fixed searchTaxa and coreTaxa options -## Modified 19. March 2021 v2.2.7 (Vinh) - check for long sequence ID -## Modified 24. March 2021 v2.2.8 (Vinh) - skip fa.mapping while checking genome_dir -## Modified 29. March 2021 v2.2.9 (Vinh) - check for zero $maxAlnScore -## - solved problem with long input path for fasta36 tools -## Modified 23. April 2021 v2.3.0 (Vinh) - parse fasta36 output for long IDs (longer than 60 chars) -## Modified 31. May 2021 v2.3.1 (Vinh) - added auto annotation for fdogFas -## Modified 11. June 2021 v2.3.2 (Vinh) - fixed --append option -## Modified 16. June 2021 v2.4.0 (Vinh) - add checkOff option -## Modified 10. Mar 2022 v2.4.1 (Vinh) - fixed bug missing results in multiprocessing - -############ General settings -my $version = 'oneSeq v.2.4.1'; - -##### configure for checking if the setup.sh script already run -my $configure = 0; -if ($configure == 0){ - die "\n\nPLEASE RUN fdog.setup BEFORE USING fdog\n\n"; -} -##### hostname -my $hostname = `hostname`; -chomp $hostname; -############# -# my $termios = new POSIX::Termios; $termios->getattr; -# my $ospeed = $termios->getospeed; -# my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed }; -# my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/; -#### Paths -my $path = abs_path(dirname(__FILE__)); -$path =~ s/\/bin//; -$path =~ s/\/$//; -printDebug("Path is $path"); - -#### Programs and output -my $sedprog = 'sed'; -my $grepprog = 'grep'; -my $readlinkprog = 'readlink'; - -my $globalaligner = 'ggsearch36'; -my $glocalaligner = 'glsearch36'; -my $localaligner = 'ssearch36'; -my $fasta36Path = which('fasta36'); -if ( !(defined $fasta36Path) || $fasta36Path eq "") { - $globalaligner = $path.'/bin/aligner/bin/'.'ggsearch36'; - $glocalaligner = $path.'/bin/aligner/bin/'.'glsearch36'; - $localaligner = $path.'/bin/aligner/bin/'.'ssearch36'; - unless (-e $globalaligner) { - print "fasta36 not found! Please install it before using fdog!\n"; - exit(); - } -} - -my $algorithm = "blastp"; -my $blast_prog = 'blastp'; -my $outputfmt = 'blastxml'; -my $eval_blast_query = 0.0001; -my $filter = 'F'; # default for blastp -my $annotation_prog = "fas.doAnno"; -my $fas_prog = "fas.run"; -my $fdogFAS_prog = "fas.runFdogFas"; - -##### ublast Baustelle: not implemented yet -my $runublast = 0; -my $ublast = 0; -my $accel = 0.8; - -############ database connection details -my $dbname=""; -my $username=""; -my $pw=""; -my $database = "DBI:mysql:database=dbdmpng;host=$dbname"; -my $getThemAll = 0; -my $updateBlast_dir = 0; - -############ directory paths -my $currDir = getcwd; -my $coreOrthologsPath = "$path/core_orthologs/"; -my $outputPath = $currDir; #"$path/output"; ## DEFAULT OUTPUT PATH -my $hamstrPath = "$path/bin/hamstr"; -my $homeDir = $path; -my $alignmentscoreMatrix = "BP62"; ## opt given by ssearch and glsearch [codaa.mat idnaa.mat P250 P120 BL50 MD40 MD20 MD10 BL62 BL80 BP62 VT160 OPT5] -my $genome_dir = "$path/genome_dir"; -my $taxaPath = "$genome_dir/"; -my $blastPath = "$path/blast_dir/"; -my $idx_dir = "$path/taxonomy/"; -my $dataDir = $path . '/data'; -my $weightPath = "$path/weight_dir/"; - -my @defaultRanks = ( - 'superkingdom', 'kingdom', - 'superphylum', 'phylum', 'subphylum', - 'superclass', 'class', 'subclass', 'infraclass', - 'superorder', 'order', 'suborder', 'parvorder', 'infraorder', - 'superfamily', 'family', 'subfamily', - 'tribe', 'subtribe', - 'genus', 'subgenus', - 'species group', 'species subgroup', 'species' -); - -################## some variables -my $finalOutput; -my $dbHandle; -my $core_hitlimit = 3; # number of hmm hits to consider for reblast during core set generation -# number of hmm hits to consider for reblast during final ortholog search. -# Note, this limits the number of co-orthologs that can be found. -my $hitlimit = 10; -## lagPhase test. Setting the autolimit option to decide from the score distribution how many hits to evaluate. -my $autoLimit; -my $scoreThreshold = 1; # evaluate only hmmsearch hits whose score is within the 10% margin of the best hmmsearch hit -my $scoreCutoff = 10; #value in percent of the hmmscore of the best hit -# Setup for FAS score support (FAS support is used by default) -# Note, fas_t is set to 0.75 by default. Changes will influence sensitivity and selectivity -my $fas_support = 1; -my $countercheck = 0; -my $fasoff = 0; -my $fasstrict = 0; -my $fas_T = 0.75; -my $priThreshold = '-t 30'; -my %profile = (); -my %fas_score_keeper = (); -my $eval_filter = 0.001; -my $inst_eval_filter = 0.01; - -my $help; -my @profile = qw(); -# my $showTaxa; -my $refSpec; -my $seqFile = ''; -my $seqId= ''; -my $seqName; -my $minDist; -my $maxDist; -my $minCoreOrthologs; -my $coreTaxa; -my $strict; -my $force = 0; -my $group; -my $groupNode; -my $blast; -my $batch; -my $blastNode; -my $representative; -my $core_rep; -my $checkOff; -my $debug; -my $corestrict; -my $inputSeq = ""; -my $rbh; -my $append = 0; -# Note, the evalue defaults ($eval_blast, $eval_hmmer) will be relaxed for final ortholog run by $eval_relaxfac -my $eval_blast = 0.00001; #1E-5 -my $eval_hmmer = 0.00001; #1E-5 -my $eval_relaxfac = 10; #checked in checkInput -my $coreOnly; -my $cpu = 1; #sets number of forks for final ortholog search (can be set via option -cpu=<>) -my $corecpu = 1; #sets number of forks for core-ortholog assembly (MUST BE 1, due to directed search process through the tree) -my $hyperthread; -my $silent; -my $checkcoorthologsref; -my $cccr; -my $tree; -my $wholeTree; -my $treeDelFlag; -my $currentNoRankDistNode; -my $currentChildsToIgnoreNode; -my $currentDistNode; -my @logOUT = qw(); -### Details about the alignment strategy -### Note, the alignment strategy can be local, glocal, or global -### Default: local -my $local; -my $global; -my $glocal; -my $core_filter_mode; -my $dbmode = 0; ## default run in dbmode. consider setting this in the configure step -my $vlevel = 2; ## verbosity level -my @taxonlist = qw(); -my @refTaxonlist = qw(); -my $seqio_object; -my %taxa; -my %refTaxa; -my $autoclean; -my $getversion; -my $coreex; ## flag to set when the core set already exists -my $addenv; -my $ignoreDistance = 0; ## flag to normalise the score by the distance in the tree -my $distDeviation = 0.05; ## Span in which a score is consideren similar -my $breakAfter = 5; ## Number of Significantly bad candidates after which the current run cancels -my %hashTree; -my $aln = 'muscle'; -my $searchTaxa; -################# Command line options -GetOptions ( - "h" => \$help, - "append" => \$append, - # "showTaxa" => \$showTaxa, - "refSpec=s" => \$refSpec, - "db" => \$dbmode, - "filter=s" => \$filter, - "seqFile=s" => \$seqFile, - "seqId=s" => \$seqId, - "seqName=s" => \$seqName, - "silent" => \$silent, - "minDist=s" => \$minDist, - "maxDist=s" => \$maxDist, - "coreOrth=i" => \$minCoreOrthologs, - "coreTaxa=s" => \$coreTaxa, - "strict" => \$strict, - "rbh" => \$rbh, - "evalBlast=s" => \$eval_blast, - "evalHmmer=s" => \$eval_hmmer, - "evalRelaxfac=s" => \$eval_relaxfac, - "checkCoorthologsRef" => \$checkcoorthologsref, - "coreCheckCoorthologsRef" => \$cccr, - "hitlimitHamstr=s" => \$hitlimit, - "coreHitlimitHamstr=s" => \$core_hitlimit, - "autoLimitHamstr" => \$autoLimit, - "scoreCutoff=s" => \$scoreCutoff, - "scoreThreshold" => \$scoreThreshold, - "coreRep" => \$core_rep, - "coreStrict" => \$corestrict, - "coreOnly" => \$coreOnly, - "group=s" => \$group, - "blast" => \$blast, - "batch=s" => \$batch, - "fas" => \$fas_support, - "countercheck" => \$countercheck, - "fasoff" => \$fasoff, - "coreFilter=s" => \$core_filter_mode, - "minScore=s" => \$fas_T, - "local" => \$local, - "global" => \$global, - "glocal" => \$glocal, - "rep" => \$representative, - "cpu=s" => \$cpu, - "outpath=s" => \$outputPath, - "hmmpath=s" => \$coreOrthologsPath, - "blastpath=s" => \$blastPath, - "searchpath=s" => \$genome_dir, - "weightpath=s" => \$weightPath, - "checkOff" => \$checkOff, - "debug" => \$debug, - "coreHitlimit=s" => \$core_hitlimit, - "hitlimit=s" => \$hitlimit, - "force" => \$force, - "cleanup" => \$autoclean, - "addenv=s" => \$addenv, - "version" => \$getversion, - "reuseCore" => \$coreex, - "ignoreDistance" => \$ignoreDistance, - "distDeviation=s" => \$distDeviation, - "aligner=s" => \$aln, - "hyperthread" => \$hyperthread, - "searchTaxa=s" => \$searchTaxa -); - -$outputPath = abs_path($outputPath); -unless (-d $coreOrthologsPath) { - make_path($coreOrthologsPath); -} -$coreOrthologsPath = abs_path($coreOrthologsPath)."/"; -$blastPath = abs_path($blastPath)."/"; -$weightPath = abs_path($weightPath)."/"; -$genome_dir = abs_path($genome_dir)."/"; -$taxaPath = $genome_dir; - -############# do initial check -if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) { - print "Validity checking....\n"; - my $checkStTime = gettime(); - unless($checkOff) { - initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff); - } - - if (!defined $coreex) { - if (!grep(/$minDist/, @defaultRanks)) { - die "ERROR: minDist $minDist invalid!\n"; - } - - if (!grep(/$maxDist/, @defaultRanks)) { - die "ERROR: maxDist $maxDist invalid!\n"; - } - - if (!defined $minCoreOrthologs) { - die "ERROR: coreOrth not defined (must be integer)!"; - } - } - print "Check finished in " . roundtime(gettime() - $checkStTime). " sec!\n"; -} - -############# show version -if ($getversion){ - print "You are running $version\n"; - print "This version supports FAS comparison.\n"; - exit; -} - -############# show help -if($help) { - my $helpmessage = helpMessage(); - print $helpmessage; - exit; -} - -############# connect to the database -if ($dbmode) { - $dbHandle = DBI->connect($database, $username, $pw) - or die "Can not open the database!"; -} - -############# show all taxa -# if ($showTaxa) { -# #get all taxa from database -# #hash example: sacce_2336 -> NCBI ID for sacce_2336 -# printTaxa(); -# exit; -# } - -#switched from online version to flatfile because it is much faster -#taxon files can be downloaded from: ftp://ftp.ncbi.nih.gov/pub/taxonomy/ -my $indexStart = gettime(); -print "Please wait while the taxonomy database is indexing...\n"; -my $db = Bio::DB::Taxonomy->new(-source => 'flatfile', - -nodesfile => $idx_dir . 'nodes.dmp', - -namesfile => $idx_dir . 'names.dmp', - -directory => $idx_dir); -my $indexTime = gettime() - $indexStart; -my $db_bkp = $db; -print "Indexing done in ",roundtime($indexTime)," sec!\n"; - -%taxa = getTaxa(); -%refTaxa = getRefTaxa(); -## debugging message -my $taxcount = keys(%taxa); -printDebug("receiving hash of taxa with $taxcount elements from sub getTaxa"); -### -for (keys %taxa){ - printDebug("value of $_ is $taxa{$_}"); -} - -my $outputFa = $coreOrthologsPath . $seqName . "/" . $seqName . ".fa"; -my $outputAln = $coreOrthologsPath . $seqName . "/" . $seqName . ".aln"; -my $tmpdir = $outputPath . '/' . $seqName . '/tmp'; -make_path($tmpdir); -checkOptions(); -createFoldersAndFiles($outputFa, $seqName, $inputSeq, $refSpec); - -my $curCoreOrthologs = 0; -my $hamstrSpecies = $refSpec; -my $addedTaxon = $refSpec; -my $noMoreOrthologs = 0; -my $coremode; -my %finalcontent; -my %candicontent; -my $maxAlnScore = 0; - -# create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction) -# get annotations for seed sequence if fas support is on -if ($fas_support){ - if (!$weightPath) { - createWeightFolder(); - } - getAnnotation($outputFa); -} - -my $coreStTime = gettime(); #time; -#core-ortholog search -if (!$coreex) { - print "\nCore compiling...\n"; - $coremode = 1; - $taxaPath = $blastPath; - #### moved from above - my $taxBuildSt = gettime(); - unless ($silent) { - print "Building up the taxonomy tree...\n"; - } - push @logOUT, "Building up the taxonomy tree...\n"; - $tree = getRefTree(); - $treeDelFlag = 0; - if($group) { - foreach($tree->get_nodes()) { - if($_->id == $groupNode->id) { - $groupNode = $_; - } - } - $tree->set_root_node($groupNode); - } - unless ($silent) { - print "Finished building the taxonomy tree in ". roundtime(gettime() - $taxBuildSt) ." sec\n"; - } - push @logOUT, "Finished building the taxonomy tree in ". roundtime(gettime() - $taxBuildSt) ." sec\n"; - ## Tree without deletions - $wholeTree = getRefTree(); - if($group) { - foreach($wholeTree->get_nodes()) { - if($_->id == $groupNode->id) { - $groupNode = $_; - } - } - $wholeTree->set_root_node($groupNode); - } - ## initialise control nodes - $currentDistNode = $wholeTree->find_node(-ncbi_taxid => $refTaxa{$refSpec}); - $currentNoRankDistNode = $currentDistNode->ancestor; ## the node from which the distance to other species will be calculated - $currentChildsToIgnoreNode = $currentDistNode; ## the node containing all child species which will not be included in the candidates file - - %hashTree = buildHashTree(); - removeMaxDist(); - printDebug("Subroutine call removeMinDist\nRefspec is $refSpec\nTaxon is $refTaxa{$refSpec}\n"); - $treeDelFlag = removeMinDist($refTaxa{$refSpec}); - #### end moved from above - - if ($ignoreDistance){ - $distDeviation = 0; - $breakAfter = -1; - } - - ## some variables used later - my $firstRun = 1; - - while (get_leaves($tree, $treeDelFlag) > 0 && $curCoreOrthologs < $minCoreOrthologs && $noMoreOrthologs == 0) { - - # checking the tree which determines the taxa that are going to be searched for hits - # printDebug("Subroutine call from core-ortholog compilation\nNumber of leaves is ".get_leaves($tree)."\nCurrent core-orthologs: $curCoreOrthologs\nVar \$noMoreOrthologs is set to $noMoreOrthologs\n"); - if ($debug){ - print "\nTaxonomic Tree as text:\n"; - my $tree_as_string = $tree->as_text("tabtree"); - print $tree_as_string; - print "\n"; - } - - #generate new aln - if($curCoreOrthologs > 0) { - createAlnMsf(); - } - - unless ($silent) { - print "In round $curCoreOrthologs running hmmbuild on $outputAln\n"; - } - hmmbuild($coreOrthologsPath.$seqName."/hmm_dir/".$seqName.".hmm", $outputAln); - - ## get the max alignment score per ortholog - printDebug("Discovering maximum alignmentscore"); - - ## Align every current core ortholog against all curretn core orthologs - ## the maximum found in this alignment is the maximun any other sequence can reach - copy($outputFa, $outputFa . ".extended") or die "Error, could not copy to file: ". "$outputFa" . ".extended\n"; - - ## get the max alnscore - my %maxAlnScores = getCumulativeAlnScores(); - foreach my $score (values %maxAlnScores){ - if ($score > $maxAlnScore){ - $maxAlnScore = $score; - } - } - printDebug("The maximum alignmentscore is: $maxAlnScore"); - if ($maxAlnScore == 0) { - die("Maximum alignment score is Zero! Something went wrong with fasta36 functions!\n") - } - clearTmpFiles(); - - my $addedTaxon = getBestOrtholog(); - my $addedTaxonName = getTaxonName($addedTaxon); - print "Added TAXON: $addedTaxon\t$addedTaxonName\n"; - #if a new core ortholog was found - if($addedTaxon ne "") { - $hamstrSpecies = $hamstrSpecies . "," . $addedTaxon; - - clearTmpFiles(); - - ++$curCoreOrthologs; - printDebug("Subroutine call from core-ortholog compilation\nTaxon is $addedTaxon\nNCBI Id is $refTaxa{$addedTaxon}\n"); - $treeDelFlag = removeMinDist($refTaxa{$addedTaxon}); - } - else { - #there are no more core orthologs - $noMoreOrthologs = 1; - print "\nThe desired number of core orthologs could not be reached.\n"; - } - } - - ## This is now the final round of alignment and profile hidden Markov model building - ## It concludes the core ortholog set compilation - if ($curCoreOrthologs < $minCoreOrthologs ){ - print "\nWARNING: The desired number of core orthologs could not be reached. Training with only $curCoreOrthologs sequences\n"; - } - createAlnMsf(); - hmmbuild($coreOrthologsPath.$seqName."/hmm_dir/".$seqName.".hmm", $outputAln); -} -print "==> Core set compilation finished in " . roundtime(gettime() - $coreStTime). " sec!\n"; -push @logOUT, "Core set compilation finished in " . roundtime(gettime() - $coreStTime). " sec!"; - -#after having calculated the core orthologous set, -#start fdog to find all orthologs -# my $finalOutput = $outputPath . '/' . $seqName . '.extended.fa'; -my $orthoStTime = gettime(); -if (!$coreOnly) { - $coremode = 0; - push @logOUT, "Performing the final ortholog search..."; - print "\nPerforming the final ortholog search...\n"; - my $startTmp = gettime(); - #using $eval_relaxfac to relax the evalues for final search - my $final_eval_blast = $eval_blast*$eval_relaxfac; - my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac; - - $taxaPath = $genome_dir; - my @searchTaxa; - unless ($searchTaxa) { - unless($groupNode) { - @searchTaxa = keys %taxa; - } else { - # %taxa = getTaxa(); - # print "GET TAXA TIME: ", roundtime(gettime() - $startTmp),"\n"; - my $tree = getTree(); - # print "GET TREE TIME: ", roundtime(gettime() - $startTmp),"\n"; - if($groupNode) { - foreach($tree->get_nodes()) { - if($_->id == $groupNode->id) { - $groupNode = $_; - } - } - $tree->set_root_node($groupNode); - } - foreach (get_leaves($tree)) { - push(@searchTaxa, @{$_->name('supplied')}[0]); - } - } - } else { - open(SEARCH, $searchTaxa) || die "Cannot open $searchTaxa file!\n"; - @searchTaxa = ; - close (SEARCH); - } - # print "PREPARE TIME: ", roundtime(gettime() - $startTmp),"\n"; - - my $pm = new Parallel::ForkManager($cpu); - if ($hyperthread) { - $pm = new Parallel::ForkManager($cpu*2); - } - - foreach (sort @searchTaxa) { - chomp(my $searchTaxon = $_); - my $pid = $pm->start and next; - if ($coreex) { - $db = Bio::DB::Taxonomy->new(-source => 'flatfile', - -nodesfile => $idx_dir . 'nodes.dmp', - -namesfile => $idx_dir . 'names.dmp', - -directory => $idx_dir); - $db_bkp = $db; - } - my $searchTaxonName = getTaxonName($searchTaxon); - if (defined($searchTaxonName)) { - unless ($silent) { - print $searchTaxon, "\t", $searchTaxonName, "\n"; - } else { - unless ($searchTaxonName eq "Unk") { - print $searchTaxonName, "\n"; - } else { - print $searchTaxon, "\n"; - } - } - } - runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln, 0); - $pm->finish; - } - $pm->wait_all_children; - - ### join result files - unless (-e $finalOutput) { - open(EXTENDEDFA, ">$finalOutput") or die "Cannot create $finalOutput\n"; - } else { - open(EXTENDEDFA, ">>$finalOutput") or die "Cannot create $finalOutput\n"; - } - opendir(my $dh, $outputPath) || die "Cannot open $outputPath: $!"; - while (readdir $dh) { - if ($_ =~ /hamstrsearch_(.)+_$seqName(\.strict)*\.out$/) { - open(RESULT, "<$outputPath/$_") or warn "Cannot open $outputPath/$_!"; - while (my $line = ) { - chomp $line; - my @tmp = split(/\|/, $line); - my $seq = pop(@tmp); - splice(@tmp, 1, 1); - my $id = join("|", @tmp); - print EXTENDEDFA ">$id\n$seq\n"; - } - close(RESULT); - unlink("$outputPath/$_") or warn "Cannot delete $outputPath/$_!" - } - } - closedir $dh; - close(EXTENDEDFA); -} -### remove duplicated seq in extended.fa -if (-e $finalOutput) { - addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); -} -### remove duplicated seq in extended.fa -if (-e $finalOutput) { - addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); -} -push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!"; -print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n"; - -## Evaluation of all orthologs that are predicted by the final run -if(!$coreOnly){ - my $fasStTime = gettime(); - my $processID = $$; - - # check if final extended.fa exists - unless (-e $finalOutput) { - die "ERROR: Could not find $finalOutput\n"; - } - # check and add seed to final extended.fa if needed - addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); - - # calculate FAS scores for final extended.fa - if ($fas_support) { - print "Starting the feature architecture similarity score computation...\n"; - my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu --redo_anno"; - unless ($countercheck) { - $fdogFAScmd .= " --bidirectional" - } - system($fdogFAScmd) - # print $fdogFAScmd,"\n"; - } else { - fasta2profile($finalOutput, $seqName) - } - push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; - print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; - if($autoclean){ - print "Cleaning up...\n"; - runAutoCleanUp($processID); - } -} - -## Delete tmp folder -unless ($debug) { - my $delTmp = "rm -rf $tmpdir"; - system ($delTmp) == 0 or die "Error deleting tmp files in $tmpdir\n"; - my $delcommandTmp = "rm -rf $outputPath/tmp"; - system ($delcommandTmp) == 0 or die "Error deleting tmp files in $outputPath/tmp\n"; -} -print "==> fdog finished after " . roundtime(gettime() - $startTime) . " sec!\n"; -push @logOUT, "fdog finished after " . roundtime(gettime() - $startTime) . " sec!\n"; - -#### writing the log -open (LOGOUT, ">>$outputPath/fdog.log") or die "Could not open $outputPath/fdog.log for writing\n"; -print LOGOUT "\n\n"; -my $fdogVersion = `fdog.run --version`; -print LOGOUT "fDOG v$fdogVersion\n"; -print LOGOUT join "\n", @logOUT; -close LOGOUT; -exit; - - -######################## SUBROUTINES ######################## - -################################# -## Clears Temporary files -sub clearTmpFiles { - #clear temporary result file - if(-e $outputFa.".extended") { - unlink($outputFa.".extended"); - } - - #clear all alignment files - my @scorefiles = glob("*.scorefile"); - foreach my $file (@scorefiles) { - unlink($file); - } - my @fastaInfiles = glob("*_fasta36.fa"); - foreach my $file (@fastaInfiles) { - unlink($file); - } -} - -sub getCandicontent{ - my %candicontent; - my $candidatesFile = $outputFa . ".extended"; - if (-e $candidatesFile) { - - ######################## - ## step: 2 - ## setup - ## candidates to hash - ## %candicontent keeps info about all candidates (header and sequence) - open (CANDI, "<".$candidatesFile) or die "Error: Could not find $candidatesFile\n"; - my $head; - %candicontent = (); - while(){ - my $line = $_; - chomp($line); - if ($line =~ m/^>/){ - $line =~ s/>//; # clip '>' character - $head = $line; - }else{ - $candicontent{$head} = $line; - } - } - close (CANDI); - } - return %candicontent; -} - -################################# -## Get the alinment score for the current candidate file -## only works for files holding only one candidate -sub getCumulativeAlnScores{ - chdir($coreOrthologsPath . $seqName); - my $candidatesFile = $outputFa . ".extended"; - my $fileId = $$; - my $scorefile = $fileId . ".scorefile"; - my $fasta36file1 = $fileId . ".1_fasta36.fa"; - my $fasta36file2 = $fileId . ".2_fasta36.fa"; - my %scores; - - ######################## - ## step: 1 - ## set alignment parameters for fasta36 - my $fasta36cmd = $fasta36file1 . "\" \"" . $fasta36file2 . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; - - ######################## - ## step: 2 - ## candidates to hash - ## %candicontent keeps info about all candidates (header and sequence) - my %candicontent = getCandicontent(); - - ######################## - ## step: 3 - ## get alignment scores - chdir($coreOrthologsPath . $seqName); - symlink($outputFa, $fasta36file1); - symlink($candidatesFile, $fasta36file2); - if ($glocal){ - #glocal global:local glsearch36 Needleman-Wunsch - my $globlocCommand = "$glocalaligner \"" . $fasta36cmd; - printDebug($globlocCommand); - # print $globlocCommand,"\n";<>; - system($globlocCommand); - }elsif ($global){ - #global global:global ggsearch36 Needleman-Wunsch - my $globglobCommand = "$globalaligner \"" . $fasta36cmd; - printDebug($globglobCommand); - # print $globglobCommand,"\n";<>; - system($globglobCommand); - }elsif ($local){ - #local local:local ssearch36 Smith-Waterman - my $loclocCommand = "$localaligner \"" . $fasta36cmd; - printDebug($loclocCommand); - # print $loclocCommand,"\n";<>; - system($loclocCommand); - } - ######################## - ## step: 4 - ## collect alignment score - ## keep track about min and max for each query/coreortholog vs candidate set - my $max = -10000000; - my $min = 10000000; - - %scores = cumulativeAlnScore($scorefile, \%candicontent); - return %scores; -} - -################################# -## Get the alinment scores for the current candidate file -sub getAlnScores{ - chdir($coreOrthologsPath . $seqName); - my %scores = getCumulativeAlnScores(); - ## Normalize Alignment scores (unity-based) - printDebug("Normalize alignment scores:\n"); - foreach my $key (keys %scores){ - my $score = $scores{$key}; - unless ($silent) { - print "Cumulative alignmentscore is: $score\n"; - } - $scores{$key} = $scores{$key} / $maxAlnScore; - $score = $scores{$key}; - unless ($silent) { - print "Normalised alignmentscore is: $score\n"; - } - } - return %scores; -} - -################################# -## Get the fas scores for the current candidate file -sub getFasScore{ - printDebug("Changing to $coreOrthologsPath$seqName", "Candidate file is $outputFa".'.extended'); - chdir($coreOrthologsPath . $seqName); - my %fas_box; - my $scorefile = $$ . ".scorefile"; - my $rankscore; - - ######################## - ## step: 1 - ## setup - ## candidates to hash - ## %candicontent keeps info about all candidates (header and sequence) - my %candicontent = getCandicontent(); - - ######################## - ## step: 2 - ## get FAS score - ## fas support: on/off - my @candidateIds = keys(%candicontent); - if ($fas_support){ - my ($name,$gene_set,$gene_id,$rep_id) = split(/\|/, $candidateIds[0]); - unless (-e "$weightPath/$gene_set.json") { - print "ERROR: $weightPath/$gene_set.json not found! FAS Score will be set as zero.\n"; - $fas_box{$candidateIds[0]} = 0.0; - } else { - my $lnCmd = "ln -fs $weightPath/$gene_set.json \"$coreOrthologsPath$seqName/fas_dir/annotation_dir/\""; - system($lnCmd); - my $fasOutTmp = `$fas_prog -s \"$coreOrthologsPath$seqName/$seqName.fa\" -q $blastPath/$gene_set/$gene_set.fa --query_id \"$gene_id\" -a \"$coreOrthologsPath$seqName/fas_dir/annotation_dir/\" -o \"$coreOrthologsPath$seqName/fas_dir/annotation_dir/\" --raw --tsv --domain --cpus 1 | grep "#" | cut -f 3,4`; - my @fasOutTmp = split(/\t/,$fasOutTmp); - $fas_box{$candidateIds[0]} = $fasOutTmp[1]; - } - } else { - $fas_box{$candidateIds[0]} = 1; - } - return %fas_box; -} - -################################# -## Add fas and alignment score together while using specified filters (coreFilter option) -sub getFilteredRankScore{ - my $alnScore = $_[0]; - my $fasScore = $_[1]; - my $rankscore = 0; - # $rankscore: keeps alignment and fas score, decider about $bestTaxon - if ($core_filter_mode){ - if ($core_filter_mode eq "strict"){ - # case 1: filter - if ($fasScore < $fas_T){ - #eliminate candidate $key - print "Deleting candidate from list due to insufficient FAS score.\n"; - $rankscore = 0; - }else{ - #keep - if ($alnScore){ - $rankscore = $fasScore + $alnScore; - }else{ - $rankscore = $fasScore; - } - } - }elsif ($core_filter_mode eq "relaxed"){ - # case 2: disadvantage - if ($fasScore < $fas_T){ - # ignore FAS score for rankscore - printDebug("Candidate will be disadvantaged.\n"); - if ($alnScore){ - $rankscore = $alnScore; - }else{ - $rankscore = 0; - } - } - else{ - #keep - if ($alnScore){ - $rankscore = $fasScore + $alnScore; - }else{ - $rankscore = $fasScore; - } - } - } - }else{ - # case 3: no filter - if($fasScore) { - if ($alnScore){ - $rankscore = $fasScore + $alnScore; - }else{ - $rankscore = $fasScore; - } - } - } - return $rankscore; -} - -sub getHeaderSeq{ - my $bestTaxon = $_[0]; - open (EXTFA, $outputFa.".extended"); - my $sequenceLine = 0; - my $bestSequence = ""; - - ######################## - ## step: 7 - ## get best sequence from candidate file - ## (will be added to the model) - while() { - my $line = $_; - chomp($line); - if($sequenceLine == 1) { - $bestSequence = $line; - $sequenceLine = -1; - } - - if($line eq $bestTaxon) { - $sequenceLine = 1; - } - } - close EXTFA; - my @best = split '\|', $bestTaxon; - my $header = $best[0] . '|' . $best[1] . '|' . $best[2]; - return ($header, $bestSequence); -} - -## create profile from extended.fa -sub fasta2profile{ - my ($file, $out) = ($_[0], $_[1]); - my ($fO_base, $fO_path, $fO_suffix) = fileparse( $file, qr/\.[^.]*/ ); - my $outFile = $fO_path.$out.".phyloprofile"; - open(FA, $file); - open(PPOUT, ">$outFile"); - print PPOUT "geneID\tncbiID\torthoID\n"; - foreach my $line() { - if ($line =~ /^>/) { - chomp($line); # test|ANOGA@7165@1|Q7Q3C2|1 - $line =~ s/>//; - my @lineTMP = split(/\|/, $line); - my $geneID = $lineTMP[0]; - my @orthoTMP = split(/@/, $lineTMP[1]); - my $ncbiID = "ncbi".$orthoTMP[1]; - print PPOUT $geneID, "\t", $ncbiID, "\t", $line,"\n"; - } - } - close(FA); - close(PPOUT); -} - -## auto clean up can be invoked via the "-cleanup" option -# $processID: given process ID -sub runAutoCleanUp { - my $processID = $_[0]; - unless ($silent) { - print "Deleting $outputPath/tmp\n"; - } - my $delCommandTmp = "rm -rf \"$outputPath/tmp\""; - system ($delCommandTmp) == 0 or die "Error deleting result files\n"; - my $seedName = $seqName . '_seed'; - my $annopath = $coreOrthologsPath.$seqName."/fas_dir/annotation_dir"; - if( -l "$currDir/$seqFile" ) { - my $delLnSeedFile = "rm $currDir/$seqFile"; - system ($delLnSeedFile); - } - unless ($silent) { - print "Deleting $annopath\n"; - } - if (!$fasoff) { - opendir(ANNODIR, $annopath) or warn "Could not open $annopath in sub runAutoCleanup\n"; - my @annodirs = grep (!/$seedName/, readdir(ANNODIR)); - unless ($silent) { - print scalar(@annodirs) . " content of $annopath\n"; - } - for (my $anno = 0; $anno < @annodirs; $anno++){ - if ($annodirs[$anno] ne '..' and $annodirs[$anno] ne '.' and $annodirs[$anno] ne $seqName.".json") { - unless ($silent) { - print "Deleting $annopath/$annodirs[$anno]\n"; - } - rmtree ($annopath."/".$annodirs[$anno]); - } - } - closedir (ANNODIR); - } -} - -## starting annotation_prog for given seed sequence file -# $seedseqFile: fasta file with seed sequence -sub getAnnotation { - my ($seedseqFile) = ($_[0]); - my $inputAnno = $seedseqFile; - $inputAnno =~ s/\|/\\\|/g; - my $outputAnno = $coreOrthologsPath . $seqName . "/fas_dir/annotation_dir"; - $outputAnno =~ s/\|/\\\|/g; - my $annotationCommand = "$annotation_prog" . " -i $inputAnno" . " -o $outputAnno --cpus 1" . " --name \"$seqName\""; #" --name " . $seqName . "_seed" . " --cpus 1"; - system($annotationCommand); -} - -## determines the reference species and/or the sequence id of the input sequence. -sub determineRef { - my ($infile, @refspec) = @_; - #run blast for all available proteomes if the given sequence is not in the database - unless ($silent) { - print "One moment please!\nLooking for the most similar sequence to your input sequence.\n"; - } - my $bestHit->{score} = 1; - $bestHit->{evalue} = 10; - my $outname = $$; - ## Baustelle: Currently, we need to loop through all possible taxa to id the best matching one - for (my $i = 0; $i < scalar(@refspec); $i++) { - my $curTaxon = $refspec[$i]; - ## run the blast search - printDebug("running blast on $curTaxon"); - my $resultFile = runBlast($seqFile, $dataDir, $outname, $tmpdir, "$blastPath/$curTaxon/$curTaxon"); - my $hits = &getBestBlasthit($tmpdir, $resultFile); - if (defined $hits and @$hits > 0) { - #only use the best hit with the index [0]. Note, $hits is an array ref of hashrefs. - if($hits->[0]->{score} > $bestHit->{score}) { - $bestHit->{name} = $hits->[0]->{name}; - $bestHit->{score} = $hits->[0]->{score}; - $bestHit->{evalue} = $hits->[0]->{evalue}; - $bestHit->{species} = $curTaxon; - } - } - } - return($bestHit); -} - -sub checkGroup { - my $group = shift; - my $node = $db->get_taxon(-name => $group); - if($node) { - $groupNode = $node; - } - else { - print "Your selected group " . $group . " was not found in the taxonomic tree... TERMINATING\n"; - exit; - } -} - -################################# -sub checkOptions { - if($eval_relaxfac < 1){ - # rethink - if($eval_relaxfac <= 0){ - printOut("\nThe specified factor for evalue relaxation is <= 0. Please see the help text for option -eval_relaxfac. We recommend a factor > 1. Default is 10.\n",1); - my $answer = ''; - my $breaker = 0; - while (($answer !~ /[0-9]/i) and ($breaker < 4)) { - $breaker++; - $answer = getInput("Please choose a new factor (Integer) for evalue relaxation. [1,100]"); - if (($breaker > 3) and ($answer !~ /[0-9]/i)){ - print "No proper factor given ... exiting.\n"; - exit; - } - } - if ($answer =~ /[0-9]/i) { - $eval_relaxfac = $answer; - } - } - } - ### check for colision of force and append. Change in favor of append - if ($force == 1 and $append ==1) { - $force = 0; - } - ### check the presence of the pre-computed core set - if ($coreex) { - if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") { - print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; - exit; - } - } - ### begin move up - ### checking reference species - my $optbreaker = 0; - while ((!$refSpec or !$refTaxa{$refSpec}) && !$blast) { - if ($optbreaker >= 3){ - print "No proper refspec given ... exiting.\n"; - exit; - } - my $output = ''; - for (my $i = 0; $i < @refTaxonlist; $i++) { - $output = $output . "[$i]" . "\t" . $refTaxonlist[$i] . "\n"; - } - ### for debug? - # for (keys %taxa){ - # print "value of $_ is \'$taxa{$_}\'"; - # } - # printDebug("taxa contains $taxa{$refSpec}"); # cannot print this if $taxa{$refSpec} not exists! - my $refSpecIdx = getInput("\n" . $output . "\n" . "You did not provide a valid reference species ($refSpec). Please choose the number for the reference species your input sequence is derived from", 1); - $optbreaker++; - $refSpec = $refTaxonlist[$refSpecIdx]; - checkBlastDb($refSpec, $refSpec); - } - ### end move up - ### adding new routine to generate the input sequence if -reuseCore has been set - if ($coreex) { - my @refseq=`$grepprog -A 1 ">$seqName|$refSpec" $coreOrthologsPath/$seqName/$seqName.fa | grep -v "^\-\-\$"`; - chomp @refseq; - unless ($silent) { - print "$refseq[0]\n"; - } - (my $tmp1, my $tmp2, $seqId) = split '\|', $refseq[0]; - if (length($seqId) == 0){ - die "error in retrieving sequence while using -reuseCore\n"; - } - print "overruling the provided seed sequence since you used the option -reuseCore. Setting seed id to $seqId\n"; - open OUT, (">$currDir/$seqName.fa") or die "could not open $currDir/$seqFile for writing in retrieve refseq\n"; - print OUT join "\n", @refseq; - close OUT; - $seqFile = "$seqName.fa"; - } - ### end mod - ### check input file - $optbreaker = 0; - while ((length $seqFile == 0) or ((! -e "$currDir/$seqFile") and (! -e "$dataDir/$seqFile"))) { - if ($optbreaker >= 3){ - print "No proper file given ... exiting.\n"; - exit; - } - if (length $seqFile > 0){ - if (-e $seqFile) { - my @seqFileTMP = split(/\//, $seqFile); - unless (-e "$currDir/$seqFileTMP[@seqFileTMP-1]") { - system("ln -fs \"$seqFile\" \"$currDir/$seqFileTMP[@seqFileTMP-1]\""); - } - $seqFile = $seqFileTMP[@seqFileTMP-1]; - } else { - printOut("\nThe specified file $seqFile does not exist!\n",1); - } - } - } - if (-e "$currDir/$seqFile"){ - $dataDir = $currDir; - printDebug("Setting datadir to $currDir in sub checkOptions"); - } - - ### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected - $optbreaker = 0; - while(!$minCoreOrthologs and !$coreex) { - if ($optbreaker >= 3){ - print "No proper number given ... exiting.\n"; - exit; - } - $minCoreOrthologs = getInput("Please specify the desired number of core orthologs!", 1); - $minCoreOrthologs = checkInt($minCoreOrthologs); - $optbreaker++; - } - ## check for blast filter - if ($blast_prog ne 'blastall'){ - $filter = 'yes' if $filter eq 'T'; - $filter = 'no' if $filter eq 'F'; - } - - $inputSeq = fetchSequence($seqFile, $dataDir); - - ## the user has not provided a sequence id, however, the refspec is determined. - if($seqId eq '') { - my $besthit; - if (!$blast){ - ## a refspec has been determined - #use blast to search in the proteome of the specified reference species for the input sequence - #in order to obtain a valid sequence id - $besthit = determineRef($seqFile, ($refSpec)); - } - else { - $besthit = determineRef($seqFile, @refTaxonlist); - } - $seqId = $besthit->{name}; - $refSpec = $besthit->{species}; - my $details = "Evalue: " . $besthit->{evalue}; - printOut("Seq id has been determined as $seqId in $refSpec with $details", 2); - # if(length("$seqName|$refSpec|$seqId") > 60) { - # die "Output file will have header longer than 60 characters ($seqName|$refSpec|$seqId). Please consider shorten the sequence IDs! More at https://github.com/BIONF/fDOG/wiki/Check-data-validity\n"; - # } - if($seqId eq '') { - print "There was no significant hit for your sequence in " . $refSpec . ".\nPlease specify a sequence id on your own.\n"; - exit; - } - } - - if($coreTaxa) { - if(! -e $coreTaxa) { - print "Please specify a valid file with taxa for the core orthologs search\n"; - exit; - } - my @userTaxa = parseTaxaFile($coreTaxa); - my %newTaxa = (); - foreach (@userTaxa) { - $newTaxa{$_} = $taxa{$_}; - } - $newTaxa{$refSpec} = $refTaxa{$refSpec}; - %refTaxa = %newTaxa; - } - - if($group) { - checkGroup($group); - } - - if(!$seqName) { - my $i = 0; - while($i < 7) { - my $j = chr(int(rand(127))); - if($j =~ /[a-zA-Z]/) { - $seqName .=$j; - $i++; - } - } - print "Your sequence was named: " . $seqName . "\n\n"; - } - $outputPath = $outputPath . "/$seqName"; - if (! -d "$outputPath"){ - mkdir "$outputPath", 0777 or die "could not create the output directory $outputPath"; - } - ## check whether a result file already exists: - $finalOutput = $outputPath . '/' . $seqName . '.extended.fa'; - if ($outputPath && -e "$finalOutput"){ - ## an ouput file is already existing - if (!$force && !$append){ - ## The user was not aware of an existing output file. Let's ask him - my $input = ''; - my $breaker = 0; - - while (($input !~ /^[aor]/i) and ($breaker < 4)) { - $breaker++; - die "\nAn outputfile $finalOutput already exists. Please consider option --force for overwriting it or option --append for appending to it.\n" - } - } - if ($force){ - ## the user wants to overwrite - printOut("Removing existing output directory $outputPath", 1); - rmtree ([ "$outputPath" ]) or die "could not remove existing output directory $outputPath\n"; - mkdir $outputPath or die "could not re-create the output directory $outputPath\n"; - } - elsif ($append) { - if (-e "$outputPath/$seqName.extended.fa") { - ## read in the content for latter appending - printOut("Appending output to $outputPath/$seqName.extended.fa", 1); - open (IN, "<$outputPath/$seqName.extended.fa") or die "failed to open $outputPath/$seqName.extended.fa after selection of option -append\n"; - while () { - my $line = $_; - if ($line =~ /\|/) { - chomp $line; - my @keys = split '\|', $line; - $profile{$keys[1]} = 1; - } - } - } - elsif ($fasoff) { - ## no extended.profile file exists but not necessary, because user switched off FAS support -> do nothing - } - else { - printOut("Option --append was selected, but the existing output was incomplete. Please restart with the --force option to overwrite the output", 1); - exit; - } - } - else { - printOut("Renaming existing output file to $finalOutput.old", 2); - my $bu_dir = $outputPath.'_bkp'; - !`mv $outputPath $bu_dir` or die "Could not rename existing output file $outputPath to $bu_dir\n"; - mkdir $outputPath or die "could not recreate $outputPath after renaming the old output\n" - } - } - - #### checking for the min and max distance for the core set compilation - #### omit this check, if the option reuseCore has been selected (added 2019-02-04) - $optbreaker = 0; - if (!$coreex) { - my $node; - $node = $db->get_taxon(-taxonid => $refTaxa{$refSpec}); - $node->name('supplied', $refSpec); - if (lc($maxDist) eq "root"){ - $maxDist = 'no rank'; - } - while (!$maxDist or (checkRank($maxDist, $node) == 0)) { - if ($optbreaker >= 3){ - print "No proper maxDist given ... exiting.\n"; - exit; - } - print "You have not defined a valid maximum distance rank!\n"; - printTaxonomy($node); - my $in = getInput('Please choose a rank by giving the number in square brackets', 1); - $optbreaker++; - $maxDist = parseInput($node, $in); - print "You selected ". $maxDist . " as maximum rank\n\n"; - } - while (!$minDist or (checkRank($minDist, $node) == 0)) { - if ($optbreaker >= 3){ - print "No proper minDist given ... exiting.\n"; - exit; - } - print "You have not defined a minimum distant rank!\n"; - printTaxonomy($node); - my $in = getInput('Please choose a rank by giving the number in square brackets', 1); - $optbreaker++; - $minDist = parseInput($node, $in); - print "You selected " . $minDist . " as minimum rank\n\n"; - } - } - $optbreaker = 0; - - #### checking in fas options - if($fasoff){ - print "You have turned FAS support off. Candidate orthologs will not be evaluated by their FAS.\n"; - # turn FAS support off - $fas_support = 0; - } - ## check if user defined fas_T is off limits - if ($fas_T < 0 or $fas_T > 1){ - print "You chose an odd FAS score filter (-minScore), default is 0.75.\n"; - my $answer = ''; - $optbreaker = 0; - while ($answer < 0 or $answer > 1) { - if ($optbreaker >= 3){ - print "No proper fas filter given ... exiting.\n"; - exit; - } - $answer = getInput("Please choose a FAS score filter [0,1] between 0 (relaxed) and 1 (stringent):"); - $optbreaker++; - } - if ($answer > 0 and $answer < 1) { - $fas_T = $answer; - } - } - ### rather strict fas filter for core orthologs: OFF - if(!$core_filter_mode){ - unless ($silent) { - print "No FAS filter for core-orthologs set.\n"; - } - }elsif($core_filter_mode eq "relaxed"){ - #core ortholog candidates with a FAS score below the threshold will be disadvantaged - }elsif($core_filter_mode eq "strict"){ - #core ortholog candidates with a FAS score below the threshold will not be considered any more - }else{ - print "No known filter mode for core-orthologs specified. Continuing with default settings\n"; - $core_filter_mode = 0; - } - - ### check alignment strategy - if (($local && $global) or ($local && $glocal) or ($global && $glocal)){ - print "Please specify only one alignment strategy!\n"; - print "Possible options are: -glocal, -local, or -global\n"; - print "... exiting.\n"; - exit; - }elsif(!$local && !$global && !$glocal){ - unless ($silent) { - print "No specific alignment strategy set. Continuing with local alignments (Smith-Waterman-Algorithm).\n"; - } - $local = 1; - } -} - -####################### sub check the systematic rank -sub checkRank { - my $rank = $_[0]; - my $node = $_[1]; - my $rankExists = 0; - while($node->ancestor && $rankExists == 0) { - if($node->rank eq $rank) { - $rankExists = 1; - } - $node = $node->ancestor; - } - - if($node->rank eq $rank) { - $rankExists = 1; - } - - return $rankExists; -} - -############ -## modified by Ingo - Added Option to run Muscle -sub createAlnMsf { - my $linsiCommand = ''; - if (!defined $aln or $aln eq 'mafft-linsi') { - my $linsiCommand = "mafft --maxiterate 1000 --localpair --anysymbol --quiet \"" . $outputFa . "\" > \"" . $outputAln . "\""; - } - elsif ($aln eq 'muscle') { - $linsiCommand = "muscle -quiet -in \"" . $outputFa . "\" -out \"" .$outputAln. "\""; - } - else { - die "issues with the msa. You need to select either mafft or muscle\n"; - } - system($linsiCommand) == 0 or die "Could not run alignment\n$linsiCommand\n"; -} - -################ creating folders for fas support usage -sub createWeightFolder{ - #create weight_dir in hamstr1seq home dir - my $weightdir = $path."/"."weight_dir"; - mkdir "$weightdir", 0777 unless -d "$weightdir"; -} - -################ -sub createFoldersAndFiles { - my ($outputFa, $seqName, $inputSeq, $refSpec) = (@_); - #create core orthologs directory - my $dir = $coreOrthologsPath . $seqName; - if (!$coreex){ - mkdir "$dir", 0755 unless -d "$dir"; - my $header = $seqName . "|" . $refSpec . "|" . $seqId; - - #create FA file - open (OUTPUT, ">$outputFa") or die "Error creating fa file $outputFa\n"; - print OUTPUT ">" . $header . "\n"; - print OUTPUT $inputSeq; - close OUTPUT; - - #create the Aln file initially only with a single species in there - open (OUTPUT, ">$outputAln") or die "Error creating fa file $outputAln\n"; - print OUTPUT ">" . $header . "\n"; - print OUTPUT $inputSeq; - close OUTPUT; - - #create the folder for the hmm output - my $hmmdir = $dir . "/hmm_dir"; - mkdir "$hmmdir", 0755 unless -d "$hmmdir"; - } - #create the fas_dir for core orthologs if fas support is ON - if ($fas_support){ - my $fasdir = $dir. "/fas_dir"; - mkdir "$fasdir", 0777 unless -d "$fasdir"; - - my $annodir = $fasdir."/annotation_dir"; - mkdir "$annodir", 0777 unless -d "$annodir"; - } -} -################# -sub fetchSequence { - my ($file, $filepath) = @_; - if (! defined $filepath){ - $filepath = '.'; - } - my $seq = ""; - open (INPUT, "<$filepath/$file") or die print "Error opening seq file\n"; - while() { - my $line = $_; - chomp($line); - unless($line =~ /^\>.*/) { - $seq = $seq . $line; - } - } - close INPUT; - $seq =~ s/\s*//g; - unless ($silent) { - printOut($seq, 2); - } - return $seq; -} -################################# -## choose the ortholog which reaches the highest score -sub getBestOrtholog { - ## max possible score is either one or two - my $maxScore = 1; - if ($fas_support){ - $maxScore += 1; - } - - ## get leavs to evaluate - my @leaves = get_leaves($tree, $treeDelFlag); - ## sort by distance in taxonomy tree - if (!$ignoreDistance){ - @leaves = sort_leaves(@leaves); - } - ## don't sort by distance - else{ - my @unsortedLeaves = @leaves; - @leaves = qw(); - push @leaves, \@unsortedLeaves; - } - - ## create needed variables - my $bestTaxon = ''; - my $rankScore = 0; - my $header = ''; - my $seq = ''; - my $newNoRankDistNode; ## this will be the new Distance node, after a new candidate has been choosen - my $newChildsToIgnoreNode; ## all leaves under this node will be ignored in future runs, after a new candidate has been choosen - my $sufficientlyClose = 0; ## flag to break outer loop - my $candidatesFile = $outputFa . ".extended"; - - ## iterate over each array with leaves of same distance - foreach my $array (@leaves) { - ## break loop if a candidate was close to the max score and no more candidates remain with the same distance - if ($sufficientlyClose){ - unless ($silent) { - print "Best Taxon is sufficiently close to max score and no more candidates with same distance remain.\nStopping evaluation.\n"; - } - last; - } - ## iterate over each leaf with the same distance - foreach my $key (@$array){ - my $keyName = @{$key->name('supplied')}[0]; - my $nodeId = $wholeTree->find_node(-ncbi_taxid => $refTaxa{$keyName})->id; - unless ($silent) { - print "fdog species: " . $key->scientific_name . " - " . @{$key->name('supplied')}[0] . "\n"; - } - my $coreTaxon = @{$key->name('supplied')}[0]; - my $coreTaxonName = getTaxonName($coreTaxon); - if (defined($coreTaxonName)) { - unless ($silent) { - print $coreTaxon, "\t", $coreTaxonName, "\n"; - } else { - print $coreTaxonName, "\n"; - } - } - runHamstr($coreTaxon, $seqName, $outputFa, $refSpec, $core_hitlimit, $core_rep, $corestrict, $coremode, $eval_blast, $eval_hmmer, $aln, 1); - ## check weather a candidate was found in the searched taxon - if(-e $candidatesFile) { - - ## get found candidates for one taxon in hash to iterate over - my %candicontent = getCandicontent(); - - ## get scores in hashes because there might be more than one candidate sequence per taxon - my %alnScores = getAlnScores(); - my %fas_box; - my $gotFasScore = 0; - ## iterate over found candidates - foreach my $candiKey (keys %candicontent){ - ## candidates alnScore is high enought, that it would be better with a fasScore of one - ## -> evaluate - if ($alnScores{$candiKey} > $rankScore * (1 + $distDeviation) - 1){ - %fas_box = getFasScore(); - if (!$gotFasScore and $fas_support){ - # %fas_box = getFasScore(); - $gotFasScore = 1; - } - ## get rankscore - my $newRankScore = getFilteredRankScore($alnScores{$candiKey}, $fas_box{$candiKey}); - ## candidate is significantly better, than the last one - if ($newRankScore > $rankScore * (1 + $distDeviation)){ #uninit - $bestTaxon = ">" . $candiKey; - $rankScore = $newRankScore; - ($header, $seq) = getHeaderSeq($bestTaxon); - $newNoRankDistNode = $currentNoRankDistNode; - $newChildsToIgnoreNode = $currentChildsToIgnoreNode; - my $newNodeId = $key->id; - ## set new distance nodes, which will replace the old ones given, that this candidate will remain the best - while (!defined $hashTree{$newNoRankDistNode}{$newNodeId}){ - $newNoRankDistNode = $newNoRankDistNode->ancestor; - $newChildsToIgnoreNode = $newChildsToIgnoreNode->ancestor; - } - unless ($silent) { - print "New Best Taxon: $bestTaxon\n"; - } - } - } - ## candidate has the same distance, as the last one and could be better, with a fasScore of one - elsif (defined $hashTree{$newNoRankDistNode}{$key->id} and $alnScores{$candiKey} > $rankScore - 1){ - %fas_box = getFasScore(); - if (!$gotFasScore and $fas_support){ - # %fas_box = getFasScore(); - $gotFasScore = 1; - } - ## get rankscore - my $newRankScore = getFilteredRankScore($alnScores{$candiKey}, $fas_box{$candiKey}); - ## candidate is better, than the last one - if ($newRankScore > $rankScore){ - $bestTaxon = ">" . $candiKey; - $rankScore = $newRankScore; - ($header, $seq) = getHeaderSeq($bestTaxon); - printDebug("New Taxon has the same distance, choosing the one with higher score"); - unless ($silent) { - print "New Best Taxon: $bestTaxon\n"; - } - } - } - } - ## candidate reached the maximum score, no need to evaluate further - if ($rankScore >= $maxScore){ - $sufficientlyClose = 1; - printDebug("Rankscore is at maximum. Breaking loop..."); - last; - } - ## rankscore got sufficiently close to the maximum, only evaluate candidates with the same distance now - elsif ($rankScore >= $maxScore * (1 - $distDeviation) and !$ignoreDistance){ - printDebug("Sufficiently close to max score. Only evaluating leafs with same distance now."); - unless ($silent) { - print "MaxScore: $maxScore\n"; - print "RankScore: $rankScore\n"; - } - $sufficientlyClose = 1; - } - clearTmpFiles(); - } - ## no candidate file was created -> so no candidate was found - else{ - unless ($silent) { - print "No Candidate was found for $keyName\n"; - } - } - } -} - -my @best = (split '\|', $bestTaxon); -$currentNoRankDistNode = $newNoRankDistNode; -$currentChildsToIgnoreNode = $newChildsToIgnoreNode; -clearTmpFiles(); - -if ($bestTaxon ne ''){ - open (COREORTHOLOGS, ">>$outputFa") or die "Error: Could not open file: " . $outputFa . "\n"; - print COREORTHOLOGS "\n" . $header . "\n" . $seq; - close COREORTHOLOGS; - return $best[1]; -}else{ - return ''; -} -} - -###################### -## param: %candicontent - hashed information about candidates (id-> sequence) -## param: $scorefile - filename with alignment tool output -## cumulative alignment scores -## candidates vs sofar core ortholog set -## return: hash of scores (id->score) -sub cumulativeAlnScore{ - my $file = $_[0]; - my %content = %{$_[1]}; - - my %cumscores; - foreach my $key(keys%content) { - my $gotScore = 0; - open (RESULT, $file) or die "Error: Could not open file with candidate taxa\n"; - while() { - my $line = $_; - $line =~ s/[\(\)]//g; - my @line = split('\s+',$line); - my $shortedId = substr($key, 0, 60); - # if($line[0] && ($line[0] eq $key)){ - if($line[0] && ($line[0] eq $shortedId)){ - if(exists $cumscores{$key}) { - $gotScore = 1; - $cumscores{$key} = $cumscores{$key} + $line[2]; - }else{ - $gotScore = 1; - $cumscores{$key} = $line[2]; - } - } - } - close RESULT; - if ($gotScore == 0){ - $cumscores{$key} = 0; - } - } - return %cumscores; -} - -###################### -sub get_leaves { - my $tree = $_[0]; - my $delFlag = 0; - if(defined($_[1])){ - $delFlag = $_[1]; - } - - my $node = $tree->get_root_node; - my @leaves; - my @children = ($node); - for (@children) { - push @children, $_->each_Descendent(); - } - for (@children) { - push @leaves, $_ if defined($_->name('supplied')); - } - # if the tree is set to be deleted - if ($delFlag){ - @leaves = qw(); - return @leaves; - }else{ - return @leaves; - } -} - -################################# -## sorts given leaves by distance -## and delets all leaves to close to the current core -sub sort_leaves { - my @leaves = @_; - my $distNode = $currentChildsToIgnoreNode; - my @candiLeaves; - my @finalLeaves; - - for (@leaves) { - if (!defined $hashTree{$distNode}{$_->id}){ - push @candiLeaves, $_ if defined($_->name('supplied')); - } - } - while ($distNode->id != $tree->get_root_node->id and scalar @candiLeaves != 0){ - $distNode = $distNode->ancestor; - my @nextCandiLeaves; - my @sameDistLeaves; - for (@candiLeaves){ - if (defined $hashTree{$distNode}{$_->id}){ - push @sameDistLeaves, $_ if defined($_->name('supplied')); - } - else{ - push @nextCandiLeaves, $_ if defined($_->name('supplied')); - } - } - @sameDistLeaves = shuffle @sameDistLeaves; - if (scalar @sameDistLeaves != 0){ - push @finalLeaves, \@sameDistLeaves; - } - @candiLeaves = @nextCandiLeaves; - } - return @finalLeaves; -} -####### get all taxa from the database (or the $genome_dir) where a genome is available -sub getTaxa { - if ($dbmode) { - my ($sql) = "select l.taxon_id, l.taxon_db, l.max_source_id, t.ncbi_id from cproteome_list.list l, taxon t where t.taxon_id = l.taxon_id and t.ncbi_id != 0"; - my ($query) = $dbHandle->prepare($sql); - $query->execute(); - while(my @result = $query->fetchrow_array) { - ## modified by ingo: make sure to capture the max_source_id - my $tax_src = $result[1] . '@' . $result[3] . '@' . $result[2]; - push @taxonlist, $tax_src; - $taxa{$tax_src} = $result[3]; - printDebug("ncbiid of $tax_src is $taxa{$tax_src}"); - if ($getThemAll){ - getProteome($tax_src); - } - } - } - else { - ## removal of misplaced files in genome_dir - if (-e "$genome_dir/query.sql"){ - unlink("$genome_dir/query.sql"); - } - if (-e "$genome_dir/@@.fa"){ - unlink("$genome_dir/@@.fa"); - } - @taxonlist = `ls $genome_dir`; - chomp @taxonlist; - for (my $i = 0; $i < @taxonlist; $i++) { - my ($taxon_name, $ncbi_id, $src_id) = split /@/, $taxonlist[$i]; - if (!$src_id) { - $src_id = ''; - } - $taxon_name = $taxonlist[$i]; - $taxa{$taxon_name} = $ncbi_id; - } - } - ### if the blast option is chosen, we will need blast databases for all taxa - ### Baustelle: have one database including all taxa to run just a single instead of n blast searches - if ($blast or $updateBlast_dir){ - for (my $i = 0; $i < @taxonlist; $i++){ - checkBlastDb($taxonlist[$i], $taxonlist[$i]); - } - if ($updateBlast_dir){ - print "\nMissing blast databases updated. Exiting.\n"; - exit; - } - } - my $hashcount = keys(%taxa); - printDebug("Returning $hashcount taxa from subroutine getTaxa"); - return(%taxa); -} -####### get all available reference taxa -sub getRefTaxa { - @refTaxonlist = `ls $blastPath`; - chomp @refTaxonlist; - for (my $i = 0; $i < @refTaxonlist; $i++) { - my ($taxon_name, $ncbi_id, $src_id) = split /@/, $refTaxonlist[$i]; - if (!$src_id) { - $src_id = ''; - } - $taxon_name = $refTaxonlist[$i]; - $refTaxa{$taxon_name} = $ncbi_id; - } - return(%refTaxa); -} -#################### -sub getTree { - # the full lineages of the species are merged into a single tree - my $tree; - foreach my $key (sort {lc $a cmp lc $b} keys %taxa) { - my $node = $db->get_taxon(-taxonid => $taxa{$key}); - printDebug("\$key in sub getTree is $key and taxid is $taxa{$key}\n"); - if (!defined $node){ - print "ISSUE in sub getTree. No correspodence found in taxonomy file for $key and taxid $taxa{$key}. Skipping...\n"; - next; - } - else { - $node->name('supplied', $key); - if($tree) { - $tree->merge_lineage($node); - } - else { - $tree = Bio::Tree::Tree->new(-verbose => $db->verbose, -node => $node); - } - } - } - if ($debug){ - print "\nTaxonomic Tree as text:\n"; - my $tree_as_string = $tree->as_text("tabtree"); - print $tree_as_string; - print "\n"; - } - return $tree; -} - -sub getRefTree { - # the full lineages of the species are merged into a single tree - my $tree; - foreach my $key (sort {lc $a cmp lc $b} keys %refTaxa) { - my $node = $db->get_taxon(-taxonid => $refTaxa{$key}); - printDebug("\$key in sub getRefTree is $key and taxid is $refTaxa{$key}\n"); - if (!defined $node){ - print "ISSUE in sub getRefTree. No correspodence found in taxonomy file for $key and taxid $refTaxa{$key}. Skipping...\n"; - next; - } - else { - $node->name('supplied', $key); - if($tree) { - $tree->merge_lineage($node); - } - else { - $tree = Bio::Tree::Tree->new(-verbose => $db->verbose, -node => $node); - } - } - } - if ($debug){ - print "\nTaxonomic Tree as text:\n"; - my $tree_as_string = $tree->as_text("tabtree"); - print $tree_as_string; - print "\n"; - } - return $tree; -} - -sub getTaxonName { - my $taxAbbr = $_[0]; - my @tmp = split(/@/,$taxAbbr); - my $taxon = $db_bkp->get_taxon($tmp[1]); - if (defined($taxon)) { - return($taxon->scientific_name); - } else { - return("Unk"); - } -} - -##################### perform the search for orthologs -# using the core-orthologs found in the previous steps -sub runHamstr { - my ($taxon, $seqName, $outputFa, $refSpec, $hitlimit, $rep, $sub_strict, $subcoremode, $ev_blst, $ev_hmm, $aln, $core) = (@_); - my $taxaDir = $taxaPath . $taxon; - printDebug("Running fdog: $taxon\t$seqName\t$outputFa\t$refSpec\t$taxaDir"); - if (! -e $taxaDir) { - ## backward compatibility. I used to name the dirs with the ending .dir - if (-e "$taxaDir.dir"){ - $taxaDir = $taxaDir . '.dir'; - } - } - $taxaDir =~ s/\s*//g; - if(! -e $taxaDir and $dbmode) { - getProteome($taxon); - } - if (-e $taxaDir) { - unless ($silent) { - print "fdog for taxon: " . $taxon . "\n"; - } - chdir($taxaDir) or die "Error: Directory for " . $taxon . " does not exist!\n"; - my $seqfile = $taxon . ".fa"; - - if(! -e $seqfile) { - printOut("Could not find $seqfile. Check naming conventions of the files. Exiting..."); - exit; - } - - if($seqFile ne "") { - my $taxon_id = substr($taxon, 6, length($taxon)); - my @hamstr = ($hamstrPath, "-sequence_file=".$seqfile, "-fasta_file=".$outputFa, "-hmmpath=".$coreOrthologsPath , "-outpath=".$outputPath, - "-blastpath=".$blastPath , "-protein", "-hmmset=".$seqName, "-taxon=".$taxon, "-force", - "-eval_blast=".$ev_blst, "-eval_hmmer=".$ev_hmm, "-central", "-aligner=".$aln); - - my $resultFile; - if (defined $autoLimit) { - push(@hamstr, "-autoLimit"); - } - elsif (defined $scoreThreshold) { - push(@hamstr, "-scoreThreshold"); - push(@hamstr, "-scoreCutoff=$scoreCutoff"); - } - elsif (defined $hitlimit) { - push(@hamstr, "-hit_limit=$hitlimit"); - } - if($sub_strict) { - push(@hamstr, "-strict"); - $resultFile = $outputPath . "/fa_dir_" . $taxon . '_' . $seqName . "_strict/" . $seqName . ".fa"; - } - else { - push(@hamstr, "-refspec=".$refSpec); - $resultFile = $outputPath . "/fa_dir_" . $taxon . '_' . $seqName . "_" . $refSpec . "/" . $seqName . ".fa"; - } - if($rep) { - push(@hamstr, "-representative"); - } - if ($checkcoorthologsref and $subcoremode==0){ - push @hamstr, '-checkCoorthologsRef'; - } - if ($cccr and $subcoremode==1){ - push @hamstr, '-checkCoorthologsRef'; - } - if ($rbh) { - push @hamstr, "-rbh"; - } - ## added 2019-11-19 - if ($append) { - push @hamstr, "-append"; - } - ## - if ($silent) { - push @hamstr, "-silent"; - } - if ($debug) { - push @hamstr, "-debug"; - } - printDebug(@hamstr); - system(@hamstr) == 0 or die "Error: fdog failed for " . $taxon . "\n"; - - if ($core == 1) { - if ($outputFa !~ /extended/){ - $outputFa .= '.extended'; - } - if(-e $resultFile) { - unless (-e $outputFa) { - open(EXTENDEDFA, ">$outputFa") or die "Cannot create $outputFa\n"; - } else { - open(EXTENDEDFA, ">>$outputFa") or die "Cannot create $outputFa\n"; - } - my $resultFa = Bio::SeqIO->new(-file => $resultFile, '-format' => 'Fasta'); - while(my $resultSeq = $resultFa->next_seq) { - if ($resultSeq->id =~ /$taxon\|(.)+\|[01]$/) { - my @tmpId = split("\\|", $resultSeq->id); - print EXTENDEDFA ">$tmpId[0]\|$tmpId[-3]\|$tmpId[-2]\|$tmpId[-1]\n",$resultSeq->seq,"\n"; - } - } - # addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa); - } else { - # add seed sequence to output extended.fa if no ortholog was found in refSpec - if ($taxon eq $refSpec) { - addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa); - } - printDebug("$resultFile not found"); - } - } - } - #remove the created folders and files - #delete fa_dir - my $delCommandFa; - my $delCommandHmm; - my $delCommandHam; - # my $outputPathTmp = $outputPath; $outputPathTmp =~ s/\|/\\\|/g; - # my $taxonTmp = $taxon; $taxonTmp =~ s/\|/\\\|/g; - # my $seqNameTmp = $seqName; $seqNameTmp =~ s/\|/\\\|/g; - if (!$strict) { - $delCommandFa = "rm -rf \"" . $outputPath . "/fa_dir_" . $taxon . "_" . $seqName . "_" . $refSpec . "\""; - $delCommandHmm = "rm -rf \"" . $outputPath . "/hmm_search_" . $taxon . "_" . $seqName . "\""; - if ($core == 1) { - $delCommandHam = "rm -f \"" . $outputPath . "/hamstrsearch_" . $taxon . "_" . $seqName . ".out" . "\""; - } - } else { - $delCommandFa = "rm -rf \"" . $outputPath . "/fa_dir_" . $taxon . "_" . $seqName . "_strict" . "\""; - $delCommandHmm = "rm -rf \"" . $outputPath . "/hmm_search_" . $taxon . "_" . $seqName . "\""; - if ($core == 1) { - $delCommandHam = "rm -f \"" . $outputPath . "/hamstrsearch_" . $taxon . "_" . $seqName . ".strict.out" . "\""; - } - } - printDebug("executing $delCommandFa", "executing $delCommandHmm"); - if ($core == 1) { - printDebug("executing $delCommandHam"); - } - if ($autoclean) { - system ($delCommandFa) == 0 or die "Error deleting result files\n"; - system ($delCommandHmm) == 0 or die "Error deleting result files\n"; - if ($core == 1) { - system ($delCommandHam) == 0 or die "Error deleting result files\n"; - } - } - } - else { - print "No protein set available for $taxon. Failed to fetch it from database and nothing at $taxaDir. Skipping!\n"; - } -} - -# add seed sequence to output file if not exists -sub addSeedSeq { - my ($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa) = @_; - unless (-e $outputFa) { - system("touch $outputFa"); - } - # get seed sequence and add it to the beginning of the fasta output - open(TEMP, ">$outputFa.temp") or die "Cannot create $outputFa.temp!\n"; - my $seqio = Bio::SeqIO->new(-file => "$coreOrthologsPath/$seqName/$seqName.fa", '-format' => 'Fasta'); - my %idTmp = (); # used to check which seq has already been written to output - while(my $seq = $seqio->next_seq) { - my $id = $seq->id; - if ($id =~ /$refSpec/) { - $idTmp{"$id|1"} = 1; - print TEMP ">$id|1\n", $seq->seq, "\n"; - #last; - } - } - # then write other sequences - my $seqio2 = Bio::SeqIO->new(-file => "$outputFa", '-format' => 'Fasta'); - while(my $seq = $seqio2->next_seq) { - my $id = $seq->id; - unless ($id =~ /$refSpec\|$seqId/) { # /$refSpec/) { - unless ($idTmp{$id}) { - print TEMP ">$id\n", $seq->seq, "\n"; - $idTmp{$id} = 1; - } - } - } - close(TEMP); - system("mv $outputFa.temp $outputFa") -} - -########################## -sub hmmbuild { - # my @hmmbuild = ("hmmbuild", $_[0], $_[1]); - # system(@hmmbuild) == 0 or die "hmmbuild failed"; - my $hmmbuild = `hmmbuild $_[0] $_[1] > /dev/null 2>&1`; -} - -sub parseInput { - my $node = $_[0]; - my $level = $_[1]; - my $rank = $node->rank; - printDebug("\nLEVEL:".$level."\n"); - printDebug("\nRANK:".$rank."\n"); - while($level > 0) { - $node = $node->ancestor; - $rank = $node->rank; - --$level; - } - print "\nRETURN RANK: ".$rank."\n"; - return $rank; -} -########################## -sub parseTaxaFile { - my $coreTaxaFile = $_[0]; - open (INPUT, "<$coreTaxaFile") or die print "Error opening file with taxa for core orthologs search\n"; - my @userTaxa; - while() { - my $line = $_; - chomp($line); - if (length($line) > 0) { - if(!$taxa{$line}) { - print "You specified " . $line . " in your core orthologs file but the taxon is not in the database!\n"; - exit; - } else { - push(@userTaxa, $line); - } - } - } - close INPUT; - return @userTaxa; -} -########################## -# sub printTaxa { -# my @result = qw(); -# if ($dbmode) { -# print "taxon_schema\tsource_id\ttaxon name\n"; -# print "------------\t---------\t----------\n"; -# my ($sql) = "select t.name, c.taxon_db, c.max_source_id from taxon t, cproteome_list.list c where t.taxon_id=c.taxon_id"; -# my ($query) = $dbHandle->prepare($sql); -# $query->execute(); -# @result = $query->fetchrow_array; -# while(my @result = $query->fetchrow_array) { -# print $result[1] . " \t" . $result[2] . "\t" . $result[0] . "\n"; -# } -# } -# else { -# print "Taxon_Name\tNCBI_ID\n"; -# print "-------------\t------------\n"; -# my $taxacall= "ls $genome_dir |$sedprog -e 's/@/\t/'"; -# @result = `$taxacall`; -# chomp @result; -# print join "\n", @result; -# print "\n"; -# } -# } -########################### -sub printTaxonomy { - my $node = $_[0]; - my $i = 0; - if($node->rank eq "species") { - print "[" . $i . "]: " . $node->rank . " (" . $node->scientific_name . ")\n"; - while($node->ancestor) { - $node = $node->ancestor; - ++$i; - print "[" . $i . "]: " . $node->rank . " (" . $node->scientific_name . ")\n"; - } - } -} -############################ -sub remove_branch { - my $node = $_[0]; - my $delFlag = 0; - printDebug("Subroutine remove_branch\nNode is $node\nRank of node: ".$node->rank."\nNumber of leaves before removing branch ".get_leaves($tree)."\n\n"); - - # undef the tree if there is only one leave left which must be removed - if (get_leaves($tree) == 1){ - $delFlag = 1; - }else{ - while (defined $node->ancestor) { - last if $node->ancestor->each_Descendent > 1; - $node = $node->ancestor; - } - $node->remove_all_Descendents; - if(defined $node->ancestor) { - $node->ancestor->remove_Descendent($node); - } - } - printDebug("Subroutine remove_branch\nNode is $node\nRank of node: ".$node->rank."\nNumber of leaves after removing branch ".get_leaves($tree, $delFlag)."\n\n"); - return $delFlag; -} -############################ -sub removeMaxDist { - my $node = $tree->find_node(-ncbi_taxid => $refTaxa{$refSpec}); - my $root = $tree->get_root_node(); - - if ($maxDist eq "no rank"){ - $tree->set_root_node($root); - }else{ - while($node->rank ne $maxDist && $node != $root) { - $node = $node->ancestor; - } - $tree->set_root_node($node); - } -} -############################ -# node determines the node in the tree in accordance to the given ncbi taxon id -sub removeMinDist { - my $ncbiId = $_[0]; - my $node = $tree->find_node(-ncbi_taxid => $ncbiId); - my $root = $tree->get_root_node(); - my $delFlag; - - printDebug("Subroutine removeMinDist\nncbiID is $ncbiId\nNode is $node\nRank of node is ".$node->rank."\nroot is $root\nMinimal distance is $minDist\n"); - - # increasing the rank of the node - while($node->rank ne $minDist && $node != $root && defined($node->ancestor)) { - if ($debug){ - print "Increasig the rank\nRank: ".$node->rank."\nNode: ".$node."\n\n"; - } - - $node = $node->ancestor; - } - - #if the species has the same ranks as the references species - if($node == $root) { - my @toCompare = (); - my $i = @defaultRanks - 1; - while($i >= 0 && $defaultRanks[$i] ne $minDist) { - push(@toCompare, $defaultRanks[$i]); - --$i; - } - $node = $tree->find_node(-ncbi_taxid => $ncbiId); - my $lastToCompare = $toCompare[$#toCompare]; - foreach(@toCompare) { - while($node->rank eq "no rank") { - $node = $node->ancestor; - } - if($node->rank ne $lastToCompare && $node->rank eq $_) { - $node = $node->ancestor; - } - } - } - $delFlag = remove_branch($node); - return $delFlag; -} - -############################ -## builds a 2 dimensional hash in which you can check for a node, -## wheather there is a path down the tree to a given species -sub buildHashTree { - unless ($silent) { - print "Building hash tree\n"; - } - - printDebug("Creating variables..."); - my %hashTree; - my %nextNodes; - my %processed; - my @ancestors; - my $rootNode = $wholeTree->get_root_node(); - - unless ($silent ){ - print "Processing leafs...\n"; - } - ## create entry for leafes - foreach my $leaf (get_leaves($wholeTree)){ - my $key = $leaf->id; - my %leafHash; - $leafHash{$key} = "exists"; - $hashTree{$leaf}{$key} = "exists"; - my $nextNode = $leaf->ancestor; - my $nextNodeKey = $nextNode->id; - my $test = $hashTree{$leaf}{$key}; - my $nodeTest = $nextNodeKey; - printDebug("Leaf $key set to $test"); - ## queue ancestor node for processing, if it hasn't been queued already - if (!$nextNodes{$nextNodeKey}){ - $nextNodes{$nextNodeKey} = $nextNode; - push @ancestors, $nextNode; - printDebug("Queuing ancestor $nextNodeKey for processing...\n"); - } - $processed{$leaf} = 1; - } - unless ($silent) { - print "Finished leafs\n"; - } - - ## create entries for all other nodes - unless ($silent) { - print "Processing ancestor nodes\n"; - } - foreach my $node (@ancestors){ - my $test = $node->id; - printDebug("Processing node: $test\n"); - my $bool = 1; - ## check, weather all childs have already been processed - foreach my $child ($node->each_Descendent()){ - if (!defined $processed{$child}){ - $bool = 0; - } - } - ## if all childs have been processed, process this node - if ($bool == 1){ - printDebug("All children processed for node: $test"); - ## node is not root - if ($node != $rootNode){ - printDebug("Node $test is not root"); - foreach my $child ($node->each_Descendent()){ - while (my ($key, $value) = each %{$hashTree{$child}}){ - $hashTree{$node}{$key} = $value; - printDebug("Node $key $value in node $test"); - } - } - my $nextNode = $node->ancestor; - my $nextNodeKey = $nextNode->id; - ## queue ancestor node for processing, if it hasn't been queued already - if (!$nextNodes{$nextNodeKey}){ - $nextNodes{$nextNodeKey} = $nextNode; - push @ancestors, $nextNode; - printDebug("Queuing ancestor $nextNodeKey for processing..."); - } - } - ## node is root - else{ - printDebug("Node $test is root"); - foreach my $child ($node->each_Descendent()){ - while (my ($key, $value) = each %{$hashTree{$child}}){ - $hashTree{$node}{$key} = $value; - printDebug("Node $key $value in node $test"); - } - } - } - ## mark node as processed - $processed{$node} = 1; - printDebug("Node $test has been processed\n\n"); - } - ## not all childs have been processed - ## queue node again - else{ - push @ancestors, $node; - printDebug("Not all children processed for node: $test"); - printDebug("Queuing $test again...\n\n"); - } - } - unless ($silent) { - print "Finished processing ancestor nodes\n"; - print "Finished building hash tree\n"; - print "Returning hash tree...\n"; - } - return %hashTree; -} - -########################## -sub getProteome { - my $taxstring = shift; - my $outdir = $taxstring; - my $outfile = $taxstring; - my @outfile; - $taxstring =~ /(.*)@([0-9]+)@([0-9]+)/; - my ($schema, $ncbi_id, $src_id) = ($1, $2, $3); - print "\n\nAttempting to fetch information for $schema using source id $src_id\n\n"; - - ## create the relevant directory - if (!-e "$taxaPath/$outdir"){ - print "creating directory $taxaPath/$outdir\n"; - mkpath($taxaPath."/".$outdir); - if (-e "$taxaPath/$outdir") { - print "succeeded\n"; - } - else { - print "create directory failed\n"; - } - } - - ### This is the sql statement required for fetching the sequence information from the database - ## Using Here Documents ####### - my $sql = <<"________END_OF_STATEMENT"; - use $schema; - select concat('>',i.id, '\\n', p.seq) from ids i, protein p - where - i.protein_id = p.id and - i.representative = 1 and - i.src_id = $src_id and - length(p.seq) > 30; -________END_OF_STATEMENT - ## the previous line must be exactly like it is to match the end note (Here Doc) - - printDebug("$sql\n"); - open (OUTQUERY, ">$taxaPath/$outdir/query.sql") or die "Could neither find nor create query.sql in $taxaPath/$outdir"; - print OUTQUERY $sql; - close OUTQUERY; - print "attempting to enter $taxaPath/$outdir\n"; - chdir("$taxaPath/$outdir") or die "could not enter $taxaPath/$outdir"; - `$homeDir/bin/run-query.sh $schema $ncbi_id $src_id`; -} -############ -#Baustelle: run generation of BlastDb in a sub routine -## now create the relevant blast directories if necessary -sub checkBlastDb { - my ($taxstring, $filename) = @_; - ## $taxstring identifies the species directory, $filename identifies the name of the file containing the protein set - while (! -e "$taxaPath/$taxstring/$filename.fa"){ - my $count = 0; ## avoid endless loop - printDebug("could not find $taxaPath/$taxstring/$filename.fa\n"); - getProteome($taxstring); - if ($count == 5){ - die "could not find $taxaPath/$taxstring/$filename.fa and could not retrieve this information from the database.\nTerminating...\n\n"; - } - } - if (! -e "$blastPath/$taxstring" or $updateBlast_dir){ - `mkdir $blastPath/$taxstring`; - } - if (! -e "$blastPath/$taxstring/$filename.fa" or $updateBlast_dir){ - `ln -s $taxaPath/$taxstring/$filename.fa $blastPath/$taxstring/$filename.fa`; - } - if (! -e "$blastPath/$taxstring/$filename.pin" or $updateBlast_dir){ - chdir("$blastPath/$taxstring") or die "failed to change to dir\n"; - if ($blast_prog eq 'blastall'){ - `formatdb -i $filename.fa -t $filename -n $filename`; - } - elsif ($blast_prog eq 'blastp'){ - printOut("attempting to run makeblastdb", 2); - `makeblastdb -in $filename.fa -dbtype prot -title $filename -out $filename`; - } - } -} -################# -sub printDebug{ - my @message = @_; - if ($debug){ - print join "\n", @message; - print "\n"; - } -} -sub printVariableDebug{ - my @values = @_; - print "\n\nDEBUG\n"; - foreach (@values){ - print $_."\n"; - } - print "\nEND OF DEBUG\n\n"; -} -################# -sub getInput { - my ($message, $dieopt) = @_; - if ($dieopt){ - $message .= ', or type \'q\' to quit'; - } - print ("\n" . $message . ": "); - my $input = ; - chomp $input; - if ($input =~ /^q$/i and $dieopt) { - die "Quitting!\n"; - } - else { - return ($input); - } -} -################# -sub runBlast { - my ($query, $inpath, $outname, $outpath, $blastdb) = @_; - printDebug("running $blast_prog on database $blastdb using input $inpath/$query and writing to $outpath/$outname.blast"); - - if ($blast_prog =~ /blast[px]/) { - !`$blast_prog -db $blastdb -seg $filter -max_target_seqs 10 -evalue $eval_blast_query -outfmt 5 -query $inpath/$query -out $outpath/$outname.blast` or die "Problem running $blast_prog\n"; - } - elsif ($blast_prog =~ /blastall/) { - !`$blast_prog -p $algorithm -d $blastdb -F $filter -e $eval_blast_query -m7 -i $inpath/$query -o $outpath/$outname.blast` or die "Problem running $blast_prog\n" - } - else { - `$blast_prog -ublast $inpath/$query -db $blastdb -accel $accel -evalue $eval_blast_query -blast6out $outpath/$outname.blast` or die "Problem running $blast_prog\n"; - - ## sort the output as ublast does not do it (at least not for ESTs) - `sort -n -r -k 12 $outpath/$outname.blast >$outpath/blastsort.tmp`; - `mv $outpath/blastsort.tmp $outpath/$outname.blast`; - } - printDebug("returning $outname.blast for subroutine runBlast\n"); - return("$outname.blast"); -} -############# -sub getBestBlasthit { - my $hits; - my $frame; - my $count = 0; - my ($inpath, $resultfile) = @_; - printDebug("Sub getBestBlasthit running on $inpath/$resultfile"); - my $searchio = Bio::SearchIO->new( - -file => "$inpath/$resultfile", - -format => $outputfmt) - or die "parse failed"; - while(my $result = $searchio->next_result){ - my $sig; - my $sig_old; - while( my $hit = $result->next_hit) { - my $frameval = $hit->strand('query'); - if ($frameval >0){ - $frame = '+'; - } - elsif ($frameval <0 ) { - $frame = '-'; - } - else { - $frame = 'na'; - } - ## now I enter all top hits having the same score into the result - $sig = $hit->score; - if (!defined $sig_old) { - $sig_old = $sig; - } - if ($sig == $sig_old) { - $hits->[$count]->{name} = $hit->name; - $hits->[$count]->{score} = $sig; - $hits->[$count]->{evalue} = $hit->significance; - $count ++; - } - else { - ## there is no lower ranking hit with the same score as the best hit. End the loop. - last; - } - } - } - return($hits); -} -########################## -sub printOut { - my ($message, $mlevel) = @_; - if ($mlevel <= $vlevel){ - print "$message\n"; - } - ###################### - sub checkInt { - my $number = shift; - if ($number =~ /[^0-9]/){ - return(); - } - else{ - return($number); - } - } -} - -########################### -sub initialCheck { - my ($seed, $ogName, $blastDir, $genomeDir, $weightDir, $fasoff) = @_; - # check tools exist - my @tools = ("hmmsearch", "muscle", "mafft", $globalaligner, $localaligner, $glocalaligner); - if ($^O eq "darwin") { - push(@tools, "clustalw2") - } else { - push(@tools, "clustalw") - } - my $flag = 1; - foreach my $tool (@tools) { - my $check = `which $tool`; - if (length($check) < 1) { - print "$tool not found\n"; - $flag = 0; - } - } - if ($flag < 1) { - die "ERROR: Some required tools not found! Please install fdog again!\n"; - } - - # check executable FAS - my $fasCheckMsg = `fas.setup -t ./ -c 2>&1`; - if ($fasoff != 1 && $fasCheckMsg =~ /ERROR/) { - die "ERROR: FAS not ready to use! Please check https://github.com/BIONF/FAS/wiki/setup\n"; - } - - # check seed fasta file - unless (-e $seed) { - $seed = "$dataDir/$seed"; - } - my $seqio = Bio::SeqIO->new(-file => $seed, '-format' => 'Fasta'); - while(my $seq = $seqio->next_seq) { - my $string = $seq->seq; - if ($string =~ /[^a-zA-Z]/) { - die "ERROR: $seed contains special characters!\n"; - } - } - - # check ortholog group name - if (!defined $ogName) { - die "ERROR: Ortholog group name (-seqName) invalid!\n"; - } else { - if ($ogName =~ /[\|\s+\"\'\`\´\!\^]/) { - die "ERROR: Ortholog group name (-seqName) cannot contain PIPE|space or \" \' \` \´ \! \^\n"; - } - } - - # check genome_dir - my @genomeDir = checkValidFolderName($genomeDir); - foreach my $genomeFd (@genomeDir) { - unless ($genomeFd =~ /^\./) { - my $genome = getGenomeFile("$genomeDir/$genomeFd", $genomeFd); - unless (-e "$genome.checked") { - die "ERROR: $genome.checked not found!\nPlease run fdog.checkData before running fdog!\n"; - } - } - } - # check blast_dir - my @blastDir = checkValidFolderName($blastDir); - foreach my $blastFd (@blastDir) { - unless ($blastFd =~ /^\./) { - my $genome = getGenomeFile("$blastDir/$blastFd", $blastFd); - unless (-e "$genome.checked") { - die "ERROR: $genome.checked not found!\nPlease run fdog.checkData before running fdog!"; - } - } - } - # check weight_dir - if ($fasoff != 1) { - my %seen; - my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir); - my @notFolder; - for (my $i = 0;$i < scalar(@allTaxa); $i++){ - if (-f "$blastDir/$allTaxa[$i]" || -f "$genomeDir/$allTaxa[$i]") { - push(@notFolder, $allTaxa[$i]); - splice(@allTaxa, $i, 1); - } - } - if (scalar(@notFolder) > 0) { - print "*** WARNING: Found files in $genomeDir or $blastDir:\t@notFolder\n"; - } - chomp(my $allAnno = `ls $weightDir | $sedprog \'s/\\.json//\'`); - my @allAnno = split(/\n/, $allAnno); - my @missingAnno = array_minus(@allTaxa, @allAnno); - if (scalar @missingAnno > 0) { - my $missingAnno = join("\n", @missingAnno); - die "ERROR: Some taxa do not have annotation! Please turn off FAS calculation (with -fasoff), or annotate their genomes before continue.\n$missingAnno\n"; - } - } -} - -sub getGenomeFile { - my ($folder, $filename) = @_; - chomp(my $faFile = `ls $folder/$filename.fa* | $grepprog -v \"\\.checked\\|\\.mod\\|\\.mapping\\|\\.tmp\"`); - my $out = $faFile; - chomp(my $link = `$readlinkprog -f $faFile`); - if ($link ne "") { - $out = $link; - } - return($out); -} - -sub checkValidFolderName { - my $folder = $_[0]; - # check if folder and its subfolders contain illegal character (e.g. pipe) - opendir(my $dh, $folder) || die "Can't open $folder: $!"; - if ($folder =~ /[\|\s+]/) { - die "ERROR: $folder contains illegal character (e.g. PIPE or space)!\n"; - } - my @folders = readdir($dh); - foreach my $fd (@folders) { - next if ($fd eq "." or $fd eq ".."); - if ($fd =~ /[\|\s+]/) { - die "ERROR: $folder/$fd contains illegal character (e.g. PIPE or space)!\n"; - } - } - closedir $dh; - my @notFd = (".", ".."); - return(array_minus(@folders, @notFd)); -} - -sub gettime { sprintf"%d.%03d",Time::HiRes::gettimeofday } -sub roundtime { sprintf("%.2f", $_[0]); } - -########################### -sub helpMessage { - my $helpmessage = " -YOU ARE RUNNING $version on $hostname - -This program is freely distributed under a GPL. -Copyright (c) GRL limited: portions of the code are from separate copyrights - -\nUSAGE: oneSeq.pl -seqFile=<> -seqId=<> -seqName=<> -refSpec=<> -minDist=<> -maxDist=<> [OPTIONS] - -OPTIONS: - -GENERAL - --h - Invoke this help method --version - Print the program version - -REQUIRED - --seqFile=<> - Specifies the file containing the seed sequence (protein only) in fasta format. - If not provided the program will ask for it. --seqId=<> - Specifies the sequence identifier of the seed sequence in the reference protein set. - If not provided, the program will attempt to determin it automatically. --refSpec=<> - Determines the reference species for the ortholog search. It should be the species the seed sequence was derived from. - If not provided, the program will ask for it. --minDist=<> - specify the minimum systematic distance of primer taxa for the core set compilation. - If not provided, the program will ask for it. --maxDist=<> - specify the maximum systematic distance of primer taxa to be considered for core set compilation. - If not provided, the program will ask for it. --coreOrth=<> - Specify the number of orthologs added to the core set. - -USING NON-DEFAULT PATHS - --outpath=<> - Specifies the path for the output directory. Default is $outputPath; --hmmpath=<> - Specifies the path for the core ortholog directory. Default is $coreOrthologsPath; --blastpath=<> - Specifies the path for the blastDB directory. Default is $blastPath; --searchpath=<> - Specifies the path for the search taxa directory. Default is $genome_dir; --weightpath=<> - Specifies the path for the pre-calculated feature annotion directory. Default is $weightPath; - -ADDITIONAL OPTIONS - --append - Set this flag to append the output to existing output files --seqName=<> - Specifies a name for the search. If not set a random name will be set. --db - Run in database mode. Requires a mySql database. Only for internatl use. --filter=[T|F] - Switch on or off the low complexity filter for the blast search. Default: T --silent - Surpress output to the command line --coreTaxa=<> - You can provide a list of primer taxa that should exclusively be used for the compilation - of the core ortholog set --strict - Run the final ortholog search in 'strict mode'. An ortholog is only then accepted when the reciprocity is fulfilled - for each sequence in the core set. --force - Force the final ortholog search to create output file. Existing files will be overwritten. --coreStrict - Run the compilation of the core set in strict mode. --checkCoorthologsRef - During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the - core ortholog itself, but a co-ortholog of it. --CorecheckCoorthologsRef - Invokes the 'checkCoorthologsRef' behavior in the course of the core set compilation. --rbh - Requires a reciprocal best hit during the ortholog search to accept a new ortholog. --evalBlast=<> - This option allows to set the e-value cut-off for the Blast search. Default: 1E-5 --evalHmmer=<> - This options allows to set the e-value cut-off for the HMM search. Default: 1E-5 --evalRelaxfac=<> - This options allows to set the factor to relax the e-value cut-off (Blast search and HMM search) for the final ortholog run. Default: 10 --hitLimit=<> - Provide an integer specifying the number of hits of the initial pHMM based search that should be evaluated - via a reverse search. Default: 10 --coreHitLimit=<> - Provide an integer specifying the number of hits of the initial pHMM based search that should be evaluated - via a reverse search. Default: 3 --autoLimit - Setting this flag will invoke a lagPhase analysis on the score distribution from the hmmer search. This will determine automatically - a hit limit for each query. Note, when setting this flag, it will be effective for both the core ortholog compilation - and the final ortholog search. --scoreThreshold - Instead of setting an automatic hit limit, you can specify with this flag that only candidates with an hmm score no less - than x percent of the hmm score of the best hit are further evaluated. Default is x = 10. - You can change this cutoff with the option -scoreCutoff. Note, when setting this flag, it will be effective for - both the core ortholog compilation and the final ortholog search. --scoreCutoff=<> - In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a - candidate of the hmmsearch will be subjected for further evaluation. Default: 10%. --coreOnly - Set this flag to compile only the core orthologs. These sets can later be used for a stand alone ortholog search. --reuseCore - Set this flag if the core set for your sequence is already existing. No check currently implemented. --ignoreDistance - Set this flag to ignore the distance between Taxa and to choose orthologs only based on score --distDeviation=<> - Specify the deviation in score in percent (1=100%, 0=0%) allowed for two taxa to be considered similar --blast - Set this flag to determine sequence id and refspec automatically. Note, the chosen sequence id and reference species - does not necessarily reflect the species the sequence was derived from. --rep - Set this flag to obtain only the sequence being most similar to the corresponding sequence in the core set rather - than all putative co-orthologs. --coreRep - Set this flag to invoke the '-rep' behaviour for the core ortholog compilation. --cpu - Determine the number of threads to be run in parallel --hyperthread - Set this flag to use hyper threading --batch=<> - Currently has NO functionality. --group=<> - Allows to limit the search to a certain systematic group --cleanup - Temporary output will be deleted. --aligner - Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle --local - Specify the alignment strategy during core ortholog compilation. Default is local. --glocal - Set the alignment strategy during core ortholog compilation to glocal. --searchTaxa - Input file containing list of search taxa. -SPECIFYING FAS SUPPORT OPTIONS - --fasoff - Turn OFF FAS support. Default is ON. --coreFilter=[relaxed|strict] - Specifiy mode for filtering core orthologs by FAS score. In 'relaxed' mode candidates with insufficient FAS score will be disadvantaged. - In 'strict' mode candidates with insufficient FAS score will be deleted from the candidates list. Default is None. - The option '-minScore=<>' specifies the cut-off of the FAS score. --minScore=<> - Specify the threshold for coreFilter. Default is 0.75. --countercheck - Set this flag to counter-check your final profile. The FAS score will be computed in two ways (seed vs. hit and hit vs. seed). - -SPECIFYING EXTENT OF OUTPUT TO SCREEN - --debug - Set this flag to obtain more detailed information about the programs actions --silent - Surpress output to screen as much as possbile -\n\n"; - return($helpmessage); -} diff --git a/fdog/bin/run_genewise_hamstr.pm b/fdog/bin/run_genewise_hamstr.pm deleted file mode 100755 index b631e64..0000000 --- a/fdog/bin/run_genewise_hamstr.pm +++ /dev/null @@ -1,260 +0,0 @@ -package run_genewise_hamstr; -use strict; -#$ENV{'WISECONFIGDIR'} = "/usr/local/src/wise2.2.0/wisecf/"; -# this module runs genewise on a DNA sequence and a protein sequence -# and then allows to parse this result. -# the constructor creates an object containing a reference to an array -# containing the file content - -# Modified 11.01.2010 renamed the file names for the genewise run to avoid overwriting of files when multipe runs are performed in parallel on the same sequence file -# LAST Modified: 31.07.2015. Added the option to keep, mask or remove partial codons and introns from -# the transcript. - -1; -sub new { - my $self_tmp = []; - my $self; - my ($class, $dna, $prot, $path, $keepintron) = @_; - if (!defined $path) { - $path = '/tmp'; - } - if (!defined $keepintron) { - $keepintron = 2; - } - my $pid=$$; - # the file names - my $protname = $pid.'_protein'; - my $dnaname = $pid . '_dna'; - ## print the two sequences to default path /tmp/ - open (DNA, ">$path/$dnaname") or die "could not open $path/$dnaname for writing\n"; - print DNA ">$dnaname\n$dna"; - close DNA; - open (PROTEIN, ">$path/$protname") or die "could not open $path/$protname for writing\n"; - print PROTEIN ">$protname\n$prot"; - close PROTEIN; - - ## run genewise on the two sequences - `echo \$WISECONFIGDIR`; - - $self_tmp = [`genewise -trans -cdna -pep -sum $path/$protname $path/$dnaname`]; - for (my $i = 0; $i < @$self_tmp; $i++) { - $self_tmp->[$i] =~ s/\s{1,}$//; - } - $self->{gw} = $self_tmp; - $self->{nt_seq} = $dna; - $self->{prot_seq} = $prot; - $self->{protname} = $protname; - $self->{dnaname} = $dnaname; - $self->{gw_count} = @$self_tmp; - - if ($keepintron =~ /^k/i ) { - $self->{get_indel} = 2; ## per default the indel-part is recovererd in lower case letters rather than masked or removed. See code for details - } - elsif ($keepintron =~ /^m/i) { - $self->{get_indel} = 1; ## The indel-part is masked. See code for details; - } - else { - $self->{get_indel} = 0; ## the indel-part is removed making the cDNA consistent with the translaton. See code for details; - } - print "intron is $self->{get_indel}\n"; - - $self->{indels} = _GetIndels($self_tmp); - bless ($self, $class); - return $self;} -################# -## sub score extract the score for the alignment -sub score { - my $self = shift; - my $score; - for (my $i = 0; $i < $self->{gw_count}; $i ++) { - if ($self->{gw}->[$i] =~ /^(\d{1,}\.{0,1}\d{0,}).*/) { - $score = $1; - last; - } - } - return ($score); -} -################## -sub protein { - my $self = shift; - my $gw = $self->{gw}; - my $prot = ''; - for (my $i = 0; $i < @$gw; $i++) { - if ($gw->[$i] =~ />.*\.pep/) { #the protein seq starts - my $count = 1; - while ($gw->[$i+$count] ne '//') { - my $protpart = $gw->[$i+$count]; - chomp $protpart; - $prot .= $protpart; - $count ++; - } - } - elsif (length $prot > 0) { - last; - } - } - return($prot); - } -################## -sub translation { - my $self = shift; - my $finish = 0; - my $translated_seq = ''; - my @transtmp; - - ## step 1: extract the relevant info from the genewise output - for (my $i = 0; $i < $self->{gw_count}; $i++) { - if ($self->{gw}->[$i] =~ />.*.tr/) {# a translated bit starts - while ($self->{gw}->[$i] !~ '//') { - push @transtmp, $self->{gw}->[$i]; - $i++; - } - last; # end the for loop since nothing left to be done - } - } - - ## step two: get the sequences - my $count = -1; - my $trans; - for (my $i = 0; $i < @transtmp; $i++) { - if ($transtmp[$i] =~ />/) { - $count++; - $trans->[$count]->{seq} = ''; # initialize - if ($transtmp[$i] =~ /.*\[([0-9]{1,}):([0-9]{1,})\].*/) { - $trans->[$count]->{start} = $1; - $trans->[$count]->{end} = $2; - } - } - else { - $trans->[$count]->{seq} .= $transtmp[$i]; - } - } - - ## step 3: connect the fragments - if (@$trans == 1) { - $translated_seq = $trans->[0]->{seq}; - } - else { - for (my $i = 0; $i < @$trans; $i++) { - $translated_seq .= $trans->[$i]->{seq}; - if ($i < (@$trans - 1)) { - my $missing = $trans->[$i+1]->{start} - $trans->[$i]->{end} -1; - if ($self->{get_indel} > 0) { - $translated_seq .= 'X'; - } - } - } - } - return($translated_seq); - } - -################## -sub codons { - my $self = shift; - my $finish = 0; - my $codon_seq = ''; - my @transtmp; - - ## step 1: extract the relevant info from the genewise output - for (my $i = 0; $i < $self->{gw_count}; $i++) { - if ($self->{gw}->[$i] =~ />.*sp$/) {# the codons set starts - while ($self->{gw}->[$i] !~ '//') { - push @transtmp, $self->{gw}->[$i]; - $i++; - } - last; # end the for loop since nothing left to be done - } - } - - ## step two: get the sequences - my $count = -1; - my $trans; - for (my $i = 0; $i < @transtmp; $i++) { - if ($transtmp[$i] =~ />/) { - $count++; - $trans->[$count]->{seq} = ''; # initialize - if ($transtmp[$i] =~ /.*\[([0-9]{1,}):([0-9]{1,})\].*/) { - $trans->[$count]->{start} = $1; - $trans->[$count]->{end} = $2; - } - } - else { - $transtmp[$i] =~ tr/a-z/A-Z/; - $trans->[$count]->{seq} .= $transtmp[$i]; - } - } - - ## step 3: connect the fragments - if (@$trans == 1) { - $codon_seq = $trans->[0]->{seq}; - } - else { - for (my $i = 0; $i < @$trans; $i++) { - $codon_seq .= $trans->[$i]->{seq}; - if ($i < (@$trans - 1)) { - my $indel = ''; - my $missing = $trans->[$i+1]->{start} - $trans->[$i]->{end} -1; - ## now decide whether the nts that did not got translated are masked by - ## 'N' or whether they will be represented as lower case letters - if ($self->{get_indel} == 2) { - $indel = substr($self->{nt_seq}, $trans->[$i]->{end}, $missing); - $indel =~ tr/A-Z/a-z/; - } - elsif ($self->{get_indel} == 1) { - $indel = 'N' x $missing; - } - else { - $indel = ''; - } - ## now append gap characters until the frame is recovered. Note that the gap - ## characters are added to the end of the indel-part. Thus, the codons are - ## not considered. - while (length($indel)%3 != 0) { - $indel .= '-'; - } - - $codon_seq .= $indel; - } - } - } - return ($codon_seq); - } -########################### -sub protein_borders { - my $self = shift; - my $gw = $self->{gw}; - for (my $i = 0; $i < @$gw; $i++) { - if ($gw->[$i] =~ /Bits.*introns$/) { - my ($start, $end) = $gw->[$i+1] =~ /.*$self->{protname}\s{1,}([0-9]{1,})\s{1,}([0-9]{1,}).*/; - return($start, $end); - } - else { - die "no protein-start and end could not be determnined. Check genewise command\n"; - } - } -} -########################## -sub cdna_borders { - my $self = shift; - my $gw = $self->{gw}; - for (my $i = 0; $i < @$gw; $i++) { - if ($gw->[$i] =~ /Bits.*introns$/) { - my ($start, $end) = $gw->[$i+1] =~ /.*$self->{dnaname}\s{1,}([0-9]{1,})\s{1,}([0-9]{1,}).*/; - return($start, $end); - } - else { - die "no cdna-start and end could not be determnined. Check genewise command\n"; - } - } -} -########################## -sub _GetIndels { - my $gw = shift; - my $indel; - for (my $i = 0; $i < @$gw; $i++) { - if ($gw->[$i] =~ /Bits/) { - $indel = $gw->[$i+1] =~ /.*([0-9]{1,})/; - return($indel); - } - } -} diff --git a/fdog/bin/translate.pl b/fdog/bin/translate.pl deleted file mode 100755 index 68ee444..0000000 --- a/fdog/bin/translate.pl +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/perl -use strict; -use File::Basename; -use lib dirname(__FILE__); -use Getopt::Long; -use Bio::Perl; -use File::Copy; - -# PROGRAMNAME: translate.pl - -# AUTHOR: INGO EBERSBERGER, ingo.ebersberger@univie.ac.at - -# PROGRAM DESCRIPTION: - -# DATE: Tue May 12 14:03:34 CEST 2009 - - -# DATE LAST MODIFIED: 03.11.2010: Bug fix suggested by Todd Oakley. -# BUG -- BIOPERL GUESSES PROTEIN FILE FORMAT WHEN AMBIGUITY CODES ARE PRESENT -# CAUSING AN ERROR IN THE TRANLATE_6 FRAMES, WHICH INTERRUPTS ALL TRANSLATION -- THO - -## Last modified: 10.01.2014 -## added option -outpath -######################## start main ############################# -my $help; -my @out; -my @estout; -my $infile; -my $trunc = 1; -my $outfile = "translate_tc.out"; -my $outpath = '.'; -my $limit = 20; ## this sets the maximum length for the sequence identifier. If sequence identifier are -## too long, then one can run into troubles with the parsing of the hmmsearch results. -######### -my $usage = "Name:\n\ttranslate.pl\n -Synopsis:\n\ttranslate_tc5.pl [-infile=FILE] [options] [-outfile=FILE]\n -Description:\n\tThis program takes a batch fasta-file with DNA -\tsequences as an input and translates the individual DNA sequences in -\tall six reading frames. -\t-infile: provide the relative or absolute path of the infile\n -\t-outfile: provide the relative or absolute path of the outfile -\tDefault is: translate_tc.out\n -\t-outpath: provide the path to the -\toutfile. Default is '.'\n -\ttrunc: set -trunc=0 to prevent truncation of the sequence header (see below). -\t-h: prints this help-message\n -NOTE: if the seq-id (everything up to the first [[:space:]]) contains a '|' everything between the '>' and the '|' will be taken as seq-id. Otherwise, the entire seq-id will be used. You can change this behavior by setting -trunc=0\n -NOTE: the script as an automated routine to check for unique sequence names in the input file. This may lead to cases where the $trunc value is overruled and additionally part of the sequence description may be included."; -########## - -GetOptions ( - "h" => \$help, - "infile=s" => \$infile, - "outfile=s" => \$outfile, - "outpath=s" => \$outpath, - "trunc=s" => \$trunc); -if ($help) { - print "$usage"; - exit; -} -if (-e "$outfile") { - print LOG "an outfile $outfile already exists. Renaming to $outfile.old\n\n"; - my $newname = "$outfile.old"; - rename($outfile, $newname); -} -#my @seq_object = read_all_sequences($infile, 'fasta'); - -open (LOG, ">>$outpath/hamstrsearch.log") or warn "could not open logfile for writing\n"; -print LOG "\n### TRANSLATE.PL: \n"; - -### changes suggested by Todd Oakley -my $tempseqio; -$tempseqio = Bio::SeqIO->new( '-file' => $infile, '-format' => 'fasta'); -my @seq_object; - -while( my $seq = $tempseqio->next_seq() ) { - $seq->alphabet('dna'); - push(@seq_object,$seq); -} -### End changes Todd Oakley - -## determine whether the seq-ids are unique given the chosen value for $trunc -my ($message, $cont, $check) = &checkIds(); -if ($cont == 1) { - ## the check for unique identifiers has failed and the programm is exiting - print LOG "$message\n"; - close LOG; - exit; -} -else { - print LOG "All sequence identifier are unique!\n"; - if ($check == 2) { - my $newname = "$infile.original"; - rename($infile, $newname); - print LOG "Sequence description was needed to make seq-id unique. The original version of the infile was stored in $infile.original\n"; - } - for (my $j = 0; $j < @seq_object; $j++) { - my $finalid = $seq_object[$j]->{finalid}; - my $estseq = $seq_object[$j]->seq; - my $inid = $seq_object[$j]->display_id; - my @all_trans = Bio::SeqUtils->translate_6frames($seq_object[$j]); - for (my $i = 0; $i < @all_trans; $i++) { - my $count = $i+1; - my $id = $all_trans[$i]->display_id; - my $seq = $all_trans[$i]->seq; - $id =~ s/$inid/$finalid/; - $id =~ s/-[0-9][RF]/_RF$count.0/; - push @out, ">$id\n$seq"; - } - push @estout, ">$finalid\n$estseq"; - if ($j%100 == 0) { - print "$j Sequences processed\n"; - open (OUT, ">>$outpath/$outfile") or die "failed to open outfile\n"; - print OUT join "\n", @out; - print OUT "\n"; - @out = qw(); - close OUT; - if ($check == 2) { - ## part of the description was added to the seq-id - open (OUT, ">>$infile"); - print OUT join "\n", @estout; - print OUT "\n"; - @estout = qw(); - } - } - } - open (OUT, ">>$outpath/$outfile") or die "failed to open outfile\n"; - print OUT join "\n", @out; - print OUT "\n"; - @out = qw(); - close OUT; - if ($check == 2) { - ## part of the description was added to the seq-id - open (OUT, ">>$infile"); - print OUT join "\n", @estout; - print OUT "\n"; - close OUT; - @estout = qw(); - } -} -close LOG; -exit; -########################## start sub ################ -sub checkIds { - my $message; - my $check = 1; - my $cont = 1; - my $counter; - ## Everything up to the first whitespace - ## in the fasta header will be taken as sequence id by bioperl. If this - ## id contains a '|' and $trunc is set to 1 (default), the ids may no longer - ## be unique. This will be checked and if necessary the id will not be truncated - ## for $check == 0, the truncated version of the id will be checked (only if $trunc == 1) - ## for $check == 1, the complete id will be checked - ## for $check == 2, the first 20 characters of the concatenated id and description - ## will be checked - if ($trunc == 1) { - $check = 0; - } - - while ($check < 3 and $cont == 1) { - $cont = 0; - for (my $i=0; $i < @seq_object; $i++) { - my $id = $seq_object[$i]->display_id; - $id =~ s/(.{0,$limit}).*/$1/; - if ($check == 0) { - $id =~ s/|.*//; - } - elsif ($check == 2) { - $id = $id . '_' . $seq_object[$i]->desc; - $id =~ s/(.{0,$limit}).*/$1/; - } - if (defined $counter->{$id}) { - if ($check == 0) { - $message = "trying next without truncating the id"; - } - elsif ($check == 1) { - $message = 'trying next to include sequence description'; - } - else { - $message = "Sequence identifier are not unique, using the first 20 characters. Aborting..."; - } - print LOG "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n"; - $check ++; - $cont = 1; - $counter = undef; - last; - } - else { - $counter->{$id} = 1; - $seq_object[$i]->{finalid} = $id; - } - } - } - ## return the value of $cont. If this is 1, then the sequence id check has failed. - return($message, $cont, $check); -} diff --git a/fdog/checkData.py b/fdog/checkData.py index 3aafe44..74b3b93 100644 --- a/fdog/checkData.py +++ b/fdog/checkData.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to check fdog data which are present in -# genome_dir, blast_dir and weight_dir +# searchTaxa_dir, coreTaxa_dir and annotation_dir # # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -18,38 +18,27 @@ import sys import os +import errno import argparse from os import listdir from os.path import isfile, join from pathlib import Path import subprocess +import shutil from Bio import SeqIO +from ete3 import NCBITaxa import re from datetime import datetime -import csv +import multiprocessing as mp +from tqdm import tqdm +from pkg_resources import get_distribution +from Bio.Blast.Applications import NcbiblastpCommandline -def checkFileExist(file): - if not os.path.exists(os.path.abspath(file)): - sys.exit('%s not found' % file) -def countLine(file,pattern,contain): - nline = 0 - with open(file, 'r') as f: - for line in f: - if contain: - if pattern in line: - nline = nline + 1 - else: - if not pattern in line: - nline = nline + 1 - return(nline) +import fdog.libs.zzz as general_fn +import fdog.libs.blast as blast_fn +import fdog.libs.fasta as fasta_fn -def join2Lists(first_list, second_list): - in_first = set(first_list) - in_second = set(second_list) - in_second_but_not_in_first = in_second - in_first - out = first_list + list(in_second_but_not_in_first) - return(out) def checkOptConflict(concat, replace, delete): if concat: @@ -62,51 +51,65 @@ def checkOptConflict(concat, replace, delete): if delete: sys.exit('*** ERROR: only one option can be choose between "--replace" and "--delete"') -def checkValidFasta(file): + +def check_valid_fasta(file): + """ Check if valid fasta file + Check if: + (1) Input file is a fasta file + (2) If headers are longer than 30 characters + (3) If headers and sequences contain any space/tab + (4) If sequences are written in a single line + """ spaceChr = (' ', '\t') with open(file, 'r') as f: f_bkp = f - # check if input file a FASTA file + # check if input file is a FASTA file fasta = SeqIO.parse(f, 'fasta') if not any(fasta): - return('notFasta') + return({'notFasta': 1}) else: # check for long header inSeq = SeqIO.to_dict((SeqIO.parse(open(file), 'fasta'))) for id in inSeq: if len(id) > 30: - return('longHeader') + return({'longHeader': id}) # check space or tab if any(s in f.read() for s in spaceChr): return('space') # check single line - nHeader = countLine(file, '>', True) - nSeq = countLine(file, '>', False) + nHeader = general_fn.count_line(file, '>', True) + nSeq = general_fn.count_line(file, '>', False) if not nHeader == nSeq: - return('multiLine') - return('ok') + return({'multiLine': 1}) + return({'ok': 1}) + -def checkValidFolderName(folder): +def check_valid_folder_name(folder): + """ Check if folder name contains any special characters """ invalidChr = (' ','|','\t','\'','"','`','´','^','!','$','%','&') if any(e in folder for e in invalidChr): sys.exit('*** ERROR: Invalid character found in %s' % folder) -def checkValidSeqs(faFile): + +def check_valid_seqs(fa_file): + """ Check if any sequence contains space/tab or special characters """ spaceChr = (' ', '\t') - faSeq = SeqIO.parse(open(faFile),'fasta') + faSeq = SeqIO.parse(open(fa_file),'fasta') for fa in faSeq: id, seq = fa.description, str(fa.seq) c = '' if any(e in id for e in spaceChr): - sys.exit('*** ERROR: Invalid character found in \">%s\" in %s' % (id, faFile)) + sys.exit('*** ERROR: Invalid character found in \">%s\" in %s' % (id, fa_file)) if any(c for c in seq if not c.isalpha()): - print('*** ERROR: Invalid character "%s" found in the sequence of gene \"%s\" in %s' % (c, id, faFile)) + print('*** ERROR: Invalid character "%s" found in the sequence of gene \"%s\" in %s' % (c, id, fa_file)) sys.exit('You can use "--replace" or "--delete" to solve this issue!') -def rewriteSeqs(faFile, replace, delete): + +def rewrite_seqs(fa_file, replace, delete): + """ Rewrite fasta sequence by replacing or deleting special characters """ spaceChr = (' ', '\t') - faSeq = SeqIO.parse(open(faFile),'fasta') - with open(faFile + '.mod', 'w') as tmpOut: + faSeq = SeqIO.parse(open(fa_file),'fasta') + with open(fa_file + '.mod', 'w') as tmpOut: for fa in faSeq: id, seq = fa.description, str(fa.seq) if replace: @@ -114,174 +117,338 @@ def rewriteSeqs(faFile, replace, delete): if delete: seq = re.sub('[^a-zA-Z]', '', seq) tmpOut.write('>%s\n%s\n' % (id, seq)) - os.replace(faFile + '.mod', faFile) + os.replace(fa_file + '.mod', fa_file) -def writeCheckedFile(faFile): - with open(faFile+'.checked', 'w') as f: + +def write_faChecked(fa_file): + """ Add fa.checked file in searchTaxa_dir """ + with open(fa_file+'.checked', 'w') as f: f.write(str(datetime.now())) -def checkDataFolder(checkDir, replace, delete, concat): - taxaList = [] - for fd in listdir(checkDir): - if not fd.startswith('.'): - taxon = fd - checkValidFolderName(checkDir+'/'+taxon) - getFaCmd = 'ls %s/%s/%s.fa*' % (checkDir, taxon, taxon) - try: - faFiles = subprocess.check_output([getFaCmd], shell=True).decode(sys.stdout.encoding).strip().split('\n') - for faFile in faFiles: - if os.path.islink(faFile): - faFile = os.path.realpath(faFile) - checkFileExist(faFile) - if not '.mapping' in faFile: - if not '.checked' in faFile: - if not os.path.exists(faFile+".checked"): - checkFaFile = checkValidFasta(faFile) - if checkFaFile == 'notFasta': - sys.exit('*** ERROR: %s does not look like a fasta file!' % faFile) - elif checkFaFile == 'longHeader': - sys.exit('*** ERROR: %s contains long headers!' % faFile) - elif checkFaFile == 'space': - sys.exit('*** ERROR: %s contains spaces/tabs!' % faFile) - elif checkFaFile == 'multiLine': - if not concat: - print('*** ERROR: %s contains multiple-line sequences!' % faFile) - sys.exit('Please use "--concat" with "--replace" or "--delete" to join them into single lines') - else: - rewriteSeqs(faFile, replace, delete) - elif checkFaFile == 'ok': - if not (delete or replace): - checkValidSeqs(faFile) - else: - rewriteSeqs(faFile, replace, delete) - writeCheckedFile(faFile) - print(fd) - taxaList.append(fd) - except subprocess.CalledProcessError as e: - print('*** ERROR: Problem while searching for fasta file') - print(e.output.decode(sys.stdout.encoding)) - sys.exit() - return(taxaList) - -def checkMissingJson(weightDir, taxaList): - allAnno = [f for f in listdir(weightDir) if isfile(join(weightDir, f))] - taxaAnno = [s + '.json' for s in taxaList] - s = set(allAnno) - missingAnno = [x for x in taxaAnno if x not in s] - return(missingAnno) - -def checkCompleteAnno(weightDir, genomeDir): - allAnno = [f for f in listdir(weightDir) if isfile(join(weightDir, f))] - for f in allAnno: + +def check_fasta(args): + """ Check fasta file in searchTaxa_dir and coreTaxa_dir """ + (taxon, file, checkDir, replace, delete, concat) = args + fa_file = '%s/%s/%s' % (checkDir, taxon, file) + if os.path.islink(fa_file): + fa_file = os.path.realpath(fa_file) + general_fn.check_file_exist(fa_file) + checkfa_file = check_valid_fasta(fa_file) + if not os.path.exists('%s.checked' % fa_file): + if list(checkfa_file.keys())[0] == 'notFasta': + sys.exit('*** ERROR: %s does not look like a fasta file!' % fa_file) + elif list(checkfa_file.keys())[0] == 'longHeader': + sys.exit('*** ERROR: %s contains long headers! E.g. %s' % (fa_file, list(checkfa_file.values())[0])) + elif list(checkfa_file.keys())[0] == 'space': + sys.exit('*** ERROR: %s contains spaces/tabs!' % fa_file) + elif list(checkfa_file.keys())[0] == 'multiLine': + if not concat: + print('*** ERROR: %s contains multiple-line sequences!' % fa_file) + sys.exit('Please use "--concat" with "--replace" or "--delete" to join them into single lines') + else: + rewrite_seqs(fa_file, replace, delete) + elif list(checkfa_file.keys())[0] == 'ok': + if not (delete or replace): + check_valid_seqs(fa_file) + else: + rewrite_seqs(fa_file, replace, delete) + write_faChecked(fa_file) + if not os.path.exists('%s.fai' % fa_file): + fasta_fn.read_fasta(fa_file) + return(taxon) + + +def run_check_fasta(checkDir, replace, delete, concat): + """ Run check_fasta fn """ + jobs = [] + for taxon in general_fn.read_dir(checkDir): + check_valid_folder_name('%s/%s' % (checkDir, taxon)) + for file in listdir('%s/%s' % (checkDir, taxon)): + if file.endswith('.fa'): + jobs.append([taxon, file, checkDir, replace, delete, concat]) + cpus = mp.cpu_count()-1 + pool = mp.Pool(cpus) + taxon_list = [] + for _ in tqdm(pool.imap_unordered(check_fasta, jobs), total=len(jobs)): + taxon_list.append(_) + return(taxon_list) + + +def check_blastdb(args): + """ Check for outdated blastdb """ + (query, taxon, coreTaxa_dir, searchTaxa_dir) = args + blast_db = '%s/%s/%s' % (coreTaxa_dir, taxon, taxon) + try: + blastp_cline = NcbiblastpCommandline(query = query, db = blast_db) + stdout, stderr = blastp_cline() + except: + return([query, blast_db]) + fai_in_genome = "%s/%s/%s.fa.fai" % (searchTaxa_dir, taxon, taxon) + fai_in_blast = "%s/%s/%s.fa.fai" % (coreTaxa_dir, taxon, taxon) + # check if fai_in_blast is a valid symlink + if os.path.islink(fai_in_blast): + if not os.path.exists(os.readlink(fai_in_blast)): + if os.path.exists(fai_in_genome): + try: + os.remove('%s/%s/%s.fa.fai' % (coreTaxa_dir, taxon, taxon)) + except OSError as e: + if e.errno != errno.ENOENT: + raise + os.symlink(fai_in_genome, fai_in_blast) + # or that file doesn't exist + else: + if not os.path.exists(fai_in_blast): + if os.path.exists(fai_in_genome): + os.symlink(fai_in_genome, fai_in_blast) + + +def run_check_blastdb(coreTaxa_dir, searchTaxa_dir, fdogPath): + """ Run check_blastdb fn """ + query = '%s/data/infile.fa' % fdogPath + jobs = [] + for fd in general_fn.read_dir(coreTaxa_dir): + jobs.append([query, fd, coreTaxa_dir, searchTaxa_dir]) + cpus = mp.cpu_count()-1 + pool = mp.Pool(cpus) + out = [] + for _ in tqdm(pool.imap_unordered(check_blastdb, jobs), total=len(jobs)): + out.append(_) + return([1]) + + +def create_blastdb(args): + """ Redo (or update) blastdb """ + (taxon, coreTaxa_dir, searchTaxa_dir, outPath) = args + fa_file = '%s/%s/%s.fa' % (coreTaxa_dir, taxon, taxon) + if os.path.islink(fa_file): + fa_file = os.path.realpath(fa_file) + if not os.path.exists(fa_file): + fa_file = '%s/%s/%s.fa' % (searchTaxa_dir, taxon, taxon) + if os.path.exists(fa_file): + ### remove old files + blast_path = '%s/%s' % (coreTaxa_dir, taxon) + shutil.rmtree(blast_path) + ### Redo blastdb + Path(blast_path).mkdir(parents = True, exist_ok = True) + blast_fn.make_blastdb([taxon, fa_file, outPath, coreTaxa_dir, searchTaxa_dir, True]) + ### make symlink to fasta files + fa_in_genome = "%s/%s/%s.fa" % (searchTaxa_dir, taxon, taxon) + fai_in_genome = "%s/%s/%s.fa.fai" % (searchTaxa_dir, taxon, taxon) + fa_in_blast = "%s/%s.fa" % (blast_path, taxon) + fai_in_blast = "%s/%s.fa.fai" % (blast_path, taxon) + if not os.path.exists(fa_in_blast): + os.symlink(fa_in_genome, fa_in_blast) + if not os.path.exists(fai_in_blast): + os.symlink(fai_in_genome, fai_in_blast) + return None + else: + return(taxon) + + +def run_create_blastdb(coreTaxa_dir, searchTaxa_dir): + """ Run create_blastdb fn """ + outPath = '/'.join(coreTaxa_dir.split('/')[0:-1]) + jobs = [] + for fd in general_fn.read_dir(coreTaxa_dir): + jobs.append([fd, coreTaxa_dir, searchTaxa_dir, outPath]) + cpus = mp.cpu_count()-1 + pool = mp.Pool(cpus) + out = [] + for _ in tqdm(pool.imap_unordered(create_blastdb, jobs), total=len(jobs)): + out.append(_) + return([i for i in out if i is not None]) + + +def check_missing_json(annotation_dir, taxon_list): + """ Check missing annotation for any taxa in coreTaxa_dir and searchTaxa_dir """ + all_anno = [f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f))] + taxa_anno = [s + '.json' for s in taxon_list] + s = set(all_anno) + missing_anno = [x for x in taxa_anno if x not in s] + return(missing_anno) + + +def check_complete_anno(args): + """ Check if an annotation is complete + I.e. if it contains annotation for all proteins of a species + """ + (gf,jf, annotation_dir, updateJson) = args + cmd = 'fas.checkAnno -s %s -a %s -o %s --noAnno' % (gf, jf, annotation_dir) + if updateJson: + cmd = '%s --update' % cmd + try: + subprocess.call([cmd], shell = True, stdout=subprocess.DEVNULL) + except subprocess.CalledProcessError as e: + print('*** ERROR: Problem while checking annotation file using fas.checkAnno!') + print(e.output.decode(sys.stdout.encoding)) + sys.exit() + + +def run_check_complete_anno(annotation_dir, searchTaxa_dir, coreTaxa_dir, updateJson): + """ Run check_complete_anno fn """ + all_anno = [f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f))] + jobs = [] + for f in all_anno: tax = f.replace('.json', '') - print('...check annotations for %s' % tax) - jf = '%s/%s.json' % (weightDir, tax) - gf = '%s/%s/%s.fa' % (genomeDir, tax, tax) - cmd = 'fas.checkAnno -s %s -a %s -o %s' % (gf, jf, weightDir) + # print('...check annotations for %s' % tax) + jf = '%s/%s.json' % (annotation_dir, tax) + gf = '%s/%s/%s.fa' % (searchTaxa_dir, tax, tax) + if not os.path.exists(gf): + gf = '%s/%s/%s.fa' % (coreTaxa_dir, tax, tax) + jobs.append([gf,jf, annotation_dir, updateJson]) + cpus = mp.cpu_count()-1 + pool = mp.Pool(cpus) + out = [] + for i in jobs: + check_complete_anno(i) + # for _ in tqdm(pool.imap_unordered(check_complete_anno, jobs), total=len(jobs)): + # out.append(_) + return None + + +def check_missing_ncbiID(taxon_list): + """ Check all taxa in searchTaxa_dir and coreTaxa_dir + if they are have valid NCBI taxonomy IDs + """ + ncbi = NCBITaxa() + missing_taxa = {} + present_taxa = {} + dup_taxa = [] + for t in taxon_list: + tax_id = t.split('@')[1] + try: + taxid2name = ncbi.get_taxid_translator([tax_id]) + if len(taxid2name) < 1: + if not t+'\t'+str(tax_id) in missing_taxa: + missing_taxa[t+'\t'+str(tax_id)] = 1 + except: + if not t+'\t'+str(tax_id) in missing_taxa: + missing_taxa[t+'\t'+str(tax_id)] = 1 + if not tax_id in present_taxa: + present_taxa[tax_id] = t + else: + dup_taxa.append('%s\t%s' % (t, present_taxa[tax_id])) + return(missing_taxa.keys(), dup_taxa) + + +def run_check(args): + (searchTaxa_dir, coreTaxa_dir, annotation_dir, replace, delete, concat, reblast, updateJson, ignoreAnno) = args + checkOptConflict(concat, replace, delete) + caution = 0 + + ### get fdog dir and assign searchTaxa_dir, coreTaxa_dir, annotation_dir if not given + fdogPath = os.path.realpath(__file__).replace('/checkData.py','') + if not searchTaxa_dir or not coreTaxa_dir or not annotation_dir: + pathconfigFile = fdogPath + '/bin/pathconfig.yml' + if not os.path.exists(pathconfigFile): + sys.exit('No pathconfig.yml found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') + cfg = general_fn.load_config(pathconfigFile) try: - subprocess.call([cmd], shell = True) - except subprocess.CalledProcessError as e: - print('*** ERROR: Problem while checking annotation file using fas.checkAnno!') - print(e.output.decode(sys.stdout.encoding)) - sys.exit() - -def checkMissingNcbiID(namesDmp, taxaList): - ncbiId = {} - with open(namesDmp, 'r') as f: - lines = f.readlines() - for x in lines: - taxId = x.split('\t')[0] - if not taxId in ncbiId: - ncbiId[taxId] = 1 - f.close() - missingTaxa = {} - presentTaxa = {} - dupTaxa = [] - for t in taxaList: - taxId = t.split('@')[1] - if not taxId in ncbiId: - if not t+'\t'+str(taxId) in missingTaxa: - missingTaxa[t+'\t'+str(taxId)] = 1 - if not taxId in presentTaxa: - presentTaxa[taxId] = t + dataPath = cfg['dataPath'] + except: + dataPath = os.getcwd() + + if not searchTaxa_dir: + try: + searchTaxa_dir = cfg['searchpath'] + except: + searchTaxa_dir = dataPath + '/searchTaxa_dir' + if not coreTaxa_dir: + try: + coreTaxa_dir = cfg['corepath'] + except: + coreTaxa_dir = dataPath + "/coreTaxa_dir" + if not annotation_dir: + try: + annotation_dir = cfg['annopath'] + except: + annotation_dir = dataPath + "/annotation_dir" + + searchTaxa_dir = os.path.abspath(searchTaxa_dir) + coreTaxa_dir = os.path.abspath(coreTaxa_dir) + annotation_dir = os.path.abspath(annotation_dir) + + ### check searchTaxa_dir + print('=> Checking %s...' % searchTaxa_dir) + search_taxa = run_check_fasta(searchTaxa_dir, replace, delete, concat) + + ### check coreTaxa_dir + if reblast: + print('=> (Re-)Creating blastDBs...') + failed_blast = run_create_blastdb(coreTaxa_dir, searchTaxa_dir) + if len(failed_blast) > 0: + print('*** WARNING: Some BlastDBs cannot be created:\n%s' % ', '.join(failed_blast)) else: - dupTaxa.append('%s\t%s' % (t, presentTaxa[taxId])) - return(missingTaxa.keys(), dupTaxa) + print('All old BlastDBs have been updated!') + print('=> Checking %s...' % coreTaxa_dir) + core_taxa = run_check_fasta(coreTaxa_dir, replace, delete, concat) + check_blast = run_check_blastdb(coreTaxa_dir, searchTaxa_dir, fdogPath) + + if not check_blast[0] == 1: + print('*** ERROR: Version incompatible between BlastDB and BLAST program!') + print('For checking, run: blastp -query %s -db %s' % (check_blast[0], check_blast[1])) + print('Consider using --reblast option to update old BlastDBs!') + sys.exit() + + ### check annotation_dir + if not ignoreAnno: + print('=> Checking %s...' % annotation_dir) + missing_anno = check_missing_json(annotation_dir, general_fn.join_2lists(search_taxa, core_taxa)) + if len(missing_anno) > 0: + print('\033[92m*** WARNING: Annotation files not found for:\033[0m') + print(*missing_anno, sep = "\n") + print('NOTE: You still can run fdog without FAS using the option "-fasoff"') + caution = 1 + run_check_complete_anno(annotation_dir, searchTaxa_dir, coreTaxa_dir, updateJson) + + ### check ncbi IDs + print('=> Checking NCBI taxonomy IDs...') + missing_taxa, dup_taxa = check_missing_ncbiID(general_fn.join_2lists(search_taxa, core_taxa)) + if (len(missing_taxa) > 0): + print('\033[92m*** WARNING: Taxa not found in current local NCBI taxonomy database:\033[0m') + print(*missing_taxa, sep = "\n") + print('==> NOTE: You still can run fDOG with those taxa, but they will not be included in the core set compilation!') + caution = 1 + if (len(dup_taxa) > 0): + print('\033[92m*** WARNING: These taxa have the same NCBI taxonomy IDs:\033[0m') + print(*dup_taxa, sep = "\n") + print('==> NOTE: This could lead to some conflicts!') + caution = 1 + print('---------------------------------') + return(caution) def main(): - version = '0.0.6' - parser = argparse.ArgumentParser(description='You are running fdog.checkData version ' + str(version) + '.') - parser.add_argument('-g', '--genomeDir', help='Path to search taxa directory (e.g. fdog_dataPath/genome_dir)', action='store', default='') - parser.add_argument('-b', '--blastDir', help='Path to blastDB directory (e.g. fdog_dataPath/blast_dir)', action='store', default='') - parser.add_argument('-w', '--weightDir', help='Path to feature annotation directory (e.g. fdog_dataPath/weight_dir)', action='store', default='') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + parser.add_argument('-s', '--searchTaxa_dir', help='Path to search taxa directory (e.g. fdog_dataPath/searchTaxa_dir)', action='store', default='') + parser.add_argument('-c', '--coreTaxa_dir', help='Path to blastDB directory (e.g. fdog_dataPath/coreTaxa_dir)', action='store', default='') + parser.add_argument('-a', '--annotation_dir', help='Path to feature annotation directory (e.g. fdog_dataPath/annotation_dir)', action='store', default='') parser.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False) parser.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False) parser.add_argument('--concat', help='Concatenate multiple-line sequences into single-line', action='store_true', default=False) + parser.add_argument('--reblast', help='Re-create blast databases', action='store_true', default=False) + parser.add_argument('--updateJson', help='Update annotation json file to FAS >=1.16', action='store_true', default=False) + parser.add_argument('--ignoreAnno', help='Do not check annotations', action='store_true', default=False) ### get arguments args = parser.parse_args() - genomeDir = args.genomeDir - blastDir = args.blastDir - weightDir = args.weightDir + searchTaxa_dir = args.searchTaxa_dir + coreTaxa_dir = args.coreTaxa_dir + annotation_dir = args.annotation_dir replace = args.replace delete = args.delete concat = args.concat + reblast = args.reblast + updateJson = args.updateJson + ignoreAnno = args.ignoreAnno - checkOptConflict(concat, replace, delete) - caution = 0 - - ### get fdog dir and assign genomeDir, blastDir, weightDir if not given - fdogPath = os.path.realpath(__file__).replace('/checkData.py','') - pathconfigFile = fdogPath + '/bin/pathconfig.txt' - if not os.path.exists(pathconfigFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - with open(pathconfigFile) as f: - dataPath = f.readline().strip() - if not genomeDir: - genomeDir = dataPath + "/genome_dir" - if not blastDir: - blastDir = dataPath + "/blast_dir" - if not weightDir: - weightDir = dataPath + "/weight_dir" - - ### check genomeDir and blastDir - print('=> Checking %s...' % genomeDir) - genomeTaxa = checkDataFolder(os.path.abspath(genomeDir), replace, delete, concat) - print('=> Checking %s...' % blastDir) - blastTaxa = checkDataFolder(os.path.abspath(blastDir), replace, delete, concat) - - ### check weightDir - print('=> Checking %s...' % weightDir) - missingAnno = checkMissingJson(weightDir, join2Lists(genomeTaxa, blastTaxa)) - if len(missingAnno) > 0: - print('\033[92m*** WARNING: Annotation files not found for:\033[0m') - print(*missingAnno, sep = "\n") - print('NOTE: You still can run fdog without FAS using the option "-fasoff"') - caution = 1 - checkCompleteAnno(weightDir, genomeDir) - - ### check ncbi IDs - print('=> Checking NCBI taxonomy IDs...') - namesDmp = fdogPath + '/taxonomy/names.dmp' - checkFileExist(namesDmp) - missingTaxa, dupTaxa = checkMissingNcbiID(namesDmp, join2Lists(genomeTaxa, blastTaxa)) - if (len(missingTaxa) > 0): - print('\033[92m*** WARNING: Taxa not found in current fdog\'s NCBI taxonomy database:\033[0m') - print(*missingTaxa, sep = "\n") - print('NOTE: You still can run fdog, but they will not be included in the core set compilation!') - caution = 1 - if (len(dupTaxa) > 0): - print('\033[92m*** WARNING: These taxa have the same NCBI taxonomy IDs:\033[0m') - print(*dupTaxa, sep = "\n") - print('NOTE: This could lead to some conflicts!') - caution = 1 - - print('---------------------------------') + caution = run_check([searchTaxa_dir, coreTaxa_dir, annotation_dir, replace, delete, concat, reblast, updateJson, ignoreAnno]) if caution == 1: - print('Done! Data are ready to use with caution!') + print('==> Done! Data are ready to use WITH CAUTION!') else: - print('Done! Data are ready to use!') + print('==> Done! Data are ready to use!') + if __name__ == '__main__': main() diff --git a/fdog/data/conda_requirements.yml b/fdog/data/conda_requirements.yml new file mode 100644 index 0000000..d5deb2b --- /dev/null +++ b/fdog/data/conda_requirements.yml @@ -0,0 +1,8 @@ +blast +hmmer +fasta3=36.3.8i +clustalw +mafft +muscle=5.1 +augustus=3.5.0 +metaeuk diff --git a/fdog/data/dependencies.txt b/fdog/data/dependencies.txt new file mode 100644 index 0000000..28d26a4 --- /dev/null +++ b/fdog/data/dependencies.txt @@ -0,0 +1,9 @@ +ncbi-blast+ +hmmer +clustalw +mafft +muscle +augustus +metaeuk +hmmemit +tblastn diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 9c0dc6b..d1a6f3a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -29,6 +29,8 @@ import time import shutil import multiprocessing as mp +import fdog.libs.alignment as align_fn +from tqdm import tqdm ########################### functions ########################################## def check_path(path): @@ -269,7 +271,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, db): output = open(candidatesOutFile, "w") region = open(candidatesOutFile.replace(".candidates.fa", ".regions.txt"), "w") - region.write("Conting/scaffold" + "\t" + "start" + "\t" + "end" + "\n") + region.write("Contig/scaffold" + "\t" + "start" + "\t" + "end" + "\n") for key in regions: locations = regions[key] @@ -281,9 +283,9 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group end = str(i[1] + length_extension) name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) - region.write(file + "\t" + str(start) + "\t" + str(end)) + region.write(file + "\t" + str(start) + "\t" + str(end) + "\n") #metaeuk call - cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" + cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" #print(cmd) # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 starting_subprocess(cmd, mode) @@ -355,12 +357,14 @@ def getSeedInfo(path): del seq_records return dic -def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path): +def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path, mode='silent'): ###########getting sequences and write all in one file to make msa ######### name_file = candidate_name + ".co" - output_file = tmp_path + name_file + '.fasta' + output_file = tmp_path + name_file + '.fa' aln_file = tmp_path + name_file + '.aln' - genome_dir_path = dataPath + '/genome_dir/%s/%s.fa'%(fdog_ref_species, fdog_ref_species) + genome_dir_path = dataPath + '/searchTaxa_dir/%s/%s.fa'%(fdog_ref_species, fdog_ref_species) + if not os.path.exists(genome_dir_path): + genome_dir_path = dataPath + '/genome_dir/%s/%s.fa'%(fdog_ref_species, fdog_ref_species) #print(searchTool) out = open(output_file, "w") @@ -380,15 +384,20 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates out.close() if msaTool == "muscle": - os.system("muscle -quiet -in " + output_file + " -out " + aln_file) - #print("muscle -quiet -in " + output_file + " -out " + aln_file) + if align_fn.get_muscle_version(msaTool) == 'v3': + cmd = "muscle -quiet -in " + output_file + " -out " + aln_file + else: + cmd = "muscle -align " + output_file + " -output " + aln_file + starting_subprocess(cmd, mode) if not os.path.exists(aln_file): - print("Muscle failed for " + candidate_name + ". Making MSA with Mafft-linsi.") - os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) + print("Muscle failed for %s. Making MSA with Mafft-linsi." % (candidate_name)) + cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file + starting_subprocess(cmd, mode) elif msaTool == "mafft-linsi": #print("mafft-linsi") - os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file) + cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file + starting_subprocess(cmd, mode) try: distances = get_distance_biopython(aln_file, matrix) @@ -400,8 +409,6 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates #print("Failure in distance computation, Candidate %s will be rejected" % candidate_name) return 0, "NaN", "NaN" - - #distance_hit_query = distances[best_hit, candidate_name] #distance_ref_hit = distances[best_hit, ref] @@ -420,7 +427,11 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva #print(fasta_path) orthologs = [] #print(seedDic) - blast_dir_path = dataPath + "/blast_dir/" + blast_dir_path = dataPath + "/coreTaxa_dir/" + #print(blast_dir_path) + if not os.path.exists(blast_dir_path): + blast_dir_path = dataPath + "/blast_dir/" + #print(blast_dir_path) if strict != True: seed = [fdog_ref_species] try: @@ -631,8 +642,14 @@ def cleanup(tmp, tmp_path): if time.time() > timeout: print("tmp folder could not be removed!") break + else: + # clean up whole contigs + for root, dirs, files in os.walk(tmp_path): + for file in files: + if file.endswith(".fasta"): + os.remove(os.path.join(root, file)) -def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix): +def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix, mode='silent'): if len(candidate_names) == 1: return candidate_names @@ -662,9 +679,19 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci f.close() if msaTool == "muscle": - os.system("muscle -quiet -in " + out + " -out " + aln_file) + if align_fn.get_muscle_version(msaTool) == 'v3': + cmd = "muscle -quiet -in %s -out %s" % (out, aln_file) + #print("muscle -quiet -in " + output_file + " -out " + aln_file) + else: + cmd = "muscle -align %s -output %s" % (out, aln_file) + starting_subprocess(cmd, mode) + if not os.path.exists(aln_file): + print("Muscle failed for %s. Making MSA with Mafft-linsi." % (aln_file)) + cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file + starting_subprocess(cmd, mode) elif msaTool == "mafft-linsi": - os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file) + cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet %s > %s'% (out, aln_file) + starting_subprocess(cmd, mode) distances = get_distance_biopython(aln_file, matrix) @@ -791,12 +818,24 @@ def blockProfiles(core_path, group, mode, out): ######################## paths ################################ msa_path = core_path + "/" + group +"/"+ group + ".aln" - check_path(msa_path) + if not os.path.exists(msa_path): + fasta_path = core_path + "/" + group +"/"+ group + ".fa" + check_path(fasta_path) + if msaTool == "muscle": + if align_fn.get_muscle_version(msaTool) == 'v3': + cmd= "muscle -quiet -in " + fasta_path + " -out " + msa_path + #print("muscle -quiet -in " + output_file + " -out " + aln_file) + else: + cmd = "muscle -quiet -align" + fasta_path + " -out " + msa_path + elif msaTool == "mafft-linsi": + cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + fasta_path + ' > ' + msa_path + starting_subprocess(cmd, mode) + profile_path = out + "/tmp/" + group + ".prfl" ######################## block profile ##################################### - print("Building a block profile ...") + print("Building a block profile ...", flush=True) cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path starting_subprocess(cmd, 'silent') @@ -809,7 +848,7 @@ def blockProfiles(core_path, group, mode, out): starting_subprocess(cmd, mode) cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path starting_subprocess(cmd, 'silent') - print(" \t ...finished \n") + print(" \t ...finished \n", flush=True) return profile_path @@ -847,7 +886,7 @@ def main(): #################### handle user input ##################################### start = time.time() - version = '0.1.3' + version = '0.1.4' ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) @@ -969,18 +1008,14 @@ def main(): sys.exit() elif force == True: shutil.rmtree(out + '/' + group, ignore_errors=True) - refBool = False os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') out = out + '/' + group + '/' elif append == True: out = out + '/' + group + '/' - refBool = True - else: - refBool = False # checks if sequences of reference species were already part of the extended.fa file + else: os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') out = out + '/' + group + '/' - refBool = False if core_path == '': core_path = out + '/core_orthologs/' @@ -1008,7 +1043,8 @@ def main(): sys.stderr = f sys.stdout = f else: - sys.stdout = Logger(f) + pass + #sys.stdout = Logger(f) ########################### other variables ################################ if searchTaxa == []: @@ -1046,8 +1082,8 @@ def main(): cmd = 'mkdir ' + out + '/tmp' starting_subprocess(cmd, 'silent') - print("Gene: " + group) - print("fDOG reference species: " + fdog_ref_species + " \n") + print("Gene: " + group, flush=True) + print("fDOG reference species: " + fdog_ref_species + " \n",flush=True) ###################### preparations ######################################## @@ -1096,6 +1132,30 @@ def main(): for k in output_ortholog_search: print(k) + #results = (pool.imap_unordered(ortholog_search_tblastn, calls)) + #pool.close() + #pool.join() + print("Searching for orthologs ...", flush=True) + for i in tqdm(pool.imap_unordered(ortholog_search_tblastn, calls),total=len(calls)): + ortholog_sequences.append([i[0], i[1]]) + if mode == 'debug': + for k in i[2]: + print(k) + #for i in results: + #ortholog_sequences.append([i[0], i[1]]) + #for k in i[2]: + #print(k) + print("\t ...finished \n", flush=True) + else: + ###################### computation species wise ################ + for asName in tqdm(assembly_names): + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db] + reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args) + ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) + if mode == 'debug': + for k in output_ortholog_search: + print(k) + time_ortholog_end = time.time() time_ortholog = time_ortholog_end - time_ortholog_start @@ -1118,6 +1178,7 @@ def main(): tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + #print(cmd) starting_subprocess(cmd, 'silent') clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') diff --git a/fdog/bin/__init__.py b/fdog/libs/__init__.py similarity index 100% rename from fdog/bin/__init__.py rename to fdog/libs/__init__.py diff --git a/fdog/libs/addtaxon.py b/fdog/libs/addtaxon.py new file mode 100644 index 0000000..995eeb1 --- /dev/null +++ b/fdog/libs/addtaxon.py @@ -0,0 +1,202 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +from pathlib import Path +from Bio import SeqIO +import subprocess +from ete3 import NCBITaxa +import re +from datetime import datetime +from collections import OrderedDict + +import fdog.libs.zzz as general_fn +import fdog.libs.blast as blast_fn +import fdog.libs.fasta as fasta_fn +import fdog.libs.tree as tree_fn + +##### FUNCTIONS RELATED TO ADDING NEW TAXON TO FDOG DATABASE ##### + +def check_conflict_opts(replace, delete): + """ Check if both replace and delete option are specified """ + if delete: + if replace: + sys.exit('*** ERROR: only one option can be choose between "--replace" and "--delete"') + if replace: + if delete: + sys.exit('*** ERROR: only one option can be choose between "--replace" and "--delete"') + + +def get_paths(outPath, fdogPath, searchpath, corepath, annopath): + """ Get path to searchTaxa_dir, coreTaxa_dir and annotation_dir """ + if outPath == '': + pathconfigFile = fdogPath + '/bin/pathconfig.yml' + if not os.path.exists(pathconfigFile): + sys.exit('No pathconfig.yml found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') + cfg = general_fn.load_config(pathconfigFile) + try: + outPath = cfg['dataPath'] + except: + try: + corepath = cfg['corepath'] + except: + pass + try: + searchpath = cfg['searchpath'] + except: + pass + try: + annopath = cfg['annopath'] + except: + pass + + outPath = os.path.abspath(outPath) + if not searchpath: + searchpath = outPath + '/searchTaxa_dir/' + searchpath = os.path.abspath(searchpath) + if not corepath: + corepath = outPath + '/coreTaxa_dir/' + corepath = os.path.abspath(corepath) + if not annopath: + annopath = outPath + '/annotation_dir/' + annopath = os.path.abspath(annopath) + return(outPath, searchpath, corepath, annopath) + +def create_folders(searchpath, corepath, annopath, spec_name, coreTaxa, noAnno): + """ Create searchTaxa_dir, coreTaxa_dir and annotation_dir in output folder """ + Path(searchpath).mkdir(parents = True, exist_ok = True) + genome_path = '%s/%s' % (searchpath, spec_name) + Path(genome_path).mkdir(parents = True, exist_ok = True) + if coreTaxa: + Path(corepath).mkdir(parents = True, exist_ok = True) + if not noAnno: + Path(annopath).mkdir(parents = True, exist_ok = True) + return(genome_path) + + +def generate_spec_name(tax_id, name, ver): + """ Create species name with the format @@ """ + if name == "": + ncbi_name = tree_fn.check_tax_id(tax_id) + name = tree_fn.abbr_ncbi_name(ncbi_name) + return(name+'@'+tax_id+'@'+ver) + + +def create_genome(args): + """ Create fa and fai in searchTaxa_dir """ + (faIn, genome_path, spec_name, force, replace, delete) = args + ### load fasta seq + in_seq = SeqIO.to_dict((SeqIO.parse(open(faIn), 'fasta'))) + if not os.path.exists(genome_path): + Path(genome_path).mkdir(parents = True, exist_ok = True) + genome_file = '%s/%s.fa' % (genome_path, spec_name) + if (not os.path.exists(os.path.abspath(genome_file))) or (os.stat(genome_file).st_size == 0) or force: + f = open(genome_file, 'w') + pipe = 0 + long_id = 0 + mod_id_index = 0 + id_dict = {} # id_dict[ori_id] = mod_id + for id in in_seq: + ori_id = id + seq = str(in_seq[id].seq) + ### check if ID contains empty char or pipe + if ' ' in id: + sys.exit('\033[91mERROR: Sequence IDs (e.g. %s) must not contain space(s)!\033[0m' % id) + else: + if '|' in id: + tmp = re.split('[_|]', id) + tmp = list(OrderedDict.fromkeys(tmp)) + pipe = 1 + id = '_'.join(tmp) + if not ori_id in id_dict: + id_dict[ori_id] = id + ### check if id longer than 20 character + if len(id) > 20: + long_id = 1 + mod_id_index = mod_id_index + 1 + id = '%s_%s' % (spec_name.split('@')[1], mod_id_index) + id_dict[ori_id] = id + ### check if seq contains special characters + if seq[-1] == '*': + seq = seq[:-1] + specialChr = 'no' + if any(c for c in seq if not c.isalpha()): + specialChr = 'yes' + if specialChr == 'yes': + if replace or delete: + if replace: + seq = re.sub('[^a-zA-Z]', 'X', seq) + if delete: + seq = re.sub('[^a-zA-Z]', '', seq) + else: + sys.exit('\033[91mERROR: %s sequence contains special character!\033[0m\nYou can use --replace or --delete to solve it.' % (id)) + f.write('>%s\n%s\n' % (id, seq)) + f.close() + ### create index file + fasta_fn.read_fasta(genome_file) + ### write .checked file + cf = open(genome_file+'.checked', 'w') + cf.write(str(datetime.now())) + cf.close() + ### write ID mapping file and give warning if ID changed + if len(id_dict) > 0: + mapping_file = '%s.mapping' % genome_file + with open(mapping_file, 'w') as mp: + for id in in_seq: + if id in id_dict: + mp.write(f'{id}\t{id_dict[id]}\n') + else: + mp.write(f'{id}\t{id}\n') + if pipe == 1: + print('\033[94mWARNING: Sequence IDs contain pipe(s). They will be replaced by "_"!\033[0m') + if long_id == 'yes': + print('\033[94mWARNING: Some headers longer than 80 characters have been automatically shortened.\033[0m') + print('\033[94mPlease check the %s file for details!\033[0m' % mapping_file) + else: + print(genome_path + '/' + spec_name + '.fa already exists!') + return(genome_file) + + +def create_blastdb(args): + """ Create blastdb for a given fasta genome_file """ + (searchpath, corepath, outPath, spec_name, genome_file, force, silent) = args + blast_path = '%s/%s' % (corepath, spec_name) + if (not os.path.exists(os.path.abspath('%s/%s.phr' % (blast_path, spec_name)))) or force: + blast_fn.make_blastdb([spec_name, genome_file, outPath, corepath, searchpath, silent]) + ### make symlink to fasta files + fa_in_genome = "%s/%s/%s.fa" % (searchpath, spec_name, spec_name) + fai_in_genome = "%s/%s/%s.fa.fai" % (searchpath, spec_name, spec_name) + fa_in_blast = "%s/%s.fa" % (blast_path, spec_name) + fai_in_blast = "%s/%s.fa.fai" % (blast_path, spec_name) + if not os.path.exists(fa_in_blast): + os.symlink(fa_in_genome, fa_in_blast) + if not os.path.exists(fai_in_blast): + os.symlink(fai_in_genome, fai_in_blast) + else: + print('Blast DB already exists!') + + +def create_annoFile(annopath, genome_file, cpus, force): + """ Create annotation json for a given genome_file """ + annoCmd = 'fas.doAnno -i %s -o %s --cpus %s' % (genome_file, annopath, cpus) + if force: + annoCmd = annoCmd + " --force" + try: + subprocess.call([annoCmd], shell = True) + except: + print('\033[91mERROR: Problem with running fas.doAnno. You can check it with this command:\n%s\033[0m' % annoCmd) diff --git a/fdog/libs/alignment.py b/fdog/libs/alignment.py new file mode 100644 index 0000000..0704bbf --- /dev/null +++ b/fdog/libs/alignment.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +import subprocess +import math +import re +from Bio import SeqIO +from io import StringIO +import random + +import fdog.libs.fasta as fasta_fn +import fdog.libs.output as output_fn + +##### FUNCTIONS RELATED TO SEQ ALIGNMENT ##### + +def check_fasta36_executable(fdogPath): + """ Check if FASTA36 installed in fdogPath """ + try: + fasta36_cmd = '%s/bin/aligner/bin/ggsearch36' % fdogPath + subprocess.check_output(fasta36_cmd, shell = True, stderr = subprocess.STDOUT) + return('%s/bin/aligner/bin/' % fdogPath) + except: + try: + which_fasta36 = subprocess.run( + 'which fasta36', shell = True, capture_output = True, check = True) + return(which_fasta36.stdout.decode().strip().replace('fasta36','')) + except subprocess.CalledProcessError as e: + sys.exit('\033[91mERROR: FASTA36 not found!\033[0m') + + +def get_muscle_version(aligner): + """ Check muscle version (3.8 or 5.1) + Return v3 for v3.8, otherwise v5 + """ + cmd = 'muscle -version' + try: + out = subprocess.run(cmd, shell = True, capture_output = True, check = True) + if 'v3.8' in out.stdout.decode(): + return('v3') + else: + return('v5') + except subprocess.CalledProcessError as e: + raise RuntimeError("Command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) + sys.exit('\033[91mERROR: Error running command\n%s\033[0m' % cmd) + + +def do_align(aligner, fa_file): + """ Do alignment using MUSCLE or MAFFT for a multiple fasta file + Return a dictionary (SeqIO object) containing seq IDs and aligned sequences + Note: if any input seq is longer than 12.000 aa/nt, only MAFFT can be used + """ + input_fa = SeqIO.to_dict((SeqIO.parse(open(fa_file), 'fasta'))) + if len(input_fa) == 1: + return(input_fa) + # parse output file name (otherwise cause troubles for muscle_v5) + out_file = fa_file.split('/')[-1].replace('@', '_') + # check muscle version + if aligner == 'muscle': + if get_muscle_version(aligner) == 'v3': + if fasta_fn.check_long_seq(fa_file, 12000) == 1: + aligner = 'mafft-linsi' + else: + aligner = 'muscle_v3' + else: + if fasta_fn.check_long_seq(fa_file, 15000) == 1: + aligner = 'mafft-linsi' + else: + aligner = 'muscle_v5' + # create alignment command and run + align_cline = '' + if aligner == 'muscle_v3': + align_cline = 'muscle -in %s' % fa_file + elif aligner == 'muscle_v5': + align_cline = 'muscle -align %s -output %s.muscle.out' % (fa_file, out_file) + else: + align_cline = 'mafft --localpair --maxiterate 1000 %s' % fa_file + try: + aln_out = subprocess.run([align_cline], shell = True, capture_output = True, check = True) + except subprocess.CalledProcessError as e: + raise RuntimeError("Command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) + sys.exit( + 'ERROR: Error doing alignment with %s for %s' % (aligner, fa_file)) + + if aligner == 'muscle_v5': + aln_seq = SeqIO.to_dict((SeqIO.parse(open('%s.muscle.out' % out_file), 'fasta'))) + os.remove('%s.muscle.out' % out_file) + else: + aln_io = StringIO(aln_out.stdout.decode().strip()) + aln_seq = SeqIO.to_dict((SeqIO.parse(aln_io,'fasta'))) + return(aln_seq) + + +def calc_Kimura_dist(aln_dict, id_1, id_2, debug): + """ Calculate Kimura distance for a pair of sequences + Input is a dictionary of MSA (see do_align function). + The Kimura distance is calculated based on perl module + https://metacpan.org/pod/Bio::Align::ProteinStatistics#D-distance-methods + """ + matches = 0 + total = 0 + if id_1 in aln_dict and id_2 in aln_dict: + for a, b in zip(aln_dict[id_1].seq, aln_dict[id_2].seq): + if a != '-' and b != '-': + if a == b: + matches +=1 + total += 1 + if not total == 0: + D = 1 - (matches/total) + else: + D = 1 + output_fn.print_debug( + debug, 'Kimura distance', + 'kimura = round(- (math.log( 1 - %s - (0.2 * (%s ** 2)))), 5)' % (D, D)) + try: + kimura = round(- (math.log( 1 - D - (0.2 * (D ** 2)))), 5) + except: + kimura = 999 + return(kimura) + else: + sys.exit('ERROR: %s or %s not found in %s!' % (id_1, id_2, aln_dict)) + + +def calc_aln_score(fa1, fa2, aln_strategy = 'local', debugCore = False): + """ Calculate alignment score for genes in fa2 vs other genes in fa1 + Return dictionary {gene_id:aln_score} + """ + fdog_path = os.path.realpath(__file__).replace('/libs/alignment.py','') + fa1_filename = fa1.split("/")[-1] + fa2_filename = fa2.split("/")[-1] + os.symlink(fa1, fa1_filename) + if not fa2_filename == fa1_filename: + os.symlink(fa2, fa2_filename) + fasta36_options = f'{fa1_filename} {fa2_filename} -s BP62 -m 9 -d 0 -z -1 -E 100' + fdog_path = os.path.realpath(__file__).replace('/libs/alignment.py','') + fasta36_bin = check_fasta36_executable(fdog_path) + if aln_strategy == 'global': + fasta36_cmd = '%s/ggsearch36 %s' \ + % (fasta36_bin, fasta36_options) + elif aln_strategy == 'glocal': + fasta36_cmd = '%s/glsearch36 %s' \ + % (fasta36_bin, fasta36_options) + else: + fasta36_cmd = '%s/ssearch36 %s' \ + % (fasta36_bin, fasta36_options) + output_fn.print_debug( + debugCore, 'ALN SCORE', + 'Calculate aln score using FASTA36: %s' % fasta36_cmd) + try: + fasta36_out = subprocess.run( + [fasta36_cmd], shell = True, capture_output = True, check = True) + except: + sys.exit('ERROR: Error running FASTA36\n%s' % fasta36_cmd) + # returns score for genes in fa2 + aln_score = {} + cand_dict = SeqIO.to_dict((SeqIO.parse(open(fa2), 'fasta'))) + for id in list(cand_dict.keys()): + aln_score[id[0:60]] = 0 + results = fasta36_out.stdout.decode().split('\n') + for l in results: + if len(l) > 1: + gene_id = l.split()[0] + if gene_id in aln_score: + if re.search('\(\s+\d+\)', l): + l = re.sub(r'\(\s+','(', l) + aln_score[gene_id] = aln_score[gene_id] + int(l.split()[2]) + os.remove(fa1_filename) + if not fa2_filename == fa1_filename: + os.remove(fa2_filename) + return(aln_score) diff --git a/fdog/libs/blast.py b/fdog/libs/blast.py new file mode 100644 index 0000000..2cb4609 --- /dev/null +++ b/fdog/libs/blast.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import os +import sys +from Bio.Blast.Applications import NcbiblastpCommandline +import xml.etree.ElementTree as ET +import subprocess + + +##### FUNCTIONS RELATED TO BLAST ##### + +def do_blastsearch( + query, blast_db, evalBlast = 0.00001, lowComplexityFilter = False): + """ Perform blastp search for a query fasta file + Return an XML string contains blast result + """ + filter = 'no' + if lowComplexityFilter == True: + filter = 'yes' + try: + blastp_cline = NcbiblastpCommandline( + query = query, db = blast_db, evalue = evalBlast, seg = filter, + max_target_seqs = 10, outfmt = 5) + stdout, stderr = blastp_cline() + return(stdout) + except: + sys.exit( + 'ERROR: Error running blastp search for %s against %s\n%s' + % (query, blast_db, NcbiblastpCommandline( + query = query, db = blast_db, evalue = evalBlast, seg = filter, + max_target_seqs = 10, outfmt = 5))) + + +def parse_blast_xml(blast_output): + """ Parse Blast XML output from a string variable + Return a dictionary containing query ID, query length, together with a list + of hits and their bit score, evalue, align len + """ + blast_dict = {} + root = ET.fromstring(blast_output) + blast_dict['query'] = root[8][0][2].text + blast_dict['query_len'] = root[8][0][3].text + blast_dict['hits'] = {} + for type_tag in root.findall( + 'BlastOutput_iterations/Iteration/Iteration_hits/Hit'): + value = type_tag.findall('*') + hit_id = 'NA' + for i in type_tag.findall('*'): + if i.tag == 'Hit_def': + if not i.text in blast_dict['hits']: + hit_id = i.text + blast_dict['hits'][hit_id] = {} + if i.tag == 'Hit_hsps': + if hit_id in blast_dict['hits']: + blast_dict['hits'][hit_id]['bit_score'] = i[0][1].text + blast_dict['hits'][hit_id]['evalue'] = i[0][3].text + blast_dict['hits'][hit_id]['align_len'] = i[0][13].text + return(blast_dict) + + +def make_blastdb(args): + """ Make blastDB in coreTaxa_dir + for fdog.addTaxon, fdog.addTaxa and fdog.checkData + """ + (specName, specFile, outPath, coreTaxa_dir, searchTaxa_dir, silent) = args + if not coreTaxa_dir: + coreTaxa_dir = '%s/coreTaxa_dir' % outPath + if not searchTaxa_dir: + searchTaxa_dir = '%s/searchTaxa_dir' % outPath + blastCmd = 'makeblastdb -dbtype prot -in %s -out %s/%s/%s' % (specFile, coreTaxa_dir, specName, specName) + if silent == True: + blastCmd = blastCmd + '> /dev/null 2>&1' + try: + subprocess.call([blastCmd], shell = True) + except: + sys.exit('Problem with running %s' % blastCmd) + fileInGenome = "%s/%s/%s.fa" % (searchTaxa_dir, specName, specName) + fileInBlast = "%s/%s/%s.fa" % (coreTaxa_dir, specName, specName) + if not os.path.exists(fileInBlast): + os.symlink(fileInGenome, fileInBlast) diff --git a/fdog/libs/corecompile.py b/fdog/libs/corecompile.py new file mode 100644 index 0000000..d16a5a1 --- /dev/null +++ b/fdog/libs/corecompile.py @@ -0,0 +1,420 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +import shutil +from pathlib import Path +from ete3 import NCBITaxa +from Bio import SeqIO +import time + +import fdog.libs.zzz as general_fn +import fdog.libs.fasta as fasta_fn +import fdog.libs.hmm as hmm_fn +import fdog.libs.alignment as align_fn +import fdog.libs.tree as tree_fn +import fdog.libs.fas as fas_fn +import fdog.libs.output as output_fn +import fdog.libs.orthosearch as ortho_fn + + +##### FUNCTIONS RELATED TO CORE COMPILATION ##### + +def get_core_taxa_ids(coreTaxa, corepath): + """ Get taxonomy IDs for core taxa + Either from coreTaxa_dir, or from user input list (--coreTaxa) + Return dictionary {taxID:@@Ver} + """ + tax_ids = {} + if not coreTaxa == '': + ignored_taxa = [] + if os.path.exists(os.path.abspath(coreTaxa)): + core_taxa = general_fn.read_file(coreTaxa) + else: + core_taxa = coreTaxa.split(',') + + for core_taxon in core_taxa: + if not os.path.exists( + os.path.abspath( + '%s/%s/%s.phr' % (corepath,core_taxon,core_taxon))): + ignored_taxa.append(core_taxon) + else: + id = core_taxon.split('@')[1] + if not id in tax_ids: + tax_ids[id] = core_taxon + if len(ignored_taxa) > 0: + print( + 'WARNING: %s taxa cannot be found at %s\n%s' + % (len(ignored_taxa), corepath, ignored_taxa)) + else: + tax_ids = general_fn.get_ids_from_folder(corepath, 'coreTaxa_dir') + return(tax_ids) + + +def initiate_core_files( + seqFile, seqName, refspec, seed_id, hmmpath, annopath, aligner, fasOff): + hmm_dir = '%s/%s/hmm_dir' % (hmmpath, seqName) + Path(hmm_dir).mkdir(parents = True, exist_ok = True) + aln_file = '%s/%s/hmm_dir/%s.aln' % (hmmpath, seqName, seqName) + aln_seed = align_fn.do_align(aligner, seqFile) + fasta_fn.write_fasta(aln_seed, aln_file) + hmm_file = '%s/%s/hmm_dir/%s.hmm' % (hmmpath, seqName, seqName) + hmm_seed = hmm_fn.create_hmm(aln_file, hmm_file) + + fa_file = '%s/%s/%s.fa' % (hmmpath, seqName, seqName) + seed_id_mod = '%s|%s|%s' % (seqName, refspec, seed_id) + input_seed = SeqIO.parse(seqFile,'fasta') + with open(fa_file, 'w') as initial_core_fa: + for fa in input_seed: + initial_core_fa.write('>%s\n%s\n' % (seed_id_mod, str(fa.seq))) + + seed_json = '' + if not fasOff == True: + seed_json = fas_fn.get_anno_fas( + seqName, refspec, seed_id, str(fa.seq), hmmpath, annopath) + return(aln_file, fa_file, hmm_file, seed_json) + + +def store_cand_reults(args): + """ Save intermediate results for a candidate ortholog + Including: + 1) Candidate joined score of fas & normalised aln score + in dictionary {taxID:score} + 2) Candidate fasta sequence in dictionary {taxID:fasta_seq} + 3) Update current (best) candidate, normalised aln score and joined score + """ + (cand_taxid, cand_score, cand_seq, + curr_cand, curr_aln_score, aln_score_normalized, + fas_score, fas_dict, ortho_id, ortho_seq) = args + cand_score[cand_taxid] = float(fas_score) + float(aln_score_normalized) + cand_seq[cand_taxid] = {ortho_id:ortho_seq} + curr_aln_score = aln_score_normalized + curr_candi_score = float(fas_score) + float(aln_score_normalized) + curr_cand = cand_taxid + return( + cand_score, cand_seq, curr_cand, curr_aln_score, + curr_candi_score, fas_dict) + + +def validate_candidate(args): + """ Validate candidate based on its normalised aln score and fas score """ + (aln_score_normalized, cand_args, calc_fas_args, variable_args, debugCore, distDeviation) = args + (cand_score, cand_seq, curr_cand, curr_aln_score, + curr_candi_score, fas_dict) = variable_args + (cand_taxid, ortho_id, ortho_seq, next_node, first_cand) = cand_args + (fasOff, seqName, seed_json, spec, seq_id, seq, hmmpath, annopath) = calc_fas_args + + if first_cand == True: + threshold = 0 + else: + if next_node == True: + threshold = curr_candi_score * (1 + distDeviation) + type = ' (STRICT) ' + else: + threshold = curr_candi_score + type = '' + + if aln_score_normalized > threshold - 1: + if not '%s_%s' % (spec, seq_id) in fas_dict: + fas_score = fas_fn.calc_fas_cand(calc_fas_args) + fas_dict['%s_%s' % (spec, seq_id)] = fas_score + output_fn.print_debug( + debugCore, '', '-FAS: %s' % fas_score) + else: + fas_score = fas_dict['%s_%s' % (spec, seq_id)] + if float(fas_score) + float(aln_score_normalized) > threshold: + variable_args = store_cand_reults( + [cand_taxid, + cand_score, cand_seq, + curr_cand, curr_aln_score, + aln_score_normalized, fas_score, fas_dict, + ortho_id, ortho_seq]) + else: + output_fn.print_debug( + debugCore, '', + '-Joined score not higher than the prev%s! Skip...' % type) + else: + output_fn.print_debug( + debugCore, '', + '-Aln score %s not higher%s! Skip...' % (aln_score_normalized, type)) + return(variable_args) + + +def compile_core(args): + """ Core compilation """ + (seqFile, seqName, refspec, seed_id, coreArgs, pathArgs, orthoArgs, otherArgs, debug) = args + (minDist, maxDist, coreSize, coreTaxa, distDeviation, alnStrategy, fasOff) = coreArgs + (outpath, hmmpath, corepath, searchpath, annopath) = pathArgs + (cpus, debugCore, silentOff, noCleanup, force, append) = otherArgs + aligner = orthoArgs[-1] + otherArgs.insert(0, 'NA') + + ncbi = NCBITaxa() + ### get taxonomy lineage of refspec + refspec_id = refspec.split('@')[1] + refspec_lineage = ncbi.get_lineage(refspec_id) + + ### get rank ID and its index in the refspec lineage + (min_rank, max_rank) = tree_fn.get_rank_range(refspec_lineage, minDist, maxDist, ncbi) + output_fn.print_debug(debugCore, 'Min & Max-rank', '%s\t%s' % (min_rank, max_rank)) + + ### create taxonomy tree from list of core tax + tax_ids = get_core_taxa_ids(coreTaxa, corepath) + tree = ncbi.get_topology(tax_ids.keys(), intermediate_nodes = True) + if debugCore: + print(tree) + + ### INITIATE FA, ALN, HMM [and anno FAS] FILE FOR SEED + (aln_file, fa_file, hmm_file, seed_json) = initiate_core_files( + seqFile, seqName, refspec, seed_id, hmmpath, annopath, aligner, fasOff) + + ### get list of taxa within min and max rank of refspec + node_dict = tree_fn.get_leaves_dict( + refspec_lineage, tree, + list(min_rank.values())[0], list(max_rank.values())[0]) + output_fn.print_debug(debugCore, 'Node dictionary', node_dict) + + ### traverse the core taxa tree + added_taxa = {} + ignored_taxa = [] + fas_dict = {} + previous_added_taxon = refspec_id + for round in range(coreSize - 1): + output_fn.print_stdout(silentOff, '---------- ROUND %s ----------' % round) + output_fn.print_debug( + debugCore, 'CORE COMPILATION', + '---------- ROUND %s ----------' % round) + aln_scores = align_fn.calc_aln_score(fa_file, fa_file, alnStrategy, debugCore) + max_aln_score = max(aln_scores.values()) #0 + if max_aln_score == 0: + exit('ERROR: Something went wrong with FASTA36. Please run debugCore to investigate!') + flag_round = 0 # use to stop current round if an ortholog was added + output_fn.print_debug(debugCore, '', 'ADDED TAXA: %s' % added_taxa.keys()) + cand_seq = {} + cand_score = {} + curr_cand = '' + curr_aln_score = 0 + curr_candi_score = 0 + next_node = False + for node_id, leaves in node_dict.items(): + if flag_round == 1: + break + output_fn.print_debug( + debugCore, '', + 'NODE %s - %s' % (node_id, ncbi.get_rank([node_id]))) + output_fn.print_debug( + debugCore, '', + '-MAX ALN SCORE: %s' % max_aln_score) + output_fn.print_debug( + debugCore, '', + '-PREVIOUS ADDED: %s' % previous_added_taxon) + leaves.reverse() + flag_node = 0 + for leaf in leaves: + if flag_node == 1: + break + if not leaf == refspec_id and \ + not leaf in added_taxa and \ + not leaf in ignored_taxa: + output_fn.print_debug(debugCore, '','') + output_fn.print_debug( + debugCore, '', + 'Leaf %s - %s' % (leaf, tax_ids[leaf])) + if len(curr_cand) > 0 and \ + not curr_cand in node_dict[node_id]: + next_node = True + output_fn.print_debug( + debugCore, '', + '-Current_candidate from different node: %s' \ + % curr_cand) + if curr_candi_score * (1 + distDeviation) > 2: + output_fn.print_debug( + debugCore, '', + '# Current score cannot be defeater! Stop this node!') + break + else: + next_node = False + output_fn.print_debug( + debugCore, '', + '-Current_candidate: %s' % curr_cand) + output_fn.print_debug( + debugCore, '', + '-Current_aln_score: %s' % curr_aln_score) + output_fn.print_debug( + debugCore, '', + '-Current_candi_score: %s' % curr_candi_score) + ### compare taxonomy rank with previous added taxon + ### ignore if this leaf closer to the refspec than to + ### previous added taxon + ancestor_to_ref = tree_fn.get_ancestor(refspec_id, leaf, ncbi) + check_ancestor_to_ref = tree_fn.check_common_ancestor( + previous_added_taxon, list(ancestor_to_ref.keys())[0], + minDist, maxDist, ncbi) + if check_ancestor_to_ref == 0: + ignored_taxa.append(leaf) + output_fn.print_debug( + debugCore, '', + '-Closer to refspec than previous added taxon!') + continue + ### continue process only if this leaf is within min and max rank + ### of the previous added taxon + ancestor = tree_fn.get_ancestor(previous_added_taxon, leaf, ncbi) + check_ancestor = tree_fn.check_common_ancestor( + previous_added_taxon, list(ancestor.keys())[0], + minDist, maxDist, ncbi) + if check_ancestor == 1: + output_fn.print_debug( + debugCore, '', + '-Ancestor %s with %s accepted' \ + % (ancestor, previous_added_taxon)) + ### run ortholog search + otherArgs[2] = debug + otherArgs[0] = tax_ids[leaf] + hamstr_out = ortho_fn.run_hamstr([seqName, refspec, pathArgs, + orthoArgs, otherArgs]) + if len(hamstr_out) > 1: + ### calculate alignment score + ortho = list(hamstr_out.items())[-1] + tmp_fa = '%s/%s/%s_%s.fa' \ + % (hmmpath, seqName, seqName, leaf) + with open(tmp_fa, 'w') as tmp_fa_out: + tmp_fa_out.write('>%s\n%s\n' \ + % (ortho[0][0:len(ortho[0])-2], ortho[1])) + aln_score = align_fn.calc_aln_score(fa_file, tmp_fa, alnStrategy, debugCore) + output_fn.print_debug( + debugCore, '', + '-Max: %s - Aln: %s' % (max_aln_score, aln_score)) + aln_score_normalized = \ + list(aln_score.values())[0] / max_aln_score + output_fn.print_debug( + debugCore, '', + '-Normalized_aln_score: %s' % aln_score_normalized) + os.remove(tmp_fa) + + ### validate candidate + if len(cand_score) == 0 \ + and len(curr_cand) == 0: + first_cand = True + else: + first_cand = False + calc_fas_args = (fasOff, seqName, seed_json, + tax_ids[leaf], ortho[0].split('|')[-2], + ortho[1] ,hmmpath, annopath) + cand_args = (leaf, ortho[0][0:len(ortho[0])-2], + ortho[1], next_node, first_cand) + variable_args = (cand_score, cand_seq, + curr_cand, curr_aln_score, + curr_candi_score, fas_dict) + (cand_score, cand_seq, curr_cand, + curr_aln_score, curr_candi_score, + fas_dict) = validate_candidate([ + aln_score_normalized, cand_args, + calc_fas_args, variable_args, debugCore, + distDeviation]) + if curr_candi_score == 2: + flag_node = 1 + output_fn.print_debug( + debugCore, '', + '-Max score achieved! Stop this node!') + elif not len(hamstr_out) > 1 and len(cand_seq) == 0: + ignored_taxa.append(leaf) + output_fn.print_debug( + debugCore, '', + '-No ortholog found!') + else: + ignored_taxa.append(leaf) + output_fn.print_debug( + debugCore, '', + '-Not considered due to ancestor %s with %s\n' \ + % (ancestor, previous_added_taxon)) + else: + output_fn.print_debug( + debugCore, '', + '%s - %s skipped' % (leaf, tax_ids[leaf])) + output_fn.print_debug( + debugCore, '', 'Node candidates: %s' % cand_score) + if len(cand_score) > 0 \ + and cand_score[curr_cand] == 2: + output_fn.print_debug( + debugCore, '', + '# MAX SCORE ACCHIEVED! Stop this round!') + flag_round = 1 + if next_node == True \ + and cand_score[curr_cand] * (1 + distDeviation) > 2: + output_fn.print_debug( + debugCore, '', + '# CURRENT SCORE CANNOT BE DEFEATED! Stop this round!') + flag_round = 1 + + if len(cand_seq) > 0: + output_fn.print_debug( + debugCore, '', + '# ADD THIS TAXON TO CORE GROUP\t%s - %s\n' \ + % (curr_cand, tax_ids[curr_cand])) + previous_added_taxon = curr_cand + added_taxa[curr_cand] = {tax_ids[curr_cand]:cand_score[curr_cand]} + ### update seqName.fa and hmm_dir/seqName.hmm + fasta_fn.append_to_fasta_file(fa_file, cand_seq[curr_cand]) + aln_seed = align_fn.do_align(aligner, fa_file) + fasta_fn.write_fasta(aln_seed, aln_file) + hmm_seed = hmm_fn.create_hmm(aln_file, hmm_file) + os.remove(aln_file) + ### remove temp json files + for file in os.listdir('%s/%s' % (hmmpath, seqName)): + if file.endswith('.json'): + os.remove('%s/%s/%s' % (hmmpath, seqName, file)) + output_fn.print_debug( + debugCore, 'CORE COMPILATION', + 'All added taxa %s' % added_taxa) + if len(added_taxa) < coreSize - 1: + output_fn.print_stdout( + silentOff, + 'WARNING: Only %s/%s orthologs in the core group' \ + % (len(added_taxa) + 1, coreSize)) + + +def run_compile_core(args): + (seqFile, seqName, refspec, seed_id, reuseCore, forceCore, coreArgs, + pathArgs, orthoCoreArgs, otherCoreArgs, debug) = args + (outpath, hmmpath, corepath, searchpath, annopath) = pathArgs + (cpus, debugCore, silentOff, noCleanup, force, append) = otherCoreArgs[-6:] + begin = time.time() + fdogPath = os.path.realpath(__file__).replace('/libs/corecompile.py','') + align_fn.check_fasta36_executable(fdogPath) + + coreHmmfile = '%s/%s/hmm_dir/%s.hmm' % (hmmpath, seqName, seqName) + coreHmmfile = os.path.abspath(coreHmmfile) + compile_core_check = 1 + ncbi = '' + if reuseCore == True: + general_fn.check_file_exist(coreHmmfile) + compile_core_check = 0 + else: + if os.path.exists(coreHmmfile): + if forceCore == True: + print('WARNING: Existing %s core group will be deleted!' % seqName) + shutil.rmtree('%s/%s' % (hmmpath, seqName)) + else: + sys.exit( + 'WARNING: Core group %s exists in %s! ' % (seqName, hmmpath) + + 'You still can run with --forceCore or --reuseCore option') + if compile_core_check == 1: + compile_core([seqFile, seqName, refspec, seed_id, coreArgs, pathArgs, + orthoCoreArgs, otherCoreArgs[-6:], debug]) + end = time.time() + return([seqName, '{:5.3f}s'.format(end - begin)]) diff --git a/fdog/libs/fas.py b/fdog/libs/fas.py new file mode 100644 index 0000000..12741d2 --- /dev/null +++ b/fdog/libs/fas.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +import subprocess +import shutil +import greedyFAS.annoFAS.annoModules as annoFas + +import fdog.libs.zzz as general_fn + + +##### FUNCTIONS RELATED TO FAS ##### + +def check_fas_executable(): + try: + subprocess.check_output(['fas.setup -t ./ --check'], shell = True, stderr = subprocess.STDOUT) + except subprocess.CalledProcessError as e: + print('\033[96m%s\033[0m' % e.output.decode(sys.stdout.encoding).strip()) + print('FAS installed but fas.setup still need to be run if you want to use it!') + return(0) + return(1) + + +def get_tool_fas_path(): + """ Get path to FAS annotation tools """ + cmd = 'fas.setup -t ~/ -c' + try: + out = subprocess.run( + [cmd], shell = True, capture_output = True, check = True) + tool_path = out.stdout.decode().split('\n')[0].split()[6].replace('.','') + return(tool_path) + except: + sys.exit('ERROR: fas.setup cannot be called!') + + +def get_anno_fas(seqName, spec, seq_id, seq, hmmpath, annopath): + """ Get annotation for a seq_id from existing json file in annopath """ + out_json = '%s/%s/%s_%s.json' % (hmmpath, seqName, spec, seq_id) + if not os.path.exists(out_json): + tmp_seed_fa = '%s/%s/%s_%s.fa' % (hmmpath, seqName, seqName, spec) + with open(tmp_seed_fa, 'w') as tmp_seed_fa_out: + tmp_seed_fa_out.write('>%s\n%s\n' % (seq_id, seq)) + spec_anno = '%s/%s.json' % (annopath, spec) + try: + anno_dict = annoFas.extractAnno(tmp_seed_fa, spec_anno) + anno_dict['clan'] = annoFas.getClans( + get_tool_fas_path(), anno_dict['feature']) + annoFas.save2json( + anno_dict, '%s_%s' % (spec, seq_id), '%s/%s' % (hmmpath, seqName)) + os.remove(tmp_seed_fa) + except: + sys.exit( + 'ERROR: Annotation for %s cannot be found in %s' + % (seq_id, spec_anno)) + return(out_json) + + +def calc_pairwise_fas(seed_json, query_json, seqName, hmmpath): + """ Calculate FAS score for a pair seed and query protein + Input are two anno json files for seed and query + Return a value between 0 and 1 + """ + general_fn.check_file_exist(seed_json) + general_fn.check_file_exist(query_json) + + fas_cmd = 'fas.run -s %s -q %s --no_config' % (seed_json, query_json) + fas_cmd = '%s -a %s/%s --raw --tsv --domain --cpus 1 -o %s/%s' \ + % (fas_cmd, hmmpath, seqName, hmmpath, seqName) + try: + fas_out = subprocess.run( + [fas_cmd], shell = True, capture_output = True, check = True) + except: + sys.exit('ERROR: Error running FAS\n%s' % fas_cmd) + results = fas_out.stdout.decode().split('\n') + for l in results: + if l.startswith('#') and len(l.split('\t')) > 1: + return(l.split('\t')[-1]) + return('') + + +def calc_fas_cand(args): + """ Calculate FAS score for a ortholog candidate against seed + Ortholog candidate defined by spec, seq_id and seq + """ + (fasOff, seqName, seed_json, spec, seq_id, seq, hmmpath, annopath) = args + if not fasOff == True: + query_json = get_anno_fas(seqName, spec, seq_id, seq, hmmpath, annopath) + fas_score = calc_pairwise_fas(seed_json, query_json, seqName, hmmpath) + else: + fas_score = 1 + return(fas_score) + + +def calc_fas_multi (input_fa, outpath, annopath, cpus): + """ Calculate pairwise FAS scores for all orthologs vs seed protein + input_fa is the default .extended.fa output file of fDOG + Output will be _forward. + """ + fasCmd = 'fas.runFdogFas -i %s -w %s --cores %s --redo_anno' % (input_fa, annopath, cpus) + try: + subprocess.call([fasCmd], shell = True) + if os.path.exists(outpath + '/tmp'): + shutil.rmtree(outpath + '/tmp') + except: + sys.exit('Problem running\n%s' % (fasCmd)) diff --git a/fdog/libs/fasta.py b/fdog/libs/fasta.py new file mode 100644 index 0000000..c786912 --- /dev/null +++ b/fdog/libs/fasta.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +from pysam import FastaFile +from Bio import SeqIO + +import fdog.libs.zzz as general_fn + + +##### FUNCTIONS RELATED TO FASTA SEQ ##### + +def add_seq_to_dict(dict, id, seq): + """ Add fasta sequence to a dictionary """ + if not id in dict: + dict[id] = seq + return(dict) + + +def read_fasta(fa_file): + """ Read LARGE fasta file and return fasta object + Sequence can be get using fasta_object.fetch(seq_id) + """ + fasta_object = FastaFile(fa_file) + return(fasta_object) + + +def write_fasta(fa_dict, out_file): + """ Write sequences in SeqIO dict into output file """ + with open(out_file, 'w') as out: + for seq in fa_dict: + out.write('>%s\n%s\n' % (seq, fa_dict[seq].seq)) + + +def append_to_fasta_file(fa_file, new_fa_dict): + """ Append a dict of fasta seq to an existing fasta file """ + general_fn.check_file_exist(fa_file) + existing_seq = SeqIO.to_dict(SeqIO.parse(open(fa_file),'fasta')) + with open(fa_file, 'a') as fa_out: + for id, seq in new_fa_dict.items(): + if not id in existing_seq: + fa_out.write('>%s\n%s\n' % (id, seq)) + + +def check_long_seq(fa_file, max_len): + """ Check if any sequence longer than max_len + (12.000 aa/nt for muscle v3; 20.000 for muscle v5)""" + fa_seq = SeqIO.parse(open(fa_file),'fasta') + for fa in fa_seq: + if len(fa.seq) > max_len: + return(1) + return(0) + + +def remove_dup(fa_file): + """ Remove duplicated sequences (filter by headers) """ + tmp = {} + fa_seq = SeqIO.parse(open(fa_file),'fasta') + for fa in fa_seq: + if not fa.id in tmp: + tmp[fa.id] = fa.seq + with open(fa_file, 'w') as out: + for id, seq in tmp.items(): + out.write('>%s\n%s\n' % (id, seq)) diff --git a/fdog/libs/hmm.py b/fdog/libs/hmm.py new file mode 100644 index 0000000..2b18d3d --- /dev/null +++ b/fdog/libs/hmm.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +import subprocess +import pyhmmer + +import fdog.libs.output as output_fn + +##### FUNCTIONS RELATED TO HMM ##### + +def create_hmm(aln_file, out_file): + """ Create hmm file for an alinment file """ + hmmbuild_cmd = 'hmmbuild --amino %s %s' % (out_file, aln_file) + try: + subprocess.run( + [hmmbuild_cmd], shell = True, + stdout = open(os.devnull, 'wb'), check = True) + except: + sys.exit('ERROR: Error running hmmbuild %s' % hmmbuild_cmd) + + +def sort_hmm_hits(hmm_hits, hitLimit = 10, scoreCutoff = 10, debug = False): + """ Sort HMM hits + Keep only n hits (n =< hitLimit), and hits that are not less than + best_hit_domain_score * (100 - scoreCutoff) / 100 + Input hmm_hits is a pyhmmer.plan7.topHits object + """ + best_score = -9999 # some "best" domains still have negative score! + cutoff = '' + score_dict = {} + ori_hits = {} + for hit in hmm_hits: + ori_hits[hit.name.decode('ASCII')] = len(hit.domains) + best_domain_score = -9999 #hit.domains[0].score + best_domain_hit = '' + if len(hit.domains) > 0: + # get domain with best score for this hit + for i in hit.domains: + if i.score > best_domain_score: + best_domain_score = i.score + best_domain_hit = i.hit.name.decode('ASCII') + # add hit to score_dict with increasing domain score + if best_domain_score > best_score: + best_score = best_domain_score + cutoff = best_score/100*(100-scoreCutoff) + if best_score < 0: + cutoff = best_score/100*(100+scoreCutoff) + if best_domain_score >= cutoff: + if best_domain_score not in score_dict: + score_dict[best_domain_score] = [best_domain_hit] + else: + score_dict[best_domain_score].append(best_domain_hit) + output_fn.print_debug(debug, 'All HMM hits', ori_hits) + hmm_cand = {} + n = 0 + score_dict = { + key:val for key, val in score_dict.items() \ + if key >= cutoff + } + output_fn.print_debug(debug, 'Candidate HMM hits', score_dict) + for score in sorted(score_dict, reverse = True): + if n < hitLimit: + for id in score_dict[score]: + hmm_cand[id] = score + n += 1 + return(hmm_cand) + + +def do_hmmsearch( + hmm_file, search_fa, evalHmmer = 0.00001, scoreCutoff = 10, + hitLimit = 10, cpus = os.cpu_count(), debug = False): + """ Perform hmmsearch for a hmm file vs a multiple fasta file + Return a dictionary of hits and their e-value and bit-score + Only "top" hits are returned. The cutoff is defined by + max_score / 100 * (100 - scoreCutoff) + By default, only hits that have at least 90% of the best bit score + are considers + """ + hmm_hits = {} + with pyhmmer.easel.SequenceFile(search_fa, digital = True, alphabet = pyhmmer.easel.Alphabet.amino()) as seq_file: + sequences = list(seq_file) + with pyhmmer.plan7.HMMFile(hmm_file) as hmm_file: + try: + for hits in pyhmmer.hmmsearch( + hmm_file, sequences, E = evalHmmer, cpus = cpus): + if len(hits) > 0: + hmm_hits = sort_hmm_hits(hits, hitLimit, scoreCutoff, debug) + except : + sys.exit( + 'ERROR: Error running hmmsearch for %s agains %s' + % (hmm_file, search_fa)) + return(hmm_hits) diff --git a/fdog/libs/orthosearch.py b/fdog/libs/orthosearch.py new file mode 100644 index 0000000..51a2288 --- /dev/null +++ b/fdog/libs/orthosearch.py @@ -0,0 +1,274 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import os +from Bio import SeqIO +import multiprocessing as mp +from tqdm import tqdm +import time + +import fdog.libs.zzz as general_fn +import fdog.libs.fasta as fasta_fn +import fdog.libs.blast as blast_fn +import fdog.libs.hmm as hmm_fn +import fdog.libs.alignment as align_fn +import fdog.libs.preparation as prepare_fn +import fdog.libs.output as output_fn + + +##### FUNCTION FOR HMM-BASED ORTHOLOG SEARCH (HaMStR) ##### +def hamstr(args): + (seqName, hmmpath, corepath, searchpath, outpath, + refspec, seed_id, search_taxon, + evalHmmer, hitLimit, scoreCutoff, + evalBlast, lowComplexityFilter, + checkCoorthologsRefOff, rbh, rep, + aligner, cpus, debug, silentOff, noCleanup) = args + """ Ortholog search algorithm for a hmm core group agains a search taxon + Implemented based on HaMStR https://doi.org/10.1186/1471-2148-9-157 + """ + ### (0) Dict for storing candidate and final orthologs (key=id, value=seq) + ortho_candi = {} + ortho_final = {} + ### (00) Parse input files + hmm_file = '%s/%s/hmm_dir/%s.hmm' % (hmmpath, seqName, seqName) + refspec_db = '%s/%s/%s' % (corepath, refspec, refspec) + refspec_fa = '%s/%s/%s.fa' % (corepath, refspec, refspec) + search_fa = '%s/%s/%s.fa' % (searchpath, search_taxon, search_taxon) + ### (000) Adapt parameters + if rbh == True: + checkCoorthologsRefOff = True + rep = True + + ### PRINT JOB PARAMETERS + output_fn.print_stdout( + silentOff, + '\n### Ortholog search ###' + + '\nSeed: %s\nRefspec: %s\n' % (seqName, refspec) + + 'Ref_seqID: %s\n' % seed_id + + 'Search taxon: %s' % search_taxon) + output_fn.print_debug( + debug, 'Parameters', + 'HMM evalue cutoff: %s\nHMM hit limit: %s\n' % (evalHmmer, hitLimit) + + 'HMM hit score cutoff: %s\n' % scoreCutoff + + 'BLAST evalue cutoff: %s\n' % evalBlast + + 'Blast low complexity filter: %s\n' % lowComplexityFilter + + 'Turn off check for co-orthologs ref: %s\n' % checkCoorthologsRefOff + + 'Aligner: %s' % aligner) + + ### (1) Do hmmsearch for query hmm against search taxon fasta + hmm_hits = hmm_fn.do_hmmsearch( + hmm_file, search_fa, evalHmmer, scoreCutoff, hitLimit, cpus, debug) + output_fn.print_debug(debug, 'Sorted HMM hits', hmm_hits) + ### (2) Read fasta file of refspec and search taxon + refspec_seqs = fasta_fn.read_fasta(refspec_fa) + search_seqs = fasta_fn.read_fasta(search_fa) + ### (3) Do re-blast search for each hmm hit against refspec + if len(hmm_hits) == 0: + output_fn.print_stdout( + silentOff, 'WARNING: No HMM hit found!') + else: + for hmm_hit in hmm_hits: + if not hmm_hit == seed_id: # only if search taxon == refspec + hmm_hit_fa = '%s/hmm_%s_%s_%s.fa' % ( + outpath, seqName, search_taxon, hmm_hit) + with open(hmm_hit_fa, 'w') as hmm_fa_out: + hmm_fa_out.write('>%s\n%s' % (hmm_hit, search_seqs.fetch(hmm_hit))) + blast_xml = blast_fn.do_blastsearch( + hmm_hit_fa, refspec_db, evalBlast = evalBlast, lowComplexityFilter = lowComplexityFilter) + blast_out = blast_fn.parse_blast_xml(blast_xml) + output_fn.print_debug(debug, 'BLAST hits', blast_out) + if noCleanup == False: + os.remove(hmm_hit_fa) + ### (4) check reciprocity + ### (4a) if refspec_seq_id == best blast hit + if len(blast_out['hits'].keys()) > 0: + best_blast_hit = list(blast_out['hits'].keys())[0] + if best_blast_hit == hmm_hit and len(blast_out['hits'].keys()) > 1: + best_blast_hit = list(blast_out['hits'].keys())[1] + if seed_id == best_blast_hit: + output_fn.print_stdout( + silentOff, + '%s accepted (best blast hit is ref)' % (blast_out['query'])) + ortho_candi[hmm_hit] = search_seqs.fetch(hmm_hit) + continue + else: + ### (4b) else, check for co-ortholog ref + if checkCoorthologsRefOff == False: + aln_fa = '%s/blast_%s_%s_%s_%s_%s.fa' % ( + outpath, seqName, seed_id, search_taxon, + hmm_hit, best_blast_hit) + with open(aln_fa, 'w') as aln_fa_out: + aln_fa_out.write( + '>%s\n%s\n>%s\n%s\n>%s\n%s' % ( + seed_id, refspec_seqs.fetch(seed_id), + hmm_hit, search_seqs.fetch(hmm_hit), + best_blast_hit, refspec_seqs.fetch(best_blast_hit) + ) + ) + fasta_fn.remove_dup(aln_fa) + aln_seq = align_fn.do_align(aligner, aln_fa) + output_fn.print_debug( + debug, 'Alignment for checking co-ortholog ref', aln_seq) + br_dist = align_fn.calc_Kimura_dist(aln_seq, best_blast_hit, seed_id, debug) + bh_dist = align_fn.calc_Kimura_dist(aln_seq, best_blast_hit, hmm_hit, debug) + output_fn.print_debug( + debug, 'Check if distance blast_vs_ref < blast_vs_hmm', + 'd_br = %s; d_bh = %s' % (br_dist, bh_dist)) + if noCleanup == False: + os.remove(aln_fa) + if br_dist == bh_dist == 0 or br_dist < bh_dist: + output_fn.print_stdout( + silentOff, + '%s accepted (best blast hit is co-ortholog to ref)' + % (blast_out['query']) + ) + ortho_candi[hmm_hit] = search_seqs.fetch(hmm_hit) + continue + ### (5) check co-ortholog if more than 1 HMM hits are accepted + if len(ortho_candi) == 0: + output_fn.print_stdout( + silentOff, 'WARNING: Reciprocity not fulfulled! No ortholog found!') + else: + best_ortho = list(ortho_candi.keys())[0] + if not best_ortho == seed_id: + ortho_final = fasta_fn.add_seq_to_dict( + ortho_final, '%s|%s|%s|1' % (seqName, search_taxon, best_ortho), + ortho_candi[best_ortho]) + if rep == False: + if len(ortho_candi) > 1: + aln_co_fa = '%s/coortho_%s_%s.fa' % ( + outpath, seqName, search_taxon) + with open(aln_co_fa, 'w') as aln_co_fa_out: + aln_co_fa_out.write(('>%s\n%s\n') % + (seed_id, refspec_seqs.fetch(seed_id))) + for cand in ortho_candi: + aln_co_fa_out.write(('>%s\n%s\n') % + (cand, ortho_candi[cand])) + aln_co_seq = align_fn.do_align(aligner, aln_co_fa) + output_fn.print_debug( + debug, 'Alignment for checking co-orthologs', aln_co_seq) + if noCleanup == False: + os.remove(aln_co_fa) + best_dist = align_fn.calc_Kimura_dist( + aln_co_seq, seed_id, best_ortho, debug) + for cand in ortho_candi: + if not cand == best_ortho: + candi_dist = align_fn.calc_Kimura_dist( + aln_co_seq, best_ortho, cand, debug) + output_fn.print_debug( + debug, + 'Check if distance bestHmm_vs_ref > ' + + 'other_vs_bestHmm', + 'd_best = %s; d_other = %s' + % (best_dist, candi_dist)) + if candi_dist < best_dist: + if not cand == seed_id: + ortho_final = fasta_fn.add_seq_to_dict( + ortho_final, + '%s|%s|%s|0' \ + % (seqName, search_taxon, cand), + ortho_candi[cand]) + output_fn.print_stdout( + silentOff, + '=> %s orthologs found: %s' + % (len(ortho_final), list(ortho_final.keys()))) + return(ortho_final) + + +def run_hamstr(args): + """ Perform ortholog search based on hamstr approach """ + + (seqName, refspec, pathArgs, orthoArgs, otherArgs) = args + (outpath, hmmpath, corepath, searchpath, annopath) = pathArgs + (checkCoorthologsRefOff, rbh, rep, evalBlast, lowComplexityFilter, + evalHmmer, hitLimit, scoreCutoff, aligner) = orthoArgs + (searchTaxa, cpus, debug, silentOff, noCleanup, force, append) = otherArgs + + hamstr_jobs = [] + ### get ref seqID + core_fa = '%s/%s/%s.fa' % (hmmpath, seqName, seqName) + seed_id = prepare_fn.get_seed_id_from_fa(core_fa, refspec) + + ### get search taxa from user defined list (as a file or directly a list) + if not searchTaxa == '': + ignored_taxa = [] + if os.path.exists(os.path.abspath(searchTaxa)): + search_taxa = general_fn.read_file(searchTaxa) + else: + search_taxa = searchTaxa.split(',') + + for search_taxon in search_taxa: + if os.path.exists( + os.path.abspath( + '%s/%s/%s.fa' % (searchpath,search_taxon,search_taxon))): + hamstr_jobs.append([ + seqName, hmmpath, corepath, searchpath, outpath, + refspec, seed_id, search_taxon, + evalHmmer, hitLimit, scoreCutoff, + evalBlast, lowComplexityFilter, + checkCoorthologsRefOff, rbh, rep, + aligner, cpus, debug, silentOff, noCleanup + ]) + else: + ignored_taxa.append(search_taxon) + if len(ignored_taxa) > 0: + print( + 'WARNING: %s taxa cannot be found at %s\n%s' + % (len(ignored_taxa), searchpath, ignored_taxa)) + ### get search taxa from searchpath (searchTaxa_dir) + else: + for search_taxon in general_fn.read_dir(searchpath): + if os.path.exists( + os.path.abspath( + '%s/%s/%s.fa' % (searchpath,search_taxon,search_taxon))): + hamstr_jobs.append([ + seqName, hmmpath, corepath, searchpath, outpath, + refspec, seed_id, search_taxon, + evalHmmer, hitLimit, scoreCutoff, + evalBlast, lowComplexityFilter, + checkCoorthologsRefOff, rbh, rep, + aligner, cpus, debug, silentOff, noCleanup + ]) + + ### do ortholog search + hamstr_out = {} + if len(hamstr_jobs) > 0: + if cpus > len(hamstr_jobs): + cpus = len(hamstr_jobs) + output_fn.print_stdout( + silentOff, 'Ortholog search for %s taxa...' % len(hamstr_jobs)) + if debug == True or silentOff == True or len(hamstr_jobs) == 1: + for job in hamstr_jobs: + tmp_out = hamstr(job) + hamstr_out = {**hamstr_out, **tmp_out} + else: + pool = mp.Pool(cpus) + for _ in tqdm( + pool.imap_unordered(hamstr, hamstr_jobs), + total=len(hamstr_jobs)): + if len(_) > 0: + hamstr_out = {**hamstr_out, **_} + + ### Get seed seq + refspec_fa = '%s/%s/%s.fa' % (corepath, refspec, refspec) + refspec_seqs = fasta_fn.read_fasta(refspec_fa) + seed_id_mod = '%s|%s|%s|1' % (seqName, refspec, seed_id) + seed_seq = refspec_seqs.fetch(seed_id) + + ### return + return({**{seed_id_mod:seed_seq}, **hamstr_out}) diff --git a/fdog/libs/output.py b/fdog/libs/output.py new file mode 100644 index 0000000..1cf9bef --- /dev/null +++ b/fdog/libs/output.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +from Bio import SeqIO + +import fdog.libs.zzz as general_fn + + +##### FUNCTIONS FOR OUTPUT ##### + +def print_debug(debug, cat, msg): + """ Print msg of a category in debug mode """ + if debug == True: + if cat == '': + print('#DEBUG#\t%s' % msg) + else: + print('#DEBUG#\t%s\n#DEBUG#\t%s' % (cat, msg)) + + +def print_stdout(silentOff, msg): + """ Print stdout """ + if silentOff == True: + print(msg) + + +def check_output_exist(outfile, force, append): + """ Check if outfile exists + And decide depends on the choice of force or append option + """ + if os.path.exists(outfile): + if force == True: + print('WARNING: %s will be deleted!' % outfile) + os.remove(outfile) + elif append == True: + general_fn.check_file_exist(outfile) + print('Result will be appended to %s!' % outfile) + else: + sys.exit( + 'WARNING: %s exists! ' % outfile + + 'You still can run with --force or --append option') + + +def write_hamstr(hamstr_result, outpath, seqName, force, append): + """ Write result of ortholog search into seqName.extended.fa """ + outfile = '%s/%s.extended.fa' % (outpath, seqName) + outfile = os.path.abspath(outfile) + check_output_exist(outfile, force, append) + + ### Write to output.extended.fa + ortho_count = len(hamstr_result) - 1 + if append == True: + if os.path.exists(outfile): + old_result_tmp = SeqIO.to_dict((SeqIO.parse(open(outfile),'fasta'))) + old_result = {} + for old_id in old_result_tmp: + if not old_id in hamstr_result: + old_result[old_id] = str(old_result_tmp[old_id].seq) + hamstr_result = {**hamstr_result, **old_result} + + with open(outfile, 'w') as out_file: + for id, seq in hamstr_result.items(): + out_file.write('>%s\n%s\n' % (id, seq)) + return( + 'Found %s ortholog(s)!\nOutput file: %s' % (ortho_count, outfile)) + + +def hamstr_2_profile(fa_file): + """ Convert extended.fa file into phyloprofile file """ + if os.path.exists(fa_file): + pp_file = fa_file.replace('.extended.fa', '.phyloprofile') + fa = SeqIO.to_dict((SeqIO.parse(open(fa_file),'fasta'))) + with open(pp_file, 'w') as pp: + pp.write('geneID\tncbiID\torthoID\n') + for id in list(fa.keys()): + tmp = id.split('|') + pp.write('%s\tncbi%s\t%s\n' % (tmp[0], tmp[1].split('@')[1], id)) + + +def add_all_taxa(pp_file, searchTaxa): + """ Add all "missing" search taxa into phyloprofile file """ + missing_taxa = [] # missing_taxa = [ncbi_id] + for taxon in searchTaxa.split(','): + flag = general_fn.search_string_in_file(pp_file, taxon) + if flag == 0: + missing_taxa.append(taxon.split('@')[1]) + first_gene = '' + if os.path.exists(pp_file): + with open(pp_file, 'a') as pp: + for line in general_fn.read_file(pp_file): + if not line.startswith('geneID'): + if not first_gene: + first_gene = line.split('\t')[0] + for i in missing_taxa: + if len(line.split('\t')) == 5: + pp.write(f'{first_gene}\tncbi{i}\tfdogMA\tNA\tNA\n') + else: + pp.write(f'{first_gene}\tncbi{i}\tfdogMA\n') + break diff --git a/fdog/libs/preparation.py b/fdog/libs/preparation.py new file mode 100644 index 0000000..bb46719 --- /dev/null +++ b/fdog/libs/preparation.py @@ -0,0 +1,202 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +from pathlib import Path +from Bio import SeqIO +from Bio.Blast.Applications import NcbiblastpCommandline +from ete3 import NCBITaxa + +import fdog.libs.zzz as general_fn +import fdog.libs.fasta as fasta_fn +import fdog.libs.blast as blast_fn +import fdog.libs.output as output_fn +import fdog.libs.tree as tree_fn + + +##### FUNCTIONS FOR DATA/INPUT PREPARATION ##### + +def parsing_paths(args): + """ Getting path to hmm core set, coreTaxa_dir, searchTaxa_dir and annotation_dir""" + (pathFile, outpath, hmmpath, corepath, searchpath, annopath) = args + ### get fdog and data path + data_path = '' + fdog_path = os.path.realpath(__file__).replace('/libs/preparation.py','') + pathconfigFile = fdog_path + '/bin/pathconfig.yml' + if not os.path.exists(pathconfigFile): + sys.exit( + f'No pathconfig.txt found at {pathconfigFile}. Please run fdog.setup ' + + '(https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') + + if pathFile: + pathconfigFile = os.path.abspath(pathFile) + + cfg = general_fn.load_config(pathconfigFile) + try: + data_path = cfg['dataPath'] + except: + data_path = os.getcwd() + + if hmmpath == '': + hmmpath = outpath + '/core_orthologs' + Path(hmmpath).mkdir(parents = True, exist_ok = True) + + if corepath == '': + try: + corepath = cfg['corepath'] + except: + corepath = data_path + '/coreTaxa_dir' + general_fn.check_file_exist(corepath) + if searchpath == '': + try: + searchpath = cfg['searchpath'] + except: + searchpath = data_path + '/searchTaxa_dir' + general_fn.check_file_exist(searchpath) + if annopath == '': + try: + annopath = cfg['annopath'] + except: + annopath = data_path + '/annotation_dir' + general_fn.check_file_exist(annopath) + return(hmmpath, corepath, searchpath, annopath) + + +def check_input(args): + (seqFile, refspec, outpath, hmmpath, corepath, + searchpath, annopath, pathFile) = args + fdog_path = os.path.realpath(__file__).replace('/libs/preparation.py','') + # create output directory + Path(outpath).mkdir(parents = True, exist_ok = True) + Path(hmmpath).mkdir(parents = True, exist_ok = True) + # check path existing + hmmpath, corepath, searchpath, annopath = parsing_paths( + [pathFile, outpath, hmmpath, corepath, searchpath, annopath]) + for path in [hmmpath, corepath, searchpath, annopath]: + general_fn.check_file_exist(path) + # check for seqFile + if not os.path.exists(os.path.abspath(seqFile)): + if not os.path.exists(fdog_path + '/data/' + seqFile): + sys.exit( + 'ERROR: %s not found in %s or %s' + % (seqFile, os.getcwd(), fdog_path + '/data/')) + else: + seqFile = fdog_path + '/data/' + seqFile + else: + seqFile = os.path.abspath(seqFile) + # check refspec + if not os.path.exists(os.path.abspath(corepath+'/'+refspec)): + exit('ERROR: Reference taxon %s not found in %s' % (refspec, corepath)) + return (seqFile, hmmpath, corepath, searchpath, annopath) + + +def check_blast_version(corepath, refspec): + """ Check if blast DBs in corepath is compatible with blastp version """ + fdog_path = os.path.realpath(__file__).replace('/libs/preparation.py','') + query = fdog_path + '/data/infile.fa' + blast_db = '%s/%s/%s' % (corepath, refspec, refspec) + try: + blastp_cline = NcbiblastpCommandline( + query = query, db = blast_db) + stdout, stderr = blastp_cline() + except: + sys.exit( + 'ERROR: Error running blast (probably conflict with BLAST DBs versions)\n%s' + % (NcbiblastpCommandline(query = query, db = blast_db))) + +def check_ranks_core_taxa(corepath, refspec, minDist, maxDist): + """ Check if refspec (or all core taxa) have a valid minDist and maxDist tax ID + Return 2 dictionaries of taxa for invalid minDist and maxDist, where + keys is taxon name and value is the next valid rank + """ + invalid_minDist = [] + invalid_maxDist = [] + ncbi = NCBITaxa() + rank_list = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'superkingdom'] + suggest_minIndex = rank_list.index(minDist) + suggest_maxIndex = rank_list.index(maxDist) + for f in os.listdir(corepath): + if f == refspec: # remove this if need to check for all core taxa + if os.path.isdir(f'{corepath}/{f}'): + id = f.split('@')[1] + lineage = ncbi.get_lineage(id) + ranks = ncbi.get_rank(lineage) + if len(general_fn.matching_elements(ranks, minDist)) < 1: + invalid_minDist.append(f) + index_minDist = rank_list.index(minDist) + 1 + while index_minDist < len(rank_list): + if len(general_fn.matching_elements(ranks, rank_list[index_minDist])) > 0: + if index_minDist > suggest_minIndex: + suggest_minIndex = index_minDist + break + index_minDist += 1 + if len(general_fn.matching_elements(ranks, maxDist)) < 1: + invalid_maxDist.append(f) + index_maxDist = rank_list.index(maxDist) + 1 + while index_maxDist < len(rank_list): + if len(general_fn.matching_elements(ranks, rank_list[index_maxDist])) > 0: + if index_maxDist > suggest_maxIndex: + suggest_maxIndex = index_maxDist + break + index_maxDist += 1 + return(invalid_minDist, invalid_maxDist, rank_list[suggest_minIndex], rank_list[suggest_maxIndex]) + + +def get_seed_id_from_fa(core_fa, refspec): + """ Get seed ID from core ortholog fasta file + (used if --reuseCore option is specified) + """ + core_seqs = SeqIO.to_dict((SeqIO.parse(open(core_fa), 'fasta'))) + core_ids = core_seqs.keys() + seed_id = [s for s in core_ids if refspec in s][0].split('|')[-1] + return(seed_id) + + +def identify_seed_id(seqFile, refspec, corepath, debug, silentOff): + """ Identify seed ID in reference protein set using BLAST + If the header of the seed fasta seq if found in the refspec proteome, then + if can be directly used. Otherwise do blast search. If the return blast hit + is longer/shorter than the seed sequence by 10 amino acids, fDOG will stop + """ + refspec_db = '%s/%s/%s' % (corepath, refspec, refspec) + # first check if input seed ID existiert in refspec genome + refspec_fa = fasta_fn.read_fasta('%s.fa' % refspec_db) + seed_fa = SeqIO.parse(open(seqFile),'fasta') + for seed in seed_fa: + try: + if len(refspec_fa.fetch(seed.id)) == len(seed.seq): + return(seed.id) + except: + output_fn.print_debug(debug, 'Identify seed ID', 'Input seed ID not found!') + # otherwise, perform blast search + blast_xml = blast_fn.do_blastsearch(seqFile, refspec_db, evalBlast = 0.001) + blast_out = blast_fn.parse_blast_xml(blast_xml) + if len(blast_out['hits']) < 1: + print(f'ERROR: Cannot find seed sequence {blast_out["query"]} in genome of reference species!') + print(f'You can check it by running:\nblastp -query {seqFile} -db {corepath}/{refspec}/{refspec} -evalue 0.001 -outfmt 7') + sys.exit() + for hit in blast_out['hits']: + if blast_out['hits'][hit]['align_len'] == blast_out['query_len']: + return(hit) + elif abs(int(blast_out['hits'][hit]['align_len']) - int(blast_out['query_len'])) < 10: + output_fn.print_stdout(silentOff, 'WARNING: Found seed sequence shorter/longer than input!') + return(hit) + else: + print(f'ERROR: Cannot find seed sequence {blast_out["query"]} in genome of reference species!') + print(f'You can check it by running:\nblastp -query {seqFile} -db {corepath}/{refspec}/{refspec} -evalue 0.001 -outfmt 7') + sys.exit() diff --git a/fdog/libs/tree.py b/fdog/libs/tree.py new file mode 100644 index 0000000..b2de19f --- /dev/null +++ b/fdog/libs/tree.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import re +from ete3 import NCBITaxa + +import fdog.libs.zzz as general_fn + + +##### FUNCTIONS RELATED TO TAXONOMY TREE ##### + +def get_rank_index(lineage, rank_name, ncbi): + """ Get ID and index in the species lineage for a given rank + Return {rank_id:rank_index} + """ + ranks = ncbi.get_rank(lineage) + rank_id = list(general_fn.matching_elements(ranks, rank_name).keys())[0] + rank_index = len(ranks) - lineage.index(rank_id) - 1 + return({rank_id:rank_index}) + + +def get_rank_range(lineage, minDist, maxDist, ncbi): + """ Get rank ID and its index in a given species lineage + for a pair of min and max rank. See get_rank_index() + Return a list of 2 dictionary for min and max rank as + [{min_rank_id:min_rank_index}, {max_rank_id:max_rank_index}] + """ + return( + get_rank_index(lineage, minDist, ncbi), + get_rank_index(lineage, maxDist, ncbi)) + + +def check_taxon_group(group_id, tax_id, ncbi): + """ Check if a taxon (tax_id) belongs to a taxonomy group (group_id)""" + lineage = ncbi.get_lineage(tax_id) + if group_id in lineage: + return(True) + return(False) + + +def get_ancestor(id1, id2, ncbi): + """ Get common ancestor ID and rank for 2 taxon IDs + Return dictionary {ancestor_id: ancestor_rank} + """ + tree = ncbi.get_topology([id1, id2], intermediate_nodes = False) + ancestor = tree.get_common_ancestor(id1, id2).name + return(ncbi.get_rank([ancestor])) + + +def check_common_ancestor(ref_id, ancestor, minDist, maxDist, ncbi): + """ Check if ancestor ID lies within the range between min and max rank + of reference species + Return 1 if true + """ + ref_lineage = ncbi.get_lineage(ref_id) + (min_ref, max_ref) = get_rank_range(ref_lineage, minDist, maxDist, ncbi) + if not ancestor in ref_lineage: + return(0) + ancestor_index = len(ref_lineage) - ref_lineage.index(ancestor) - 1 + if list(min_ref.values())[0] <= ancestor_index <= list(max_ref.values())[0]: + return(1) + return(0) + + +def remove_clade(tree, node_id): + """ Remove a clade from a tree """ + removed_clade = tree.search_nodes(name = str(node_id))[0] + removed_node = removed_clade.detach() + return(tree) + + +def get_leaves_dict(spec_lineage, tree, min_index, max_index): + """ Given a tree and a lineage string of a species + Return a dictionary where keys are the internal nodes defined by the + ranks between min rank (e.g. genus, specified by min_index in the species + lineage) and max rank (e.g. phylum). Values are all leaves in the tree + that belong to the corresponding internal node (rank) + """ + node_dict = {} + already_added = [] + spec_lineage.reverse() + for i in range(len(spec_lineage)): + if i >= min_index and i <= max_index: + curr_node = spec_lineage[i] + node = tree.search_nodes(name = str(curr_node)) + if len(node) > 0: + for leaf in node: + node_dict[spec_lineage[i]] = [] + for t in leaf.traverse(): + if t.is_leaf(): + if not t.name in already_added: + already_added.append(t.name) + node_dict[spec_lineage[i]].append(t.name) + return(general_fn.remove_dup_in_dict(node_dict)) + + +def check_tax_id(tax_id): + """ Check valid taxon ID + Return taxon name (UNK if ID not found in ncbi db) + """ + ncbi = NCBITaxa() + tmp = ncbi.get_rank([tax_id]) + try: + tmp = ncbi.get_rank([tax_id]) + rank = tmp[int(tax_id)] + if not rank == 'species': + print('\033[92mWARNING: rank of %s is not SPECIES (%s)\033[0m' % (tax_id, rank)) + else: + ncbi_name = ncbi.get_taxid_translator([tax_id])[int(tax_id)] + print('\033[92mNCBI taxon info: %s %s\033[0m' % (tax_id, ncbi_name)) + return(ncbi_name) + except: + print('\033[92mWARNING: %s not found in NCBI taxonomy database!\033[0m' % tax_id) + return('UNK%s' % tax_id) + + +def abbr_ncbi_name(ncbi_name): + """ Parse ncbi taxon name into abbr name + E.g. "Homo sapiens" -> "HOMSA" + """ + if not ncbi_name.startswith('UNK'): + ncbi_name = re.sub('[^a-zA-Z1-9\s]+', '', ncbi_name) + tax_name = ncbi_name.split() + name = tax_name[0][:3].upper()+tax_name[1][:2].upper() + else: + name = ncbi_name + return(name) diff --git a/fdog/libs/zzz.py b/fdog/libs/zzz.py new file mode 100644 index 0000000..5900574 --- /dev/null +++ b/fdog/libs/zzz.py @@ -0,0 +1,196 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +from pathlib import Path +import ssl +import urllib.request +import yaml +import time +import pickle + + +##### GENERAL FUNCTIONS FOR FILES, FOLDERS AND GENERAL VARIABLES ##### + +def check_file_exist(file): + """ Exit if a file does not exist""" + if not os.path.exists(os.path.abspath(file)): + sys.exit('%s not found' % file) + + +def read_file(file): + """ Read a file and return list of lines""" + if os.path.exists(file): + with open(file, 'r') as f: + lines = f.read().splitlines() + f.close() + return(lines) + else: + sys.exit('%s not found' % file) + + +def read_dir(dir): + """ Return list of directories from a given path """ + check_file_exist(dir) + out_dirs = [] + p = os.listdir(dir) + for i in p: + if os.path.isdir('%s/%s' % (dir, i)): + out_dirs.append(i) + return(out_dirs) + + +def load_config(config_file): + """ Load a YAML file and return as a dictionary """ + with open(config_file, 'r') as stream: + try: + return yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + + +def download_progress(count, block_size, total_size): + global start_time + if count == 0: + start_time = time.time() + return + duration = time.time() - start_time + progress_size = int(count * block_size) + speed = int(progress_size / (1024 * duration)) + percent = int(count * block_size * 100 / total_size) + if percent > 100: + percent = 100 + sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" % + (percent, progress_size / (1024 * 1024), speed, duration)) + sys.stdout.flush() + + +def download_file(url, file): + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + download_file = urllib.request.URLopener(context=ctx) + print('Downloading %s' % (url + '/' + file)) + urllib.request.urlretrieve(url + '/' + file, file, download_progress) + print(' ... done!') + + +def count_line(file, pattern, contain): + """ Count lines in file that contain (or not) a pattern """ + nline = 0 + with open(file, 'r') as f: + for line in f: + if contain: + if pattern in line: + nline = nline + 1 + else: + if not pattern in line: + nline = nline + 1 + return(nline) + + +def get_ids_from_folder(folder, type): + """ Get taxonomy IDs for from coreTaxa_dir, searchTaxa_dir or annotation_dir + Return dictionary {taxID:@@Ver} + """ + tax_ids = {} + + for name in read_dir(folder): + if type == 'annotation_dir': + if not name.endswith('.json'): + continue + else: + name = name.replace('.json','') + else: + if not os.path.isdir('%s/%s' % (folder, name)): + continue + id = name.split('@')[1] + if not id in tax_ids: + tax_ids[id] = name + return(tax_ids) + + +def matching_elements(dictionary, search_string): + """ Search for a string in dictionary's values + Return {key:val} where string was found in val + """ + return {key:val for key,val in dictionary.items() if search_string == val} + + +def remove_dup_in_dict(dictionary): + """ Find and remove duplicated or empty values of a dictionary """ + tmp_dict = {'_'.join(val) : key for key, val in dictionary.items()} + res = {val : key.split('_') for key, val in tmp_dict.items()} + res = {key : val for key, val in res.items() if len(val[0]) > 0} + return(res) + + +def join_2lists(first_list, second_list): + """ Join 2 lists """ + in_first = set(first_list) + in_second = set(second_list) + in_second_but_not_in_first = in_second - in_first + out = first_list + list(in_second_but_not_in_first) + return(out) + + +def save_pyobj(obj, out_file): + """ Save a python object to out_file """ + with open(out_file, 'wb') as obj_out: + pickle.dump(obj, obj_out) + + +def read_pyobj_file(in_file): + """ Read a python object from an in_file """ + with open(in_file, 'rb') as obj_file: + return(pickle.load(obj_file)) + + +def query_yes_no(question, default='yes'): + valid = {'yes': True, 'y': True, 'ye': True, + 'no': False, 'n': False} + if default is None: + prompt = ' [y/n] ' + elif default == 'yes': + prompt = ' [Y/n] ' + elif default == 'no': + prompt = ' [y/N] ' + else: + raise ValueError('invalid default answer: "%s"' % default) + while True: + choice = sys.stdin.readline().rstrip().lower() + if default is not None and choice == '': + return valid[default] + elif choice in valid: + return valid[choice] + else: + sys.stdout.write('Please respond with "yes" or "no" ' + '(or "y" or "n").\n') + + +def search_string_in_file(file, string): + """ Search for a string in file + Return 0 if not found, 1 if found + """ + flag = 0 + with open(file, 'r') as fp: + for l_no, line in enumerate(fp): + if string in line: + flag = 1 + break + return(flag) diff --git a/fdog/makeCoreGroupFromFasta.py b/fdog/makeCoreGroupFromFasta.py new file mode 100644 index 0000000..b81e872 --- /dev/null +++ b/fdog/makeCoreGroupFromFasta.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2021 Hannah Muelbaier +# +# This script is used to prepare the core group used as input for fDOG-Assembly from a fasta file of an ortholog group. +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: hannah.muelbaier@gmail.com +# +####################################################################### + +############################ imports ################################## +import os +import os.path +import sys +import argparse +import fdog.libs.alignment as align_fn +import fdog.libs.zzz as general_fn + +def check_fasta(file): + nHeader = general_fn.count_line(file, '>', True) + nSeq = general_fn.count_line(file, '>', False) + if not nHeader == nSeq: + return(1) + return(0) + +def make_single_line_fasta(input, gene, out_folder): + output = out_folder + gene + ".fa" + with open(input, 'r') as f_input, open(output, 'w') as f_output: + block = [] + for line in f_input: + if line.startswith('>'): + if block: + f_output.write(''.join(block) + '\n') + block = [] + f_output.write(line) + else: + block.append(line.strip()) + + if block: + f_output.write(''.join(block) + '\n') + return (output) + +def makeMSA(out_folder, gene, fasta_file): + aln_file = out_folder + gene + '.aln' + if align_fn.get_muscle_version('muscle') == 'v3': + os.system('muscle -quiet -in %s -out %s' % (fasta_file, aln_file)) + else: + os.system('muscle -align %s -output %s' % (fasta_file, aln_file)) + return aln_file + +def makeHMM(out_folder, gene, aln_file): + hmm_dir = out_folder + 'hmm_dir' + os.system('mkdir %s >/dev/null 2>&1' % (hmm_dir)) + out_file = '%s/%s.hmm' % (hmm_dir, gene) + hmmbuild_cmd = 'hmmbuild --amino %s %s' % (out_file, aln_file) + os.system(hmmbuild_cmd) + return out_file + + +def main(): + + #################### handle user input ##################################### + version = '0.0.1' + ################### initialize parser ###################################### + parser = argparse.ArgumentParser(description='You are running fdog.addCoreGroup version ' + str(version) + '.') + ################## required arguments ###################################### + required = parser.add_argument_group('Required arguments') + required.add_argument('--fasta', help='Path to fasta file of ortholog group.', action='store', default='', required=True) + required.add_argument('--out', help='Path to output folder.', action='store', default='', required=True) + required.add_argument('--geneName', help='Core group name', action='store', default='', required=True) + args = parser.parse_args() + + fasta_file_input = args.fasta + out_folder = args.out + gene = args.geneName + + + out_folder = out_folder + '/' + gene + '/' + os.system('mkdir %s >/dev/null 2>&1' % (out_folder)) + + if check_fasta(fasta_file_input) == 1: + fasta_file = make_single_line_fasta(fasta_file_input, gene, out_folder) + else: + fasta_file = out_folder + gene + '.fa' + os.system('cp ' + fasta_file_input + ' ' + fasta_file) + + aln_file = makeMSA(out_folder, gene, fasta_file) + hmm_file = makeHMM(out_folder, gene, aln_file) + + print('Core group located at %s. Fasta file: %s; MSA: %s; HMM: %s' % (out_folder, fasta_file, aln_file, hmm_file)) + +main() diff --git a/fdog/mergeOutput.py b/fdog/mergeOutput.py index c710ee7..2d5a276 100644 --- a/fdog/mergeOutput.py +++ b/fdog/mergeOutput.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to merge all output files (.extended.fa, .phyloprofile, # _forward.domains, _reverse.domains) in a given directory into one file each. @@ -21,6 +21,7 @@ from os import listdir as ldir import argparse import yaml +from pkg_resources import get_distribution def createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out): settings = dict( @@ -33,9 +34,10 @@ def createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out): with open('%s.config.yml' % (out), 'w') as outfile: yaml.dump(settings, outfile, default_flow_style = False) + def main(): - version = '0.1.0' - parser = argparse.ArgumentParser(description='You are running fdog.mergeOutput version ' + str(version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') parser.add_argument('-i', '--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', action='store', default='', required=True) diff --git a/fdog/removefDog.py b/fdog/removefDog.py index 7b705ea..9a28b41 100644 --- a/fdog/removefDog.py +++ b/fdog/removefDog.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to uninstall fdog and its data # @@ -20,7 +20,9 @@ import argparse import subprocess import shutil +from pkg_resources import get_distribution +import fdog.setupfDog as setupfDog_fn def query_yes_no(question, default='yes'): valid = {'yes': True, 'y': True, 'ye': True, @@ -46,18 +48,14 @@ def query_yes_no(question, default='yes'): def main(): - version = '0.0.1' - parser = argparse.ArgumentParser(description='You are running fdog.remove version ' + str(version) + '.') - parser.add_argument('--data', help='Remove fdog together with all files/data within the installed fdog directory', action='store_true', default=False) + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + parser.add_argument('--all', help='Remove fdog together with all files/data within the installed fdog directory', action='store_true', default=False) args = parser.parse_args() - data = args.data + data = args.all fdogPath = os.path.realpath(__file__).replace('/removefDog.py','') - pathconfigFile = fdogPath + '/bin/pathconfig.txt' - if not os.path.exists(pathconfigFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - with open(pathconfigFile) as f: - dataPath = f.readline().strip() + dataPath = setupfDog_fn.get_data_path(fdogPath) if data: print('All files and folders in %s will be removed! Enter to continue' % fdogPath) @@ -65,7 +63,7 @@ def main(): print('fdog will be uninstalled. Some files/data still can be found in %s! Enter to continue' % fdogPath) if query_yes_no('Are you sure?'): if data: - folders = ['bin', 'core_orthologs', 'taxonomy', 'data'] + folders = ['bin', 'data'] for f in folders: dirPath = fdogPath+'/'+f if os.path.exists(os.path.abspath(dirPath)): @@ -75,12 +73,11 @@ def main(): try: subprocess.call([uninstallCmd], shell = True) except: - print('Error by uninstalling fdog. Please manually uninstall it using pip uninstall fdog') - if data: - if os.path.exists(os.path.abspath(fdogPath)): - shutil.rmtree(fdogPath) + print('Error by uninstalling fdog. Please manually uninstall it using ') + if os.path.exists(os.path.abspath(fdogPath)): + shutil.rmtree(fdogPath) - print('NOTE: fdog genome data are still available at %s.' % dataPath) + print('NOTE: fdog data are still available at\n%s.' % dataPath) if __name__ == '__main__': diff --git a/fdog/runMulti.py b/fdog/runMulti.py index ca8a058..68a072c 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # -# This script is used to run fdog with multiple seed sequences. +# This file is part of fDOG tool https://github.com/BIONF/fDOG # # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -17,22 +17,29 @@ import sys import os -from os import listdir +from pathlib import Path from os.path import isfile, join -import time import argparse import subprocess -from pathlib import Path -import multiprocessing as mp import re -from tqdm import tqdm -import fdog.runSingle as fdogFn import shutil -import yaml +import multiprocessing as mp +from tqdm import tqdm from ete3 import NCBITaxa +from pkg_resources import get_distribution +import time + +import fdog.libs.zzz as general_fn +import fdog.libs.preparation as prepare_fn +import fdog.libs.orthosearch as ortho_fn +import fdog.libs.corecompile as core_fn +import fdog.libs.fas as fas_fn +import fdog.libs.tree as tree_fn +import fdog.libs.output as output_fn + -def getSortedFiles(directory): +def get_sorted_files(directory): list = os.listdir(directory) pairs = [] for file in list: @@ -43,129 +50,103 @@ def getSortedFiles(directory): pairs.sort(key=lambda s: s[0], reverse=True) return([x[1] for x in pairs]) -def prepare(args, step): - (seqFile, seqName, fdogPath, refspec, minDist, maxDist, coreOrth, - append, force, noCleanup, group, blast, db, - outpath, hmmpath, blastpath, searchpath, weightpath, - coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, - fasoff, countercheck, coreFilter, minScore, - strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, checkOff, debug, silent) = args - - mute = False - if step == 'core': - coreOnly = True - silent = True - mute = True - else: - reuseCore = True - fasoff = True - if silent == True: - mute = True - ### check input arguments - seqFile, hmmpath, blastpath, searchpath, weightpath = fdogFn.checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) - # group arguments - basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth] - ioArgs = [append, force, noCleanup, group, blast, db] - pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] - coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] - fasArgs = [fasoff, countercheck, coreFilter, minScore] - orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] - otherArgs = [cpu, hyperthread, checkOff, debug, True] - return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) - -def getSeedName(seedFile): + +def get_seed_name(seedFile): seqName = seedFile.rsplit('.', 1)[0] seqName = re.sub('[\|\.]', '_', seqName) return(seqName) -def getIndividualRuntime(step, outpath, seeds): - logFile = outpath + '/runtime_core.txt' - searchTerm = 'Core set compilation finished in' - if step == 'ortho': - logFile = outpath + '/runtime_ortho.txt' - searchTerm = 'Ortholog search completed in' - log = open(logFile, "w") - for seed in seeds: - seqName = getSeedName(seed) - logFile = outpath + '/' + seqName + '/fdog.log' - if os.path.exists(logFile): - with open(logFile, 'r') as f: - for line in f: - if searchTerm in line: - runtime = line.split()[-2] - log.write('%s\t%s\n' % (seqName, runtime)) + +def create_core_jobs(args): + (seed, core_options, other_options, inFol, outpath, silentOff) = args + (coreArgs, orthoCoreArgs, otherCoreArgs) = core_options + (refspec, reuseCore, forceCore, pathArgs, debug) = other_options + (outpath, hmmpath, corepath, searchpath, annopath) = pathArgs + seqFile = ('%s/%s' % (inFol, seed)) + seqName = get_seed_name(seed) + if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)) or forceCore == True: + seed_id = prepare_fn.identify_seed_id(seqFile, refspec, corepath, debug, silentOff) + if not seed_id == 'None': + return([seqFile, seqName, refspec, seed_id, + reuseCore, forceCore, coreArgs, pathArgs, orthoCoreArgs, + otherCoreArgs, debug]) else: - missing = open(outpath + '/missing.txt', 'a+') - missing.write(step + '\t' + seqName + '\n') - log.close() - -def compileCore(options, seeds, inFol, cpu, outpath): - print('Starting compiling core orthologs...') - start = time.time() - coreCompilationJobs = [] - for seed in seeds: - seqFile = [inFol + '/' + seed] - seqName = getSeedName(seed) - if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)): - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core') - coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute]) - if len(coreCompilationJobs) > 0: - pool = mp.Pool(cpu) - coreOut = [] - for _ in tqdm(pool.imap_unordered(fdogFn.runSingle, coreCompilationJobs), total=len(coreCompilationJobs)): - coreOut.append(_) + print(f'WARNING: Cannot identify seed ID for {seqFile}!') + + +def compile_core(core_options, other_options, seeds, inFol, cpus, outpath, silentOff, jobName): + core_compilation_jobs = [] + (coreArgs, orthoCoreArgs, otherCoreArgs) = core_options + (cpus, debugCore, silentOff, noCleanup, force, append) = otherCoreArgs + (refspec, reuseCore, forceCore, pathArgs, debug) = other_options + (outpath, hmmpath, corepath, searchpath, annopath) = pathArgs + pool = mp.Pool(cpus) + begin = time.time() + print('Preparing core compilation jobs...') + core_job_file = '%s/%s_core_jobs.list' % (outpath, jobName) + if os.path.exists(core_job_file) and os.stat(core_job_file).st_size > 0: + print('... file contains jobs found (%s)' % core_job_file) + core_compilation_jobs = general_fn.read_pyobj_file(core_job_file) + else: + prepare_jobs = [] + for seed in seeds: + prepare_jobs.append([seed, core_options, other_options, inFol, outpath, silentOff]) + for _ in tqdm(pool.imap_unordered(create_core_jobs, prepare_jobs), total=len(prepare_jobs)): + core_compilation_jobs.append(_) + general_fn.save_pyobj(core_compilation_jobs, core_job_file) + end = time.time() + print('==> %s jobs will be run. Preparing finished in %s' % (len(core_compilation_jobs), '{:5.3f}s'.format(end - begin))) + if len(core_compilation_jobs) > 0: + core_runtime = [] + if debugCore == True or silentOff == True or len(core_compilation_jobs) == 1: + for job in core_compilation_jobs: + tmp_out = core_fn.run_compile_core(job) + core_runtime.append(tmp_out) + else: + for _ in tqdm(pool.imap_unordered(core_fn.run_compile_core, core_compilation_jobs), total=len(core_compilation_jobs)): + core_runtime.append(_) pool.close() pool.join() - # read logs file to get runtime for individual seeds - getIndividualRuntime('core', outpath, seeds) - end = time.time() - multiCoreTime = '{:5.3f}'.format(end-start) - print('==> Core compiling finished in %s sec' % multiCoreTime) #'{:5.3f}s'.format(end-start)) - return(multiCoreTime) - -def searchOrtho(options, seeds, inFol, cpu, outpath): - print('Searching orthologs for...') - start = time.time() - coreCompilationJobs = [] + out = [] + for r in core_runtime: + out.append('\t'.join(r)) + return(out) + + +def search_ortholog(options, seeds, inFol, outpath): + (orthoArgs, otherArgs, pathArgs, refspec) = options + (searchTaxa, cpus, debug, silentOff, noCleanup, force, append) = otherArgs + ortho_runtime = [] + n = 1 for seed in seeds: + begin = time.time() seqFile = [inFol + '/' + seed] - seqName = getSeedName(seed) - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') - if mute == True: - print(seed) - else: - print('\n##### ' + seed) - fdogFn.runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute]) - end = time.time() - # read logs file to get runtime for individual seeds - getIndividualRuntime('ortho', outpath, seeds) - multiOrthoTime = '{:5.3f}'.format(end-start) - print('==> Ortholog search finished in %s sec' % multiOrthoTime) - return(multiOrthoTime) - -def joinOutputs(outpath, jobName, seeds, keep, silent): - print('Joining single outputs...') + seqName = get_seed_name(seed) + print('... %s (%s / %s)' % (seqName, n, len(seeds))) + if not os.path.exists('%s/%s.extended.fa' % (outpath, seqName)) or force == True: + hamstr_out = ortho_fn.run_hamstr([seqName, refspec, pathArgs, orthoArgs, otherArgs]) + output_fn.write_hamstr(hamstr_out, outpath, seqName, force, append) + end = time.time() + ortho_runtime.append('%s\t%s' % (seqName, '{:5.3f}s'.format(end - begin))) + n += 1 + return(ortho_runtime) + + +def join_outputs(outpath, jobName, seeds, keep, silentOff): finalFa = '%s/%s.extended.fa' % (outpath, jobName) - finalPP = open('%s/%s.phyloprofile' % (outpath, jobName), 'wb') - Path(outpath+'/singleOutput').mkdir(parents=True, exist_ok=True) + single_output_fol = '%s/%s_singleOutput' % (outpath, jobName) + Path('%s/%s_singleOutput' % (outpath, jobName)).mkdir(parents=True, exist_ok=True) with open(finalFa,'wb') as wfd: for seed in seeds: - seqName = getSeedName(seed) - resultFile = '%s/%s/%s.extended.fa' % (outpath, seqName, seqName) - resultPP ='%s/%s/%s.phyloprofile' % (outpath, seqName, seqName) - if silent == False: + seqName = get_seed_name(seed) + resultFile = '%s/%s.extended.fa' % (outpath, seqName) + if silentOff == True: print(resultFile) if os.path.exists(resultFile): with open(resultFile,'rb') as fd: shutil.copyfileobj(fd, wfd) - with open(resultPP,'rb') as pp: - shutil.copyfileobj(pp, finalPP) - shutil.move(outpath + '/' + seqName, outpath + '/singleOutput') - else: - Path(outpath+'/missingOutput').mkdir(parents=True, exist_ok=True) - if not os.path.exists(outpath + '/missingOutput/' + seqName): - shutil.move(outpath + '/' + seqName, outpath + '/missingOutput') + if not os.path.exists('%s/%s.extended.fa' % (single_output_fol, seqName)): + shutil.move(resultFile, single_output_fol) if os.path.exists(outpath + '/' + seqName + '.fa'): os.remove(outpath + '/' + seqName + '.fa') if os.path.exists(os.getcwd() + '/' + seqName + '.fa'): @@ -173,59 +154,19 @@ def joinOutputs(outpath, jobName, seeds, keep, silent): if keep == True: try: print('Compressing single outputs...') - shutil.make_archive(outpath + '/' + jobName + '_singleOutput', 'gztar', outpath+'/singleOutput') + shutil.make_archive(single_output_fol, 'gztar', single_output_fol) except: - shutil.make_archive(outpath + '/' + jobName + '_singleOutput', 'tar', outpath+'/singleOutput') - shutil.rmtree(outpath + '/singleOutput') - return(finalFa) - -def removeDupLines (infilename, outfilename): - lines_seen = set() # holds lines already seen - outfile = open(outfilename, "w") - for line in open(infilename, "r"): - if line not in lines_seen: # not a duplicate - outfile.write(line) - lines_seen.add(line) - outfile.close() - -def calcFAS (outpath, extendedFa, weightpath, cpu): - print('Starting calculating FAS scores...') - start = time.time() - fasCmd = 'fas.runFdogFas -i %s -w %s --cores %s --redo_anno' % (extendedFa, weightpath, cpu) - try: - subprocess.call([fasCmd], shell = True) - end = time.time() - if os.path.exists(outpath + '/tmp'): - shutil.rmtree(outpath + '/tmp') - fasTime = '{:5.3f}s'.format(end-start) - print('==> FAS calculation finished in %s sec' % fasTime) - return(fasTime) - except: - sys.exit('Problem running\n%s' % (fasCmd)) - -def createConfigPP(outpath, jobName, refspec): - settings = dict( - mainInput = '%s/%s.phyloprofile' % (outpath, jobName), - fastaInput = '%s/%s.extended.fa' % (outpath, jobName), - ) - domainFile = '%s/%s_forward.domains' % (outpath, jobName) - if os.path.exists(os.path.abspath(domainFile)): - settings['domainInput'] = domainFile - taxId = refspec.split('@')[1] - refspec = fdogFn.getTaxName(taxId) - if not refspec == 'UNK': - settings['rank'] = 'species' - settings['refspec'] = refspec - settings['clusterProfile'] = 'TRUE' - with open('%s/%s.config.yml' % (outpath, jobName), 'w') as configfile: - yaml.dump(settings, configfile, default_flow_style = False) + shutil.make_archive(single_output_fol, 'tar', single_output_fol) + shutil.rmtree(single_output_fol) + def main(): - version = '0.0.52' - parser = argparse.ArgumentParser(description='You are running fdogs.run version ' + str(version) + '.') - parser.add_argument('--version', action='version', version=str(version)) + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.', + epilog="For more information on certain options, please refer to the wiki pages " + "on github: https://github.com/BIONF/fDOG/wiki") required = parser.add_argument_group('Required arguments') - required.add_argument('--input', help='Input folder containing the seed sequences (protein only) in fasta format', + required.add_argument('--seqFolder', help='Input folder containing the seed sequences (protein only) in fasta format', action='store', default='', required=True) required.add_argument('--jobName', help='Job name. This will also be file name for the output', action='store', default='', required=True) @@ -235,22 +176,11 @@ def main(): optional_paths = parser.add_argument_group('Non-default directory options') optional_paths.add_argument('--outpath', help='Output directory', action='store', default='') optional_paths.add_argument('--hmmpath', help='Path for the core ortholog directory', action='store', default='') - optional_paths.add_argument('--blastpath', help='Path for the blastDB directory', action='store', default='') + optional_paths.add_argument('--corepath', help='Path for the core taxa directory', action='store', default='') optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') - optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') + optional_paths.add_argument('--annopath', help='Path for the pre-calculated feature annotion directory', action='store', default='') optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - addtionalIO = parser.add_argument_group('Other I/O options') - addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) - addtionalIO.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) - addtionalIO.add_argument('--forceComplete', help='Overwrite existing core orthologs and all output files', action='store_true', default=False) - addtionalIO.add_argument('--noCleanup', help='Temporary output will NOT be deleted. Default: False', action='store_true', default=False) - addtionalIO.add_argument('--keep', help='Keep output of individual seed sequence. Default: False', action='store_true', default=False) - addtionalIO.add_argument('--group', help='Allows to limit the search to a certain systematic group', action='store', default='') - addtionalIO.add_argument('--blast', help='Determine sequence id and refspec automatically. Note, the chosen sequence id and reference species does not necessarily reflect the species the sequence was derived from.', - action='store_true', default=False) - addtionalIO.add_argument('--db', help='Run fdog in database mode. Requires a mySql database. Only for internal use.', action='store_true', default=False) - core_options = parser.add_argument_group('Core compilation options') core_options.add_argument('--coreOnly', help='Compile only the core orthologs', action='store_true', default=False) core_options.add_argument('--reuseCore', help='Reuse existing core set of your sequence', action='store_true', default=False) @@ -260,12 +190,8 @@ def main(): core_options.add_argument('--maxDist', help='Maximum systematic distance of primer taxa for the core set compilation. Default: kingdom', choices=['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'superkingdom'], action='store', default='kingdom') - core_options.add_argument('--coreOrth', help='Number of orthologs added to the core set. Default: 5', action='store', default=5, type=int) + core_options.add_argument('--coreSize', help='Maximul number of orthologs in core set. Default: 6', action='store', default=6, type=int) core_options.add_argument('--coreTaxa', help='List of primer taxa that should exclusively be used for the core set compilation', action='store', default='') - core_options.add_argument('--coreStrict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', - action='store_true', default=False) - core_options.add_argument('--CorecheckCoorthologsRef', help='During the core compilation, an ortholog also be accepted when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', - action='store_true', default=True) core_options.add_argument('--CorecheckCoorthologsOff', help='Turn off checking for co-ortholog of the reverse search during the core compilation', action='store_true', default=False) core_options.add_argument('--coreRep', help='Obtain only the sequence being most similar to the corresponding sequence in the core set rather than all putative co-orthologs', @@ -274,20 +200,14 @@ def main(): action='store', default=3, type=int) core_options.add_argument('--distDeviation', help='The deviation in score in percent (0 = 0 percent, 1 = 100 percent) allowed for two taxa to be considered similar. Default: 0.05', action='store', default=0.05, type=float) - core_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score', - action='store_true', default=False) - core_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True', - action='store_true', default=True) - core_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False', - action='store_true', default=False) - - ortho_options = parser.add_argument_group('Search strategy options') - ortho_options.add_argument('--searchTaxa', help='Specify list of search taxa', action='store', default='') - ortho_options.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', - action='store_true', default=False) - ortho_options.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', - action='store_true', default=True) - ortho_options.add_argument('--checkCoorthologsOff', help='Turn off checking for co-ortholog of the reverse search during the final ortholog search', + core_options.add_argument('--alnStrategy', help='Specify the alignment strategy during core ortholog compilation. Default: local', + choices=['local', 'glocal', 'global'], + action='store', default='local') + + ortho_options = parser.add_argument_group('Ortholog search strategy options') + ortho_options.add_argument('--searchTaxa', help='Specify file contains list of search taxa', action='store', default='') + ortho_options.add_argument('--group', help='Allows to limit the search to a certain systematic group', action='store', default='') + ortho_options.add_argument('--checkCoorthologsRefOff', help='Turn off checking for co-ortholog of the reverse search during the final ortholog search', action='store_true', default=False) ortho_options.add_argument('--rbh', help='Requires a reciprocal best hit during the ortholog search to accept a new ortholog', action='store_true', default=False) @@ -295,269 +215,249 @@ def main(): action='store_true', default=False) ortho_options.add_argument('--lowComplexityFilter', help='Switch the low complexity filter for the blast search on. Default: False', action='store_true', default=False) - ortho_options.add_argument('--evalBlast', help='E-value cut-off for the Blast search. Default: 0.00001', - action='store', default=0.00005, type=float) - ortho_options.add_argument('--evalHmmer', help='E-value cut-off for the HMM search. Default: 0.00001', - action='store', default=0.00005, type=float) - ortho_options.add_argument('--evalRelaxfac', help='The factor to relax the e-value cut-off (Blast search and HMM search). Default: 10', - action='store', default=10, type=int) + ortho_options.add_argument('--evalBlast', help='E-value cut-off for the Blast search. Default: 0.0001', + action='store', default=0.0001, type=float) + ortho_options.add_argument('--evalHmmer', help='E-value cut-off for the HMM search. Default: 0.0001', + action='store', default=0.0001, type=float) ortho_options.add_argument('--hitLimit', help='number of hits of the initial pHMM based search that should be evaluated via a reverse search. Default: 10', action='store', default=10, type=int) - ortho_options.add_argument('--autoLimit', help='Invoke a lagPhase analysis on the score distribution from the hmmer search. This will determine automatically a hit limit for each query. Note, it will be effective for both the core compilation and the final ortholog search', - action='store_true', default=False) - ortho_options.add_argument('--scoreThreshold', help='Instead of setting an automatic hit limit, you can specify with this flag that only candidates with an hmm score no less than x percent of the hmm score of the best hit are further evaluated. Default: x = 10. You can change this cutoff with the option -scoreCutoff. Note, it will be effective for both the core compilation and the final ortholog search', - action='store_true', default=False) - ortho_options.add_argument('--scoreCutoff', help='In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', + ortho_options.add_argument('--scoreCutoff', help='Define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', action='store', default=10, type=int) fas_options = parser.add_argument_group('FAS options') - fas_options.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) - fas_options.add_argument('--countercheck', help='The FAS score will be computed in two ways', action='store_true', default=True) fas_options.add_argument('--coreFilter', help='Specifiy mode for filtering core orthologs by FAS score. In \'relaxed\' mode candidates with insufficient FAS score will be disadvantaged. In \'strict\' mode candidates with insufficient FAS score will be deleted from the candidates list. The option \'--minScore\' specifies the cut-off of the FAS score.', choices=['relaxed', 'strict'], action='store', default='') fas_options.add_argument('--minScore', help='Specify the threshold for coreFilter. Default: 0.75', action='store', default=0.75, type=float) + addtionalIO = parser.add_argument_group('Other I/O options') + addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) + addtionalIO.add_argument('--force', help='Overwrite existing ortholog search output files', action='store_true', default=False) + addtionalIO.add_argument('--forceCore', help='Overwrite existing core set of your sequence', action='store_true', default=False) + addtionalIO.add_argument('--notAddingTaxa', help='Do not add all search taxa to phyloprofile output', action='store_true', default=False) + addtionalIO.add_argument('--noCleanup', help='Temporary output will NOT be deleted. Default: False', action='store_true', default=False) + addtionalIO.add_argument('--keep', help='Keep output of individual seed sequence. Default: False', action='store_true', default=False) + addtionalIO.add_argument('--debug', help='Set this flag to obtain more detailed information about the ortholog search progress', action='store_true', default=False) + addtionalIO.add_argument('--debugCore', help='Set this flag to obtain more detailed information about the core compilation actions', action='store_true', default=False) + addtionalIO.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) + optional = parser.add_argument_group('Other options') + optional.add_argument('--fasOff', help='Turn OFF FAS support', action='store_true', default=False) optional.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') - optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) - optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False) - optional.add_argument('--checkOff', help='Set this flag to turn of the initial checks. Default: False', action='store_true', default=False) - optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) - optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) + optional.add_argument('--cpus', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) ### get arguments args = parser.parse_args() # required arguments - inFol = os.path.abspath(args.input) + inFol = os.path.abspath(args.seqFolder) jobName = args.jobName refspec = args.refspec - minDist = args.minDist - maxDist = args.maxDist - coreOrth = args.coreOrth - # path arguments outpath = os.path.abspath(args.outpath) hmmpath = args.hmmpath - blastpath = args.blastpath + corepath = args.corepath searchpath = args.searchpath - weightpath = args.weightpath + annopath = args.annopath pathFile = args.pathFile - # other I/O arguments - append = args.append - force = args.force - forceComplete = args.forceComplete - noCleanup = args.noCleanup - keep = args.keep - group = args.group - blast = args.blast - db = args.db - # core compilation arguments coreOnly = args.coreOnly reuseCore = args.reuseCore + minDist = args.minDist + maxDist = args.maxDist + coreSize = args.coreSize coreTaxa = args.coreTaxa - coreStrict = args.coreStrict - CorecheckCoorthologsRef = args.CorecheckCoorthologsRef + if not coreTaxa == '': + if os.path.exists(os.path.abspath(coreTaxa)): + coreTaxa = os.path.abspath(coreTaxa) CorecheckCoorthologsOff = args.CorecheckCoorthologsOff - if CorecheckCoorthologsOff == True: - CorecheckCoorthologsRef = False coreRep = args.coreRep coreHitLimit = args.coreHitLimit distDeviation = args.distDeviation + alnStrategy = args.alnStrategy # ortholog search arguments - strict = args.strict - checkCoorthologsRef = args.checkCoorthologsRef - checkCoorthologsOff = args.checkCoorthologsOff - if checkCoorthologsOff == True: - checkCoorthologsRef = False + searchTaxa = args.searchTaxa + if not searchTaxa == '': + if os.path.exists(os.path.abspath(searchTaxa)): + searchTaxa = os.path.abspath(searchTaxa) + group = args.group + if not group == '' and not searchTaxa == '': + print('WARNING: Both --group and --searchTaxa are specified. Search taxa will be obtained only from %s!' % searchTaxa) + group = '' + checkCoorthologsRefOff = args.checkCoorthologsRefOff rbh = args.rbh rep = args.rep - ignoreDistance = args.ignoreDistance lowComplexityFilter = args.lowComplexityFilter evalBlast = args.evalBlast evalHmmer = args.evalHmmer - evalRelaxfac = args.evalRelaxfac hitLimit = args.hitLimit - autoLimit = args.autoLimit - scoreThreshold = args.scoreThreshold scoreCutoff = args.scoreCutoff - aligner = args.aligner - local = args.local - glocal = args.glocal - searchTaxa = args.searchTaxa # fas arguments - fasoff = args.fasoff - countercheck = args.countercheck + fasOff = args.fasOff coreFilter = args.coreFilter minScore = args.minScore - # others - cpu = args.cpu - hyperthread = args.hyperthread - checkOff = args.checkOff + # other I/O arguments + append = args.append + force = args.force + forceCore = args.forceCore + noCleanup = args.noCleanup + keep = args.keep debug = args.debug + debugCore = args.debugCore silentOff = args.silentOff - if silentOff == True: - silent = False - else: - silent = True - - ### check fas - if not fasoff: - try: - fasVersion = subprocess.run(['fas.run --version'], shell = True, capture_output = True, check = True) - except: - sys.exit('Problem with FAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!') - - ### delete output folder and files if needed - if forceComplete: - if os.path.exists(outpath): - print("Removing existing output directory %s" % outpath) - shutil.rmtree(outpath) - Path(outpath).mkdir(parents=True, exist_ok=True) - if force: - if os.path.exists(outpath): - print("Removing existing files %s in %s*" % (jobName, outpath)) - outfiles = os.listdir(outpath) - for item in outfiles: - if item.startswith(jobName): - try: - os.remove(os.path.join(outpath, item)) - except: - shutil.rmtree(outpath+'/'+item) - if item.startswith("runtime"): - os.remove(os.path.join(outpath, item)) - if os.path.exists(outpath + '/missing.txt'): - os.remove(outpath + '/missing.txt') - - ### get fdog and data path - dataPath = '' - fdogPath = os.path.realpath(__file__).replace('/runMulti.py','') - pathconfigFile = fdogPath + '/bin/pathconfig.txt' - if not os.path.exists(pathconfigFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - if pathFile == '': - with open(pathconfigFile) as f: - dataPath = f.readline().strip() - else: - cfg = fdogFn.load_config(pathFile) - try: - dataPath = cfg['dataPath'] - except: - dataPath = 'config' - - if hmmpath == '': - hmmpath = outpath + '/core_orthologs' - # hmmpath = dataPath + '/core_orthologs' - # if dataPath == 'config': - # try: - # hmmpath = cfg['hmmpath'] - # except: - # sys.exit('hmmpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) - else: - hmmpath = os.path.abspath(hmmpath) - if blastpath == '': - blastpath = dataPath + '/blast_dir' - if dataPath == 'config': - try: - blastpath = cfg['blastpath'] - except: - sys.exit('blastpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) - if searchpath == '': - searchpath = dataPath + '/genome_dir' - if dataPath == 'config': - try: - searchpath = cfg['searchpath'] - except: - sys.exit('searchpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) - if weightpath == '': - weightpath = dataPath + '/weight_dir' - if dataPath == 'config': - try: - weightpath = cfg['weightpath'] - except: - sys.exit('weightpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) + notAddingTaxa = args.notAddingTaxa + # others + aligner = args.aligner + cpus = args.cpus + if cpus > os.cpu_count(): + cpus = os.cpu_count() + + + begin = time.time() + ##### Check and group parameters + print('##### PREPARING & CHECKING #####') + (inFol, hmmpath, corepath, searchpath, annopath) = prepare_fn.check_input( + [inFol, refspec, outpath, hmmpath, + corepath, searchpath, annopath, pathFile]) + pathArgs = [outpath, hmmpath, corepath, searchpath, annopath] + prepare_fn.check_blast_version(corepath, refspec) + + (invalid_minDist, invalid_maxDist, suggested_minRank, suggested_maxRank) = prepare_fn.check_ranks_core_taxa(corepath, refspec, minDist, maxDist) + if len(invalid_minDist) > 0 or len(invalid_maxDist) > 0: + if len(invalid_minDist) > 0: + print(f'Invalid {minDist} (--minDist) for:\t{", ".join(invalid_minDist)}') + if len(invalid_maxDist) > 0: + print(f'Invalid {maxDist} (--maxDist) for:\t{", ".join(invalid_maxDist)}') + if not minDist == "genus" and not maxDist == "kingdom": + print(f'Please consider setting --minDist and --maxDist with these valid ranks:\n--minDist {suggested_minRank} --maxDist {suggested_maxRank}') + sys.exit() + else: + print(f'WARNING: --minDist and --maxDist will be automatically changed to {suggested_minRank} and {suggested_maxRank}') + minDist = suggested_minRank + maxDist = suggested_maxRank - ### join options - options = [fdogPath, refspec, minDist, maxDist, coreOrth, - append, force, noCleanup, group, blast, db, - outpath, hmmpath, blastpath, searchpath, weightpath, - coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, - fasoff, countercheck, coreFilter, minScore, - strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, checkOff, debug, silent] + if not fasOff: + check_fas = fas_fn.check_fas_executable() + if check_fas == 0: + sys.exit('ERROR: FAS is not executable! You still can use fDOG with --fasOff!') ### START Path(outpath).mkdir(parents=True, exist_ok=True) - multiLog = open(outpath + '/' + jobName + '_log.txt', "w") - fdogStart = time.time() - seeds = getSortedFiles(inFol) - print('PID ' + str(os.getpid())) - multiLog.write('PID ' + str(os.getpid()) + '\n') + multiLog = open(outpath + '/' + jobName + '_runtime.txt', "w") - ### run core compilation + print('PID %s - Jobname %s'% (str(os.getpid()), jobName)) + multiLog.write('PID %s - Jobname %s\n'% (str(os.getpid()), jobName)) + seeds = get_sorted_files(inFol) + end = time.time() + print('==> Sort seed files finished in ' + '{:5.3f}s'.format(end - begin)) + + ##### DO CORE COMPILATION if reuseCore == False: - multiCoreTime = compileCore(options, seeds, inFol, cpu, outpath) - multiLog.write('==> Core compilation finished in %s sec\n' % multiCoreTime) + print('##### COMPILING CORE ORTHOLOG GROUPS #####') + start = time.time() + coreArgs = [minDist, maxDist, coreSize, coreTaxa, distDeviation, + alnStrategy, fasOff] + orthoCoreArgs = [CorecheckCoorthologsOff, rbh, True, evalBlast/10, + lowComplexityFilter, evalHmmer/10, coreHitLimit, + scoreCutoff, aligner] # rep = True; e-value cutoff is 10x more stringent than from ortho search + otherCoreArgs = [cpus, debugCore, silentOff, noCleanup, force, append] + core_options = [coreArgs, orthoCoreArgs, otherCoreArgs] + other_options = [refspec, reuseCore, forceCore, pathArgs, debug] + core_runtime = compile_core(core_options, other_options, seeds, inFol, cpus, outpath, silentOff, jobName) + end = time.time() + multi_core_time = '{:5.3f}'.format(end-start) + print('==> Core compilation finished in %ss\n' % multi_core_time) + if len(core_runtime) > 1: + multiLog.write('==> Core compilation finished in %ss\n%s\n' % (multi_core_time, '\n'.join(core_runtime))) + else: + multiLog.write('==> Core compilation finished in %ss\n' % multi_core_time) else: if not os.path.exists(hmmpath): sys.exit('--reuseCore was set, but no core orthologs found in %s! You could use --hmmpath to manually specify the core ortholog directory.' % outpath) - ### do ortholog search - if coreOnly == False: - if not os.path.exists('%s/%s.extended.fa' % (outpath, jobName)): - ### create list of search taxa - searchTaxa = '' - searchGroup = 'all' + + ##### DO ORTHOLOG SEARCH USING HMM (HAMSTR) + finalFa = '%s/%s.extended.fa' % (outpath, jobName) + if not coreOnly: + print('##### SEARCHING ORTHOLOGS #####') + start = time.time() + if not os.path.exists(finalFa) or force == True: + ### get list of search taxa if not group == '': - print('Creating list for search taxa...') - searchTaxa = '%s/searchTaxa.txt' % (outpath) - searchGroup = group - cmd = 'perl %s/bin/getSearchTaxa.pl -i %s -b %s -h %s -r %s -n %s -t %s/taxonomy -o %s' % (fdogPath, searchpath, evalBlast, evalHmmer, evalRelaxfac, searchGroup, fdogPath, searchTaxa) - try: - subprocess.call([cmd], shell = True) - except: - sys.exit('Problem running\n%s' % (cmd)) - ### run ortholog search - multiOrthoTime = searchOrtho(options, seeds, inFol, cpu, outpath) - multiLog.write('==> Ortholog search finished in %s sec\n' % multiOrthoTime) - ### join output - finalFa = joinOutputs(outpath, jobName, seeds, keep, silent) - else: - if append == True: - sys.exit("Currently the append option is not available. Please use fdog.run if you need this option!") - else: - sys.exit("%s.extended.fa found in %s! If you want to re-run the ortholog search, please use --force or --append option." % (jobName, outpath)) - ### calculate FAS scores - if fasoff == False: - if os.path.exists('%s/%s.phyloprofile' % (outpath, jobName)): - os.remove('%s/%s.phyloprofile' % (outpath, jobName)) - if not os.path.exists('%s/%s.phyloprofile' % (outpath, jobName)): - if os.path.exists(finalFa) and os.path.getsize(finalFa) > 0: - fasTime = calcFAS(outpath, finalFa, weightpath, cpu) - multiLog.write('==> FAS calculation finished in %s sec\n' % fasTime) + ### Check valid taxonomy group + ncbi = NCBITaxa() + group_id = ncbi.get_name_translator([group]) + if len(group_id) == 0: + exit('ERROR: Taxon group "%s" invalid!' % group) + ### create taxonomy tree from list of search taxa + searchTaxa = [] + tax_ids = core_fn.get_core_taxa_ids(coreTaxa, corepath) + + for tax_id in tax_ids.keys(): + check = tree_fn.check_taxon_group(group_id[group][0], tax_id, ncbi) + if check == True: + searchTaxa.append(tax_ids[tax_id]) + if debugCore: + print(searchTaxa) + if len(searchTaxa) == 0: + exit('ERROR: No taxon found within %s taxonomy group!' % group) else: - print("Final fasta file %s not exists or empty!" % finalFa) + searchTaxa = ','.join(searchTaxa) + + if len(searchTaxa) == '': + searchTaxa = general_fn.read_dir(searchpath) + searchTaxa = ','.join(searchTaxa) + + ### do ortholog search + orthoArgs = [checkCoorthologsRefOff, rbh, rep, evalBlast, + lowComplexityFilter, evalHmmer, hitLimit, scoreCutoff, aligner] + otherArgs = [searchTaxa, cpus, debug, silentOff, noCleanup, force, append] + ortho_options = [orthoArgs, otherArgs, pathArgs, refspec] + ortho_runtime = search_ortholog(ortho_options, seeds, inFol, outpath) + end = time.time() + multi_ortho_time = '{:5.3f}'.format(end-start) + print('==> Ortholog search finished in %ss\n' % multi_ortho_time) + multiLog.write('==> Ortholog search finished in %ss\n%s\n' % (multi_ortho_time, '\n'.join(ortho_runtime))) + ### join output + print('Joining single outputs...') + start = time.time() + join_outputs(outpath, jobName, seeds, keep, silentOff) + end = time.time() + print('==> Joining outputs finished in %ss\n' % '{:5.3f}'.format(end-start)) + + ##### DO FINAL FAS CALCULATION + if not fasOff: + print('##### CALCULATING FAS SCORES #####') + try: + fasVersion = subprocess.run(['fas.run --version'], shell = True, capture_output = True, check = True) + except: + sys.exit('Problem with FAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!') + if os.path.exists(finalFa): + start = time.time() + fas_fn.calc_fas_multi(finalFa, outpath, annopath, cpus) + end = time.time() + print('==> FAS calculation finished in ' + '{:5.3f}s'.format(end - start)) + multiLog.write('==> FAS calculation finished in ' + '{:5.3f}s'.format(end - start)) else: - shutil.move('%s/%s.phyloprofile' % (outpath, jobName), '%s/%s.phyloprofile.tmp' % (outpath, jobName)) - removeDupLines ('%s/%s.phyloprofile.tmp' % (outpath, jobName), '%s/%s.phyloprofile' % (outpath, jobName)) - os.remove('%s/%s.phyloprofile.tmp' % (outpath, jobName)) + output_fn.hamstr_2_profile(finalFa) - ### create PhyloProfile config file - createConfigPP(outpath, jobName, refspec) + ##### ADD ALL SEARCH TAXA INTO PhyloProfile OUTPUT + if not notAddingTaxa: + pp_file = f'{outpath}/{jobName}.phyloprofile' + if not searchTaxa: + tmp = general_fn.read_dir(searchpath) + searchTaxa = ','.join(tmp) + output_fn.add_all_taxa(pp_file, searchTaxa) - fdogEnd = time.time() - print('==> fdogs.run finished in ' + '{:5.3f}s'.format(fdogEnd-fdogStart)) - multiLog.write('==> fdogs.run finished in ' + '{:5.3f}s'.format(fdogEnd-fdogStart)) - multiLog.close() + end = time.time() + print('==> fdogs.run finished in ' + '{:5.3f}s'.format(end - begin)) if __name__ == '__main__': main() diff --git a/fdog/runSingle.py b/fdog/runSingle.py index df22bd2..8f0de85 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # -# This script is used to run fdog for one seed sequence. +# This file is part of fDOG tool https://github.com/BIONF/fDOG # # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -19,193 +19,28 @@ import os import argparse import subprocess -from pathlib import Path -import yaml from ete3 import NCBITaxa +from pkg_resources import get_distribution +import time +import fdog.libs.zzz as general_fn +import fdog.libs.preparation as prepare_fn +import fdog.libs.orthosearch as ortho_fn +import fdog.libs.corecompile as core_fn +import fdog.libs.fas as fas_fn +import fdog.libs.tree as tree_fn +import fdog.libs.output as output_fn -def checkFileExist(file): - if not os.path.exists(os.path.abspath(file)): - sys.exit('%s not found' % file) - -def load_config(config_file): - with open(config_file, 'r') as stream: - try: - return yaml.safe_load(stream) - except yaml.YAMLError as exc: - print(exc) - -def checkInput(args): - (fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath) = args - # create output directory - Path(outpath).mkdir(parents=True, exist_ok=True) - Path(hmmpath).mkdir(parents=True, exist_ok=True) - # check path existing - for path in [hmmpath, blastpath, searchpath, weightpath]: - checkFileExist(path) - # check for seqFile - if not os.path.exists(os.path.abspath(seqFile)): - if not os.path.exists(fdogPath + '/data/' + seqFile): - sys.exit('%s not found in %s or %s' % (seqFile, os.getcwd(), fdogPath + '/data/')) - else: - seqFile = fdogPath + '/data/' + seqFile - else: - seqFile = os.path.abspath(seqFile) - # check refspec - if not os.path.exists(os.path.abspath(blastpath+'/'+refspec)): - exit('Reference taxon %s not found in %s' % (refspec, blastpath)) - return (seqFile, hmmpath, blastpath, searchpath, weightpath) - -def getfdogInfo(fdogPath, infoType): - if os.path.exists(fdogPath + '/bin/oneSeq.pl'): - cmd = subprocess.Popen([fdogPath + '/bin/oneSeq.pl', infoType], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - msg, err = cmd.communicate() - print(msg.decode('UTF-8').strip()) - print(err.decode('UTF-8').strip()) - exit() - else: - exit('%s not found' % (fdogPath + '/bin/oneSeq.pl')) - -def runSingle(args): - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args - # basic command - (fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs - cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec) - # add paths - (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs - cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath) - # add other I/O options - (append, force, noCleanup, group, blast, db) = ioArgs - if append == True: - cmd = cmd + ' -append' - if force == True: - cmd = cmd + ' -force' - if noCleanup == False: - cmd = cmd + ' -cleanup' - if blast == True: - cmd = cmd + ' -blast' - if db == True: - cmd = cmd + ' -db' - if not group == '': - cmd = cmd + ' -group=%s' % group - # add core compilation options - (coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation) = coreArgs - if coreOnly == True: - cmd = cmd + ' -coreOnly' - if reuseCore == True: - cmd = cmd + ' -reuseCore' - else: - cmd = cmd + ' -minDist=%s -maxDist=%s -coreOrth=%s' % (minDist, maxDist, coreOrth) - if not coreTaxa == '': - cmd = cmd + ' -coreTaxa=%s' % coreTaxa - if coreStrict == True: - cmd = cmd + ' -coreStrict' - if CorecheckCoorthologsRef == True: - cmd = cmd + ' -CorecheckCoorthologsRef' - if coreRep == True: - cmd = cmd + ' -coreRep' - if not coreHitLimit == 3: - cmd = cmd + ' -coreHitLimit=%s' % coreHitLimit - if not distDeviation == 0.05: - cmd = cmd + ' -distDeviation=%s' % distDeviation - # add ortholo search options - (strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa) = orthoArgs - if strict == True: - cmd = cmd + ' -strict' - if checkCoorthologsRef == True: - cmd = cmd + ' -checkCoorthologsRef' - if rbh == True: - cmd = cmd + ' -rbh' - if rep == True: - cmd = cmd + ' -rep' - if ignoreDistance == True: - cmd = cmd + ' -ignoreDistance' - if lowComplexityFilter == True: - cmd = cmd + ' -filter=T' - if not evalBlast == 0.00005: - cmd = cmd + ' -evalBlast=%s' % evalBlast - if not evalHmmer == 0.00005: - cmd = cmd + ' -evalHmmer=%s' % evalHmmer - if not evalRelaxfac == 10: - cmd = cmd + ' -evalRelaxfac=%s' % evalRelaxfac - if not hitLimit == 10: - cmd = cmd + ' -hitLimit=%s' % hitLimit - if autoLimit == True: - cmd = cmd + ' -autoLimit' - if scoreThreshold: - cmd = cmd + ' -scoreThreshold' - if not scoreCutoff == 10: - cmd = cmd + ' -scoreCutoff=%s' % scoreCutoff - if not aligner == 'muscle': - cmd = cmd + ' -aligner=%s' % aligner - if glocal == True: - cmd = cmd + ' -glocal' - if not searchTaxa == '': - checkFileExist(searchTaxa) - searchTaxa = os.path.abspath(searchTaxa) - cmd = cmd + ' -searchTaxa=%s' % searchTaxa - # add fas options - (fasoff, countercheck, coreFilter, minScore) = fasArgs - if fasoff == True: - cmd = cmd + ' -fasoff' - else: - if countercheck == True: - cmd = cmd + ' -countercheck' - if not coreFilter == '': - if minScore > 0: - cmd = cmd + ' -coreFilter=%s -minScore=%s' % (coreFilter, minScore) - # add other options - (cpu, hyperthread, checkOff, debug, silent) = otherArgs - cmd = cmd + ' -cpu=%s' % cpu - if hyperthread == True: - cmd = cmd + ' -hyperthread' - if checkOff == True: - cmd = cmd + ' -checkOff' - if debug == True: - cmd = cmd + ' -debug' - if silent == True: - cmd = cmd + ' -silent' - # print(cmd) - if mute == True: - cmd = cmd + ' > /dev/null 2>&1' - try: - subprocess.call([cmd], shell = True) - except: - sys.exit('Problem running\n%s' % (cmd)) - -def createConfigPP(outpath, seqName, refspec): - settings = dict( - mainInput = '%s/%s/%s.phyloprofile' % (outpath, seqName, seqName), - fastaInput = '%s/%s/%s.extended.fa' % (outpath, seqName, seqName), - ) - domainFile = '%s/%s/%s_forward.domains' % (outpath, seqName, seqName) - if os.path.exists(os.path.abspath(domainFile)): - settings['domainInput'] = domainFile - taxId = refspec.split('@')[1] - refspec = getTaxName(taxId) - if not refspec == 'UNK': - settings['rank'] = 'species' - settings['refspec'] = refspec - settings['clusterProfile'] = 'FALSE' - with open('%s/%s/%s.config.yml' % (outpath, seqName, seqName), 'w') as outfile: - yaml.dump(settings, outfile, default_flow_style = False) - -def getTaxName(taxId): - ncbi = NCBITaxa() - try: - name = ncbi.get_taxid_translator([taxId])[int(taxId)] - except: - name = 'UNK' - return(name) def main(): - version = '0.0.52' - parser = argparse.ArgumentParser(description='You are running fdog.run version ' + str(version) + '.') - parser.add_argument('--version', action='version', version=str(version)) + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.', + epilog="For more information on certain options, please refer to the wiki pages " + "on github: https://github.com/BIONF/fDOG/wiki") required = parser.add_argument_group('Required arguments') required.add_argument('--seqFile', help='Input file containing the seed sequence (protein only) in fasta format', action='store', default='', required=True) - required.add_argument('--seqName', help='Job name. This will also be file name for the output', + required.add_argument('--jobName', help='Job name. This will also be file name for the output', action='store', default='', required=True) required.add_argument('--refspec', help='Reference taxon. It should be the species the seed sequence was derived from', action='store', default='', required=True) @@ -213,20 +48,11 @@ def main(): optional_paths = parser.add_argument_group('Non-default directory options') optional_paths.add_argument('--outpath', help='Output directory', action='store', default='') optional_paths.add_argument('--hmmpath', help='Path for the core ortholog directory', action='store', default='') - optional_paths.add_argument('--blastpath', help='Path for the blastDB directory', action='store', default='') + optional_paths.add_argument('--corepath', help='Path for the core taxa directory', action='store', default='') optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') - optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') + optional_paths.add_argument('--annopath', help='Path for the pre-calculated feature annotion directory', action='store', default='') optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - addtionalIO = parser.add_argument_group('Other I/O options') - addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) - addtionalIO.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) - addtionalIO.add_argument('--noCleanup', help='Temporary output will NOT be deleted. Default: False', action='store_true', default=False) - addtionalIO.add_argument('--group', help='Allows to limit the search to a certain systematic group', action='store', default='') - addtionalIO.add_argument('--blast', help='Determine sequence id and refspec automatically. Note, the chosen sequence id and reference species does not necessarily reflect the species the sequence was derived from.', - action='store_true', default=False) - addtionalIO.add_argument('--db', help='Run fdog in database mode. Requires a mySql database. Only for internal use.', action='store_true', default=False) - core_options = parser.add_argument_group('Core compilation options') core_options.add_argument('--coreOnly', help='Compile only the core orthologs', action='store_true', default=False) core_options.add_argument('--reuseCore', help='Reuse existing core set of your sequence', action='store_true', default=False) @@ -236,12 +62,8 @@ def main(): core_options.add_argument('--maxDist', help='Maximum systematic distance of primer taxa for the core set compilation. Default: kingdom', choices=['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'superkingdom'], action='store', default='kingdom') - core_options.add_argument('--coreOrth', help='Number of orthologs added to the core set. Default: 5', action='store', default=5, type=int) + core_options.add_argument('--coreSize', help='Maximul number of orthologs in core set. Default: 6', action='store', default=6, type=int) core_options.add_argument('--coreTaxa', help='List of primer taxa that should exclusively be used for the core set compilation', action='store', default='') - core_options.add_argument('--coreStrict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', - action='store_true', default=False) - core_options.add_argument('--CorecheckCoorthologsRef', help='During the core compilation, an ortholog also be accepted when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', - action='store_true', default=True) core_options.add_argument('--CorecheckCoorthologsOff', help='Turn off checking for co-ortholog of the reverse search during the core compilation', action='store_true', default=False) core_options.add_argument('--coreRep', help='Obtain only the sequence being most similar to the corresponding sequence in the core set rather than all putative co-orthologs', @@ -250,20 +72,14 @@ def main(): action='store', default=3, type=int) core_options.add_argument('--distDeviation', help='The deviation in score in percent (0 = 0 percent, 1 = 100 percent) allowed for two taxa to be considered similar. Default: 0.05', action='store', default=0.05, type=float) - core_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score', - action='store_true', default=False) - core_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True', - action='store_true', default=True) - core_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False', - action='store_true', default=False) + core_options.add_argument('--alnStrategy', help='Specify the alignment strategy during core ortholog compilation. Default: local', + choices=['local', 'glocal', 'global'], + action='store', default='local') ortho_options = parser.add_argument_group('Ortholog search strategy options') ortho_options.add_argument('--searchTaxa', help='Specify file contains list of search taxa', action='store', default='') - ortho_options.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', - action='store_true', default=False) - ortho_options.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', - action='store_true', default=True) - ortho_options.add_argument('--checkCoorthologsOff', help='Turn off checking for co-ortholog of the reverse search during the final ortholog search', + ortho_options.add_argument('--group', help='Allows to limit the search to a certain systematic group', action='store', default='') + ortho_options.add_argument('--checkCoorthologsRefOff', help='Turn off checking for co-ortholog of the reverse search during the final ortholog search', action='store_true', default=False) ortho_options.add_argument('--rbh', help='Requires a reciprocal best hit during the ortholog search to accept a new ortholog', action='store_true', default=False) @@ -271,181 +87,226 @@ def main(): action='store_true', default=False) ortho_options.add_argument('--lowComplexityFilter', help='Switch the low complexity filter for the blast search on. Default: False', action='store_true', default=False) - ortho_options.add_argument('--evalBlast', help='E-value cut-off for the Blast search. Default: 0.00001', - action='store', default=0.00005, type=float) - ortho_options.add_argument('--evalHmmer', help='E-value cut-off for the HMM search. Default: 0.00001', - action='store', default=0.00005, type=float) - ortho_options.add_argument('--evalRelaxfac', help='The factor to relax the e-value cut-off (Blast search and HMM search). Default: 10', - action='store', default=10, type=int) + ortho_options.add_argument('--evalBlast', help='E-value cut-off for the Blast search. Default: 0.0001', + action='store', default=0.0001, type=float) + ortho_options.add_argument('--evalHmmer', help='E-value cut-off for the HMM search. Default: 0.0001', + action='store', default=0.0001, type=float) ortho_options.add_argument('--hitLimit', help='number of hits of the initial pHMM based search that should be evaluated via a reverse search. Default: 10', action='store', default=10, type=int) - ortho_options.add_argument('--autoLimit', help='Invoke a lagPhase analysis on the score distribution from the hmmer search. This will determine automatically a hit limit for each query. Note, it will be effective for both the core compilation and the final ortholog search', - action='store_true', default=False) - ortho_options.add_argument('--scoreThreshold', help='Instead of setting an automatic hit limit, you can specify with this flag that only candidates with an hmm score no less than x percent of the hmm score of the best hit are further evaluated. Default: x = 10. You can change this cutoff with the option -scoreCutoff. Note, it will be effective for both the core compilation and the final ortholog search', - action='store_true', default=False) - ortho_options.add_argument('--scoreCutoff', help='In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', + ortho_options.add_argument('--scoreCutoff', help='Define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', action='store', default=10, type=int) fas_options = parser.add_argument_group('FAS options') - fas_options.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) - fas_options.add_argument('--countercheck', help='The FAS score will be computed in two ways', action='store_true', default=True) fas_options.add_argument('--coreFilter', help='Specifiy mode for filtering core orthologs by FAS score. In \'relaxed\' mode candidates with insufficient FAS score will be disadvantaged. In \'strict\' mode candidates with insufficient FAS score will be deleted from the candidates list. The option \'--minScore\' specifies the cut-off of the FAS score.', choices=['relaxed', 'strict'], action='store', default='') fas_options.add_argument('--minScore', help='Specify the threshold for coreFilter. Default: 0.75', action='store', default=0.75, type=float) + addtionalIO = parser.add_argument_group('Other I/O options') + addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) + addtionalIO.add_argument('--force', help='Overwrite existing ortholog search output files', action='store_true', default=False) + addtionalIO.add_argument('--forceCore', help='Overwrite existing core set of your sequence', action='store_true', default=False) + addtionalIO.add_argument('--notAddingTaxa', help='Do not add all search taxa to phyloprofile output', action='store_true', default=False) + addtionalIO.add_argument('--noCleanup', help='Temporary output will NOT be deleted. Default: False', action='store_true', default=False) + addtionalIO.add_argument('--debug', help='Set this flag to obtain more detailed information about the ortholog search progress', action='store_true', default=False) + addtionalIO.add_argument('--debugCore', help='Set this flag to obtain more detailed information about the core compilation actions', action='store_true', default=False) + addtionalIO.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) + optional = parser.add_argument_group('Other options') + optional.add_argument('--fasOff', help='Turn OFF FAS support', action='store_true', default=False) optional.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') - optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) - optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False) - optional.add_argument('--checkOff', help='Set this flag to turn of the initial checks. Default: False', action='store_true', default=False) - optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) - optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) + optional.add_argument('--cpus', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) ### get arguments args = parser.parse_args() # required arguments seqFile = args.seqFile - seqName = args.seqName + seqName = args.jobName refspec = args.refspec - minDist = args.minDist - maxDist = args.maxDist - coreOrth = args.coreOrth - # path arguments outpath = os.path.abspath(args.outpath) hmmpath = args.hmmpath - blastpath = args.blastpath + corepath = args.corepath searchpath = args.searchpath - weightpath = args.weightpath + annopath = args.annopath pathFile = args.pathFile - # other I/O arguments - append = args.append - force = args.force - noCleanup = args.noCleanup - group = args.group - blast = args.blast - db = args.db - # core compilation arguments coreOnly = args.coreOnly reuseCore = args.reuseCore + minDist = args.minDist + maxDist = args.maxDist + coreSize = args.coreSize coreTaxa = args.coreTaxa - coreStrict = args.coreStrict - CorecheckCoorthologsRef = True #args.CorecheckCoorthologsRef + if not coreTaxa == '': + if os.path.exists(os.path.abspath(coreTaxa)): + coreTaxa = os.path.abspath(coreTaxa) CorecheckCoorthologsOff = args.CorecheckCoorthologsOff - if CorecheckCoorthologsOff == True: - CorecheckCoorthologsRef = False coreRep = args.coreRep coreHitLimit = args.coreHitLimit distDeviation = args.distDeviation + alnStrategy = args.alnStrategy # ortholog search arguments - strict = args.strict - checkCoorthologsRef = args.checkCoorthologsRef - checkCoorthologsOff = args.checkCoorthologsOff - if checkCoorthologsOff == True: - checkCoorthologsRef = False + searchTaxa = args.searchTaxa + if not searchTaxa == '': + if os.path.exists(os.path.abspath(searchTaxa)): + searchTaxa = os.path.abspath(searchTaxa) + group = args.group + if not group == '' and not searchTaxa == '': + print('WARNING: Both --group and --searchTaxa are specified. Search taxa will be obtained only from %s!' % searchTaxa) + group = '' + checkCoorthologsRefOff = args.checkCoorthologsRefOff rbh = args.rbh rep = args.rep - ignoreDistance = args.ignoreDistance lowComplexityFilter = args.lowComplexityFilter evalBlast = args.evalBlast evalHmmer = args.evalHmmer - evalRelaxfac = args.evalRelaxfac hitLimit = args.hitLimit - autoLimit = args.autoLimit - scoreThreshold = args.scoreThreshold scoreCutoff = args.scoreCutoff - aligner = args.aligner - local = args.local - glocal = args.glocal - searchTaxa = args.searchTaxa # fas arguments - fasoff = args.fasoff - countercheck = args.countercheck + fasOff = args.fasOff coreFilter = args.coreFilter minScore = args.minScore - # others - cpu = args.cpu - hyperthread = args.hyperthread - checkOff = args.checkOff + # other I/O arguments + append = args.append + force = args.force + forceCore = args.forceCore + noCleanup = args.noCleanup debug = args.debug + debugCore = args.debugCore silentOff = args.silentOff - if silentOff == True: - silent = False - else: - silent = True - - ### get fdog and data path - dataPath = '' - fdogPath = os.path.realpath(__file__).replace('/runSingle.py','') - pathconfigFile = fdogPath + '/bin/pathconfig.txt' - if not os.path.exists(pathconfigFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - if pathFile == '': - with open(pathconfigFile) as f: - dataPath = f.readline().strip() + notAddingTaxa = args.notAddingTaxa + + # others + aligner = args.aligner + cpus = args.cpus + if cpus > os.cpu_count(): + cpus = os.cpu_count() + + + begin = time.time() + ##### Check and group parameters + print('##### PREPARING & CHECKING #####') + if seqFile == 'infile.fa': + fdogPath = os.path.realpath(__file__).replace('/runSingle.py','') + seqFile = '%s/data/infile.fa' % fdogPath + + (seqFile, hmmpath, corepath, searchpath, annopath) = prepare_fn.check_input( + [seqFile, refspec, outpath, hmmpath, + corepath, searchpath, annopath, pathFile]) + pathArgs = [outpath, hmmpath, corepath, searchpath, annopath] + + prepare_fn.check_blast_version(corepath, refspec) + + (invalid_minDist, invalid_maxDist, suggested_minRank, suggested_maxRank) = prepare_fn.check_ranks_core_taxa(corepath, refspec, minDist, maxDist) + if len(invalid_minDist) > 0 or len(invalid_maxDist) > 0: + if len(invalid_minDist) > 0: + print(f'Invalid {minDist} (--minDist) for:\t{", ".join(invalid_minDist)}') + if len(invalid_maxDist) > 0: + print(f'Invalid {maxDist} (--maxDist) for:\t{", ".join(invalid_maxDist)}') + if not minDist == "genus" and not maxDist == "kingdom": + print(f'Please consider setting --minDist and --maxDist with these valid ranks:\n--minDist {suggested_minRank} --maxDist {suggested_maxRank}') + sys.exit() + else: + print(f'WARNING: --minDist and --maxDist will be automatically changed to {suggested_minRank} and {suggested_maxRank}') + minDist = suggested_minRank + maxDist = suggested_maxRank + + if not fasOff: + check_fas = fas_fn.check_fas_executable() + if check_fas == 0: + sys.exit('ERROR: FAS is not executable! You still can use fDOG with --fasOff!') + + ##### Identify seed ID from refspec genome + if reuseCore: + core_fa = '%s/%s/%s.fa' % (hmmpath, seqName, seqName) + seed_id = prepare_fn.get_seed_id_from_fa(core_fa, refspec) else: - cfg = load_config(pathFile) - try: - dataPath = cfg['dataPath'] - except: - dataPath = 'config' - - if hmmpath == '': - hmmpath = outpath + '/core_orthologs' - # hmmpath = dataPath + '/core_orthologs' - # if dataPath == 'config': - # try: - # hmmpath = cfg['hmmpath'] - # except: - # sys.exit('hmmpath not found in %s' % pathFile) - - if blastpath == '': - blastpath = dataPath + '/blast_dir' - if dataPath == 'config': - try: - blastpath = cfg['blastpath'] - except: - sys.exit('blastpath not found in %s' % pathFile) - if searchpath == '': - searchpath = dataPath + '/genome_dir' - if dataPath == 'config': + seed_id = prepare_fn.identify_seed_id(seqFile, refspec, corepath, debug, silentOff) + print('==> Identified seed ID: %s' % seed_id) + + ##### DO CORE COMPILATION + # start = time.time() + coreArgs = [minDist, maxDist, coreSize, coreTaxa, distDeviation, + alnStrategy, fasOff] + orthoCoreArgs = [CorecheckCoorthologsOff, rbh, True, evalBlast/10, + lowComplexityFilter, evalHmmer/10, coreHitLimit, + scoreCutoff, aligner] # rep = True; e-value cutoff is 10x more stringent than from ortho search + otherCoreArgs = [cpus, debugCore, silentOff, noCleanup, force, append] + print('##### COMPILING CORE SET FOR %s #####' % seqName) + core_runtime = core_fn.run_compile_core([seqFile, seqName, refspec, seed_id, reuseCore, + forceCore, coreArgs, pathArgs, orthoCoreArgs, otherCoreArgs, debug]) + print('==> Core compilation finished in %s' % core_runtime[1]) + + + ##### DO ORTHOLOG SEARCH USING CORE HMM (HAMSTR) + if not coreOnly: + start = time.time() + print('##### SEARCHING ORTHOLOGS #####') + # check existing output + finalOutfile = '%s/%s.extended.fa' % (outpath, seqName) + finalOutfile = os.path.abspath(finalOutfile) + output_fn.check_output_exist(finalOutfile, force, append) + # get list of search taxa + if not group == '': + ### Check valid taxonomy group + ncbi = NCBITaxa() + group_id = ncbi.get_name_translator([group]) + if len(group_id) == 0: + exit('ERROR: Taxon group "%s" invalid!' % group) + ### create taxonomy tree from list of search taxa + searchTaxa = [] + tax_ids = core_fn.get_core_taxa_ids(coreTaxa, corepath) + + for tax_id in tax_ids.keys(): + check = tree_fn.check_taxon_group(group_id[group][0], tax_id, ncbi) + if check == True: + searchTaxa.append(tax_ids[tax_id]) + output_fn.print_debug(debugCore, 'Search taxa', searchTaxa) + if len(searchTaxa) == 0: + exit('ERROR: No taxon found within %s taxonomy group!' % group) + else: + searchTaxa = ','.join(searchTaxa) + # do ortholog search + orthoArgs = [checkCoorthologsRefOff, rbh, rep, evalBlast, + lowComplexityFilter, evalHmmer, hitLimit, scoreCutoff, aligner] + otherArgs = [searchTaxa, cpus, debug, silentOff, noCleanup, force, append] + hamstr_out = ortho_fn.run_hamstr([seqName, refspec, pathArgs, orthoArgs, otherArgs]) + output_fn.write_hamstr(hamstr_out, outpath, seqName, force, append) + end = time.time() + print('==> Ortholog search finished in ' + '{:5.3f}s'.format(end - start)) + + ##### DO FINAL FAS CALCULATION + if not fasOff: + print('##### CALCULATING FAS SCORES #####') try: - searchpath = cfg['searchpath'] + fasVersion = subprocess.run(['fas.run --version'], shell = True, capture_output = True, check = True) except: - sys.exit('searchpath not found in %s' % pathFile) - if weightpath == '': - weightpath = dataPath + '/weight_dir' - if dataPath == 'config': - try: - weightpath = cfg['weightpath'] - except: - sys.exit('weightpath not found in %s' % pathFile) - - ### check input arguments - seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) - # group arguments - basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth] - ioArgs = [append, force, noCleanup, group, blast, db] - pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] - coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] - fasArgs = [fasoff, countercheck, coreFilter, minScore] - orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] - otherArgs = [cpu, hyperthread, checkOff, debug, silent] - - ### run fdog - runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False]) - - ### create PhyloProfile config file - createConfigPP(outpath, seqName, refspec) + sys.exit('Problem with FAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!') + if os.path.exists(finalOutfile): + start = time.time() + fas_fn.calc_fas_multi(finalOutfile, outpath, annopath, cpus) + end = time.time() + print('==> FAS calculation finished in ' + '{:5.3f}s'.format(end - start)) + else: + output_fn.hamstr_2_profile(finalOutfile) + + ##### ADD ALL SEARCH TAXA INTO PhyloProfile OUTPUT + if not notAddingTaxa: + pp_file = f'{outpath}/{seqName}.phyloprofile' + if not searchTaxa: + tmp = general_fn.read_dir(searchpath) + searchTaxa = ','.join(tmp) + output_fn.add_all_taxa(pp_file, searchTaxa) + + end = time.time() + print('==> fdog.run finished in ' + '{:5.3f}s'.format(end - begin)) if __name__ == '__main__': main() diff --git a/fdog/setPaths.py b/fdog/setPaths.py new file mode 100644 index 0000000..3b6e501 --- /dev/null +++ b/fdog/setPaths.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This script is used to set default data directories for fdog. +# These include the path to the core taxa (coreTaxa_dir), +# search taxa (searchTaxa_dir) and FAS annotation json files (annotation_dir). +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import os +import argparse +from pkg_resources import get_distribution + +import fdog.libs.zzz as general_fn +import fdog.checkData as check_data_fn + + +def set_data_path(searchpath, corepath, annopath, checkOff): + """ Set default fDOG data path to pathconfig.yml file """ + fdogPath = os.path.realpath(__file__).replace('/setPaths.py','') + pathconfigFile = fdogPath + '/bin/pathconfig.yml' + flag = 0 + if os.path.exists(pathconfigFile): + print('Default fDOG data paths in %s will be overwritten! Enter to continue.' % pathconfigFile) + if general_fn.query_yes_no(''): + flag = 1 + else: + flag = 0 + + if not checkOff: + caution = check_data(searchpath, corepath, annopath) + if caution == 1: + print('Check done! Data are ready to use WITH CAUTION! Are you sure to add these paths as default? (Y/N)') + if general_fn.query_yes_no('', default='no'): + flag = 1 + else: + flag = 0 + else: + print('WARNING: Data will not be checked! Run fdog.checkData if you encounter any problems!') + + if flag == 1: + with open(pathconfigFile, 'w') as cf: + cf.write('corepath: \'%s\'\n' % corepath) + cf.write('searchpath: \'%s\'\n' % searchpath) + cf.write('annopath: \'%s\'\n' % annopath) + print('Finished! New data paths have been saved in %s' % pathconfigFile) + else: + print(f'{pathconfigFile} remains unchanged!') + + +def check_data(searchpath, corepath, annopath): + """ Perform data check """ + caution = check_data_fn.run_check([searchpath, corepath, annopath, False, False, False, False, False]) + return(caution) + + +def main(): + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + required = parser.add_argument_group('required arguments') + optional = parser.add_argument_group('optional arguments') + required.add_argument('--searchpath', help='Path to search taxa folder (e.g. fdog_data/searchTaxa_dir)', action='store', default='', required=True) + required.add_argument('--corepath', help='Path to core taxa folder (e.g. fdog_data/coreTaxa_dir)', action='store', default='', required=True) + required.add_argument('--annopath', help='Path to annotation folder (e.g. fdog_data/annotation_dir)', action='store', default='', required=True) + optional.add_argument('--checkOff', help='Turn off checking for valid data', action='store_true', default=False) + + args = parser.parse_args() + searchpath = os.path.abspath(args.searchpath) + corepath = os.path.abspath(args.corepath) + annopath = os.path.abspath(args.annopath) + checkOff = args.checkOff + + set_data_path(searchpath, corepath, annopath, checkOff) + +if __name__ == '__main__': + main() diff --git a/fdog/setup/__init__.py b/fdog/setup/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/fdog/setup/indexTaxonomy.pl b/fdog/setup/indexTaxonomy.pl deleted file mode 100644 index 967f3ef..0000000 --- a/fdog/setup/indexTaxonomy.pl +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/perl -use Bio::DB::Taxonomy; - -my $idx_dir = $ARGV[0]; -# taxon files can be downloaded from: ftp://ftp.ncbi.nih.gov/pub/taxonomy/ -my $db = Bio::DB::Taxonomy->new(-source => 'flatfile', - -nodesfile => $idx_dir . '/nodes.dmp', - -namesfile => $idx_dir . '/names.dmp', - -directory => $idx_dir); -# test -my $taxonid = 9606; -my $taxon = $db->get_taxon(-taxonid => $taxonid); -my $name = $taxon->scientific_name; - -if ($name eq "Homo sapiens") { - print "Index files for taxonomy database were successfully generated!\n"; -} else { - print "Something wrong happened while indexing taxonomy. Please try again!\n"; -} - -exit; diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh deleted file mode 100755 index 1eaf176..0000000 --- a/fdog/setup/install_lib.sh +++ /dev/null @@ -1,189 +0,0 @@ -#!/bin/bash - -sys="$(uname)" # Linux for Linux or Darwin for MacOS - -flag=0 - -### update GPG key (Google signature key for signing and authenticating packages) -if ! [ "$sys" == "Darwin" ]; then - wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | sudo apt-key add - -fi - -### check grep, sed and wget availability -grepprog='grep' -sedprog='sed' -readlinkprog='readlink' -wgetprog='wget' -bashFile='.bashrc' -if [ "$sys" == "Darwin" ]; then - if [ -z "$(which brew)" ]; then - echo "Please install homebrew to install dependencies tools and libraries!" - echo "Check https://brew.sh" - exit - fi - sedprog='gsed' - grepprog='ggrep' - readlinkprog='greadlink' - shell=$(echo $SHELL) - if [ $shell == "/bin/zsh" ]; then - bashFile='.zshrc' - else - bashFile='.bash_profile' - fi -else - if [ "$EUID" -ne 0 ]; then - echo "You must run this setup as a root user!" - exit - fi -fi - -if [ -z "$(which $readlinkprog)" ]; then - if [ "$sys" == "Darwin" ]; then - brew install coreutils - fi -fi - -if [ -z "$(which $sedprog)" ]; then - if [ "$sys" == "Darwin" ]; then - brew install gnu-sed - fi -fi - -if [ -z "$(which $grepprog)" ]; then - if [ "$sys" == "Darwin" ]; then - brew install grep - fi -fi - -if [ -z "$(which $wgetprog)" ]; then - if [ "$sys" == "Darwin" ]; then - brew install wget - fi -fi - -if ! [ -f ~/$bashFile ]; then - touch ~/$bashFile -fi - -### check dependencies -echo "-------------------------------------" -echo "Installing dependencies..." - -dependenciesUbuntu=( - build-essential # for make - curl - r-base # for Statistics::R - wise - hmmer # hmmer (for both hmmsearch and hmmbuild) - clustalw - mafft - muscle - blast2 # blast - ncbi-blast+ - libdbi-perl - libipc-run-perl - perl-doc - locales - lib32z1 - augustus -) - -dependenciesMac=( - brewsci/bio/genewise - hmmer # hmmer (for both hmmsearch and hmmbuild) - brewsci/bio/clustal-w - mafft - brewsci/bio/muscle - blast - augustus -) - -if [ "$sys" == "Darwin" ]; then - for i in "${dependenciesMac[@]}"; do - echo $i - brew install $i - done - if [ -z "$(grep clustalw ~/$bashFile)" ]; then - echo "alias clustalw='clustalw2'" >> ~/$bashFile - fi -else - sudo apt-get update -y - for i in "${dependenciesUbuntu[@]}"; do - echo $i - if ["$i" == "augustus"]; then - sudo apt install augustus > /dev/null - else - sudo apt-get install -y -qq $i > /dev/null - fi - done -fi - -dependencies=( - genewise - hmmsearch - hmmbuild - mafft - muscle - blastn - augustus -) - -for i in "${dependencies[@]}"; do - if [ -z "$(which $i)" ]; then - echo "$i not found / cannot be automatically installed. Please install it manually and run this setup again!" - flag=1 - fi -done -if [ "$flag" == 1 ]; then exit 1; fi - -wisePath=$(which "genewise") -if [ -z "$(grep WISECONFIGDIR=$wisePath ~/$bashFile)" ]; then - echo "export WISECONFIGDIR=${wisePath}" >> ~/$bashFile -fi - -echo "Installing Perl modules..." -perlModules=( - Array::Utils - Capture::Tiny - DBI - DB_File - File::Copy - File::Path - File::Basename - File::Which - List::Util - Parallel::ForkManager - POSIX - Getopt::Long - IO::Handle - IPC::Run - Statistics::R - Term::Cap - Time::HiRes - Bio::AlignIO - Bio::Align::ProteinStatistics - Bio::DB::Taxonomy - Bio::SearchIO - Bio::SearchIO::blastxml - Bio::Search::Hit::BlastHit - Bio::Seq - Bio::SeqIO - Bio::SeqUtils - Bio::Tree::Tree - Bio::Tools::Run::StandAloneBlast -) - -if [ -z "$(which cpanm)" ]; then - curl -L http://cpanmin.us | perl - --sudo App::cpanminus -fi -for i in "${perlModules[@]}"; do - msg=$((perldoc -l $i) 2>&1) - if [[ "$(echo $msg)" == *"No documentation"* ]]; then - sudo cpanm ${i} --quiet --force - fi -done -echo "done!" - -echo "-------------------------------------" -CURRENT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -echo "Please run fdog.setup without --lib option to continue setup fdog!" diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh deleted file mode 100755 index 28eb851..0000000 --- a/fdog/setup/setup.sh +++ /dev/null @@ -1,427 +0,0 @@ -#!/bin/bash - -sys="$(uname)" # Linux for Linux or Darwin for MacOS -echo "Current OS system: $sys" - -CURRENT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -CURRENT="${CURRENT/\/setup/}" -BIN="$CURRENT/bin" - -flag=0 -fas=1 -installLib=0 -homedir="$(echo $HOME)" -outDir=$CURRENT - -while getopts lfo: opt; do - case ${opt} in - o ) - echo "Data output path: $OPTARG" - outDir=$OPTARG - ;; - l ) - echo "INSTALL LIB" - installLib=1 - ;; - f ) - echo "NO FAS!" - fas=0 - ;; - \? ) - echo "Usage: setup.sh [-l] [-f]" - exit 1 - ;; - esac -done - -### install dependencies -if [ $installLib == 1 ]; then - if [ "$sys" == "Darwin" ]; then - $CURRENT/setup/install_lib.sh - else - echo "Enter sudo password to install required libraries..." - sudo $CURRENT/setup/install_lib.sh - fi - exit -fi - -### check grep, sed, readlink and wget availability -echo "-------------------------------------" -echo "Checking .bash_profile/.bashrc, grep, sed/gsed and wget availability..." -grepprog='grep' -sedprog='sed' -readlinkprog='readlink' -wgetprog='wget' -bashFile='.bashrc' -rprofile='.Rprofile' - -if [ "$sys" == "Darwin" ]; then - sedprog='gsed' - grepprog='ggrep' - readlinkprog='greadlink' - shell=$(echo $SHELL) - if [ $shell == "/bin/zsh" ]; then - bashFile='.zshrc' - else - bashFile='.bash_profile' - fi -fi - -if [ -z "$(which $sedprog)" ]; then - echo -e "\e[31m$sedprog not found!\e[0m" - echo "Please run fdog.setup with --lib first!" - exit -fi - -if [ -z "$(which $grepprog)" ]; then - echo -e "\e[31m$grepprog not found!\e[0m" - echo "Please run fdog.setup with --lib first!" - exit -fi - -if [ -z "$(which $wgetprog)" ]; then - echo -e "\e[31m$wgetprog not found!\e[0m" - echo "Please run fdog.setup with --lib first!" - exit -fi - -if [ -z "$(which $readlinkprog)" ]; then - echo -e "\e[31m$readlinkprog not found!\e[0m" - echo "Please run fdog.setup with --lib first!" - exit -fi - -if ! [ -f ~/$bashFile ]; then - touch ~/$bashFile -fi -if ! [ -f ~/$rprofile ]; then - touch ~/$rprofile -fi -echo "done!" - -### prepare folders -echo "-------------------------------------" -echo "Preparing folders..." -if [ ! -d "$CURRENT/taxonomy" ]; then mkdir "$CURRENT/taxonomy"; fi -if [ ! -d "$CURRENT/bin" ]; then mkdir "$CURRENT/bin"; fi -if [ ! -d "$CURRENT/bin/aligner" ]; then mkdir "$CURRENT/bin/aligner"; fi -echo "done!" - -### download tools -echo "-------------------------------------" -echo "Downloading and installing annotation tools/databases:" - -fasta36="yes" -if [ -z "$(which fasta36)" ]; then - fasta36="no" - # fasta36v="fasta-36.3.8h" - fasta36v="36.3.8h_04-May-2020" - if ! [ -f "bin/aligner/bin/fasta36" ]; then - echo "fasta36" - # wget "http://faculty.virginia.edu/wrpearson/fasta/fasta36/${fasta36v}.tar.gz" - # tar xf $fasta36v.tar.gz - # rm "${fasta36v}.tar.gz" - # mv $fasta36v/* $CURRENT/bin/aligner/ - # rm -rf $fasta36v - wget "https://github.com/wrpearson/fasta36/archive/refs/tags/v${fasta36v}.tar.gz" - tar xf "v${fasta36v}.tar.gz" - rm "v${fasta36v}.tar.gz" - mv fasta36-${fasta36v}/* $CURRENT/bin/aligner/ - rm -rf "fasta36-${fasta36v}" - cd "$CURRENT/bin/aligner/src" - if [ $sys=="Linux" ]; then - make -f ../make/Makefile.linux64_sse2 all - elif [ $sys=="Darwin" ]; then - make -f ../make/Makefile.os_x86_64 all - fi - fi - if [ -z "$($grepprog PATH=$CURRENT/bin/aligner/bin ~/$bashFile)" ]; then - echo "export PATH=$CURRENT/bin/aligner/bin:\$PATH" >> ~/$bashFile - fi -fi -cd $CURRENT -if [ -z "$(which fasta36)" ]; then - if ! [ -f "$CURRENT/bin/aligner/bin/fasta36" ]; then - echo -e "\e[31mfasta36 tool could not be found in $CURRENT/bin/aligner/. Please check again!\e[0m" - exit - fi -fi - -cd "$CURRENT/taxonomy" -if ! [ -f "nodes" ]; then - wget "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" - tar xf taxdump.tar.gz - rm taxdump.tar.gz - echo "Taxonomy database indexing. It can take a while, please wait..." - perl $CURRENT/setup/indexTaxonomy.pl $CURRENT/taxonomy - rm citations.dmp - rm delnodes.dmp - rm division.dmp - rm gencode.dmp - rm merged.dmp - rm gc.prt - rm readme.txt -fi -cd $CURRENT -if ! [ -f "$CURRENT/taxonomy/nodes" ]; then - echo -e "\e[31mError while indexing NCBI taxonomy database! Please check $CURRENT/taxonomy/ folder and run this setup again!\e[0m" - exit -fi - -setupFAS=0 -if [ $fas == 1 ]; then - cd "$CURRENT/bin" - if [ -z "$(which fas.doAnno)" ]; then - echo "FAS" - pip install --user greedyFAS - if [ -z "$($grepprog \$HOME/.local/bin:\$PATH ~/$bashFile)" ]; then - echo "export PATH=\$HOME/.local/bin:\$PATH" >> ~/$bashFile - fi - if [ -z "$($grepprog $homedir/.local/bin ~/$rprofile)" ]; then - echo "Sys.setenv(PATH = paste(\"$homedir/.local/bin\", Sys.getenv(\"PATH\"), sep=\":\"))" >> ~/$rprofile - fi - setupFAS=1 - else - if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then - setupFAS=1 - fi - fi - - cd $CURRENT - source ~/$bashFile - if [ -z "$(which fas.doAnno)" ]; then - echo -e "Installation of FAS failed! Please try again or install FAS by yourself using \e[91mpip install greedyFAS\e[0m!" - echo -e "For more info, please check FAS website at \e[91mhttps://github.com/BIONF/FAS\e[0m" - exit - else - if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then - setupFAS=1 - fi - fi - echo "done!" -fi - -### download data -data_fdog_file="data_HaMStR-2019c.tar.gz" -checkSumData="1748371655 621731824 $data_fdog_file" -cd $outDir -if [ ! -d "$outDir/genome_dir" ]; then mkdir "$outDir/genome_dir"; fi -if [ ! -d "$outDir/assembly_dir" ]; then mkdir "$outDir/assembly_dir"; fi - -if ! [ "$(ls -A $outDir/genome_dir)" ]; then - echo "-------------------------------------" - echo "Getting pre-calculated data" - - echo "Processing $outDir ..." - if [ ! -f $outDir/$data_fdog_file ]; then - echo "Downloading data from https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - wget --no-check-certificate https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file - else - CHECKSUM=$(cksum $data_fdog_file) - echo "Checksum: $CHECKSUM" - if ! [ "$CHECKSUM" == "$checkSumData" ]; then - rm $outDir/$data_fdog_file - echo "Downloading data from https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - wget --no-check-certificate https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file - fi - fi - - if [ ! -f $outDir/$data_fdog_file ]; then - echo "File $data_fdog_file not found! Please try to download again from" - echo "https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - exit - fi - - CHECKSUM=$(cksum $data_fdog_file) - if [ "$CHECKSUM" == "$checkSumData" ]; then - echo "Extracting archive $data_fdog_file..." - tar xf $outDir/$data_fdog_file - rm $outDir/$data_fdog_file - if [ -d "$outDir/genome_dir" ]; then - for i in $(ls "$outDir/genome_dir"); do rm -f "$outDir/genome_dir/$i/$i.fa.mod"; done - fi - - if [ "$(ls -A $outDir/blast_dir)" ]; then - echo "Data should be in place to run fdog." - else - echo -e "\e[31mSomething went wrong with the download. Data folders are empty.\e[0m" - echo "Please try to download again from" - echo "https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - echo "Or contact us if you think this is our issue!" - exit - fi - else - echo -e "\e[31mSomething went wrong with the download. Checksum does not match.\e[0m" - echo "Please try to download again from" - echo "https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - echo "Please put it into $outDir folder and run this setup again!" - exit - fi -fi -# write data path to pathConfig file -if [ -f $BIN/pathconfig.txt ]; then - rm $BIN/pathconfig.txt -fi -touch $BIN/pathconfig.txt -echo $outDir >> $BIN/pathconfig.txt - -### add paths to bash profile file -echo "-------------------------------------" -echo "Adding WISECONFIGDIR to ~/$bashFile" - -wisePath=$(which "genewise") -if [ -z "$($grepprog WISECONFIGDIR=$wisePath ~/$bashFile)" ]; then - echo "export WISECONFIGDIR=${wisePath}" >> ~/$bashFile -fi - -# echo "Adding paths to ~/$rprofile" -# if [ -z "$($grepprog $CURRENT/bin ~/$rprofile)" ]; then -# echo "Sys.setenv(PATH = paste(\"$CURRENT/bin\", Sys.getenv(\"PATH\"), sep=\":\"))" >> ~/$rprofile -# fi -echo "done!" - -### adapt paths in fdog scripts -echo "-------------------------------------" -echo "Adapting paths in fdog scripts" -# update the sed and grep commands -$sedprog -i -e "s/\(my \$sedprog = '\).*/\1$sedprog';/" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s/\(my \$grepprog = '\).*/\1$grepprog';/" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s/\(my \$readlinkprog = '\).*/\1$readlinkprog';/" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s/\(my \$sedprog = '\).*/\1$sedprog';/" $CURRENT/bin/oneSeq.pl -$sedprog -i -e "s/\(my \$grepprog = '\).*/\1$grepprog';/" $CURRENT/bin/oneSeq.pl -$sedprog -i -e "s/\(my \$readlinkprog = '\).*/\1$readlinkprog';/" $CURRENT/bin/oneSeq.pl - -# localize the perl installation -path2perl=`which perl` -echo "path to perl: $path2perl" -$sedprog -i -e "s|\#\!.*|\#\!$path2perl|g" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s|\#\!.*|\#\!$path2perl|g" $CURRENT/bin/translate.pl -$sedprog -i -e "s|\#\!.*|\#\!$path2perl|g" $CURRENT/bin/oneSeq.pl - -echo "done!" - -### final check -echo "-------------------------------------" -echo "Final check..." -flag=0 - -echo "Tools" -dependencies=( -genewise -hmmsearch -hmmscan -hmmbuild -mafft -muscle -clustalw -blastp -augustus -tblastn -) - -for i in "${dependencies[@]}"; do - tool=$i - if [ $tool == "clustalw" ]; then - if [ "$sys" == "Darwin" ]; then - tool="clustalw2" - fi - fi - if [ $tool == "tblastn" ]; then - requiredver="2.9.0" - currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" - t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1) - if [ $t == $currentver ]; then - echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" - fi - fi - if [ -z "$(which $tool)" ]; then - echo -e "\t\e[31mWARNING $tool not found!\e[0m" - flag=1 - fi -done - -perlModules=( - Array::Utils - Capture::Tiny - DBI - DB_File - File::Copy - File::Path - File::Basename - File::Which - List::Util - Parallel::ForkManager - POSIX - Getopt::Long - IO::Handle - IPC::Run - Statistics::R - Term::Cap - Time::HiRes - Bio::AlignIO - Bio::Align::ProteinStatistics - Bio::DB::Taxonomy - Bio::SearchIO - Bio::SearchIO::blastxml - Bio::Search::Hit::BlastHit - Bio::Seq - Bio::SeqIO - Bio::SeqUtils - Bio::Tree::Tree - Bio::Tools::Run::StandAloneBlast -) - -echo "Perl modules" -for i in "${perlModules[@]}"; do - msg=$((perl -e "use $i") 2>&1) - if ! [[ -z ${msg} ]]; then - echo -e "\t\e[31mWARNING $i could not be installed\e[0m" - flag=1 - fi -done - -echo "Environment paths" -envPaths=( -WISECONFIGDIR -) -for i in "${envPaths[@]}"; do - if [ -z "$($grepprog $i ~/$bashFile)" ]; then - echo -e "\t\e[31mWARNING $i was not added into ~/$bashFile\e[0m" - flag=1 - fi -done -if [ "$fasta36" == "no" ]; then - if [ -z "$($grepprog PATH=$CURRENT/bin/aligner/bin ~/$bashFile)" ]; then - echo -e "\t\e[31mWARNING $CURRENT/bin/aligner/bin was not added into ~/$bashFile\e[0m" - flag=1 - fi -fi -echo "done!" - -if [ "$flag" == 1 ]; then - echo "Some tools/libraries counld not installed correctly or paths were not added into ~/$bashFile." - echo "Please manually install the missing dependencies using fdog.setup with --lib option (ask your admin if you don't have root privileges)." - echo "Then run this setup again to try one more time!" - exit -else - echo "Generating symbolic links" - ln -s -f $CURRENT/bin/hamstr.pl $CURRENT/bin/hamstr - ln -s -f $CURRENT/bin/oneSeq.pl $CURRENT/bin/oneSeq - echo "Sourcing bash profile file" - source ~/$bashFile - echo "-------------------------------------" - $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/hamstr.pl - $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/oneSeq.pl - if [ "$setupFAS" == 1 ]; then - echo "All tests succeeded." - echo -e "\e[91mPLEASE RUN\e[0m \e[96mfas.setup\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m" - echo "Then you can test fdog with:" - else - echo "All tests succeeded, fdog should be ready to run. You can test it with:" - fi - echo -e "\e[96mfdog.run --seqFile infile.fa --seqName test --refspec HUMAN@9606@3\e[0m" - echo "Output files with prefix \"test\" will be found at your current working directory!" - echo -e "For more details, use \e[96mfdog.run -h\e[0m or visit https://github.com/BIONF/fDOG/wiki" - echo "Happy running fdog! ;-)" -fi -exit 1 diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh deleted file mode 100755 index 73b8573..0000000 --- a/fdog/setup/setup_conda.sh +++ /dev/null @@ -1,447 +0,0 @@ -#!/bin/bash - -sys="$(uname)" # Linux for Linux or Darwin for MacOS -echo "Current OS system: $sys" - -CURRENT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -CURRENT="${CURRENT/\/setup/}" -BIN="$CURRENT/bin" - -flag=0 -outDir=$CURRENT - -while getopts o: opt; do - case ${opt} in - o ) - echo "Data output path: $OPTARG" - outDir=$OPTARG - ;; - \? ) - echo "Usage: setup.sh [-l] [-f]" - exit 1 - ;; - esac -done -if [ ! -d "$outDir" ]; then mkdir "$outDir"; fi - -### check grep, sed and wget availability -echo "-------------------------------------" -echo "Checking .bash_profile/.bashrc, grep, sed/gsed and wget availability..." -grepprog='grep' -sedprog='sed' -readlinkprog='readlink' -wgetprog='wget' -bashFile='.bashrc' -rprofile='.Rprofile' - -if [ "$sys" == "Darwin" ]; then - sedprog='gsed' - grepprog='ggrep' - readlinkprog='greadlink' - shell=$(echo $SHELL) - if [ $shell == "/bin/zsh" ]; then - bashFile='.zshrc' - else - bashFile='.bash_profile' - fi -fi - -# NOTE: install only available for Linux! -if [ -z "$(which $sedprog)" ]; then - if [ "$sys" == "Darwin" ]; then - echo -e "\e[31m$sedprog not found. Please install it first (e.g. using brew)!\e[0m" - flag=1 - fi - conda install -c conda-forge sed -fi - -if [ -z "$(which $grepprog)" ]; then - if [ "$sys" == "Darwin" ]; then - echo -e "\e[31m$grepprog not found. Please install it first (e.g. using brew)!\e[0m" - flag=1 - fi - conda install -c bioconda grep -fi - -if [ -z "$(which $wgetprog)" ]; then - if [ "$sys" == "Darwin" ]; then - echo -e "\e[31m$wgetprog not found. Please install it first (e.g. using brew)!\e[0m" - flag=1 - fi - conda install -c anaconda wget -fi - -if [ -z "$(which $readlinkprog)" ]; then - if [ "$sys" == "Darwin" ]; then - echo -e "\e[31m$readlinkprog not found. Please install it first (e.g. using brew)!\e[0m" - flag=1 - fi -fi - -if ! [ -f ~/$bashFile ]; then - touch ~/$bashFile -fi -if ! [ -f ~/$rprofile ]; then - touch ~/$rprofile -fi -if [ "$flag" == 1 ]; then exit 1; fi -echo "done!" - -### check dependencies -echo "-------------------------------------" -echo "Installing dependencies..." - -if [ -z "$(which R)" ]; then - echo "R" - conda install -y r -fi - -if [[ -z $(conda list | $grepprog "pkg-config ") ]]; then - echo "pkg-config" - conda install -y pkg-config -fi - -if [[ -z $(conda list | $grepprog "perl-bioperl ") ]]; then - echo "perl-bioperl" - conda install -y -c bioconda perl-bioperl - conda install -y -c bioconda perl-bioperl-core - conda install -y -c bioconda perl-bioperl-run -fi - -dependencies=( - blastp # blast - genewise # wise2 - hmmsearch # hmmer (for both hmmsearch and hmmbuild) - clustalw - mafft # for linsi - muscle - fasta36 - augustus #for fdog.assembly -) - -for i in "${dependencies[@]}"; do - if [ -z "$(which $i)" ]; then - echo $i - tool=$i - if [ "$tool" = "blastp" ]; then - conda install -y -c bioconda blast - elif [ "$tool" = "hmmsearch" ]; then - conda install -y -c bioconda hmmer - elif [ "$tool" = "genewise" ]; then - conda install -y -c bioconda wise2 - wisePath=$(which "genewise") - if [ -z "$($grepprog WISECONFIGDIR=$wisePath ~/$bashFile)" ]; then - echo "export WISECONFIGDIR=${wisePath}" >> ~/$bashFile - fi - elif [ "$tool" = "fasta36" ]; then - conda install -y -c bioconda fasta3 - elif [ "$tool" = "augustus" ]; then - conda install -y -c bioconda augustus - else - conda install -y -c bioconda $i - fi - fi -done - -for i in "${dependencies[@]}"; do - if [ -z "$(which $i)" ]; then - echo -e "\e[31m$i not found. Please install it to use fdog!\e[0m" - flag=1 - fi -done -if [ "$flag" == 1 ]; then exit 1; fi - -perlModules=( - Array::Utils - Capture::Tiny - DBI - DB_File - File::Copy - File::Path - File::Basename - File::Which - List::Util - Parallel::ForkManager - POSIX - Getopt::Long - IO::Handle - IPC::Run - Statistics::R - Term::Cap - Time::HiRes - Bio::AlignIO - Bio::Align::ProteinStatistics - Bio::DB::Taxonomy - Bio::SearchIO - Bio::SearchIO::blastxml - Bio::Search::Hit::BlastHit - Bio::Seq - Bio::SeqIO - Bio::SeqUtils - Bio::Tree::Tree - Bio::Tools::Run::StandAloneBlast -) - -for i in "${perlModules[@]}"; do - msg=$((perldoc -l $i) 2>&1) - if [[ "$(echo $msg)" == *"No documentation"* ]]; then - cpanm ${i} --quiet --force - fi -done - -echo "done!" - -### prepare folders -echo "-------------------------------------" -echo "Preparing folders..." - -# create required folders -if [ ! -d "$CURRENT/taxonomy" ]; then mkdir "$CURRENT/taxonomy"; fi -if [ ! -d "$CURRENT/bin" ]; then mkdir "$CURRENT/bin"; fi -if [ ! -d "$CURRENT/bin/aligner" ]; then mkdir "$CURRENT/bin/aligner"; fi -echo "done!" - -### download tools -echo "-------------------------------------" -echo "Downloading and installing annotation tools/databases:" - -cd "$CURRENT/taxonomy" -if ! [ -f "nodes" ]; then - wget "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" - tar xfv taxdump.tar.gz - rm taxdump.tar.gz - echo "Taxonomy database indexing. It can take a while, please wait..." - perl $CURRENT/setup/indexTaxonomy.pl $CURRENT/taxonomy - rm citations.dmp - rm delnodes.dmp - rm division.dmp - rm gencode.dmp - rm merged.dmp - rm gc.prt - rm readme.txt -fi -cd $CURRENT -if ! [ -f "$CURRENT/taxonomy/nodes" ]; then - echo -e "\e[31mError while indexing NCBI taxonomy database! Please check $CURRENT/taxonomy/ folder and run this setup again!\e[0m" - exit -fi - -cd "$CURRENT/bin" -setupFAS=0 -if [ -z "$(which fas.doAnno)" ]; then - echo "FAS" - conda install -y -c BIONF fas - if [ -z "$(which fas.doAnno)" ]; then - echo -e "\e[31mInstallation of FAS failed! Please try again!\e[0m" - exit - fi - setupFAS=1 -else - if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then - setupFAS=1 - fi -fi - -if [ -z "$(which fas.doAnno)" ]; then - echo -e "Installation of FAS failed! Please try again or install FAS by yourself using \e[91mconda install -c BIONF fas\e[0m or \e[91mpip install greedyFAS\e[0m" - echo -e "For more info, please check FAS website at \e[91mhttps://github.com/BIONF/FAS\e[0m" - exit -else - if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then - setupFAS=1 - fi -fi -cd $CURRENT -echo "done!" - -### download data -data_fdog_file="data_HaMStR-2019c.tar.gz" -checkSumData="1748371655 621731824 $data_fdog_file" -cd $outDir -if [ ! -d "$outDir/genome_dir" ]; then mkdir "$outDir/genome_dir"; fi -if [ ! -d "$outDir/assembly_dir" ]; then mkdir "$outDir/assembly_dir"; fi - -if ! [ "$(ls -A $outDir/genome_dir)" ]; then - echo "-------------------------------------" - echo "Getting pre-calculated data" - - echo "Processing $outDir ..." - if [ ! -f $outDir/$data_fdog_file ]; then - echo "Downloading data from https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - wget --no-check-certificate https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file - else - CHECKSUM=$(cksum $data_fdog_file) - echo "Checksum: $CHECKSUM" - if ! [ "$CHECKSUM" == "$checkSumData" ]; then - rm $outDir/$data_fdog_file - echo "Downloading data from https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - wget --no-check-certificate https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file - fi - fi - - if [ ! -f $outDir/$data_fdog_file ]; then - echo "File $data_fdog_file not found! Please try to download again from" - echo "https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - exit - fi - - CHECKSUM=$(cksum $data_fdog_file) - if [ "$CHECKSUM" == "$checkSumData" ]; then - echo "Extracting archive $data_fdog_file..." - tar xf $outDir/$data_fdog_file - rm $outDir/$data_fdog_file - for i in $(ls "$outDir/genome_dir"); do rm -f "$outDir/genome_dir/$i/$i.fa.mod"; done - - if [ "$(ls -A $outDir/blast_dir)" ]; then - echo "Data should be in place to run fdog.\n" - else - echo -e "\e[31mSomething went wrong with the download. Data folders are empty.\e[0m" - echo "Please try to download again from" - echo "https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - echo "Or contact us if you think this is our issue!" - exit - fi - else - echo -e "\e[31mSomething went wrong with the download. Checksum does not match.\e[0m" - echo "Please try to download again from" - echo "https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - echo "Please put it into $outDir folder and run this setup again!" - exit - fi -fi -# write data path to pathConfig file -if [ -f $BIN/pathconfig.txt ]; then - rm $BIN/pathconfig.txt -fi -touch $BIN/pathconfig.txt -echo $outDir >> $BIN/pathconfig.txt - -### add paths to bash profile file -echo "-------------------------------------" -echo "Adding WISECONFIGDIR to ~/$bashFile" - -wisePath=$(which "genewise") -if [ -z "$($grepprog WISECONFIGDIR=$wisePath ~/$bashFile)" ]; then - echo "export WISECONFIGDIR=${wisePath}" >> ~/$bashFile -fi - -# echo "Adding paths to ~/$rprofile" -# if [ -z "$($grepprog $CURRENT/bin ~/$rprofile)" ]; then -# echo "Sys.setenv(PATH = paste(\"$CURRENT/bin\", Sys.getenv(\"PATH\"), sep=\":\"))" >> ~/$rprofile -# fi - -echo "done!" - -### adapt paths in fdog scripts -echo "-------------------------------------" -echo "Adapting paths in fdog scripts" -# update the sed and grep commands -$sedprog -i -e "s/\(my \$sedprog = '\).*/\1$sedprog';/" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s/\(my \$grepprog = '\).*/\1$grepprog';/" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s/\(my \$readlinkprog = '\).*/\1$readlinkprog';/" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s/\(my \$sedprog = '\).*/\1$sedprog';/" $CURRENT/bin/oneSeq.pl -$sedprog -i -e "s/\(my \$grepprog = '\).*/\1$grepprog';/" $CURRENT/bin/oneSeq.pl -$sedprog -i -e "s/\(my \$readlinkprog = '\).*/\1$readlinkprog';/" $CURRENT/bin/oneSeq.pl - -# localize the perl installation -path2perl=`which perl` -echo "path to perl: $path2perl" -$sedprog -i -e "s|\#\!.*|\#\!$path2perl|g" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s|\#\!.*|\#\!$path2perl|g" $CURRENT/bin/translate.pl -$sedprog -i -e "s|\#\!.*|\#\!$path2perl|g" $CURRENT/bin/oneSeq.pl - -### final check -echo "-------------------------------------" -echo "Final check..." -flag=0 - -echo "Conda packages" -condaPkgs=( -perl-bioperl -perl-bioperl-core -blast -hmmer -wise2 -clustalw -mafft -muscle -fasta3 -augustus -tblastn -) -for i in "${condaPkgs[@]}"; do - if [[ -z $(conda list | $grepprog "$i ") ]]; then - progname=$i - if [ "$i" == "blast" ]; then - progname="blastp" - elif [ "$i" == "wise2" ]; then - progname="genewise" - elif [ "$i" == "hmmer" ]; then - progname="hmmsearch" - elif [ "$i" == "fasta3" ]; then - progname="fasta36" - elif [ "$i" == "tblastn" ]; then - requiredver="2.9.0" - currentver="$(tblastn -version | head -n1 | cut -d" " -f2 | sed 's/+//g')" - t=$(printf '%s\n' $requiredver $currentver | sort -V | head -n1) - if [ $t == $currentver ]; then - echo -e "\t\e[31mWARNING BLAST+ needs an update to at least version ${requiredver}!\e[0m" - fi - fi - if [ -z "$(which $progname)" ]; then - echo -e "\t\e[31m$i could not be installed\e[0m" - flag=1 - fi - fi -done -echo "done!" - -echo "Perl modules" -for i in "${perlModules[@]}"; do - msg=$((perl -e "use $i") 2>&1) - if ! [[ -z ${msg} ]]; then - echo -e "\t\e[31m$i could not be installed\e[0m" - flag=1 - fi -done -echo "done!" - -echo "Environment paths" -envPaths=( -WISECONFIGDIR -) -for i in "${envPaths[@]}"; do - if [ -z "$($grepprog $i ~/$bashFile)" ]; then - echo -e "\t\e[31m$i was not added into ~/$bashFile\e[0m" - flag=1 - fi -done -echo "done!" - -if [ "$flag" == 1 ]; then - echo "Some tools/libraries counld not installed correctly or paths were not added into ~/$bashFile." - echo "Please manually install the missing dependencies using using fdog.setup with --lib option (ask your admin if you don't have root privileges)." - echo "Then run this setup again to try one more time!" - exit -else - echo "Generating symbolic links" - ln -s -f $CURRENT/bin/hamstr.pl $CURRENT/bin/hamstr - ln -s -f $CURRENT/bin/oneSeq.pl $CURRENT/bin/oneSeq - echo "Sourcing bash profile file" - source ~/$bashFile - echo "-------------------------------------" - $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/hamstr.pl - $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/oneSeq.pl - if [ "$setupFAS" == 1 ]; then - echo "All tests succeeded." - echo -e "\e[91mPLEASE RUN\e[0m \e[96mfas.setup\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m" - echo "Then you can test fdog with:" - else - echo "All tests succeeded, fdog should be ready to run. You can test it with:" - fi - echo -e "\e[96mfdog.run --seqFile infile.fa --seqName test --refspec HUMAN@9606@3\e[0m" - echo "Output files with prefix \"test\" will be found at your current working directory!" - echo -e "For more details, use \e[96mfdog.run -h\e[0m or visit https://github.com/BIONF/fDOG/wiki" - echo "Happy using fdog! ;-)" -fi -exit 1 diff --git a/fdog/setupfDog.py b/fdog/setupfDog.py index b6a67d6..679598f 100644 --- a/fdog/setupfDog.py +++ b/fdog/setupfDog.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to setup fdog: install dependencies and # download pre-computed data @@ -18,62 +18,272 @@ import sys import os +import platform import argparse import subprocess +import shutil from ete3 import NCBITaxa from pathlib import Path +from pkg_resources import get_distribution + +import fdog.libs.zzz as general_fn +import fdog.libs.fas as fas_fn +import fdog.libs.alignment as align_fn + + +def check_conda_env(): + """ Return if a conda env is currently using """ + if 'CONDA_DEFAULT_ENV' in os.environ: + if not os.environ['CONDA_DEFAULT_ENV'] == 'base': + return(True) + return(False) + + +def get_source_path(): + """ Get path of installed fDOG library """ + fdogPath = os.path.realpath(__file__).replace('/setupfDog.py','') + return(fdogPath) + + +def get_data_path(fdogPath): + """ Get path of fDOG data """ + pathconfigFile = fdogPath + '/bin/pathconfig.yml' + if not os.path.exists(pathconfigFile): + sys.exit('No pathconfig.yml found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') + else: + cfg = general_fn.load_config(pathconfigFile) + try: + dataPath = cfg['datapath'] + except: + try: + corepath = cfg['corepath'] + except: + pass + try: + searchpath = cfg['searchpath'] + except: + pass + try: + annopath = cfg['annopath'] + except: + pass + dataPath = 'Core taxa: %s\nSearch taxa: %s\nAnnotations: %s' % (corepath, searchpath, annopath) + return(dataPath) + + +def install_fas(woFAS): + """ Install greedyFAS """ + if not woFAS: + ### check if fas already installed + try: + fasVersion = subprocess.run(['fas.run --version'], shell = True, capture_output = True, check = True) + except: + print('=> greedyFAS (https://github.com/BIONF/FAS)') + install_fas_cmd = 'pip install greedyFAS' + try: + subprocess.check_output([install_fas_cmd], shell = True, stderr = subprocess.STDOUT) + except subprocess.CalledProcessError as e: + sys.exit('\033[91mERROR: Problem with installing FAS! Please do it manually. See: https://github.com/BIONF/FAS!\033[0m') + ### check if fas installed but not yet configured + check_fas = fas_fn.check_fas_executable() + + +def install_fasta36(fdogPath, cwd): + """ Install FASTA36 from source """ + try: + subprocess.check_output(['which fasta36'], shell = True, stderr = subprocess.STDOUT) + except subprocess.CalledProcessError as e: + print('=> FASTA36 (https://github.com/wrpearson/fasta36)') + fasta36v = '36.3.8h_04-May-2020' + fasta36url = 'https://github.com/wrpearson/fasta36/archive/refs/tags' + fasta36file = 'v%s.tar.gz' % fasta36v + if not os.path.exists('%s/bin/aligner/bin/fasta36' % fdogPath): + if os.path.exists('%s/bin/aligner' % fdogPath): + shutil.rmtree('%s/bin/aligner' % fdogPath) + general_fn.download_file(fasta36url, fasta36file) + shutil.unpack_archive(fasta36file, '%s/bin/' % fdogPath, 'gztar') + os.remove(fasta36file) + shutil.move('%s/bin/fasta36-%s' % (fdogPath, fasta36v), '%s/bin/aligner' % fdogPath) + if 'Darwin' in platform.uname(): + make_cmd = 'make -f %s/bin/aligner/make/Makefile.os_x86_64 all' % fdogPath + elif 'Linux' in platform.uname(): + make_cmd = 'make -f %s/bin/aligner/make/Makefile.linux64_sse2 all' % fdogPath + else: + sys.exit('\033[91mERROR: Cannot identify type of system (neither Linux nor Darwin/MacOS)\033[0m') + try: + print('Compiling fasta36. Please wait...') + os.chdir('%s/bin/aligner/src' % fdogPath) + subprocess.run(make_cmd, shell = True, check = True) + except: + sys.exit('\033[91mERROR: Cannot install FASTA36!\033[0m') + os.chdir(cwd) + if not os.path.exists('%s/bin/aligner/bin/fasta36' % fdogPath): + sys.exit('\033[91mERROR: fasta36 not found! Please install it manually!\033[0m') + else: + print('FASTA36 installed at %s/bin/aligner/' % fdogPath) + else: + fasta36_path = align_fn.check_fasta36_executable(fdogPath) + print('FASTA36 found at %s' % fasta36_path) + + +def check_dependencies(fdogPath): + """ Check for missing dependencies + Dependencies are specified in fdog/data/dependencies.txt file + """ + missing = [] + dependencies = '%s/data/dependencies.txt' % fdogPath + for tool in general_fn.read_file(dependencies): + function = tool + if tool == 'hmmer': + function = 'hmmsearch' + if tool == 'ncbi-blast+': + function = 'blastp' + try: + subprocess.check_output(['which %s' % function], shell = True, stderr = subprocess.STDOUT) + except subprocess.CalledProcessError as e: + missing.append(tool) + return(missing) + + +def download_data(dataPath, resetData): + """ Downloade pre-calculated fDOG data """ + data_fdog_file = "data_HaMStR-2019c.tar.gz" + checksum_data = "1748371655 621731824 $data_fdog_file" + + genome_path = '%s/searchTaxa_dir' % dataPath + Path(genome_path).mkdir(parents = True, exist_ok = True) + + if len(general_fn.read_dir(genome_path)) < 1 or resetData: + data_url = 'https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo' + if os.path.exists(data_fdog_file) and resetData: + os.remove(data_fdog_file) + general_fn.download_file(data_url, data_fdog_file) + try: + print('Extracting %s...' % data_fdog_file) + shutil.unpack_archive(data_fdog_file, dataPath, 'gztar') + except: + sys.exit('\033[91mERROR: Cannot extract %s to %s!\033[0m' % (data_fdog_file, dataPath)) + if 'genome_dir' in general_fn.read_dir(dataPath): + os.rename('%s/genome_dir' % dataPath, '%s/searchTaxa_dir' % dataPath) + os.rename('%s/blast_dir' % dataPath, '%s/coreTaxa_dir' % dataPath) + os.rename('%s/weight_dir' % dataPath, '%s/annotation_dir' % dataPath) + check_cmd = 'fdog.checkData -s %s/searchTaxa_dir -c %s/coreTaxa_dir -a %s/annotation_dir --reblast' % (dataPath, dataPath, dataPath) + try: + print('Checking downloaded data...') + subprocess.run([check_cmd], stdout = subprocess.DEVNULL, check = True, shell = True) + except: + print('\033[96mWARNING: Problem with validating downloaded data. Please run fdog.checkData manually!\033[0m') + os.remove(data_fdog_file) + print('fDOG data downloaded and saved at %s' % dataPath) + else: + print('fDOG data found at %s' % dataPath) + + +def write_pathconfig(fdogPath, dataPath): + """ Write data directories to pathconfig file """ + Path('%s/bin' % fdogPath).mkdir(parents = True, exist_ok = True) + pathconfigFile = '%s/bin/pathconfig.yml' % fdogPath + if os.path.exists(pathconfigFile): + os.remove(pathconfigFile) + with open(pathconfigFile, 'w') as cf: + cf.write('datapath: \'%s\'\n' % dataPath) + cf.write('corepath: \'%s/coreTaxa_dir\'\n' % dataPath) + cf.write('searchpath: \'%s/searchTaxa_dir\'\n' % dataPath) + cf.write('annopath: \'%s/annotation_dir\'\n' % dataPath) -def checkOptConflict(lib, conda): - if lib: - if (conda): - sys.exit('*** ERROR: --lib and --conda cannot be used at the same time!') def main(): - version = '0.0.3' - parser = argparse.ArgumentParser(description='You are running fdog.setup version ' + str(version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') - required.add_argument('-o', '--outPath', help='Output path for fdog data', action='store', default='', required=True) - optional.add_argument('--conda', help='Setup fdog within a conda env', action='store_true', default=False) - optional.add_argument('--lib', help='Install fdog libraries only', action='store_true', default=False) - optional.add_argument('--getSourcepath', help='Get path to installed fdog', action='store_true', default=False) - optional.add_argument('--getDatapath', help='Get fdog default data path', action='store_true', default=False) + required.add_argument('-d', '--dataPath', help='Output path for fDOG data', action='store', default='', required=True) + optional.add_argument('--getSourcepath', help='Get path to installed fdog package', action='store_true', default=False) + optional.add_argument('--getDatapath', help='Get fDOG default data path', action='store_true', default=False) + optional.add_argument('--woFAS', help='Do not install FAS (https://github.com/BIONF/FAS)', action='store_true', default=False) + optional.add_argument('--force', help='Force installing', action='store_true', default=False) + optional.add_argument('--resetData', help='Re-download precalculated fDOG data', action='store_true', default=False) - ### get arguments + ### parse arguments args = parser.parse_args() - conda = args.conda - lib = args.lib - checkOptConflict(lib, conda) - outPath = args.outPath - Path(outPath).mkdir(parents = True, exist_ok = True) - fdogPath = os.path.realpath(__file__).replace('/setupfDog.py','') + dataPath = os.path.abspath(args.dataPath) + woFAS = args.woFAS + force = args.force + resetData = args.resetData + + ### get install path + fdogPath = get_source_path() if args.getSourcepath: print(fdogPath) sys.exit() + ### get data path if args.getDatapath: - pathconfigFile = fdogPath + '/bin/pathconfig.txt' - if not os.path.exists(pathconfigFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') + dataPath = get_data_path(fdogPath) + print(dataPath) + sys.exit() + + ### check if pathconfig file exists + pathconfigFile = '%s/bin/pathconfig.yml' % fdogPath + demo_cmd = 'fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3' + if os.path.exists(pathconfigFile) and not force: + check_fas = 1 + if not woFAS: + check_fas = fas_fn.check_fas_executable() + if check_fas == 1: + print('fDOG seems to be ready to use!') + print('You can test fDOG using the following command:\n%s' % demo_cmd) else: - with open(pathconfigFile) as f: - dataPath = f.readline().strip() - print(dataPath) + print('fDOG seems to be ready to use without FAS!') + print('You can test fDOG using the following command:\n%s --fasOff' % demo_cmd) sys.exit() + ### get ncbi taxonomy database for ete3 - print('Creating local NCBI taxonomy database...') + print('*** Creating local NCBI taxonomy database...') ncbi = NCBITaxa() - ### run setup - if conda: - setupFile = '%s/setup/setup_conda.sh -o %s' % (fdogPath, outPath) - subprocess.call([setupFile], shell = True) - else: - if lib: - setupFile = '%s/setup/setup.sh -l' % (fdogPath) + + ### install dependencies + print('*** Installing dependencies...') + ## FAS + if not woFAS: + install_fas(woFAS) + ## hmmer, blast+, clustalw, mafft, muscle, augustus, metaeuk + missing_tools = check_dependencies(fdogPath) + if len(missing_tools) > 0: + if check_conda_env() == True: + req_file = '%s/data/conda_requirements.yml' % fdogPath + print('=> Dependencies in %s' % req_file) + conda_install_cmd = 'conda install -c bioconda --file %s -y' % (req_file) + try: + subprocess.call([conda_install_cmd], shell = True) + except: + sys.exit('\033[91mERROR: Cannot install conda packages in %s!\033[0m' % req_file) else: - setupFile = '%s/setup/setup.sh -o %s' % (fdogPath, outPath) - subprocess.call([setupFile], shell = True) + install_cmd = 'sudo apt-get install -y -qq ' + sys.exit('\033[91mERROR: Please install these tools manually:\n%s\nusing the command: %s!\033[0m' % (', '.join(missing_tools), install_cmd)) + else: + print('=> Dependencies in %s/data/dependencies.txt already installed!' % fdogPath) + ## fasta36 + install_fasta36(fdogPath, os.getcwd()) + + ### download pre-calculated data + print('*** Downloading precalculated data...') + ### Remove data if resetData is used + if resetData: + if os.path.exists(dataPath): + print('fDOG data found in %s will be deleted! Enter to continue.' % dataPath) + if general_fn.query_yes_no(''): + shutil.rmtree(dataPath) + + Path(dataPath).mkdir(parents = True, exist_ok = True) + download_data(dataPath, resetData) + + ### create pathconfig file + write_pathconfig(fdogPath, dataPath) + + print('\033[96m==> FINISHED! fDOG data can be found at %s\033[0m' % dataPath) + print('You can test fDOG using the following command:\n%s' % demo_cmd) if __name__ == '__main__': main() diff --git a/fdog/showTaxa.py b/fdog/showTaxa.py index a29bf57..7bb27c4 100644 --- a/fdog/showTaxa.py +++ b/fdog/showTaxa.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to list all available taxa of the installed fdog # @@ -19,9 +19,7 @@ import os from ete3 import NCBITaxa -def checkFileExist(file): - if not os.path.exists(os.path.abspath(file)): - sys.exit('%s not found' % file) +import fdog.libs.zzz as general_fn def getNcbiName(taxonName): ncbi = NCBITaxa() @@ -32,28 +30,44 @@ def getNcbiName(taxonName): name = taxonName return(name) + def getTaxa(): # get data path fdogPath = os.path.realpath(__file__).replace('/showTaxa.py','') - pathconfigFile = fdogPath + '/bin/pathconfig.txt' + pathconfigFile = fdogPath + '/bin/pathconfig.yml' if not os.path.exists(pathconfigFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') + sys.exit('No pathconfig.yml found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') with open(pathconfigFile) as f: - dataPath = f.readline().strip() + cfg = general_fn.load_config(pathconfigFile) + try: + dataPath = cfg['datapath'] + except: + dataPath = 'several places!' + try: + corepath = cfg['corepath'] + except: + corepath = dataPath + '/coreTaxa_dir' + general_fn.check_file_exist(corepath) + try: + searchpath = cfg['searchpath'] + except: + searchpath = dataPath + '/searchTaxa_dir' + general_fn.check_file_exist(searchpath) - # print taxa in blast_dir + # print taxa in coreTaxa_dir print('##### Data found at %s' % dataPath) print('\n##### Taxa in the core sets, which can be used as reference species #####\n') - for taxon in sorted(os.listdir(dataPath + '/blast_dir/')): - if os.path.isdir(dataPath + '/blast_dir/' + taxon): + for taxon in sorted(os.listdir(corepath)): + if os.path.isdir(f'{corepath}/{taxon}'): print('%s\t%s' % (taxon, getNcbiName(taxon))) - # print taxa in genome_dir + # print taxa in searchTaxa_dir print('\n##### Search taxa. in which you can search orthologs #####\n') - for taxon in sorted(os.listdir(dataPath + '/genome_dir/')): - if os.path.isdir(dataPath + '/genome_dir/' + taxon): + for taxon in sorted(os.listdir(searchpath)): + if os.path.isdir(f'{searchpath}/{taxon}'): print('%s\t%s' % (taxon, getNcbiName(taxon))) + def main(): getTaxa() diff --git a/setup.py b/setup.py index 63bc4cb..79c7325 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # -# hamstr1s is distributed in the hope that it will be useful, +# fdog is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. @@ -26,8 +26,7 @@ setup( name="fdog", - version="0.0.52", - + version="0.1.24", python_requires='>=3.7.0', description="Feature-aware Directed OrtholoG search tool", long_description=long_description, @@ -43,6 +42,8 @@ 'ete3', 'six', 'PyYAML', + 'pyhmmer', + 'pysam', 'greedyFAS>=1.11.2' ], entry_points={ @@ -53,10 +54,11 @@ "fdog.addTaxon = fdog.addTaxon:main", "fdog.addTaxa = fdog.addTaxa:main", "fdog.showTaxa = fdog.showTaxa:main", + "fdog.setPaths = fdog.setPaths:main", "fdog.mergeOutput = fdog.mergeOutput:main", - "fdog.remove = fdog.removefDog:main", + "fdog.uninstall = fdog.removefDog:main", "fdog.assembly = fdog.fDOGassembly:main", - "fdog.mergeAssembly = fdog.mergeAssemblyOutput:main"], + "fdog.addCoreGroup = fdog.makeCoreGroupFromFasta:main"], }, license="GPL-3.0", classifiers=[ From 87c66f1bc4ba83146ed88df87593553fa0dc7477 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Tue, 24 Oct 2023 14:10:04 +0200 Subject: [PATCH 197/229] Fdog assembly v0.1.4 (#35) fDOG Assembly update to version v0.1.4 A few bug fixes, cleaned up the output and added a script to compute MSA and pHMM of an ortholog group that can be used as input for fDOG-Assembly. From d429ec9afb996a04d1f205099242474cb9402d71 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Fri, 17 Nov 2023 17:31:35 +0100 Subject: [PATCH 198/229] bug fix parallel computation --- fdog/fDOGassembly.py | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index d1a6f3a..2c042e7 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1107,7 +1107,7 @@ def main(): ortholog_sequences = [] time_ortholog_start = time.time() - + if parallel == True: ##################### parallel computation ############################# calls = [] @@ -1116,35 +1116,13 @@ def main(): for asName in assembly_names: calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db]) - results = (pool.imap_unordered(ortholog_search_tblastn, calls)) - pool.close() - pool.join() - for i in results: - ortholog_sequences.append([i[0], i[1]]) - for k in i[2]: - print(k) - else: - ###################### computation species wise ################ - for asName in assembly_names: - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db] - reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args) - ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) - for k in output_ortholog_search: - print(k) - - #results = (pool.imap_unordered(ortholog_search_tblastn, calls)) - #pool.close() - #pool.join() print("Searching for orthologs ...", flush=True) for i in tqdm(pool.imap_unordered(ortholog_search_tblastn, calls),total=len(calls)): ortholog_sequences.append([i[0], i[1]]) if mode == 'debug': for k in i[2]: print(k) - #for i in results: - #ortholog_sequences.append([i[0], i[1]]) - #for k in i[2]: - #print(k) + print("\t ...finished \n", flush=True) else: ###################### computation species wise ################ From 8b1a58f61775235461b2c80724ee6869abe1e227 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Mon, 20 Nov 2023 14:22:12 +0100 Subject: [PATCH 199/229] Fdog assembly (#37) --- fdog/fDOGassembly.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 2c042e7..5f7f968 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1116,14 +1116,22 @@ def main(): for asName in assembly_names: calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db]) + #results = (pool.imap_unordered(ortholog_search_tblastn, calls)) + #pool.close() + #pool.join() + print("Searching for orthologs ...", flush=True) for i in tqdm(pool.imap_unordered(ortholog_search_tblastn, calls),total=len(calls)): ortholog_sequences.append([i[0], i[1]]) if mode == 'debug': for k in i[2]: print(k) - + #for i in results: + #ortholog_sequences.append([i[0], i[1]]) + #for k in i[2]: + #print(k) print("\t ...finished \n", flush=True) + else: ###################### computation species wise ################ for asName in tqdm(assembly_names): From 64a0022c89fa524cba3190139ce0ede2acc553c3 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 20 Nov 2023 16:20:24 +0100 Subject: [PATCH 200/229] renamed ortholog group fasta file because of signalp bug --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 5f7f968..a9f963e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1146,7 +1146,7 @@ def main(): time_ortholog = time_ortholog_end - time_ortholog_start ################## preparing output ######################################## - orthologsOutFile = out + "/" + group + ".extended.fa" + orthologsOutFile = out + "/" + group + "_og.fa" if taxa == []: taxa = [fdog_ref_species] From 90a4f20156c5b29d1a0cb1242fd7e305c1235ef6 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Mon, 20 Nov 2023 16:25:43 +0100 Subject: [PATCH 201/229] renamed .extended.fa to _og.fa --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 5f7f968..a9f963e 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1146,7 +1146,7 @@ def main(): time_ortholog = time_ortholog_end - time_ortholog_start ################## preparing output ######################################## - orthologsOutFile = out + "/" + group + ".extended.fa" + orthologsOutFile = out + "/" + group + "_og.fa" if taxa == []: taxa = [fdog_ref_species] From 172ee24fb0343dc4c45b4b4287030bc0edf91837 Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 11 Dec 2023 15:35:04 +0100 Subject: [PATCH 202/229] bug fix run time error with --fasoff --- fdog/fDOGassembly.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a9f963e..a97d783 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -1170,10 +1170,14 @@ def main(): clean_fas(out + group + "_reverse.domains", 'domains') clean_fas(out + group + ".phyloprofile", 'phyloprofile') print("\t ...finished \n") + end = time.time() + time_fas = end - fas + else: + end = time.time() + time_fas = 0 ################# remove tmp folder ######################################## - end = time.time() - time_fas = end - fas + print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) sys.stdout = sys.__stdout__ From 46b8c0541834f6f3b9af8d1ab3222c14d4dae1f2 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Mon, 5 Feb 2024 12:53:12 +0100 Subject: [PATCH 203/229] bugfix block profile computation --- fdog/fDOGassembly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a97d783..7ac3b86 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -814,7 +814,7 @@ def ortholog_search_tblastn(args): return reciprocal_sequences, candidatesOutFile, output -def blockProfiles(core_path, group, mode, out): +def blockProfiles(core_path, group, mode, out, msaTool): ######################## paths ################################ msa_path = core_path + "/" + group +"/"+ group + ".aln" @@ -1090,7 +1090,7 @@ def main(): if augustus == True: group_computation_time_start = time.time() consensus_path = consensusSequence(core_path, group, mode, out) - profile_path = blockProfiles(core_path, group, mode, out) + profile_path = blockProfiles(core_path, group, mode, out, msaTool) group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start else: From 9983e30b4218b792d76cf5ca56b7c558c652388d Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Tue, 6 Feb 2024 12:00:14 +0100 Subject: [PATCH 204/229] update block profile function --- fdog/fDOGassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 7ac3b86..7b917c5 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -826,7 +826,7 @@ def blockProfiles(core_path, group, mode, out, msaTool): cmd= "muscle -quiet -in " + fasta_path + " -out " + msa_path #print("muscle -quiet -in " + output_file + " -out " + aln_file) else: - cmd = "muscle -quiet -align" + fasta_path + " -out " + msa_path + cmd = "muscle -quiet -align " + fasta_path + " -output " + msa_path elif msaTool == "mafft-linsi": cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + fasta_path + ' > ' + msa_path starting_subprocess(cmd, mode) From 844c02d38afaf812bc247c34d741837dd668aa54 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Fri, 9 Feb 2024 16:38:37 +0100 Subject: [PATCH 205/229] Fdog assembly v0.1.5 (#39) * added fDOG-Assembly workflow * added fDOG-Assembly dependencies * adjustments to new muscle version and fDOG version * further changes to adapt to new fDOG version * script to produce msa and hmm in the format fDOG-Assembly requires from a fasta file * deactivate fas config output * Bugfix muscle v5 command in fdog.addCoreGroup * fixed augustus version * further updates to adapt fDOG-Assembly to the new fDOG version. Additionally bugfix in co-ortholog detection if MSA crashed due to too many sequences. * contig fasta files where delited from the tmp folder to save memory * add function to output only best isoform * added function to only output one isoform * new gff function, fixed problem that muscle can not work with path containing @ sysmbols, adapted mergeOutput that merges files with _og ending * Fixed bug where fDOG-Assembly does not terminate if FAS raises an error * bugfix createGff function * bugfix createGff function --------- Co-authored-by: trvinh --- .github/workflows/github_build.yml | 2 +- .gitignore | 1 + fdog/addAssembly.py | 115 ++++++++++ fdog/fDOGassembly.py | 323 +++++++++++++++++++++-------- fdog/libs/corecompile.py | 2 + fdog/libs/hmm.py | 61 ++++-- fdog/libs/orthosearch.py | 132 ++++++------ fdog/libs/preparation.py | 4 + fdog/mergeOutput.py | 14 +- fdog/runMulti.py | 7 +- fdog/runSingle.py | 7 +- fdog/setupfDog.py | 2 +- setup.py | 4 +- 13 files changed, 501 insertions(+), 173 deletions(-) create mode 100644 fdog/addAssembly.py diff --git a/.github/workflows/github_build.yml b/.github/workflows/github_build.yml index 7bd6208..10127d8 100644 --- a/.github/workflows/github_build.yml +++ b/.github/workflows/github_build.yml @@ -52,7 +52,7 @@ jobs: mkdir seeds path=$(fdog.setup -d ./ --getSourcepath); a="1 2 3"; for i in ${a[@]}; do cp $path/data/infile.fa seeds/$i.fa; done echo "TEST fdogs.run" - fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@3 --fasOff --searchTaxa PARTE@5888@3,THAPS@35128@3 + fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@3 --fasOff --searchTaxa PARTE@5888@3,THAPS@35128@3 --hmmScoreType sequence echo "TEST fdog.addTaxon" head /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir/HUMAN@9606@3/HUMAN@9606@3.fa > hm.fa fdog.addTaxon -f hm.fa -i 9606 -o ./ -c -a diff --git a/.gitignore b/.gitignore index 90963b6..d1b7979 100644 --- a/.gitignore +++ b/.gitignore @@ -140,3 +140,4 @@ dmypy.json /fdog/data/assembly_dir/ /fdog/fdog_goes_assembly/tmp/ taxdump* +.DS_Store diff --git a/fdog/addAssembly.py b/fdog/addAssembly.py new file mode 100644 index 0000000..b0e901d --- /dev/null +++ b/fdog/addAssembly.py @@ -0,0 +1,115 @@ +import fdog.libs.addtaxon as addTaxon_fn +import fdog.libs.tree as tree_fn +import fdog.libs.zzz as general_fn +import sys +import os +import argparse + +def check_fasta(file): + nHeader = general_fn.count_line(file, '>', True) + nSeq = general_fn.count_line(file, '>', False) + if not nHeader == nSeq: + return(1) + nPipe = general_fn.count_line(file, '|', True) + if nPipe > 0: + return(1) + return(0) + +def check_path(path): + if not os.path.exists(path): + return False + else: + if os.path.isfile(path): + return "File" + else: + return "Path" + +def parse_file(path): + file = open(path, "r") + lines = file.readlines() + id_dict = {} + for line in lines: + line = line.rstrip() + ncbi, name = line.split("\t") + id_dict[ncbi] = name + + return id_dict + + +def main(): + + #################### handle user input ##################################### + version = '0.0.1' + ################### initialize parser ###################################### + parser = argparse.ArgumentParser(description='You are running fdog.addAssembly version ' + str(version) + '.') + ################## required arguments ###################################### + required = parser.add_argument_group('Required arguments') + required.add_argument('--fasta', help='Path to fasta file or folder', action='store', default='', required=True) + required.add_argument('--out', help='Path to output folder.', action='store', default='', required=True) + required.add_argument('--ncbi', help='NCBI number of species or mapping file', action='store', default='', required=True) + required.add_argument('--ver', help='Version', action='store', default='', required=True) + optional = parser.add_argument_group('Optional arguments') + optional.add_argument('--link', help='link files not copy', action='store_true', default = True) + + args = parser.parse_args() + + fasta = args.fasta + if check_path(fasta) == False: + print("%s does not exists. Exiting ..."%(fasta)) + sys.exit() + else: + format = check_path(fasta) + out_folder = args.out + out_folder = os.path.abspath(out_folder) + '/' + os.system('mkdir %s >/dev/null 2>&1' % (out_folder)) + ncbi = args.ncbi + ver = args.ver + ln = args.link + id_dict = {} + + if check_path(ncbi) == False: + if isdigit(ncbi) and format == "File": + id_dict[ncbi] = fasta_file + else: + print("%s is no file or digit. Exiting ..."%(ncbi)) + sys.exit() + elif check_path(ncbi) == "File": + id_dict = parse_file(ncbi) + else: + print("%s is no file or digit. Exiting ..."%(ncbi)) + sys.exit() + + if format == "File": + fa = id_dict[id] + if check_fasta(fa): + name = addTaxon_fn.generate_spec_name(id, "", ver) + if ln == False: + assembly_folder = out_folder + name + os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) + os.system("cp %fa %s/%s.fa" %(fa, assembly_folder, name)) + else: + assembly_folder = out_folder + name + os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) + os.system("ln %fa %s/%s.fa" %(fa, assembly_folder, name)) + else: + print("%s Fasta format not valid or header includes |"%(fa)) + + for id in id_dict: + fa = id_dict[id] + fasta = os.path.abspath(fasta) + '/' + if check_fasta(fasta + fa): + name = addTaxon_fn.generate_spec_name(id, "", ver) + if ln == False: + assembly_folder = out_folder + name + os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) + os.system("cp %s/%fa %s/%s.fa" %(fasta, fa, assembly_folder, name)) + else: + assembly_folder = out_folder + name + os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) + os.system("ln -s %s/%s %s/%s.fa" %(fasta, fa, assembly_folder, name)) + else: + print("%s Fasta format not valid or header includes |"%(fa)) + + print("DONE, files can be found: %s"%(out_folder)) + +main() diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 7b917c5..0766df9 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -31,12 +31,18 @@ import multiprocessing as mp import fdog.libs.alignment as align_fn from tqdm import tqdm +from pathlib import Path +import pandas as pd ########################### functions ########################################## -def check_path(path): - if not os.path.exists(path): +def check_path(path, exit=True): + if not os.path.exists(path) and exit == True: print(path + " does not exist. Exciting ...") sys.exit() + elif not os.path.exists(path) and exit == False: + return 1 + else: + return 0 def check_ref_sepc(species_list, fasta_file): file = open(fasta_file, "r") @@ -233,8 +239,8 @@ def extract_sequence_from_to(name, file, start, end): return out, start, end def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode): + """Gene prediction with software Augustus for all candidate regions. The resulting AS sequences will be written in a tmp file.""" output = open(candidatesOutFile, "w") - for key in regions: locations = regions[key] counter = 0 @@ -248,7 +254,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" #print(cmd) starting_subprocess(cmd, 'silent') - # transfer augustus output to as sequence + # transfer augustus output to AS sequence cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" starting_subprocess(cmd, mode) # parsing header and sequences @@ -338,8 +344,12 @@ def get_distance_biopython(file, matrix): dm = calculator.get_distance(aln) return dm -def readFasta(candidatesOutFile): - seq_records = SeqIO.parse(candidatesOutFile, "fasta") +def readFasta(fasta): + path = Path(fasta) + if path.exists() == False: + print(str(path) + ' does not exists.') + sys.exit() + seq_records = SeqIO.parse(path, "fasta") return seq_records def getSeedInfo(path): @@ -373,7 +383,7 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates out.write(str(inSeq[best_hit].seq) + "\n") out.write(">" + ref + "\n") out.write(str(inSeq[ref].seq )+ "\n") - + #print(candidatesOutFile) candidates = readFasta(candidatesOutFile) for record in candidates: if candidate_name in record.id: @@ -385,12 +395,13 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates if msaTool == "muscle": if align_fn.get_muscle_version(msaTool) == 'v3': - cmd = "muscle -quiet -in " + output_file + " -out " + aln_file + cmd = "muscle -quiet -in " + output_file + "-out " + aln_file else: - cmd = "muscle -align " + output_file + " -output " + aln_file + cmd = "muscle -align " + output_file + " -output " + aln_file starting_subprocess(cmd, mode) if not os.path.exists(aln_file): - print("Muscle failed for %s. Making MSA with Mafft-linsi." % (candidate_name)) + print('Muscle failed with command: %s'%(cmd)) + print("Muscle failed for file %s. Making MSA with Mafft-linsi." % (candidate_name)) cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file starting_subprocess(cmd, mode) @@ -649,7 +660,65 @@ def cleanup(tmp, tmp_path): if file.endswith(".fasta"): os.remove(os.path.join(root, file)) -def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix, mode='silent'): +def getLocationFromGff(gff_file, name, tool): + #print(name) + if tool == 'metaeuk': + gene_count = int(name.split('_')[-1:][0]) + else: + gene_count = int(name.split('.')[-2].replace('g', '').split('_')[-1:][0]) + counter = 0 + with open(gff_file,'r') as gff: + for line in gff: + if line.startswith('#'): + pass + else: + contig, source, type, start, end, score, strand, phase, att = line.split('\t') + if type == 'gene': + counter += 1 + if counter == gene_count: + position = [contig, int(start), int(end), strand] + #print(position) + return position + +def checkOverlap(position, n=30): + pairs = set() + overlapping = set() + keys = list(position.keys()) + index = 0 + for x in keys: + index +=1 + for i in range(index,len(keys)): + y = keys[i] + if x != y: + if position[y][0] == position[x][0]: + if position[y][3] == position[x][3]: + if position[x][1] < position[y][1] and position[y][1] <= position[x][2]: + len_overlap = position[x][2] - position[y][1] + if len_overlap >= n: + pairs.add((y,x)) + overlapping.add(y) + overlapping.add(x) + elif position[x][1] == position[y][1]: + len_overlap = min(position[x][2],position[y][2]) - position[x][1] + if len_overlap >= n: + pairs.add((y,x)) + overlapping.add(y) + overlapping.add(x) + elif position[x][2] == position[y][2]: + len_overlap = position[x][2] - max((position[x][2],position[y][2])) + if len_overlap >= n: + pairs.add((y,x)) + overlapping.add(y) + overlapping.add(x) + elif position[y][1] < position[x][1] and position[x][1] <= position[y][2]: + len_overlap = position[y][2] - position[x][1] + if len_overlap >= n: + pairs.add((y,x)) + overlapping.add(y) + overlapping.add(x) + return pairs, overlapping + +def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix, isoforms, gene_prediction, mode='silent'): if len(candidate_names) == 1: return candidate_names @@ -681,12 +750,12 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci if msaTool == "muscle": if align_fn.get_muscle_version(msaTool) == 'v3': cmd = "muscle -quiet -in %s -out %s" % (out, aln_file) - #print("muscle -quiet -in " + output_file + " -out " + aln_file) else: cmd = "muscle -align %s -output %s" % (out, aln_file) starting_subprocess(cmd, mode) if not os.path.exists(aln_file): - print("Muscle failed for %s. Making MSA with Mafft-linsi." % (aln_file)) + print('Muscle failed with command: %s' % (cmd)) + print("Muscle failed for file %s. Making MSA with Mafft-linsi." % (aln_file)) cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file starting_subprocess(cmd, mode) elif msaTool == "mafft-linsi": @@ -697,21 +766,48 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci min_dist = 10 min_name = None - + position = {} for name in candidate_names: distance = distances[ref_id , name] + id = name.split('|')[2] + if isoforms == False: + gff_file = tmp_path + '/' + '_'.join(id.split('_')[0:-1]) + '.gff' + position[name] = getLocationFromGff(gff_file, id, gene_prediction) if distance <= min_dist: min_dist = distance min_name = name checked = [min_name] - + pairs, overlapping = checkOverlap(position) + #print(pairs, overlapping) + tested = set() for name in candidate_names: if name == min_name: pass elif distances[min_name , name] <= distances[min_name , ref_id]: - checked.append(name) - + if isoforms == False and name in overlapping and name not in tested: + for pair in pairs: + min_dist = 10 + to_add = '' + if name in pair: + x,y = pair + tested.add(x) + tested.add(y) + distx = distances[x,ref_id] + disty = distances[y, ref_id] + if distx <= disty and distx < min_dist: + to_add = x + min_dist = distx + elif disty <= distx and disty < min_dist: + to_add = y + min_dist = disty + if to_add != min_name: + checked.append(to_add) + elif name in tested and isoforms == False: + pass + else: + checked.append(name) + #print(checked) return checked def clean_fas(path, file_type): @@ -733,16 +829,29 @@ def clean_fas(path, file_type): file.write(new_line) file.close() +def run_fas(cmd): + #print(cmd) + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + while process.poll() is None: + output = process.stdout.readline().decode().split('\n') + error = process.stderr.readline().decode().split('\n') + if error: + for line in error: + line.strip() + if 'error' in line or 'Error' in line: + print ("Error running FAS with %s"%(' '.join(cmd))) + process.terminate() + sys.exit() + return output + def ortholog_search_tblastn(args): - (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db) = args + (asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms) = args output = [] - cmd = 'mkdir ' + out + '/tmp/' + asName + asNamePath = asName.replace('@', '_') + cmd = 'mkdir ' + out + '/tmp/' + asNamePath starting_subprocess(cmd, 'silent') - tmp_path = out + "tmp/" + asName + "/" + tmp_path = out + "tmp/" + asNamePath + "/" candidatesOutFile = tmp_path + group + ".candidates.fa" - #orthologsOutFile = out + "/" + group + ".extended.fa" - fasOutFile = out + "/" + group - #mappingFile = out + "/tmp/" + group + ".mapping.txt" output.append("Searching in species " + asName + "\n") assembly_path = assemblyDir + "/" + asName + "/" + asName + ".fa" @@ -810,7 +919,7 @@ def ortholog_search_tblastn(args): output.append("No ortholog fulfilled the reciprocity criteria for species %s.\n" % asName) return [], candidatesOutFile, output else: - reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix) + reciprocal_sequences = coorthologs(reciprocal_sequences, tmp_path, candidatesOutFile, fasta_path, fdog_ref_species, msaTool, matrix, isoforms, gene_prediction) return reciprocal_sequences, candidatesOutFile, output @@ -823,8 +932,7 @@ def blockProfiles(core_path, group, mode, out, msaTool): check_path(fasta_path) if msaTool == "muscle": if align_fn.get_muscle_version(msaTool) == 'v3': - cmd= "muscle -quiet -in " + fasta_path + " -out " + msa_path - #print("muscle -quiet -in " + output_file + " -out " + aln_file) + print("muscle -quiet -in " + output_file + " -out " + aln_file) else: cmd = "muscle -quiet -align " + fasta_path + " -output " + msa_path elif msaTool == "mafft-linsi": @@ -868,25 +976,68 @@ def consensusSequence(core_path, group, mode, out): return consensus_path -class Logger(object): - def __init__(self, file): - self.file = file - self.terminal = sys.stdout - self.log = self.file +def createGff(ortholog_sequences, out_folder, tool): + #print(ortholog_sequences) + #print(out_folder) + gff_folder = out_folder + "/gff/" + os.system('mkdir %s >/dev/null 2>&1' %(gff_folder)) + for s in ortholog_sequences: + genes = s[0] + #print(genes) + data = [] + if genes != []: + gff_file_sp = gff_folder + '/' + genes[0].split('|')[1] + '.gff' + for gene in genes: + group, species, gene = gene.split('|') + #print(group, species, gene) + region = '_'.join(gene.split('_')[0:-1]) + if tool == 'metaeuk': + gene_count = int(gene.split('_')[-1:][0]) + else: + gene_count = int(gene.split('.')[-2].replace('g', '').split('_')[-1:][0]) + #print(region, gene_count) + gff_file_gene = "%s/tmp/%s/%s.gff" %(out_folder, species.replace('@', '_'), region) + #print(gff_file_gene) + with open(gff_file_gene, 'r') as gff: + counter = 0 + for line in gff: + if line.startswith('#'): + pass + else: + line=line.rstrip() + contig, source, type, start, end, score, strand, phase, att = line.split('\t') + if type == 'gene': + counter += 1 + if counter == gene_count: + if source == 'AUGUSTUS': + att = att.replace('g' + str(gene_count), '_'.join(gene.split('.')[:-1])) + att = att.replace('"', '') + elif source == 'MetaEuk': + att = 'gene_id ' + gene + '; ' + att + data.append([contig, source, type, int(start), int(end), score, strand, phase, att]) + else: + continue - def write(self, message): - self.terminal.write(message) - self.log.write(message) + df = pd.DataFrame(data, columns=['contig', 'source', 'type', 'start', 'end', 'score', 'starnd', 'phase', 'att']) + #print(df) + df.sort_values(by=['contig', 'start']) + df.to_csv(gff_file_sp,sep='\t' , index=False, header=None) - def flush(self): - pass +def getAugustusRefSpec(mapping_augustus): + dict = {} + with open(mapping_augustus,'r') as file: + for line in file: + line = line.rstrip() + assembly, id = line.split('\t') + dict[assembly] = id + return dict def main(): #################### handle user input ##################################### start = time.time() - version = '0.1.4' + version = '0.1.5' ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) @@ -898,31 +1049,33 @@ def main(): ################## optional arguments ###################################### optional = parser.add_argument_group('Optional arguments') optional.add_argument('--avIntron', help='average intron length of the assembly species in bp (default: 50000)',action='store', default=50000, type=int) - optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:5000)', action='store', default=5000, type=int) - optional.add_argument('--assemblyPath', help='Path for the assembly directory', action='store', default='') + optional.add_argument('--lengthExtension', help='length extension of the candidate regions in bp (default:20000)', action='store', default=20000, type=int) + optional.add_argument('--assemblyPath', help='Path for the assembly directory, (default dataPath)', action='store', default='') optional.add_argument('--tmp', help='tmp files will not be deleted', action='store_true', default = False) optional.add_argument('--out', help='Output directory', action='store', default='') - optional.add_argument('--dataPath', help='data directory', action='store', default='') - optional.add_argument('--coregroupPath', help='core_ortholog directory', action='store', default='') - optional.add_argument('--searchTool', help='Choose between blast and diamond as alignemnt search tool(default:blast)', action='store', choices=['blast', 'diamond'], default='blast') + optional.add_argument('--dataPath', help='fDOG data directory containing searchTaxa_dir, coreTaxa_dir and annotation_dir', action='store', default='') + optional.add_argument('--coregroupPath', help='core_ortholog directory containing ortholog groups of gene of interest', action='store', default='') + #optional.add_argument('--searchTool', help='Choose between blast and diamond as alignment search tool(default:blast)', action='store', choices=['blast', 'diamond'], default='blast') optional.add_argument('--evalBlast', help='E-value cut-off for the Blast search. (default: 0.00001)', action='store', default=0.00001, type=float) optional.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', action='store_true', default=False) - optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') + optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. (default:muscle)', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False) - optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. DEFAULT: blosum62', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') + optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. (default: blosum62)', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[]) #optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') - optional.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) + optional.add_argument('--fasoff', help='Turn off FAS support', action='store_true', default=False) optional.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - optional.add_argument('--searchTaxa', help='List of Taxa to search in', action='store', nargs="+", default=[]) - optional.add_argument('--silent', help='Output will only be written into the log file', action='store_true', default=False) - optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed', action='store_true', default=False) + optional.add_argument('--searchTaxa', help='List of Taxa to search in, (default: all species located in assembly_dir)', action='store', nargs="+", default=[]) + optional.add_argument('--debug', help='Stdout and Stderr from fdog.assembly and every used tool will be printed, caution: using --parallel can result in messy output', action='store_true', default=False) optional.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) - optional.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) + optional.add_argument('--append', help='Append the output to existing output files, caution: reference species must be identical', action='store_true', default=False) optional.add_argument('--parallel', help= 'The ortholog search of multiple species will be done in parallel', action='store_true', default=False) optional.add_argument('--augustus', help= 'Gene prediction is done by using the tool Augustus PPX', action='store_true', default=False) - optional.add_argument('--augustusRefSpec', help='augustus reference species', action='store', default='') - optional.add_argument('--metaeukDb', help='path to metaeuk reference database', action='store', default='') + optional.add_argument('--augustusRefSpec', help='Augustus reference species identifier (use command: augustus --species=help to get precomputed augustus gene models)', action='store', default='') + optional.add_argument('--augustusRefSpecFile', help='Mapping file tab seperated containing Assembly Names and augustus reference species that should be used', action='store', default='') + optional.add_argument('--metaeukDb', help='Path to MetaEuk reference database', action='store', default='') + optional.add_argument('--isoforms', help='All Isoforms of a gene passing the ortholog verification will be included in the output', action='store_true', default=False) + optional.add_argument('--gff', help='GFF files will be included in output', action='store_true', default=False) args = parser.parse_args() # required @@ -941,31 +1094,39 @@ def main(): #others average_intron_length = args.avIntron length_extension = args.lengthExtension - searchTool = args.searchTool + #searchTool = args.searchTool + searchTool = 'blast' evalue = args.evalBlast msaTool = args.msaTool matrix = args.scoringmatrix taxa = args.coreTaxa fasoff = args.fasoff searchTaxa = args.searchTaxa - silent = args.silent debug = args.debug force = args.force append = args.append parallel = args.parallel augustus_ref_species = args.augustusRefSpec + mapping_augustus = args.augustusRefSpecFile metaeuk_db = args.metaeukDb + isoforms = args.isoforms + gff = args.gff #gene prediction tool augustus = args.augustus if augustus == True: - - if augustus_ref_species == '': + if augustus_ref_species == '' and mapping_augustus == '': print("Augustus reference species is required when using Augustus as gene prediction tool") return 1 gene_prediction = "augustus" + if mapping_augustus != '': + check_path(mapping_augustus) + aug_ref_dict = getAugustusRefSpec(mapping_augustus) else: gene_prediction = "metaeuk" + if metaeuk_db == '': + print("MetaEuk DB is required when using MetaEuk as gene prediction tool") + return 1 # output modes if debug == True and silent == True: @@ -974,8 +1135,6 @@ def main(): else: if debug == True: mode = 'debug' - elif silent == True: - mode = 'silent' else: mode = 'normal' @@ -1019,6 +1178,8 @@ def main(): if core_path == '': core_path = out + '/core_orthologs/' + if check_path(core_path, False) == 1: + core_path = dataPath + '/core_orthologs/' else: if not core_path.endswith('/'): core_path = core_path + '/' @@ -1031,12 +1192,6 @@ def main(): if metaeuk_db != '': check_path(metaeuk_db) - - try: - f = open(out + "/fdog.log", "a+") - except FileNotFoundError: - f = open(out + "/fdog.log", "w") - ################## How to handle std output and std error ################## if mode == 'silent': @@ -1044,7 +1199,6 @@ def main(): sys.stdout = f else: pass - #sys.stdout = Logger(f) ########################### other variables ################################ if searchTaxa == []: @@ -1107,35 +1261,39 @@ def main(): ortholog_sequences = [] time_ortholog_start = time.time() - + if parallel == True: ##################### parallel computation ############################# calls = [] cpus = mp.cpu_count() pool = mp.Pool(cpus) for asName in assembly_names: - calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db]) + if mapping_augustus == '': + calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms]) + else: + try: + calls.append([asName, out, assemblyDir, consensus_path, aug_ref_dict[asName], group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms]) + except KeyError: + print("%s is not included in Augustus reference species mapping file. %s will be skipped" %(asName, asName)) - #results = (pool.imap_unordered(ortholog_search_tblastn, calls)) - #pool.close() - #pool.join() - print("Searching for orthologs ...", flush=True) for i in tqdm(pool.imap_unordered(ortholog_search_tblastn, calls),total=len(calls)): ortholog_sequences.append([i[0], i[1]]) if mode == 'debug': for k in i[2]: print(k) - #for i in results: - #ortholog_sequences.append([i[0], i[1]]) - #for k in i[2]: - #print(k) print("\t ...finished \n", flush=True) - + else: ###################### computation species wise ################ for asName in tqdm(assembly_names): - args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db] + if mapping_augustus == '': + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms] + else: + try: + args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db, isoforms] + except KeyError: + print("%s is not included in Augustus reference species mapping file. %s will be skipped" % (asName, asName)) reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args) ortholog_sequences.append([reciprocal_sequences, candidatesOutFile]) if mode == 'debug': @@ -1155,21 +1313,24 @@ def main(): else: addRef(orthologsOutFile, fasta_path, taxa) addSeq(orthologsOutFile, ortholog_sequences) + + if gff == True: + createGff(ortholog_sequences, out, gene_prediction) mappingFile = out + "/tmp/" + group + ".mapping.txt" if fasoff == False: fas = time.time() - print("Calculating FAS scores ...") + print("Calculating FAS scores ...", flush=True) tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) - cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group + cmd = ['fas.run', '--seed', fasta_path , '--query' , orthologsOutFile , '--annotation_dir' , tmp_path + 'anno_dir' ,'--bidirectional', '--tsv', '--phyloprofile', mappingFile, '--seed_id', fas_seed_id, '--out_dir', out, '--out_name', group] #print(cmd) - starting_subprocess(cmd, 'silent') + fas_out = run_fas(cmd) clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') clean_fas(out + group + ".phyloprofile", 'phyloprofile') - print("\t ...finished \n") + print("\t ...finished \n", flush=True) end = time.time() time_fas = end - fas else: @@ -1181,8 +1342,6 @@ def main(): print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) sys.stdout = sys.__stdout__ - - f.close() cleanup(tmp, tmp_folder) if __name__ == '__main__': diff --git a/fdog/libs/corecompile.py b/fdog/libs/corecompile.py index d16a5a1..bd57e07 100644 --- a/fdog/libs/corecompile.py +++ b/fdog/libs/corecompile.py @@ -226,6 +226,8 @@ def compile_core(args): leaves.reverse() flag_node = 0 for leaf in leaves: + if not leaf in tax_ids: + continue if flag_node == 1: break if not leaf == refspec_id and \ diff --git a/fdog/libs/hmm.py b/fdog/libs/hmm.py index 2b18d3d..88ea070 100644 --- a/fdog/libs/hmm.py +++ b/fdog/libs/hmm.py @@ -35,7 +35,7 @@ def create_hmm(aln_file, out_file): sys.exit('ERROR: Error running hmmbuild %s' % hmmbuild_cmd) -def sort_hmm_hits(hmm_hits, hitLimit = 10, scoreCutoff = 10, debug = False): +def sort_hmm_hits(hmm_hits, hmm_score_type = 'domain', hitLimit = 10, scoreCutoff = 10, debug = False): """ Sort HMM hits Keep only n hits (n =< hitLimit), and hits that are not less than best_hit_domain_score * (100 - scoreCutoff) / 100 @@ -45,27 +45,48 @@ def sort_hmm_hits(hmm_hits, hitLimit = 10, scoreCutoff = 10, debug = False): cutoff = '' score_dict = {} ori_hits = {} - for hit in hmm_hits: - ori_hits[hit.name.decode('ASCII')] = len(hit.domains) - best_domain_score = -9999 #hit.domains[0].score - best_domain_hit = '' - if len(hit.domains) > 0: - # get domain with best score for this hit - for i in hit.domains: - if i.score > best_domain_score: - best_domain_score = i.score - best_domain_hit = i.hit.name.decode('ASCII') - # add hit to score_dict with increasing domain score - if best_domain_score > best_score: - best_score = best_domain_score + best_hit_score = -9999 + if hmm_score_type == 'domain': + for hit in hmm_hits: + ori_hits[hit.name.decode('ASCII')] = len(hit.domains) + best_domain_score = -9999 #hit.domains[0].score + best_domain_hit = '' + if len(hit.domains) > 0: + # get domain with best score for this hit + for i in hit.domains: + if i.score > best_domain_score: + best_domain_score = i.score + best_domain_hit = i.hit.name.decode('ASCII') + # add hit to score_dict with increasing domain score + if best_domain_score > best_score: + best_score = best_domain_score + cutoff = best_score/100*(100-scoreCutoff) + if best_score < 0: + cutoff = best_score/100*(100+scoreCutoff) + if best_domain_score >= cutoff: + if best_domain_score not in score_dict: + score_dict[best_domain_score] = [best_domain_hit] + else: + score_dict[best_domain_score].append(best_domain_hit) + else: + for hit in hmm_hits: + ori_hits[hit.name.decode('ASCII')] = hit.score + if hit.score > best_hit_score: + # get hit with best score + best_hit_score = hit.score + best_hit = hit.name.decode('ASCII') + # add to score_dict + if best_hit_score > best_score: + best_score = best_hit_score cutoff = best_score/100*(100-scoreCutoff) if best_score < 0: cutoff = best_score/100*(100+scoreCutoff) - if best_domain_score >= cutoff: - if best_domain_score not in score_dict: - score_dict[best_domain_score] = [best_domain_hit] + if hit.score >= cutoff: + if hit.score not in score_dict: + score_dict[hit.score] = [hit.name.decode('ASCII')] else: - score_dict[best_domain_score].append(best_domain_hit) + score_dict[hit.score].append(hit.name.decode('ASCII')) + output_fn.print_debug(debug, 'All HMM hits', ori_hits) hmm_cand = {} n = 0 @@ -84,7 +105,7 @@ def sort_hmm_hits(hmm_hits, hitLimit = 10, scoreCutoff = 10, debug = False): def do_hmmsearch( hmm_file, search_fa, evalHmmer = 0.00001, scoreCutoff = 10, - hitLimit = 10, cpus = os.cpu_count(), debug = False): + hitLimit = 10, hmm_score_type = 'domain', cpus = os.cpu_count(), debug = False): """ Perform hmmsearch for a hmm file vs a multiple fasta file Return a dictionary of hits and their e-value and bit-score Only "top" hits are returned. The cutoff is defined by @@ -100,7 +121,7 @@ def do_hmmsearch( for hits in pyhmmer.hmmsearch( hmm_file, sequences, E = evalHmmer, cpus = cpus): if len(hits) > 0: - hmm_hits = sort_hmm_hits(hits, hitLimit, scoreCutoff, debug) + hmm_hits = sort_hmm_hits(hits, hmm_score_type, hitLimit, scoreCutoff, debug) except : sys.exit( 'ERROR: Error running hmmsearch for %s agains %s' diff --git a/fdog/libs/orthosearch.py b/fdog/libs/orthosearch.py index 51a2288..4c06cf4 100644 --- a/fdog/libs/orthosearch.py +++ b/fdog/libs/orthosearch.py @@ -34,7 +34,7 @@ def hamstr(args): (seqName, hmmpath, corepath, searchpath, outpath, refspec, seed_id, search_taxon, - evalHmmer, hitLimit, scoreCutoff, + evalHmmer, hitLimit, hmmScoreType, scoreCutoff, evalBlast, lowComplexityFilter, checkCoorthologsRefOff, rbh, rep, aligner, cpus, debug, silentOff, noCleanup) = args @@ -72,7 +72,7 @@ def hamstr(args): ### (1) Do hmmsearch for query hmm against search taxon fasta hmm_hits = hmm_fn.do_hmmsearch( - hmm_file, search_fa, evalHmmer, scoreCutoff, hitLimit, cpus, debug) + hmm_file, search_fa, evalHmmer, scoreCutoff, hitLimit, hmmScoreType, cpus, debug) output_fn.print_debug(debug, 'Sorted HMM hits', hmm_hits) ### (2) Read fasta file of refspec and search taxon refspec_seqs = fasta_fn.read_fasta(refspec_fa) @@ -83,72 +83,78 @@ def hamstr(args): silentOff, 'WARNING: No HMM hit found!') else: for hmm_hit in hmm_hits: - if not hmm_hit == seed_id: # only if search taxon == refspec - hmm_hit_fa = '%s/hmm_%s_%s_%s.fa' % ( - outpath, seqName, search_taxon, hmm_hit) - with open(hmm_hit_fa, 'w') as hmm_fa_out: - hmm_fa_out.write('>%s\n%s' % (hmm_hit, search_seqs.fetch(hmm_hit))) - blast_xml = blast_fn.do_blastsearch( - hmm_hit_fa, refspec_db, evalBlast = evalBlast, lowComplexityFilter = lowComplexityFilter) - blast_out = blast_fn.parse_blast_xml(blast_xml) - output_fn.print_debug(debug, 'BLAST hits', blast_out) - if noCleanup == False: - os.remove(hmm_hit_fa) - ### (4) check reciprocity - ### (4a) if refspec_seq_id == best blast hit - if len(blast_out['hits'].keys()) > 0: + hmm_hit_fa = '%s/hmm_%s_%s_%s.fa' % ( + outpath, seqName, search_taxon, hmm_hit) + with open(hmm_hit_fa, 'w') as hmm_fa_out: + hmm_fa_out.write('>%s\n%s' % (hmm_hit, search_seqs.fetch(hmm_hit))) + blast_xml = blast_fn.do_blastsearch( + hmm_hit_fa, refspec_db, evalBlast = evalBlast, lowComplexityFilter = lowComplexityFilter) + blast_out = blast_fn.parse_blast_xml(blast_xml) + output_fn.print_debug(debug, 'BLAST hits', blast_out) + if noCleanup == False: + os.remove(hmm_hit_fa) + ### (4) check reciprocity + ### (4a) if refspec_seq_id == best blast hit + if len(blast_out['hits'].keys()) > 0: + best_blast_hit = list(blast_out['hits'].keys())[0] + if best_blast_hit == hmm_hit and len(blast_out['hits'].keys()) > 1: best_blast_hit = list(blast_out['hits'].keys())[0] - if best_blast_hit == hmm_hit and len(blast_out['hits'].keys()) > 1: - best_blast_hit = list(blast_out['hits'].keys())[1] - if seed_id == best_blast_hit: - output_fn.print_stdout( - silentOff, - '%s accepted (best blast hit is ref)' % (blast_out['query'])) - ortho_candi[hmm_hit] = search_seqs.fetch(hmm_hit) - continue - else: - ### (4b) else, check for co-ortholog ref - if checkCoorthologsRefOff == False: - aln_fa = '%s/blast_%s_%s_%s_%s_%s.fa' % ( - outpath, seqName, seed_id, search_taxon, - hmm_hit, best_blast_hit) - with open(aln_fa, 'w') as aln_fa_out: - aln_fa_out.write( - '>%s\n%s\n>%s\n%s\n>%s\n%s' % ( - seed_id, refspec_seqs.fetch(seed_id), - hmm_hit, search_seqs.fetch(hmm_hit), - best_blast_hit, refspec_seqs.fetch(best_blast_hit) - ) + if seed_id == best_blast_hit: + output_fn.print_stdout( + silentOff, + '%s accepted (best blast hit is ref)' % (blast_out['query'])) + ortho_candi[hmm_hit] = search_seqs.fetch(hmm_hit) + continue + else: + ### (4b) else, check for co-ortholog ref + if checkCoorthologsRefOff == False: + aln_fa = '%s/blast_%s_%s_%s_%s_%s.fa' % ( + outpath, seqName, seed_id, search_taxon, + hmm_hit, best_blast_hit) + with open(aln_fa, 'w') as aln_fa_out: + aln_fa_out.write( + '>%s\n%s\n>%s\n%s\n>%s\n%s' % ( + seed_id, refspec_seqs.fetch(seed_id), + hmm_hit, search_seqs.fetch(hmm_hit), + best_blast_hit, refspec_seqs.fetch(best_blast_hit) ) - fasta_fn.remove_dup(aln_fa) - aln_seq = align_fn.do_align(aligner, aln_fa) - output_fn.print_debug( - debug, 'Alignment for checking co-ortholog ref', aln_seq) - br_dist = align_fn.calc_Kimura_dist(aln_seq, best_blast_hit, seed_id, debug) - bh_dist = align_fn.calc_Kimura_dist(aln_seq, best_blast_hit, hmm_hit, debug) - output_fn.print_debug( - debug, 'Check if distance blast_vs_ref < blast_vs_hmm', - 'd_br = %s; d_bh = %s' % (br_dist, bh_dist)) - if noCleanup == False: - os.remove(aln_fa) - if br_dist == bh_dist == 0 or br_dist < bh_dist: - output_fn.print_stdout( - silentOff, - '%s accepted (best blast hit is co-ortholog to ref)' - % (blast_out['query']) - ) - ortho_candi[hmm_hit] = search_seqs.fetch(hmm_hit) - continue + ) + fasta_fn.remove_dup(aln_fa) + aln_seq = align_fn.do_align(aligner, aln_fa) + output_fn.print_debug( + debug, 'Alignment for checking co-ortholog ref', aln_seq) + br_dist = align_fn.calc_Kimura_dist(aln_seq, best_blast_hit, seed_id, debug) + bh_dist = align_fn.calc_Kimura_dist(aln_seq, best_blast_hit, hmm_hit, debug) + output_fn.print_debug( + debug, 'Check if distance blast_vs_ref < blast_vs_hmm', + 'd_br = %s; d_bh = %s' % (br_dist, bh_dist)) + if noCleanup == False: + os.remove(aln_fa) + if br_dist == bh_dist == 0 or br_dist < bh_dist: + output_fn.print_stdout( + silentOff, + '%s accepted (best blast hit is co-ortholog to ref)' + % (blast_out['query']) + ) + ortho_candi[hmm_hit] = search_seqs.fetch(hmm_hit) + continue + + # remove seed protein from candidata list + if search_taxon == refspec: + if seed_id in ortho_candi: + ortho_candi.pop(seed_id) ### (5) check co-ortholog if more than 1 HMM hits are accepted if len(ortho_candi) == 0: output_fn.print_stdout( silentOff, 'WARNING: Reciprocity not fulfulled! No ortholog found!') else: + output_fn.print_debug( + debug, 'Candidates for checking co-orthologs', ortho_candi.keys()) best_ortho = list(ortho_candi.keys())[0] - if not best_ortho == seed_id: - ortho_final = fasta_fn.add_seq_to_dict( - ortho_final, '%s|%s|%s|1' % (seqName, search_taxon, best_ortho), - ortho_candi[best_ortho]) + ortho_final = fasta_fn.add_seq_to_dict( + ortho_final, '%s|%s|%s|1' % (seqName, search_taxon, best_ortho), + ortho_candi[best_ortho]) + if rep == False: if len(ortho_candi) > 1: aln_co_fa = '%s/coortho_%s_%s.fa' % ( @@ -196,7 +202,7 @@ def run_hamstr(args): (seqName, refspec, pathArgs, orthoArgs, otherArgs) = args (outpath, hmmpath, corepath, searchpath, annopath) = pathArgs (checkCoorthologsRefOff, rbh, rep, evalBlast, lowComplexityFilter, - evalHmmer, hitLimit, scoreCutoff, aligner) = orthoArgs + evalHmmer, hitLimit, hmmScoreType, scoreCutoff, aligner) = orthoArgs (searchTaxa, cpus, debug, silentOff, noCleanup, force, append) = otherArgs hamstr_jobs = [] @@ -219,7 +225,7 @@ def run_hamstr(args): hamstr_jobs.append([ seqName, hmmpath, corepath, searchpath, outpath, refspec, seed_id, search_taxon, - evalHmmer, hitLimit, scoreCutoff, + evalHmmer, hitLimit, hmmScoreType, scoreCutoff, evalBlast, lowComplexityFilter, checkCoorthologsRefOff, rbh, rep, aligner, cpus, debug, silentOff, noCleanup @@ -239,7 +245,7 @@ def run_hamstr(args): hamstr_jobs.append([ seqName, hmmpath, corepath, searchpath, outpath, refspec, seed_id, search_taxon, - evalHmmer, hitLimit, scoreCutoff, + evalHmmer, hitLimit, hmmScoreType, scoreCutoff, evalBlast, lowComplexityFilter, checkCoorthologsRefOff, rbh, rep, aligner, cpus, debug, silentOff, noCleanup diff --git a/fdog/libs/preparation.py b/fdog/libs/preparation.py index bb46719..0308418 100644 --- a/fdog/libs/preparation.py +++ b/fdog/libs/preparation.py @@ -185,6 +185,10 @@ def identify_seed_id(seqFile, refspec, corepath, debug, silentOff): output_fn.print_debug(debug, 'Identify seed ID', 'Input seed ID not found!') # otherwise, perform blast search blast_xml = blast_fn.do_blastsearch(seqFile, refspec_db, evalBlast = 0.001) + if not blast_xml: + print(f'ERROR: No blast output!') + print(f'You can check it by running:\nblastp -query {seqFile} -db {corepath}/{refspec}/{refspec} -evalue 0.001 -outfmt 7') + sys.exit() blast_out = blast_fn.parse_blast_xml(blast_xml) if len(blast_out['hits']) < 1: print(f'ERROR: Cannot find seed sequence {blast_out["query"]} in genome of reference species!') diff --git a/fdog/mergeOutput.py b/fdog/mergeOutput.py index 2d5a276..e85ada5 100644 --- a/fdog/mergeOutput.py +++ b/fdog/mergeOutput.py @@ -39,7 +39,7 @@ def main(): version = get_distribution('fdog').version parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') parser.add_argument('-i', '--input', - help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', + help='Input directory, where all single output (o_g.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', action='store', default='', required=True) parser.add_argument('-o', '--output', help='Output name', action='store', default='', required=True) args = parser.parse_args() @@ -55,6 +55,7 @@ def main(): domains_0 = None domains_1 = None ex_fasta = None + og_fasta = None lines_seen = set() for infile in ldir(directory): if infile.endswith('.phyloprofile') and not infile == out + '.phyloprofile': @@ -90,6 +91,14 @@ def main(): lines = reader.readlines() for line in lines: ex_fasta_out.write(line) + elif infile.endswith('_og.fa') and not infile == out + '_og.fa': + if not og_fasta: + og_fasta = out + '_og.fa' + og_fasta_out = open(og_fasta, 'w') + with open(directory + '/' + infile, 'r') as reader: + lines = reader.readlines() + for line in lines: + og_fasta_out.write(line) if phyloprofile: phyloprofile_out.close() if domains_0: @@ -98,6 +107,9 @@ def main(): domains_1.close() if ex_fasta: ex_fasta_out.close() + if og_fasta: + og_fasta_out.close() + ex_fasta = og_fasta createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out) print('Done! Output files:\n%s/%s.*' % (directory,out)) diff --git a/fdog/runMulti.py b/fdog/runMulti.py index 68a072c..5763fee 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -221,6 +221,8 @@ def main(): action='store', default=0.0001, type=float) ortho_options.add_argument('--hitLimit', help='number of hits of the initial pHMM based search that should be evaluated via a reverse search. Default: 10', action='store', default=10, type=int) + ortho_options.add_argument('--hmmScoreType', help='Choose type of hmm score (best domain or full sequence score) for validating HMM candidates (NOTE: applied also for the core compilation). Default: domain', + action='store', choices=['domain','sequence'], default='domain') ortho_options.add_argument('--scoreCutoff', help='Define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', action='store', default=10, type=int) @@ -295,6 +297,7 @@ def main(): evalBlast = args.evalBlast evalHmmer = args.evalHmmer hitLimit = args.hitLimit + hmmScoreType = args.hmmScoreType scoreCutoff = args.scoreCutoff # fas arguments @@ -365,7 +368,7 @@ def main(): coreArgs = [minDist, maxDist, coreSize, coreTaxa, distDeviation, alnStrategy, fasOff] orthoCoreArgs = [CorecheckCoorthologsOff, rbh, True, evalBlast/10, - lowComplexityFilter, evalHmmer/10, coreHitLimit, + lowComplexityFilter, evalHmmer/10, coreHitLimit, hmmScoreType, scoreCutoff, aligner] # rep = True; e-value cutoff is 10x more stringent than from ortho search otherCoreArgs = [cpus, debugCore, silentOff, noCleanup, force, append] core_options = [coreArgs, orthoCoreArgs, otherCoreArgs] @@ -417,7 +420,7 @@ def main(): ### do ortholog search orthoArgs = [checkCoorthologsRefOff, rbh, rep, evalBlast, - lowComplexityFilter, evalHmmer, hitLimit, scoreCutoff, aligner] + lowComplexityFilter, evalHmmer, hitLimit, hmmScoreType, scoreCutoff, aligner] otherArgs = [searchTaxa, cpus, debug, silentOff, noCleanup, force, append] ortho_options = [orthoArgs, otherArgs, pathArgs, refspec] ortho_runtime = search_ortholog(ortho_options, seeds, inFol, outpath) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index 8f0de85..ddf5844 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -93,6 +93,8 @@ def main(): action='store', default=0.0001, type=float) ortho_options.add_argument('--hitLimit', help='number of hits of the initial pHMM based search that should be evaluated via a reverse search. Default: 10', action='store', default=10, type=int) + ortho_options.add_argument('--hmmScoreType', help='Choose type of hmm score (best domain or full sequence score) for validating HMM candidates (NOTE: applied also for the core compilation). Default: domain', + action='store', choices=['domain','sequence'], default='domain') ortho_options.add_argument('--scoreCutoff', help='Define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', action='store', default=10, type=int) @@ -166,6 +168,7 @@ def main(): evalBlast = args.evalBlast evalHmmer = args.evalHmmer hitLimit = args.hitLimit + hmmScoreType = args.hmmScoreType scoreCutoff = args.scoreCutoff # fas arguments @@ -236,7 +239,7 @@ def main(): coreArgs = [minDist, maxDist, coreSize, coreTaxa, distDeviation, alnStrategy, fasOff] orthoCoreArgs = [CorecheckCoorthologsOff, rbh, True, evalBlast/10, - lowComplexityFilter, evalHmmer/10, coreHitLimit, + lowComplexityFilter, evalHmmer/10, coreHitLimit, hmmScoreType, scoreCutoff, aligner] # rep = True; e-value cutoff is 10x more stringent than from ortho search otherCoreArgs = [cpus, debugCore, silentOff, noCleanup, force, append] print('##### COMPILING CORE SET FOR %s #####' % seqName) @@ -275,7 +278,7 @@ def main(): searchTaxa = ','.join(searchTaxa) # do ortholog search orthoArgs = [checkCoorthologsRefOff, rbh, rep, evalBlast, - lowComplexityFilter, evalHmmer, hitLimit, scoreCutoff, aligner] + lowComplexityFilter, evalHmmer, hitLimit, hmmScoreType, scoreCutoff, aligner] otherArgs = [searchTaxa, cpus, debug, silentOff, noCleanup, force, append] hamstr_out = ortho_fn.run_hamstr([seqName, refspec, pathArgs, orthoArgs, otherArgs]) output_fn.write_hamstr(hamstr_out, outpath, seqName, force, append) diff --git a/fdog/setupfDog.py b/fdog/setupfDog.py index 679598f..8ed4224 100644 --- a/fdog/setupfDog.py +++ b/fdog/setupfDog.py @@ -254,7 +254,7 @@ def main(): if check_conda_env() == True: req_file = '%s/data/conda_requirements.yml' % fdogPath print('=> Dependencies in %s' % req_file) - conda_install_cmd = 'conda install -c bioconda --file %s -y' % (req_file) + conda_install_cmd = 'conda install -c bioconda -c conda-forge --file %s -y' % (req_file) try: subprocess.call([conda_install_cmd], shell = True) except: diff --git a/setup.py b/setup.py index 79c7325..86a5690 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ setup( name="fdog", - version="0.1.24", + version="0.1.26", python_requires='>=3.7.0', description="Feature-aware Directed OrtholoG search tool", long_description=long_description, @@ -44,6 +44,7 @@ 'PyYAML', 'pyhmmer', 'pysam', + 'pandas', 'greedyFAS>=1.11.2' ], entry_points={ @@ -58,6 +59,7 @@ "fdog.mergeOutput = fdog.mergeOutput:main", "fdog.uninstall = fdog.removefDog:main", "fdog.assembly = fdog.fDOGassembly:main", + "fdog.addAssembly = fdog.addAssembly:main", "fdog.addCoreGroup = fdog.makeCoreGroupFromFasta:main"], }, license="GPL-3.0", From 05283e009be6d2c877d89b75ed6c96d0d97fb311 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Mon, 12 Feb 2024 09:56:07 +0100 Subject: [PATCH 206/229] Fdog assembly (#40) --- fdog/fDOGassembly.py | 3 +++ fdog/libs/orthosearch.py | 31 +++++++++++++++++++++++++++++++ fdog/runMulti.py | 4 +++- fdog/runSingle.py | 4 +++- setup.py | 2 +- 5 files changed, 41 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 0766df9..e3ae118 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -988,6 +988,9 @@ def createGff(ortholog_sequences, out_folder, tool): if genes != []: gff_file_sp = gff_folder + '/' + genes[0].split('|')[1] + '.gff' for gene in genes: + if gene == '': + continue + #print(gene.split('|')) group, species, gene = gene.split('|') #print(group, species, gene) region = '_'.join(gene.split('_')[0:-1]) diff --git a/fdog/libs/orthosearch.py b/fdog/libs/orthosearch.py index 4c06cf4..9dbc31a 100644 --- a/fdog/libs/orthosearch.py +++ b/fdog/libs/orthosearch.py @@ -278,3 +278,34 @@ def run_hamstr(args): ### return return({**{seed_id_mod:seed_seq}, **hamstr_out}) + + +def get_search_taxa_ids(searchTaxa, searchpath): + """ Get taxonomy IDs for search taxa + Either from searchTaxa_dir, or from user input list (--searchTaxa) + Return dictionary {taxID:@@Ver} + """ + tax_ids = {} + if not searchTaxa == '' and not len(searchTaxa) == 0: + ignored_taxa = [] + if os.path.exists(os.path.abspath(searchTaxa)): + search_taxa = general_fn.read_file(searchTaxa) + else: + search_taxa = searchTaxa.split(',') + + for search_taxon in search_taxa: + if not os.path.exists( + os.path.abspath( + '%s/%s/%s.fa' % (searchpath,search_taxon,search_taxon))): + ignored_taxa.append(search_taxon) + else: + id = search_taxon.split('@')[1] + if not id in tax_ids: + tax_ids[id] = search_taxon + if len(ignored_taxa) > 0: + print( + 'WARNING: %s taxa cannot be found at %s\n%s' + % (len(ignored_taxa), searchpath, ignored_taxa)) + else: + tax_ids = general_fn.get_ids_from_folder(searchpath, 'searchTaxa_dir') + return(tax_ids) diff --git a/fdog/runMulti.py b/fdog/runMulti.py index 5763fee..571411b 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -350,6 +350,8 @@ def main(): check_fas = fas_fn.check_fas_executable() if check_fas == 0: sys.exit('ERROR: FAS is not executable! You still can use fDOG with --fasOff!') + check_time = time.time() + print('==> Preparation finished in ' + '{:5.3f}s'.format(check_time - begin)) ### START Path(outpath).mkdir(parents=True, exist_ok=True) @@ -401,7 +403,7 @@ def main(): exit('ERROR: Taxon group "%s" invalid!' % group) ### create taxonomy tree from list of search taxa searchTaxa = [] - tax_ids = core_fn.get_core_taxa_ids(coreTaxa, corepath) + tax_ids = ortho_fn.get_search_taxa_ids(searchTaxa, searchpath) for tax_id in tax_ids.keys(): check = tree_fn.check_taxon_group(group_id[group][0], tax_id, ncbi) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index ddf5844..2feee77 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -225,6 +225,8 @@ def main(): check_fas = fas_fn.check_fas_executable() if check_fas == 0: sys.exit('ERROR: FAS is not executable! You still can use fDOG with --fasOff!') + check_time = time.time() + print('==> Preparation finished in ' + '{:5.3f}s'.format(check_time - begin)) ##### Identify seed ID from refspec genome if reuseCore: @@ -265,7 +267,7 @@ def main(): exit('ERROR: Taxon group "%s" invalid!' % group) ### create taxonomy tree from list of search taxa searchTaxa = [] - tax_ids = core_fn.get_core_taxa_ids(coreTaxa, corepath) + tax_ids = ortho_fn.get_search_taxa_ids(searchTaxa, searchpath) for tax_id in tax_ids.keys(): check = tree_fn.check_taxon_group(group_id[group][0], tax_id, ncbi) diff --git a/setup.py b/setup.py index 86a5690..84c0438 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ setup( name="fdog", - version="0.1.26", + version="0.1.27", python_requires='>=3.7.0', description="Feature-aware Directed OrtholoG search tool", long_description=long_description, From 08510c06daee41f953b1d7f00de84137fe6181b5 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Mon, 4 Mar 2024 14:06:34 +0100 Subject: [PATCH 207/229] Fdog assembly (#41) fDOG-Assembly bug fix during distance calculation: AS U will be scored as C --- fdog/fDOGassembly.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index e3ae118..cfe3e77 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -291,7 +291,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) region.write(file + "\t" + str(start) + "\t" + str(end) + "\n") #metaeuk call - cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1" + cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 --remove-tmp-files" #print(cmd) # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 starting_subprocess(cmd, mode) @@ -340,8 +340,16 @@ def searching_for_db(assembly_path): def get_distance_biopython(file, matrix): #print(file) aln = AlignIO.read(open(file), 'fasta') - calculator = DistanceCalculator(matrix) - dm = calculator.get_distance(aln) + try: + calculator = DistanceCalculator(matrix) + dm = calculator.get_distance(aln) + except ValueError: + #print('The amino acid U is scored as C during distance calculation for file %s'%(file)) + for record in aln: + new_seq = record.seq.replace('U', 'C') + record.seq = new_seq + calculator = DistanceCalculator(matrix) + dm = calculator.get_distance(aln) return dm def readFasta(fasta): From 60ae899fe228e83a616fb8593af72d15d8a888ea Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Mon, 4 Mar 2024 14:48:19 +0100 Subject: [PATCH 208/229] Fdog assembly (#42) Update fDOG version Bug fix addAssembly user output --------- Co-authored-by: trvinh --- fdog/addAssembly.py | 8 ++++---- fdog/libs/alignment.py | 4 ++++ fdog/libs/orthosearch.py | 16 ++++++++-------- setup.py | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/fdog/addAssembly.py b/fdog/addAssembly.py index b0e901d..1e6b374 100644 --- a/fdog/addAssembly.py +++ b/fdog/addAssembly.py @@ -39,7 +39,7 @@ def parse_file(path): def main(): #################### handle user input ##################################### - version = '0.0.1' + version = '0.0.2' ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.addAssembly version ' + str(version) + '.') ################## required arguments ###################################### @@ -86,11 +86,11 @@ def main(): if ln == False: assembly_folder = out_folder + name os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) - os.system("cp %fa %s/%s.fa" %(fa, assembly_folder, name)) + os.system("cp %s %s/%s.fa" %(fa, assembly_folder, name)) else: assembly_folder = out_folder + name os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) - os.system("ln %fa %s/%s.fa" %(fa, assembly_folder, name)) + os.system("ln %s %s/%s.fa" %(fa, assembly_folder, name)) else: print("%s Fasta format not valid or header includes |"%(fa)) @@ -102,7 +102,7 @@ def main(): if ln == False: assembly_folder = out_folder + name os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) - os.system("cp %s/%fa %s/%s.fa" %(fasta, fa, assembly_folder, name)) + os.system("cp %s/%s %s/%s.fa" %(fasta, fa, assembly_folder, name)) else: assembly_folder = out_folder + name os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) diff --git a/fdog/libs/alignment.py b/fdog/libs/alignment.py index 0704bbf..fd27270 100644 --- a/fdog/libs/alignment.py +++ b/fdog/libs/alignment.py @@ -143,6 +143,10 @@ def calc_aln_score(fa1, fa2, aln_strategy = 'local', debugCore = False): fdog_path = os.path.realpath(__file__).replace('/libs/alignment.py','') fa1_filename = fa1.split("/")[-1] fa2_filename = fa2.split("/")[-1] + if os.path.exists(fa1_filename): + if fa2_filename == fa1_filename: + fa2_filename = f'{fa2_filename}.core' + fa1_filename = f'{fa1_filename}.core' os.symlink(fa1, fa1_filename) if not fa2_filename == fa1_filename: os.symlink(fa2, fa2_filename) diff --git a/fdog/libs/orthosearch.py b/fdog/libs/orthosearch.py index 9dbc31a..68b64ae 100644 --- a/fdog/libs/orthosearch.py +++ b/fdog/libs/orthosearch.py @@ -86,7 +86,7 @@ def hamstr(args): hmm_hit_fa = '%s/hmm_%s_%s_%s.fa' % ( outpath, seqName, search_taxon, hmm_hit) with open(hmm_hit_fa, 'w') as hmm_fa_out: - hmm_fa_out.write('>%s\n%s' % (hmm_hit, search_seqs.fetch(hmm_hit))) + hmm_fa_out.write('>%s_%s\n%s' % (search_taxon, hmm_hit, search_seqs.fetch(hmm_hit))) blast_xml = blast_fn.do_blastsearch( hmm_hit_fa, refspec_db, evalBlast = evalBlast, lowComplexityFilter = lowComplexityFilter) blast_out = blast_fn.parse_blast_xml(blast_xml) @@ -113,9 +113,9 @@ def hamstr(args): hmm_hit, best_blast_hit) with open(aln_fa, 'w') as aln_fa_out: aln_fa_out.write( - '>%s\n%s\n>%s\n%s\n>%s\n%s' % ( + '>%s\n%s\n>%s_%s\n%s\n>%s\n%s' % ( seed_id, refspec_seqs.fetch(seed_id), - hmm_hit, search_seqs.fetch(hmm_hit), + search_taxon, hmm_hit, search_seqs.fetch(hmm_hit), best_blast_hit, refspec_seqs.fetch(best_blast_hit) ) ) @@ -124,7 +124,7 @@ def hamstr(args): output_fn.print_debug( debug, 'Alignment for checking co-ortholog ref', aln_seq) br_dist = align_fn.calc_Kimura_dist(aln_seq, best_blast_hit, seed_id, debug) - bh_dist = align_fn.calc_Kimura_dist(aln_seq, best_blast_hit, hmm_hit, debug) + bh_dist = align_fn.calc_Kimura_dist(aln_seq, best_blast_hit, f'{search_taxon}_{hmm_hit}', debug) output_fn.print_debug( debug, 'Check if distance blast_vs_ref < blast_vs_hmm', 'd_br = %s; d_bh = %s' % (br_dist, bh_dist)) @@ -139,7 +139,7 @@ def hamstr(args): ortho_candi[hmm_hit] = search_seqs.fetch(hmm_hit) continue - # remove seed protein from candidata list + # remove seed protein from candidate list if search_taxon == refspec: if seed_id in ortho_candi: ortho_candi.pop(seed_id) @@ -164,18 +164,18 @@ def hamstr(args): (seed_id, refspec_seqs.fetch(seed_id))) for cand in ortho_candi: aln_co_fa_out.write(('>%s\n%s\n') % - (cand, ortho_candi[cand])) + (f'{search_taxon}_{cand}', ortho_candi[cand])) aln_co_seq = align_fn.do_align(aligner, aln_co_fa) output_fn.print_debug( debug, 'Alignment for checking co-orthologs', aln_co_seq) if noCleanup == False: os.remove(aln_co_fa) best_dist = align_fn.calc_Kimura_dist( - aln_co_seq, seed_id, best_ortho, debug) + aln_co_seq, seed_id, f'{search_taxon}_{best_ortho}', debug) for cand in ortho_candi: if not cand == best_ortho: candi_dist = align_fn.calc_Kimura_dist( - aln_co_seq, best_ortho, cand, debug) + aln_co_seq, f'{search_taxon}_{best_ortho}', f'{search_taxon}_{cand}', debug) output_fn.print_debug( debug, 'Check if distance bestHmm_vs_ref > ' diff --git a/setup.py b/setup.py index 84c0438..34edadd 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ setup( name="fdog", - version="0.1.27", + version="0.1.28", python_requires='>=3.7.0', description="Feature-aware Directed OrtholoG search tool", long_description=long_description, From c38687e927c0bdc9a9db4442cf2e17e9cb3f18f0 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Thu, 13 Jun 2024 11:29:42 +0200 Subject: [PATCH 209/229] Update of some functions to improve I/O operations (#44) --- fdog/fDOGassembly.py | 84 ++++++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 27 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index cfe3e77..a84a388 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -44,15 +44,15 @@ def check_path(path, exit=True): else: return 0 -def check_ref_sepc(species_list, fasta_file): - file = open(fasta_file, "r") - lines = file.readlines() - species_file = [] - - for line in lines: - if line[0] == ">": - species = line.split("|")[1] - species_file.append(species) +def check_ref_spec(species_list, fasta_file): + """ Checks if reference species is part of the input ortholog group + """ + species_file = {} + with open(fasta_file,"r") as lines: + for line in lines: + if line[0] == ">": + species = line.split("|")[1] + species_file.add(species) for species in species_list: if species in species_file: return species @@ -241,6 +241,8 @@ def extract_sequence_from_to(name, file, start, end): def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, augustus_ref_species, ass_name, group, tmp_path, mode): """Gene prediction with software Augustus for all candidate regions. The resulting AS sequences will be written in a tmp file.""" output = open(candidatesOutFile, "w") + region = open(candidatesOutFile.replace(".candidates.fa", ".regions.txt"), "w") + region.write("Contig/scaffold" + "\t" + "start" + "\t" + "end" + "\n") for key in regions: locations = regions[key] counter = 0 @@ -251,12 +253,17 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug end = str(i[1] + length_extension) name = key + "_" + str(counter) # augutus call - cmd = "augustus --protein=1 --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" + cmd = "augustus --protein=1 --gff3=on --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" #print(cmd) starting_subprocess(cmd, 'silent') # transfer augustus output to AS sequence + #print(tmp_path) + #print(key) cmd = "getAnnoFasta.pl --seqfile=" + tmp_path + key + ".fasta " + tmp_path + name + ".gff" + #print(cmd) starting_subprocess(cmd, mode) + #write region in region file + region.write(key + "\t" + str(start) + "\t" + str(end) + "\n") # parsing header and sequences try: sequence_file = open(tmp_path + name + ".aa", "r") @@ -273,6 +280,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug pass #print("No gene found in region with ID" + name + " in species " + ass_name + " , continuing with next region") output.close() + region.close() def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group, tmp_path, mode, db): output = open(candidatesOutFile, "w") @@ -290,9 +298,10 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) region.write(file + "\t" + str(start) + "\t" + str(end) + "\n") - #metaeuk call - cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 --remove-tmp-files" + #metaeuk call sensitive + #cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 --remove-tmp-files -s 6" #print(cmd) + cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --max-intron 130000 --max-seq-len 160000 --min-exon-aa 15 --max-overlap 15 --min-intron 5 --overlap 1 -s 4.5 --remove-tmp-files" # other parameteres used by BUSCO with metazoa set--max-intron 130000 --max-seq-len 160000 --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 starting_subprocess(cmd, mode) # parsing header and sequences @@ -325,7 +334,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group gff_file.close() except FileNotFoundError: pass - + region.write() output.close() def searching_for_db(assembly_path): @@ -338,8 +347,10 @@ def searching_for_db(assembly_path): return check def get_distance_biopython(file, matrix): + """ Reads alignment file and returns distance matrix """ #print(file) - aln = AlignIO.read(open(file), 'fasta') + input_handle = open(file) + aln = AlignIO.read(input_handle, 'fasta') try: calculator = DistanceCalculator(matrix) dm = calculator.get_distance(aln) @@ -350,6 +361,7 @@ def get_distance_biopython(file, matrix): record.seq = new_seq calculator = DistanceCalculator(matrix) dm = calculator.get_distance(aln) + input_handle.close() return dm def readFasta(fasta): @@ -989,6 +1001,7 @@ def createGff(ortholog_sequences, out_folder, tool): #print(out_folder) gff_folder = out_folder + "/gff/" os.system('mkdir %s >/dev/null 2>&1' %(gff_folder)) + types_set = set(['gene', 'CDS', 'transcript', 'mRNA', 'exon']) for s in ortholog_sequences: genes = s[0] #print(genes) @@ -1021,10 +1034,23 @@ def createGff(ortholog_sequences, out_folder, tool): counter += 1 if counter == gene_count: if source == 'AUGUSTUS': - att = att.replace('g' + str(gene_count), '_'.join(gene.split('.')[:-1])) + att = att.replace('g' + str(gene_count), group + '_' + '_'.join(gene.split('.')[:-1])) att = att.replace('"', '') + if type not in types_set: + continue elif source == 'MetaEuk': - att = 'gene_id ' + gene + '; ' + att + #att = 'ID=' + group + '_' + gene + '; ' + att + if type == 'gene': + att_entries = att.split(';') + for x in att_entries: + if x.startswith('Target_ID='): + target = x + elif x.startswith('TCS_ID='): + parent_prefix = x.replace('TCS_ID=', '') + att = att.replace('TCS_ID=', 'ID=') + att = att.replace(parent_prefix, group + '_' + gene) + att = att.replace(target + ';', '') + phase = 0 data.append([contig, source, type, int(start), int(end), score, strand, phase, att]) else: continue @@ -1140,14 +1166,10 @@ def main(): return 1 # output modes - if debug == True and silent == True: - print("It's not possible to use booth modes, please restart and use --debug or --silent") - return 1 + if debug == True: + mode = 'debug' else: - if debug == True: - mode = 'debug' - else: - mode = 'normal' + mode = 'normal' #checking paths if dataPath == '': @@ -1240,7 +1262,7 @@ def main(): ########### is/are fDOG reference species part of ortholog group? ########## - fdog_ref_species = check_ref_sepc(fdog_ref_species, fasta_path) + fdog_ref_species = check_ref_spec(fdog_ref_species, fasta_path) ###################### create tmp folder ################################### @@ -1254,15 +1276,23 @@ def main(): if augustus == True: group_computation_time_start = time.time() - consensus_path = consensusSequence(core_path, group, mode, out) - profile_path = blockProfiles(core_path, group, mode, out, msaTool) + consensus_path = core_path + '/' + group + '/' + group + '.con' + if check_path(consensus_path, exit=False) == 1: + consensus_path = consensusSequence(core_path, group, mode, out) + print(consensus_path) + profile_path = core_path + '/' + group + '/' + group + '.prfl' + if check_path(profile_path, exit=False) == 1: + profile_path = blockProfiles(core_path, group, mode, out, msaTool) + print(profile_path) group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start else: #print("test") profile_path = "" group_computation_time_start = time.time() - consensus_path = consensusSequence(core_path, group, mode, out) + consensus_path = core_path + '/' + group + '.con' + if check_path(consensus_path, exit=False) == 1: + consensus_path = consensusSequence(core_path, group, mode, out) #concatinade core_group sequences if metaeuk should be run without tblastn group_computation_time_end = time.time() time_group = group_computation_time_end - group_computation_time_start From dddd57a1dcde85b525624ddd508b94f72610e757 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Thu, 13 Jun 2024 13:20:17 +0200 Subject: [PATCH 210/229] v 0.1.5.1 (#45) * bug fix debug mode * updated check_ref_spec function * updated check_ref_spec function, added region output if augustus is used * update get_distance_function * bug fix check_ref_spec function * v 0.1.5.1 * update output containing candidate regions * bugfix --------- Co-authored-by: trvinh --- fdog/fDOGassembly.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index a84a388..915025f 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -47,7 +47,7 @@ def check_path(path, exit=True): def check_ref_spec(species_list, fasta_file): """ Checks if reference species is part of the input ortholog group """ - species_file = {} + species_file = set() with open(fasta_file,"r") as lines: for line in lines: if line[0] == ">": @@ -297,7 +297,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group end = str(i[1] + length_extension) name = key + "_" + str(counter) file, start, end = extract_sequence_from_to(tmp_path + name, tmp_path + key + ".fasta", start, end) - region.write(file + "\t" + str(start) + "\t" + str(end) + "\n") + region.write(key + "\t" + str(start) + "\t" + str(end) + "\n") #metaeuk call sensitive #cmd = "metaeuk easy-predict " + file + " " + db + " " + tmp_path + name + " " + tmp_path + "/metaeuk --min-exon-aa 5 --max-overlap 5 --min-intron 1 --overlap 1 --remove-tmp-files -s 6" #print(cmd) @@ -334,7 +334,7 @@ def metaeuk_single(regions, candidatesOutFile, length_extension, ass_name, group gff_file.close() except FileNotFoundError: pass - region.write() + region.close() output.close() def searching_for_db(assembly_path): @@ -1074,7 +1074,7 @@ def main(): #################### handle user input ##################################### start = time.time() - version = '0.1.5' + version = '0.1.5.1' ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) From 3a3081e052e14fe0b019167573c22b85292cadc6 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Tue, 18 Jun 2024 14:57:30 +0200 Subject: [PATCH 211/229] Improvements fdog.addAssembly script (#47) * Updated fdog.addAssembly module * improved call of main fuction * Improved help function --- fdog/addAssembly.py | 63 +++++++++++++++++++--------------- fdog/makeCoreGroupFromFasta.py | 3 +- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/fdog/addAssembly.py b/fdog/addAssembly.py index 1e6b374..171adb6 100644 --- a/fdog/addAssembly.py +++ b/fdog/addAssembly.py @@ -32,27 +32,26 @@ def parse_file(path): line = line.rstrip() ncbi, name = line.split("\t") id_dict[ncbi] = name - + file.close() return id_dict def main(): - + print("#################################") #################### handle user input ##################################### - version = '0.0.2' + version = '0.0.3' ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.addAssembly version ' + str(version) + '.') ################## required arguments ###################################### required = parser.add_argument_group('Required arguments') required.add_argument('--fasta', help='Path to fasta file or folder', action='store', default='', required=True) required.add_argument('--out', help='Path to output folder.', action='store', default='', required=True) - required.add_argument('--ncbi', help='NCBI number of species or mapping file', action='store', default='', required=True) + required.add_argument('--ncbi', help='NCBI ID of species or a mapping file (tab separated) containing the NCBI ID and the corresponding file name placed in the folder given by --fasta. ', action='store', default='', required=True) required.add_argument('--ver', help='Version', action='store', default='', required=True) optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--link', help='link files not copy', action='store_true', default = True) + optional.add_argument('--link', help='links fasta files instead of copying them', action='store_true', default = False) args = parser.parse_args() - fasta = args.fasta if check_path(fasta) == False: print("%s does not exists. Exiting ..."%(fasta)) @@ -68,21 +67,24 @@ def main(): id_dict = {} if check_path(ncbi) == False: - if isdigit(ncbi) and format == "File": - id_dict[ncbi] = fasta_file + if ncbi.isdigit() and format == "File": + id_dict[ncbi] = fasta else: print("%s is no file or digit. Exiting ..."%(ncbi)) sys.exit() elif check_path(ncbi) == "File": + print("Parsing mapping file ...") id_dict = parse_file(ncbi) + print("... done") else: print("%s is no file or digit. Exiting ..."%(ncbi)) sys.exit() - + #print(format) + #print(fasta) if format == "File": - fa = id_dict[id] + fa = id_dict[ncbi] if check_fasta(fa): - name = addTaxon_fn.generate_spec_name(id, "", ver) + name = addTaxon_fn.generate_spec_name(ncbi, "", ver) if ln == False: assembly_folder = out_folder + name os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) @@ -93,23 +95,30 @@ def main(): os.system("ln %s %s/%s.fa" %(fa, assembly_folder, name)) else: print("%s Fasta format not valid or header includes |"%(fa)) - - for id in id_dict: - fa = id_dict[id] - fasta = os.path.abspath(fasta) + '/' - if check_fasta(fasta + fa): - name = addTaxon_fn.generate_spec_name(id, "", ver) - if ln == False: - assembly_folder = out_folder + name - os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) - os.system("cp %s/%s %s/%s.fa" %(fasta, fa, assembly_folder, name)) + + else: + for sp in id_dict: + print("Adding species %s"%(sp)) + #print(id_dict) + fa = id_dict[sp] + fasta = os.path.abspath(fasta) + '/' + #print(fa) + #print(fasta) + fasta_path = fasta + fa + if check_fasta(fasta_path): + name = addTaxon_fn.generate_spec_name(sp, "", ver) + if ln == False: + assembly_folder = out_folder + name + os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) + os.system("cp %s %s/%s.fa" %(fasta_path, assembly_folder, name)) + else: + assembly_folder = out_folder + name + os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) + os.system("ln -s %s %s/%s.fa" %(fasta_path, assembly_folder, name)) else: - assembly_folder = out_folder + name - os.system('mkdir %s >/dev/null 2>&1' % (assembly_folder)) - os.system("ln -s %s/%s %s/%s.fa" %(fasta, fa, assembly_folder, name)) - else: - print("%s Fasta format not valid or header includes |"%(fa)) + print("%s Fasta format not valid or header includes |"%(fasta_path)) print("DONE, files can be found: %s"%(out_folder)) -main() +if __name__ == '__main__': + main() diff --git a/fdog/makeCoreGroupFromFasta.py b/fdog/makeCoreGroupFromFasta.py index b81e872..566c203 100644 --- a/fdog/makeCoreGroupFromFasta.py +++ b/fdog/makeCoreGroupFromFasta.py @@ -96,4 +96,5 @@ def main(): print('Core group located at %s. Fasta file: %s; MSA: %s; HMM: %s' % (out_folder, fasta_file, aln_file, hmm_file)) -main() +if __name__ == '__main__': + main() From 01c94ba08c8d5217356943bb21f5f714c592cae7 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+mueli94@users.noreply.github.com> Date: Thu, 20 Jun 2024 16:16:52 +0200 Subject: [PATCH 212/229] bugfix using --strict (#48) --- fdog/fDOGassembly.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 915025f..b052c8d 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -71,10 +71,13 @@ def starting_subprocess(cmd, mode, time_out = None): try: if mode == 'debug': result = subprocess.run(cmd, shell=True, timeout = time_out) + return result elif mode == 'silent': result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell=True, timeout = time_out) + return result elif mode == 'normal': result = subprocess.run(cmd, stdout = subprocess.PIPE, shell=True, timeout = time_out) + return result except subprocess.TimeoutExpired: return 1 @@ -550,16 +553,19 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva orthologs = set({}) for species in seed: - print("backward search in species %s\n" %species) + #print("backward search in species %s\n" %species) orthologs_new = set({}) try: id_ref = seedDic[species] except KeyError: #print("The species " + species + " isn't part of the core ortholog group, ... exciting") return 0, seed - - cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -seg " + filter + " -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile - starting_subprocess(cmd, mode) + #cmd = "blastp -db " + blast_dir_path + fdog_ref_species + "/" + fdog_ref_species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "blast_" + fdog_ref_species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + cmd = "blastp -db " + blast_dir_path + species + "/" + species + " -outfmt '6 sseqid qseqid evalue' -max_target_seqs 10 -out " + tmp_path + "/blast_" + species + " -evalue " + str(evalue_cut_off) + " -query " + candidatesOutFile + results = starting_subprocess(cmd, mode) + if results.returncode != 0: + print("Blastp failed with the command: %s"%(results.args)) + sys.exit() alg_file = open(tmp_path + "/blast_" + species, "r") lines = alg_file.readlines() alg_file.close() From d79e25366ee797b39dfd6b4017ba48470954db0c Mon Sep 17 00:00:00 2001 From: mueli94 Date: Mon, 24 Mar 2025 20:50:20 +0100 Subject: [PATCH 213/229] Improved FAS error output --- fdog/fDOGassembly.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index b052c8d..6358868 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -862,10 +862,16 @@ def run_fas(cmd): output = process.stdout.readline().decode().split('\n') error = process.stderr.readline().decode().split('\n') if error: + cmd_out = '' for line in error: line.strip() if 'error' in line or 'Error' in line: - print ("Error running FAS with %s"%(' '.join(cmd))) + for i in cmd: + if '|' in i: + cmd_out += " '"+ str(i) + "'" + else: + cmd_out += " " + str(i) + print ("Error running FAS with %s"%(cmd_out)) process.terminate() sys.exit() return output From 7b656afa7743fee503a4698219541651e8d6bdf5 Mon Sep 17 00:00:00 2001 From: Hannah Date: Wed, 10 Sep 2025 21:24:14 +0200 Subject: [PATCH 214/229] Merge remote-tracking branch 'upstream/master' into fdog_goes_assembly --- .github/workflows/github_build.yml | 10 +++++----- fdog/addTaxa.py | 7 +++---- fdog/addTaxon.py | 6 +++--- fdog/checkData.py | 25 ++++++++++++++----------- fdog/libs/addtaxon.py | 1 - fdog/libs/blast.py | 29 ++++++++++++++--------------- fdog/libs/corecompile.py | 2 +- fdog/libs/preparation.py | 22 ++++++++++------------ fdog/libs/tree.py | 14 ++++++++------ fdog/mergeOutput.py | 6 +++--- fdog/removefDog.py | 6 +++--- fdog/runMulti.py | 8 ++++---- 12 files changed, 68 insertions(+), 68 deletions(-) diff --git a/.github/workflows/github_build.yml b/.github/workflows/github_build.yml index 10127d8..856dbfb 100644 --- a/.github/workflows/github_build.yml +++ b/.github/workflows/github_build.yml @@ -18,7 +18,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8] + python-version: [3.12] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} @@ -44,17 +44,17 @@ jobs: echo "TEST fdog.setup" fdog.setup -d /home/runner/work/fDOG/fDOG/dt --woFAS echo "TEST fdog.checkData" - fdog.checkData -s /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir -c /home/runner/work/fDOG/fDOG/dt/coreTaxa_dir -a /home/runner/work/fDOG/fDOG/dt/annotation_dir --reblast + fdog.checkData -s /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir -c /home/runner/work/fDOG/fDOG/dt/coreTaxa_dir -a /home/runner/work/fDOG/fDOG/dt/annotation_dir --reblast --ignoreAnno echo "TEST fdog.showTaxa" fdog.showTaxa echo "TEST fdog.run" - fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3 --fasOff --group mammalia + fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@qfo24_02 --fasOff --group mammalia mkdir seeds path=$(fdog.setup -d ./ --getSourcepath); a="1 2 3"; for i in ${a[@]}; do cp $path/data/infile.fa seeds/$i.fa; done echo "TEST fdogs.run" - fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@3 --fasOff --searchTaxa PARTE@5888@3,THAPS@35128@3 --hmmScoreType sequence + fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@qfo24_02 --fasOff --searchTaxa PARTE@5888@qfo24_02,THAPS@35128@qfo24_02 --hmmScoreType sequence echo "TEST fdog.addTaxon" - head /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir/HUMAN@9606@3/HUMAN@9606@3.fa > hm.fa + head /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir/HUMAN@9606@qfo24_02/HUMAN@9606@qfo24_02.fa > hm.fa fdog.addTaxon -f hm.fa -i 9606 -o ./ -c -a ls - name: Deploy diff --git a/fdog/addTaxa.py b/fdog/addTaxa.py index c03a9a8..e5982c8 100644 --- a/fdog/addTaxa.py +++ b/fdog/addTaxa.py @@ -26,12 +26,11 @@ from Bio import SeqIO import multiprocessing as mp from tqdm import tqdm -from ete3 import NCBITaxa import re import shutil from datetime import datetime import time -from pkg_resources import get_distribution +from importlib.metadata import version, PackageNotFoundError from collections import OrderedDict import fdog.libs.zzz as general_fn @@ -66,8 +65,8 @@ def parse_map_file(mapping_file, folIn): def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-i', '--input', help='Path to input folder', action='store', default='', required=True) diff --git a/fdog/addTaxon.py b/fdog/addTaxon.py index bd17fe0..3751f2e 100755 --- a/fdog/addTaxon.py +++ b/fdog/addTaxon.py @@ -26,7 +26,7 @@ import shutil import multiprocessing as mp from datetime import datetime -from pkg_resources import get_distribution +from importlib.metadata import version, PackageNotFoundError import fdog.libs.zzz as general_fn import fdog.libs.tree as tree_fn @@ -34,8 +34,8 @@ def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-f', '--fasta', help='FASTA file of input taxon', action='store', default='', required=True) diff --git a/fdog/checkData.py b/fdog/checkData.py index 74b3b93..755240b 100644 --- a/fdog/checkData.py +++ b/fdog/checkData.py @@ -26,14 +26,12 @@ import subprocess import shutil from Bio import SeqIO -from ete3 import NCBITaxa +from ete4 import NCBITaxa import re from datetime import datetime import multiprocessing as mp from tqdm import tqdm -from pkg_resources import get_distribution -from Bio.Blast.Applications import NcbiblastpCommandline - +from importlib.metadata import version, PackageNotFoundError import fdog.libs.zzz as general_fn import fdog.libs.blast as blast_fn @@ -176,13 +174,18 @@ def run_check_fasta(checkDir, replace, delete, concat): def check_blastdb(args): """ Check for outdated blastdb """ - (query, taxon, coreTaxa_dir, searchTaxa_dir) = args - blast_db = '%s/%s/%s' % (coreTaxa_dir, taxon, taxon) + query, taxon, coreTaxa_dir, searchTaxa_dir = args + blast_db = f"{coreTaxa_dir}/{taxon}/{taxon}" + try: - blastp_cline = NcbiblastpCommandline(query = query, db = blast_db) - stdout, stderr = blastp_cline() - except: + result = subprocess.run( + ["blastp", "-query", query, "-db", blast_db], + capture_output=True, text=True, check=True + ) + return(result.stdout) + except subprocess.CalledProcessError as e: return([query, blast_db]) + fai_in_genome = "%s/%s/%s.fa.fai" % (searchTaxa_dir, taxon, taxon) fai_in_blast = "%s/%s/%s.fa.fai" % (coreTaxa_dir, taxon, taxon) # check if fai_in_blast is a valid symlink @@ -418,8 +421,8 @@ def run_check(args): return(caution) def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') parser.add_argument('-s', '--searchTaxa_dir', help='Path to search taxa directory (e.g. fdog_dataPath/searchTaxa_dir)', action='store', default='') parser.add_argument('-c', '--coreTaxa_dir', help='Path to blastDB directory (e.g. fdog_dataPath/coreTaxa_dir)', action='store', default='') parser.add_argument('-a', '--annotation_dir', help='Path to feature annotation directory (e.g. fdog_dataPath/annotation_dir)', action='store', default='') diff --git a/fdog/libs/addtaxon.py b/fdog/libs/addtaxon.py index 995eeb1..2da4066 100644 --- a/fdog/libs/addtaxon.py +++ b/fdog/libs/addtaxon.py @@ -20,7 +20,6 @@ from pathlib import Path from Bio import SeqIO import subprocess -from ete3 import NCBITaxa import re from datetime import datetime from collections import OrderedDict diff --git a/fdog/libs/blast.py b/fdog/libs/blast.py index 2cb4609..e40532a 100644 --- a/fdog/libs/blast.py +++ b/fdog/libs/blast.py @@ -17,7 +17,6 @@ import os import sys -from Bio.Blast.Applications import NcbiblastpCommandline import xml.etree.ElementTree as ET import subprocess @@ -29,21 +28,21 @@ def do_blastsearch( """ Perform blastp search for a query fasta file Return an XML string contains blast result """ - filter = 'no' - if lowComplexityFilter == True: - filter = 'yes' + filter_value = "yes" if lowComplexityFilter else "no" try: - blastp_cline = NcbiblastpCommandline( - query = query, db = blast_db, evalue = evalBlast, seg = filter, - max_target_seqs = 10, outfmt = 5) - stdout, stderr = blastp_cline() - return(stdout) - except: - sys.exit( - 'ERROR: Error running blastp search for %s against %s\n%s' - % (query, blast_db, NcbiblastpCommandline( - query = query, db = blast_db, evalue = evalBlast, seg = filter, - max_target_seqs = 10, outfmt = 5))) + cmd = [ + "blastp", + "-query", query, + "-db", blast_db, + "-evalue", str(evalBlast), + "-seg", filter_value, + "-max_target_seqs", "10", + "-outfmt", "5" + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + return result.stdout + except subprocess.CalledProcessError as e: + sys.exit(f"ERROR: Error running BLASTP search for {query} against {blast_db}\n{e.stderr}") def parse_blast_xml(blast_output): diff --git a/fdog/libs/corecompile.py b/fdog/libs/corecompile.py index bd57e07..99e9bbd 100644 --- a/fdog/libs/corecompile.py +++ b/fdog/libs/corecompile.py @@ -19,7 +19,7 @@ import os import shutil from pathlib import Path -from ete3 import NCBITaxa +from ete4 import NCBITaxa from Bio import SeqIO import time diff --git a/fdog/libs/preparation.py b/fdog/libs/preparation.py index 2d79f82..f18e141 100644 --- a/fdog/libs/preparation.py +++ b/fdog/libs/preparation.py @@ -17,10 +17,10 @@ import sys import os +import subprocess from pathlib import Path from Bio import SeqIO -from Bio.Blast.Applications import NcbiblastpCommandline -from ete3 import NCBITaxa +from ete4 import NCBITaxa import fdog.libs.zzz as general_fn import fdog.libs.fasta as fasta_fn @@ -107,17 +107,15 @@ def check_input(args): def check_blast_version(corepath, refspec): """ Check if blast DBs in corepath is compatible with blastp version """ - fdog_path = os.path.realpath(__file__).replace('/libs/preparation.py','') - query = fdog_path + '/data/infile.fa' - blast_db = '%s/%s/%s' % (corepath, refspec, refspec) + fdog_path = os.path.realpath(__file__).replace('/libs/preparation.py', '') + query = os.path.join(fdog_path, 'data', 'infile.fa') + blast_db = os.path.join(corepath, refspec, refspec) try: - blastp_cline = NcbiblastpCommandline( - query = query, db = blast_db) - stdout, stderr = blastp_cline() - except: - sys.exit( - 'ERROR: Error running blast (probably conflict with BLAST DBs versions)\n%s' - % (NcbiblastpCommandline(query = query, db = blast_db))) + cmd = ["blastp", "-query", query, "-db", blast_db] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + except subprocess.CalledProcessError as e: + sys.exit(f"ERROR: Error running BLAST (probably conflict with BLAST DB versions)\n{e.stderr}") + def check_ranks_core_taxa(corepath, refspec, minDist, maxDist): """ Check if refspec (or all core taxa) have a valid minDist and maxDist tax ID diff --git a/fdog/libs/tree.py b/fdog/libs/tree.py index b2de19f..57efcaf 100644 --- a/fdog/libs/tree.py +++ b/fdog/libs/tree.py @@ -16,7 +16,7 @@ ####################################################################### import re -from ete3 import NCBITaxa +from ete4 import NCBITaxa import fdog.libs.zzz as general_fn @@ -57,8 +57,9 @@ def get_ancestor(id1, id2, ncbi): Return dictionary {ancestor_id: ancestor_rank} """ tree = ncbi.get_topology([id1, id2], intermediate_nodes = False) - ancestor = tree.get_common_ancestor(id1, id2).name - return(ncbi.get_rank([ancestor])) + ancestor_name = tree.common_ancestor(id1, id2) + ancestor_id = int(ancestor_name.name) + return(ncbi.get_rank([ancestor_id])) def check_common_ancestor(ref_id, ancestor, minDist, maxDist, ncbi): @@ -68,6 +69,7 @@ def check_common_ancestor(ref_id, ancestor, minDist, maxDist, ncbi): """ ref_lineage = ncbi.get_lineage(ref_id) (min_ref, max_ref) = get_rank_range(ref_lineage, minDist, maxDist, ncbi) + ancestor = int(ancestor) if not ancestor in ref_lineage: return(0) ancestor_index = len(ref_lineage) - ref_lineage.index(ancestor) - 1 @@ -78,7 +80,7 @@ def check_common_ancestor(ref_id, ancestor, minDist, maxDist, ncbi): def remove_clade(tree, node_id): """ Remove a clade from a tree """ - removed_clade = tree.search_nodes(name = str(node_id))[0] + removed_clade = list(tree.search_nodes(name = str(node_id)))[0] removed_node = removed_clade.detach() return(tree) @@ -96,12 +98,12 @@ def get_leaves_dict(spec_lineage, tree, min_index, max_index): for i in range(len(spec_lineage)): if i >= min_index and i <= max_index: curr_node = spec_lineage[i] - node = tree.search_nodes(name = str(curr_node)) + node = list(tree.search_nodes(name = str(curr_node))) if len(node) > 0: for leaf in node: node_dict[spec_lineage[i]] = [] for t in leaf.traverse(): - if t.is_leaf(): + if t.is_leaf: if not t.name in already_added: already_added.append(t.name) node_dict[spec_lineage[i]].append(t.name) diff --git a/fdog/mergeOutput.py b/fdog/mergeOutput.py index fa36d4d..ffebb5c 100644 --- a/fdog/mergeOutput.py +++ b/fdog/mergeOutput.py @@ -21,7 +21,7 @@ from os import listdir as ldir import argparse import yaml -from pkg_resources import get_distribution +from importlib.metadata import version, PackageNotFoundError from Bio import SeqIO def createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out): @@ -37,8 +37,8 @@ def createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out): def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') parser.add_argument('-i', '--input', help='Input directory, where all single output (o_g.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', action='store', default='', required=True) diff --git a/fdog/removefDog.py b/fdog/removefDog.py index 9a28b41..bd34c90 100644 --- a/fdog/removefDog.py +++ b/fdog/removefDog.py @@ -20,7 +20,7 @@ import argparse import subprocess import shutil -from pkg_resources import get_distribution +from importlib.metadata import version, PackageNotFoundError import fdog.setupfDog as setupfDog_fn @@ -48,8 +48,8 @@ def query_yes_no(question, default='yes'): def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') parser.add_argument('--all', help='Remove fdog together with all files/data within the installed fdog directory', action='store_true', default=False) args = parser.parse_args() data = args.all diff --git a/fdog/runMulti.py b/fdog/runMulti.py index 1bce014..b1b3587 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -25,8 +25,8 @@ import shutil import multiprocessing as mp from tqdm import tqdm -from ete3 import NCBITaxa -from pkg_resources import get_distribution +from ete4 import NCBITaxa +from importlib.metadata import version, PackageNotFoundError import time import fdog.libs.zzz as general_fn @@ -161,8 +161,8 @@ def join_outputs(outpath, jobName, seeds, keep, silentOff): def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.', + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.', epilog="For more information on certain options, please refer to the wiki pages " "on github: https://github.com/BIONF/fDOG/wiki") required = parser.add_argument_group('Required arguments') From 5414d1b0e9596cf4b7e4056e33c0d5c2900744f1 Mon Sep 17 00:00:00 2001 From: Hannah Date: Wed, 10 Sep 2025 21:25:15 +0200 Subject: [PATCH 215/229] update --- README.md | 10 +++++----- fdog/runSingle.py | 8 ++++---- fdog/setPaths.py | 6 +++--- fdog/setupfDog.py | 48 ++++++++++++++++++++++++++--------------------- fdog/showTaxa.py | 2 +- setup.py | 6 +++--- 6 files changed, 43 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 082a46d..96750e0 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # fDOG - Feature-aware Directed OrtholoG search +[![published in: MBE](https://img.shields.io/badge/published%20in-MBE-ff69b4)](https://doi.org/10.1093/molbev/msaf120) [![PyPI version](https://badge.fury.io/py/fdog.svg)](https://pypi.org/project/fdog/) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) -[![Build Status](https://travis-ci.com/BIONF/fDOG.svg?branch=master)](https://travis-ci.com/BIONF/fDOG) ![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg) # Poster fDOG - Assembly @@ -21,7 +21,7 @@ https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf # How to install -*fDOG* tool is distributed as a python package called *fdog*. It is compatible with [Python ≥ v3.7](https://www.python.org/downloads/). +*fDOG* tool is distributed as a python package called *fdog*. It is compatible with [Python ≥ v3.12](https://www.python.org/downloads/). ## Install the fDOG package You can install *fdog* using `pip`: @@ -73,7 +73,7 @@ Please find more information in [our wiki](https://github.com/BIONF/fDOG/wiki) t # fDOG data set -Within the data package we provide a set of 78 reference taxa. They can be automatically downloaded during the setup. This data comes "ready to use" with the *fdog* framework. Species data must be present in the three directories listed below: +Within the data package we provide a set of [81 reference taxa](https://ftp.ebi.ac.uk/pub/databases/reference_proteomes/QfO/QfO_release_2024_02.tar.gz). They will be automatically downloaded during the setup. This data comes "ready to use" with the *fdog* framework. Species data must be present in the three directories listed below: * searchTaxa_dir (Contains sub-directories for proteome fasta files for each species) * coreTaxa_dir (Contains sub-directories for BLAST databases made with `makeblastdb` out of your proteomes) @@ -81,7 +81,7 @@ Within the data package we provide a set of 78 reference taxa. They can be autom For each species/taxon there is a sub-directory named in accordance to the naming schema ([Species acronym]@[NCBI ID]@[Proteome version]) -*fdog* is not limited to those 78 taxa. If needed the user can manually add further gene sets (multiple fasta format) using provided functions. +*fdog* is not limited to those 81 reference taxa. If needed the user can manually add further gene sets (multiple fasta format) using provided functions. ## Adding a new gene set into fDOG For adding **one gene set**, please use the `fdog.addTaxon` function: @@ -114,7 +114,7 @@ _**NOTE:** After adding new taxa into *fdog*, you should [check for the validity Any bug reports or comments, suggestions are highly appreciated. Please [open an issue on GitHub](https://github.com/BIONF/fDOG/issues/new) or be in touch via email. # How to cite -Ebersberger, I., Strauss, S. & von Haeseler, A. HaMStR: Profile hidden markov model based search for orthologs in ESTs. BMC Evol Biol 9, 157 (2009), [doi:10.1186/1471-2148-9-157](https://doi.org/10.1186/1471-2148-9-157) +Tran V, Langschied F, Muelbaier H, Dosch J, Arthen F, Balint M, Ebersberger I. 2025. Feature architecture-aware ortholog search with fDOG reveals the distribution of plant cell wall-degrading enzymes across life. Molecular Biology and Evolution:msaf120. https://doi.org/10.1093/molbev/msaf120 # Contributors - [Ingo Ebersberger](https://github.com/ebersber) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index a781a63..9520249 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -19,8 +19,8 @@ import os import argparse import subprocess -from ete3 import NCBITaxa -from pkg_resources import get_distribution +from ete4 import NCBITaxa +from importlib.metadata import version, PackageNotFoundError import time import fdog.libs.zzz as general_fn @@ -33,8 +33,8 @@ def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.', + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.', epilog="For more information on certain options, please refer to the wiki pages " "on github: https://github.com/BIONF/fDOG/wiki") required = parser.add_argument_group('Required arguments') diff --git a/fdog/setPaths.py b/fdog/setPaths.py index 3b6e501..7271427 100644 --- a/fdog/setPaths.py +++ b/fdog/setPaths.py @@ -19,7 +19,7 @@ import os import argparse -from pkg_resources import get_distribution +from importlib.metadata import version, PackageNotFoundError import fdog.libs.zzz as general_fn import fdog.checkData as check_data_fn @@ -65,8 +65,8 @@ def check_data(searchpath, corepath, annopath): def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('--searchpath', help='Path to search taxa folder (e.g. fdog_data/searchTaxa_dir)', action='store', default='', required=True) diff --git a/fdog/setupfDog.py b/fdog/setupfDog.py index 2de9c06..184de6f 100644 --- a/fdog/setupfDog.py +++ b/fdog/setupfDog.py @@ -22,9 +22,9 @@ import argparse import subprocess import shutil -from ete3 import NCBITaxa +from ete4 import NCBITaxa from pathlib import Path -from pkg_resources import get_distribution +from importlib.metadata import version, PackageNotFoundError import fdog.libs.zzz as general_fn import fdog.libs.fas as fas_fn @@ -147,25 +147,24 @@ def check_dependencies(fdogPath): def download_data(dataPath, resetData): """ Downloade pre-calculated fDOG data """ - data_fdog_file = "data_HaMStR-2019c.tar.gz" + data_fdog_file = "data_fDOG_2024.tar.gz" checksum_data = "1748371655 621731824 $data_fdog_file" genome_path = '%s/searchTaxa_dir' % dataPath Path(genome_path).mkdir(parents = True, exist_ok = True) - if len(general_fn.read_dir(genome_path)) < 1 or resetData: data_url = 'https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo' if os.path.exists(data_fdog_file) and resetData: os.remove(data_fdog_file) - # general_fn.download_file(data_url, data_fdog_file) - ####### temporary solution while the uni network does not work ######### - wgetCmd = 'wget "https://www.dropbox.com/scl/fi/t2ln18k0jthc3y74s591q/data_HaMStR-2019c.tar.gz?rlkey=c66nc3eslqyn2a6k6ey4e678r&st=plzvbllv&dl=0"' - try: - subprocess.run([wgetCmd], shell=True, check=True) - shutil.move("data_HaMStR-2019c.tar.gz?rlkey=c66nc3eslqyn2a6k6ey4e678r&st=plzvbllv&dl=0", "data_HaMStR-2019c.tar.gz") - except: - print('Problem occurred while download demo data from dropbox') - ######################################################################## + general_fn.download_file(data_url, data_fdog_file) + # ####### temporary solution while the uni network does not work ######### + # wgetCmd = 'wget "https://www.dropbox.com/scl/fi/t2ln18k0jthc3y74s591q/data_HaMStR-2019c.tar.gz?rlkey=c66nc3eslqyn2a6k6ey4e678r&st=plzvbllv&dl=0"' + # try: + # subprocess.run([wgetCmd], shell=True, check=True) + # shutil.move("data_HaMStR-2019c.tar.gz?rlkey=c66nc3eslqyn2a6k6ey4e678r&st=plzvbllv&dl=0", "data_HaMStR-2019c.tar.gz") + # except: + # print('Problem occurred while download demo data from dropbox') + # ######################################################################## try: print('Extracting %s...' % data_fdog_file) shutil.unpack_archive(data_fdog_file, dataPath, 'gztar') @@ -175,7 +174,7 @@ def download_data(dataPath, resetData): os.rename('%s/genome_dir' % dataPath, '%s/searchTaxa_dir' % dataPath) os.rename('%s/blast_dir' % dataPath, '%s/coreTaxa_dir' % dataPath) os.rename('%s/weight_dir' % dataPath, '%s/annotation_dir' % dataPath) - check_cmd = 'fdog.checkData -s %s/searchTaxa_dir -c %s/coreTaxa_dir -a %s/annotation_dir --reblast' % (dataPath, dataPath, dataPath) + check_cmd = 'fdog.checkData -s %s/searchTaxa_dir -c %s/coreTaxa_dir -a %s/annotation_dir --reblast --ignoreAnno' % (dataPath, dataPath, dataPath) try: print('Checking downloaded data...') subprocess.run([check_cmd], stdout = subprocess.DEVNULL, check = True, shell = True) @@ -201,8 +200,8 @@ def write_pathconfig(fdogPath, dataPath): def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-d', '--dataPath', help='Output path for fDOG data', action='store', default='', required=True) @@ -234,7 +233,7 @@ def main(): ### check if pathconfig file exists pathconfigFile = '%s/bin/pathconfig.yml' % fdogPath - demo_cmd = 'fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3' + demo_cmd = 'fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@qfo24_02' if os.path.exists(pathconfigFile) and not force: check_fas = 1 if not woFAS: @@ -247,7 +246,7 @@ def main(): print('You can test fDOG using the following command:\n%s --fasOff' % demo_cmd) sys.exit() - ### get ncbi taxonomy database for ete3 + ### get ncbi taxonomy database for ete4 print('*** Creating local NCBI taxonomy database...') ncbi = NCBITaxa() @@ -262,11 +261,18 @@ def main(): if check_conda_env() == True: req_file = '%s/data/conda_requirements.yml' % fdogPath print('=> Dependencies in %s' % req_file) - conda_install_cmd = 'conda install -c bioconda --file %s -y' % (req_file) + + install_cmd = f'install -c bioconda --file {req_file} -y' + if shutil.which("micromamba"): + install_cmd = f'micromamba {install_cmd}' + elif shutil.which("mamba"): + install_cmd = f'mamba {install_cmd}' + else: + install_cmd = f'conda {install_cmd}' try: - subprocess.call([conda_install_cmd], shell = True) + subprocess.call(install_cmd, shell=True) except: - sys.exit('\033[91mERROR: Cannot install conda packages in %s!\033[0m' % req_file) + sys.exit(f'\033[91mERROR: Cannot install conda packages in {req_file}!\033[0m') else: install_cmd = 'sudo apt-get install -y -qq ' sys.exit('\033[91mERROR: Please install these tools manually:\n%s\nusing the command: %s!\033[0m' % (', '.join(missing_tools), install_cmd)) diff --git a/fdog/showTaxa.py b/fdog/showTaxa.py index 7bb27c4..fd41ada 100644 --- a/fdog/showTaxa.py +++ b/fdog/showTaxa.py @@ -17,7 +17,7 @@ import sys import os -from ete3 import NCBITaxa +from ete4 import NCBITaxa import fdog.libs.zzz as general_fn diff --git a/setup.py b/setup.py index dfe2971..ba8a988 100644 --- a/setup.py +++ b/setup.py @@ -26,8 +26,8 @@ setup( name="fdog", - version="0.1.35", - python_requires='>=3.7.0', + version="1.0.0", + python_requires='>=3.12.0', description="Feature-aware Directed OrtholoG search tool", long_description=long_description, long_description_content_type="text/markdown", @@ -39,7 +39,7 @@ install_requires=[ 'biopython', 'tqdm', - 'ete3', + 'ete4', 'six', 'PyYAML', 'pyhmmer', From 5b458ea4c2ec00fa14b0693909002763dbd126d6 Mon Sep 17 00:00:00 2001 From: Hannah Date: Wed, 10 Sep 2025 21:25:20 +0200 Subject: [PATCH 216/229] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 96750e0..654f792 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ You will get a warning if any of the dependencies are not ready to use, please s *fdog* will run smoothly with the provided sample input file 'infile.fa' if everything is set correctly. ``` -fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3 +fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@qfo24_02 ``` The output files with the prefix `test` will be saved at your current working directory. You can have an overview about all available options with the command From ebe8e8a9ef09fc723ce22302bc7306cdfe828abe Mon Sep 17 00:00:00 2001 From: Hannah Date: Wed, 10 Sep 2025 21:28:58 +0200 Subject: [PATCH 217/229] Revert "Merge remote-tracking branch 'upstream/master' into fdog_goes_assembly" This reverts commit 7b656afa7743fee503a4698219541651e8d6bdf5. --- .github/workflows/github_build.yml | 10 +++++----- fdog/addTaxa.py | 7 ++++--- fdog/addTaxon.py | 6 +++--- fdog/checkData.py | 25 +++++++++++-------------- fdog/libs/addtaxon.py | 1 + fdog/libs/blast.py | 29 +++++++++++++++-------------- fdog/libs/corecompile.py | 2 +- fdog/libs/preparation.py | 22 ++++++++++++---------- fdog/libs/tree.py | 14 ++++++-------- fdog/mergeOutput.py | 6 +++--- fdog/removefDog.py | 6 +++--- fdog/runMulti.py | 8 ++++---- 12 files changed, 68 insertions(+), 68 deletions(-) diff --git a/.github/workflows/github_build.yml b/.github/workflows/github_build.yml index 856dbfb..10127d8 100644 --- a/.github/workflows/github_build.yml +++ b/.github/workflows/github_build.yml @@ -18,7 +18,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.12] + python-version: [3.8] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} @@ -44,17 +44,17 @@ jobs: echo "TEST fdog.setup" fdog.setup -d /home/runner/work/fDOG/fDOG/dt --woFAS echo "TEST fdog.checkData" - fdog.checkData -s /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir -c /home/runner/work/fDOG/fDOG/dt/coreTaxa_dir -a /home/runner/work/fDOG/fDOG/dt/annotation_dir --reblast --ignoreAnno + fdog.checkData -s /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir -c /home/runner/work/fDOG/fDOG/dt/coreTaxa_dir -a /home/runner/work/fDOG/fDOG/dt/annotation_dir --reblast echo "TEST fdog.showTaxa" fdog.showTaxa echo "TEST fdog.run" - fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@qfo24_02 --fasOff --group mammalia + fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3 --fasOff --group mammalia mkdir seeds path=$(fdog.setup -d ./ --getSourcepath); a="1 2 3"; for i in ${a[@]}; do cp $path/data/infile.fa seeds/$i.fa; done echo "TEST fdogs.run" - fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@qfo24_02 --fasOff --searchTaxa PARTE@5888@qfo24_02,THAPS@35128@qfo24_02 --hmmScoreType sequence + fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@3 --fasOff --searchTaxa PARTE@5888@3,THAPS@35128@3 --hmmScoreType sequence echo "TEST fdog.addTaxon" - head /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir/HUMAN@9606@qfo24_02/HUMAN@9606@qfo24_02.fa > hm.fa + head /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir/HUMAN@9606@3/HUMAN@9606@3.fa > hm.fa fdog.addTaxon -f hm.fa -i 9606 -o ./ -c -a ls - name: Deploy diff --git a/fdog/addTaxa.py b/fdog/addTaxa.py index e5982c8..c03a9a8 100644 --- a/fdog/addTaxa.py +++ b/fdog/addTaxa.py @@ -26,11 +26,12 @@ from Bio import SeqIO import multiprocessing as mp from tqdm import tqdm +from ete3 import NCBITaxa import re import shutil from datetime import datetime import time -from importlib.metadata import version, PackageNotFoundError +from pkg_resources import get_distribution from collections import OrderedDict import fdog.libs.zzz as general_fn @@ -65,8 +66,8 @@ def parse_map_file(mapping_file, folIn): def main(): - fdog_version = version("fdog") - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-i', '--input', help='Path to input folder', action='store', default='', required=True) diff --git a/fdog/addTaxon.py b/fdog/addTaxon.py index 3751f2e..bd17fe0 100755 --- a/fdog/addTaxon.py +++ b/fdog/addTaxon.py @@ -26,7 +26,7 @@ import shutil import multiprocessing as mp from datetime import datetime -from importlib.metadata import version, PackageNotFoundError +from pkg_resources import get_distribution import fdog.libs.zzz as general_fn import fdog.libs.tree as tree_fn @@ -34,8 +34,8 @@ def main(): - fdog_version = version("fdog") - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-f', '--fasta', help='FASTA file of input taxon', action='store', default='', required=True) diff --git a/fdog/checkData.py b/fdog/checkData.py index 755240b..74b3b93 100644 --- a/fdog/checkData.py +++ b/fdog/checkData.py @@ -26,12 +26,14 @@ import subprocess import shutil from Bio import SeqIO -from ete4 import NCBITaxa +from ete3 import NCBITaxa import re from datetime import datetime import multiprocessing as mp from tqdm import tqdm -from importlib.metadata import version, PackageNotFoundError +from pkg_resources import get_distribution +from Bio.Blast.Applications import NcbiblastpCommandline + import fdog.libs.zzz as general_fn import fdog.libs.blast as blast_fn @@ -174,18 +176,13 @@ def run_check_fasta(checkDir, replace, delete, concat): def check_blastdb(args): """ Check for outdated blastdb """ - query, taxon, coreTaxa_dir, searchTaxa_dir = args - blast_db = f"{coreTaxa_dir}/{taxon}/{taxon}" - + (query, taxon, coreTaxa_dir, searchTaxa_dir) = args + blast_db = '%s/%s/%s' % (coreTaxa_dir, taxon, taxon) try: - result = subprocess.run( - ["blastp", "-query", query, "-db", blast_db], - capture_output=True, text=True, check=True - ) - return(result.stdout) - except subprocess.CalledProcessError as e: + blastp_cline = NcbiblastpCommandline(query = query, db = blast_db) + stdout, stderr = blastp_cline() + except: return([query, blast_db]) - fai_in_genome = "%s/%s/%s.fa.fai" % (searchTaxa_dir, taxon, taxon) fai_in_blast = "%s/%s/%s.fa.fai" % (coreTaxa_dir, taxon, taxon) # check if fai_in_blast is a valid symlink @@ -421,8 +418,8 @@ def run_check(args): return(caution) def main(): - fdog_version = version("fdog") - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') parser.add_argument('-s', '--searchTaxa_dir', help='Path to search taxa directory (e.g. fdog_dataPath/searchTaxa_dir)', action='store', default='') parser.add_argument('-c', '--coreTaxa_dir', help='Path to blastDB directory (e.g. fdog_dataPath/coreTaxa_dir)', action='store', default='') parser.add_argument('-a', '--annotation_dir', help='Path to feature annotation directory (e.g. fdog_dataPath/annotation_dir)', action='store', default='') diff --git a/fdog/libs/addtaxon.py b/fdog/libs/addtaxon.py index 2da4066..995eeb1 100644 --- a/fdog/libs/addtaxon.py +++ b/fdog/libs/addtaxon.py @@ -20,6 +20,7 @@ from pathlib import Path from Bio import SeqIO import subprocess +from ete3 import NCBITaxa import re from datetime import datetime from collections import OrderedDict diff --git a/fdog/libs/blast.py b/fdog/libs/blast.py index e40532a..2cb4609 100644 --- a/fdog/libs/blast.py +++ b/fdog/libs/blast.py @@ -17,6 +17,7 @@ import os import sys +from Bio.Blast.Applications import NcbiblastpCommandline import xml.etree.ElementTree as ET import subprocess @@ -28,21 +29,21 @@ def do_blastsearch( """ Perform blastp search for a query fasta file Return an XML string contains blast result """ - filter_value = "yes" if lowComplexityFilter else "no" + filter = 'no' + if lowComplexityFilter == True: + filter = 'yes' try: - cmd = [ - "blastp", - "-query", query, - "-db", blast_db, - "-evalue", str(evalBlast), - "-seg", filter_value, - "-max_target_seqs", "10", - "-outfmt", "5" - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - return result.stdout - except subprocess.CalledProcessError as e: - sys.exit(f"ERROR: Error running BLASTP search for {query} against {blast_db}\n{e.stderr}") + blastp_cline = NcbiblastpCommandline( + query = query, db = blast_db, evalue = evalBlast, seg = filter, + max_target_seqs = 10, outfmt = 5) + stdout, stderr = blastp_cline() + return(stdout) + except: + sys.exit( + 'ERROR: Error running blastp search for %s against %s\n%s' + % (query, blast_db, NcbiblastpCommandline( + query = query, db = blast_db, evalue = evalBlast, seg = filter, + max_target_seqs = 10, outfmt = 5))) def parse_blast_xml(blast_output): diff --git a/fdog/libs/corecompile.py b/fdog/libs/corecompile.py index 99e9bbd..bd57e07 100644 --- a/fdog/libs/corecompile.py +++ b/fdog/libs/corecompile.py @@ -19,7 +19,7 @@ import os import shutil from pathlib import Path -from ete4 import NCBITaxa +from ete3 import NCBITaxa from Bio import SeqIO import time diff --git a/fdog/libs/preparation.py b/fdog/libs/preparation.py index f18e141..2d79f82 100644 --- a/fdog/libs/preparation.py +++ b/fdog/libs/preparation.py @@ -17,10 +17,10 @@ import sys import os -import subprocess from pathlib import Path from Bio import SeqIO -from ete4 import NCBITaxa +from Bio.Blast.Applications import NcbiblastpCommandline +from ete3 import NCBITaxa import fdog.libs.zzz as general_fn import fdog.libs.fasta as fasta_fn @@ -107,15 +107,17 @@ def check_input(args): def check_blast_version(corepath, refspec): """ Check if blast DBs in corepath is compatible with blastp version """ - fdog_path = os.path.realpath(__file__).replace('/libs/preparation.py', '') - query = os.path.join(fdog_path, 'data', 'infile.fa') - blast_db = os.path.join(corepath, refspec, refspec) + fdog_path = os.path.realpath(__file__).replace('/libs/preparation.py','') + query = fdog_path + '/data/infile.fa' + blast_db = '%s/%s/%s' % (corepath, refspec, refspec) try: - cmd = ["blastp", "-query", query, "-db", blast_db] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - except subprocess.CalledProcessError as e: - sys.exit(f"ERROR: Error running BLAST (probably conflict with BLAST DB versions)\n{e.stderr}") - + blastp_cline = NcbiblastpCommandline( + query = query, db = blast_db) + stdout, stderr = blastp_cline() + except: + sys.exit( + 'ERROR: Error running blast (probably conflict with BLAST DBs versions)\n%s' + % (NcbiblastpCommandline(query = query, db = blast_db))) def check_ranks_core_taxa(corepath, refspec, minDist, maxDist): """ Check if refspec (or all core taxa) have a valid minDist and maxDist tax ID diff --git a/fdog/libs/tree.py b/fdog/libs/tree.py index 57efcaf..b2de19f 100644 --- a/fdog/libs/tree.py +++ b/fdog/libs/tree.py @@ -16,7 +16,7 @@ ####################################################################### import re -from ete4 import NCBITaxa +from ete3 import NCBITaxa import fdog.libs.zzz as general_fn @@ -57,9 +57,8 @@ def get_ancestor(id1, id2, ncbi): Return dictionary {ancestor_id: ancestor_rank} """ tree = ncbi.get_topology([id1, id2], intermediate_nodes = False) - ancestor_name = tree.common_ancestor(id1, id2) - ancestor_id = int(ancestor_name.name) - return(ncbi.get_rank([ancestor_id])) + ancestor = tree.get_common_ancestor(id1, id2).name + return(ncbi.get_rank([ancestor])) def check_common_ancestor(ref_id, ancestor, minDist, maxDist, ncbi): @@ -69,7 +68,6 @@ def check_common_ancestor(ref_id, ancestor, minDist, maxDist, ncbi): """ ref_lineage = ncbi.get_lineage(ref_id) (min_ref, max_ref) = get_rank_range(ref_lineage, minDist, maxDist, ncbi) - ancestor = int(ancestor) if not ancestor in ref_lineage: return(0) ancestor_index = len(ref_lineage) - ref_lineage.index(ancestor) - 1 @@ -80,7 +78,7 @@ def check_common_ancestor(ref_id, ancestor, minDist, maxDist, ncbi): def remove_clade(tree, node_id): """ Remove a clade from a tree """ - removed_clade = list(tree.search_nodes(name = str(node_id)))[0] + removed_clade = tree.search_nodes(name = str(node_id))[0] removed_node = removed_clade.detach() return(tree) @@ -98,12 +96,12 @@ def get_leaves_dict(spec_lineage, tree, min_index, max_index): for i in range(len(spec_lineage)): if i >= min_index and i <= max_index: curr_node = spec_lineage[i] - node = list(tree.search_nodes(name = str(curr_node))) + node = tree.search_nodes(name = str(curr_node)) if len(node) > 0: for leaf in node: node_dict[spec_lineage[i]] = [] for t in leaf.traverse(): - if t.is_leaf: + if t.is_leaf(): if not t.name in already_added: already_added.append(t.name) node_dict[spec_lineage[i]].append(t.name) diff --git a/fdog/mergeOutput.py b/fdog/mergeOutput.py index ffebb5c..fa36d4d 100644 --- a/fdog/mergeOutput.py +++ b/fdog/mergeOutput.py @@ -21,7 +21,7 @@ from os import listdir as ldir import argparse import yaml -from importlib.metadata import version, PackageNotFoundError +from pkg_resources import get_distribution from Bio import SeqIO def createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out): @@ -37,8 +37,8 @@ def createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out): def main(): - fdog_version = version("fdog") - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') parser.add_argument('-i', '--input', help='Input directory, where all single output (o_g.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', action='store', default='', required=True) diff --git a/fdog/removefDog.py b/fdog/removefDog.py index bd34c90..9a28b41 100644 --- a/fdog/removefDog.py +++ b/fdog/removefDog.py @@ -20,7 +20,7 @@ import argparse import subprocess import shutil -from importlib.metadata import version, PackageNotFoundError +from pkg_resources import get_distribution import fdog.setupfDog as setupfDog_fn @@ -48,8 +48,8 @@ def query_yes_no(question, default='yes'): def main(): - fdog_version = version("fdog") - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') parser.add_argument('--all', help='Remove fdog together with all files/data within the installed fdog directory', action='store_true', default=False) args = parser.parse_args() data = args.all diff --git a/fdog/runMulti.py b/fdog/runMulti.py index b1b3587..1bce014 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -25,8 +25,8 @@ import shutil import multiprocessing as mp from tqdm import tqdm -from ete4 import NCBITaxa -from importlib.metadata import version, PackageNotFoundError +from ete3 import NCBITaxa +from pkg_resources import get_distribution import time import fdog.libs.zzz as general_fn @@ -161,8 +161,8 @@ def join_outputs(outpath, jobName, seeds, keep, silentOff): def main(): - fdog_version = version("fdog") - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.', + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.', epilog="For more information on certain options, please refer to the wiki pages " "on github: https://github.com/BIONF/fDOG/wiki") required = parser.add_argument_group('Required arguments') From 3c4972801419f164c531218315a11f120627d8dc Mon Sep 17 00:00:00 2001 From: Hannah Date: Wed, 10 Sep 2025 21:29:05 +0200 Subject: [PATCH 218/229] Revert "update" This reverts commit 5414d1b0e9596cf4b7e4056e33c0d5c2900744f1. --- README.md | 10 +++++----- fdog/runSingle.py | 8 ++++---- fdog/setPaths.py | 6 +++--- fdog/setupfDog.py | 48 +++++++++++++++++++++-------------------------- fdog/showTaxa.py | 2 +- setup.py | 6 +++--- 6 files changed, 37 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 654f792..2345438 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # fDOG - Feature-aware Directed OrtholoG search -[![published in: MBE](https://img.shields.io/badge/published%20in-MBE-ff69b4)](https://doi.org/10.1093/molbev/msaf120) [![PyPI version](https://badge.fury.io/py/fdog.svg)](https://pypi.org/project/fdog/) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) +[![Build Status](https://travis-ci.com/BIONF/fDOG.svg?branch=master)](https://travis-ci.com/BIONF/fDOG) ![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg) # Poster fDOG - Assembly @@ -21,7 +21,7 @@ https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf # How to install -*fDOG* tool is distributed as a python package called *fdog*. It is compatible with [Python ≥ v3.12](https://www.python.org/downloads/). +*fDOG* tool is distributed as a python package called *fdog*. It is compatible with [Python ≥ v3.7](https://www.python.org/downloads/). ## Install the fDOG package You can install *fdog* using `pip`: @@ -73,7 +73,7 @@ Please find more information in [our wiki](https://github.com/BIONF/fDOG/wiki) t # fDOG data set -Within the data package we provide a set of [81 reference taxa](https://ftp.ebi.ac.uk/pub/databases/reference_proteomes/QfO/QfO_release_2024_02.tar.gz). They will be automatically downloaded during the setup. This data comes "ready to use" with the *fdog* framework. Species data must be present in the three directories listed below: +Within the data package we provide a set of 78 reference taxa. They can be automatically downloaded during the setup. This data comes "ready to use" with the *fdog* framework. Species data must be present in the three directories listed below: * searchTaxa_dir (Contains sub-directories for proteome fasta files for each species) * coreTaxa_dir (Contains sub-directories for BLAST databases made with `makeblastdb` out of your proteomes) @@ -81,7 +81,7 @@ Within the data package we provide a set of [81 reference taxa](https://ftp.ebi. For each species/taxon there is a sub-directory named in accordance to the naming schema ([Species acronym]@[NCBI ID]@[Proteome version]) -*fdog* is not limited to those 81 reference taxa. If needed the user can manually add further gene sets (multiple fasta format) using provided functions. +*fdog* is not limited to those 78 taxa. If needed the user can manually add further gene sets (multiple fasta format) using provided functions. ## Adding a new gene set into fDOG For adding **one gene set**, please use the `fdog.addTaxon` function: @@ -114,7 +114,7 @@ _**NOTE:** After adding new taxa into *fdog*, you should [check for the validity Any bug reports or comments, suggestions are highly appreciated. Please [open an issue on GitHub](https://github.com/BIONF/fDOG/issues/new) or be in touch via email. # How to cite -Tran V, Langschied F, Muelbaier H, Dosch J, Arthen F, Balint M, Ebersberger I. 2025. Feature architecture-aware ortholog search with fDOG reveals the distribution of plant cell wall-degrading enzymes across life. Molecular Biology and Evolution:msaf120. https://doi.org/10.1093/molbev/msaf120 +Ebersberger, I., Strauss, S. & von Haeseler, A. HaMStR: Profile hidden markov model based search for orthologs in ESTs. BMC Evol Biol 9, 157 (2009), [doi:10.1186/1471-2148-9-157](https://doi.org/10.1186/1471-2148-9-157) # Contributors - [Ingo Ebersberger](https://github.com/ebersber) diff --git a/fdog/runSingle.py b/fdog/runSingle.py index 9520249..a781a63 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -19,8 +19,8 @@ import os import argparse import subprocess -from ete4 import NCBITaxa -from importlib.metadata import version, PackageNotFoundError +from ete3 import NCBITaxa +from pkg_resources import get_distribution import time import fdog.libs.zzz as general_fn @@ -33,8 +33,8 @@ def main(): - fdog_version = version("fdog") - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.', + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.', epilog="For more information on certain options, please refer to the wiki pages " "on github: https://github.com/BIONF/fDOG/wiki") required = parser.add_argument_group('Required arguments') diff --git a/fdog/setPaths.py b/fdog/setPaths.py index 7271427..3b6e501 100644 --- a/fdog/setPaths.py +++ b/fdog/setPaths.py @@ -19,7 +19,7 @@ import os import argparse -from importlib.metadata import version, PackageNotFoundError +from pkg_resources import get_distribution import fdog.libs.zzz as general_fn import fdog.checkData as check_data_fn @@ -65,8 +65,8 @@ def check_data(searchpath, corepath, annopath): def main(): - fdog_version = version("fdog") - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('--searchpath', help='Path to search taxa folder (e.g. fdog_data/searchTaxa_dir)', action='store', default='', required=True) diff --git a/fdog/setupfDog.py b/fdog/setupfDog.py index 184de6f..2de9c06 100644 --- a/fdog/setupfDog.py +++ b/fdog/setupfDog.py @@ -22,9 +22,9 @@ import argparse import subprocess import shutil -from ete4 import NCBITaxa +from ete3 import NCBITaxa from pathlib import Path -from importlib.metadata import version, PackageNotFoundError +from pkg_resources import get_distribution import fdog.libs.zzz as general_fn import fdog.libs.fas as fas_fn @@ -147,24 +147,25 @@ def check_dependencies(fdogPath): def download_data(dataPath, resetData): """ Downloade pre-calculated fDOG data """ - data_fdog_file = "data_fDOG_2024.tar.gz" + data_fdog_file = "data_HaMStR-2019c.tar.gz" checksum_data = "1748371655 621731824 $data_fdog_file" genome_path = '%s/searchTaxa_dir' % dataPath Path(genome_path).mkdir(parents = True, exist_ok = True) + if len(general_fn.read_dir(genome_path)) < 1 or resetData: data_url = 'https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo' if os.path.exists(data_fdog_file) and resetData: os.remove(data_fdog_file) - general_fn.download_file(data_url, data_fdog_file) - # ####### temporary solution while the uni network does not work ######### - # wgetCmd = 'wget "https://www.dropbox.com/scl/fi/t2ln18k0jthc3y74s591q/data_HaMStR-2019c.tar.gz?rlkey=c66nc3eslqyn2a6k6ey4e678r&st=plzvbllv&dl=0"' - # try: - # subprocess.run([wgetCmd], shell=True, check=True) - # shutil.move("data_HaMStR-2019c.tar.gz?rlkey=c66nc3eslqyn2a6k6ey4e678r&st=plzvbllv&dl=0", "data_HaMStR-2019c.tar.gz") - # except: - # print('Problem occurred while download demo data from dropbox') - # ######################################################################## + # general_fn.download_file(data_url, data_fdog_file) + ####### temporary solution while the uni network does not work ######### + wgetCmd = 'wget "https://www.dropbox.com/scl/fi/t2ln18k0jthc3y74s591q/data_HaMStR-2019c.tar.gz?rlkey=c66nc3eslqyn2a6k6ey4e678r&st=plzvbllv&dl=0"' + try: + subprocess.run([wgetCmd], shell=True, check=True) + shutil.move("data_HaMStR-2019c.tar.gz?rlkey=c66nc3eslqyn2a6k6ey4e678r&st=plzvbllv&dl=0", "data_HaMStR-2019c.tar.gz") + except: + print('Problem occurred while download demo data from dropbox') + ######################################################################## try: print('Extracting %s...' % data_fdog_file) shutil.unpack_archive(data_fdog_file, dataPath, 'gztar') @@ -174,7 +175,7 @@ def download_data(dataPath, resetData): os.rename('%s/genome_dir' % dataPath, '%s/searchTaxa_dir' % dataPath) os.rename('%s/blast_dir' % dataPath, '%s/coreTaxa_dir' % dataPath) os.rename('%s/weight_dir' % dataPath, '%s/annotation_dir' % dataPath) - check_cmd = 'fdog.checkData -s %s/searchTaxa_dir -c %s/coreTaxa_dir -a %s/annotation_dir --reblast --ignoreAnno' % (dataPath, dataPath, dataPath) + check_cmd = 'fdog.checkData -s %s/searchTaxa_dir -c %s/coreTaxa_dir -a %s/annotation_dir --reblast' % (dataPath, dataPath, dataPath) try: print('Checking downloaded data...') subprocess.run([check_cmd], stdout = subprocess.DEVNULL, check = True, shell = True) @@ -200,8 +201,8 @@ def write_pathconfig(fdogPath, dataPath): def main(): - fdog_version = version("fdog") - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-d', '--dataPath', help='Output path for fDOG data', action='store', default='', required=True) @@ -233,7 +234,7 @@ def main(): ### check if pathconfig file exists pathconfigFile = '%s/bin/pathconfig.yml' % fdogPath - demo_cmd = 'fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@qfo24_02' + demo_cmd = 'fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3' if os.path.exists(pathconfigFile) and not force: check_fas = 1 if not woFAS: @@ -246,7 +247,7 @@ def main(): print('You can test fDOG using the following command:\n%s --fasOff' % demo_cmd) sys.exit() - ### get ncbi taxonomy database for ete4 + ### get ncbi taxonomy database for ete3 print('*** Creating local NCBI taxonomy database...') ncbi = NCBITaxa() @@ -261,18 +262,11 @@ def main(): if check_conda_env() == True: req_file = '%s/data/conda_requirements.yml' % fdogPath print('=> Dependencies in %s' % req_file) - - install_cmd = f'install -c bioconda --file {req_file} -y' - if shutil.which("micromamba"): - install_cmd = f'micromamba {install_cmd}' - elif shutil.which("mamba"): - install_cmd = f'mamba {install_cmd}' - else: - install_cmd = f'conda {install_cmd}' + conda_install_cmd = 'conda install -c bioconda --file %s -y' % (req_file) try: - subprocess.call(install_cmd, shell=True) + subprocess.call([conda_install_cmd], shell = True) except: - sys.exit(f'\033[91mERROR: Cannot install conda packages in {req_file}!\033[0m') + sys.exit('\033[91mERROR: Cannot install conda packages in %s!\033[0m' % req_file) else: install_cmd = 'sudo apt-get install -y -qq ' sys.exit('\033[91mERROR: Please install these tools manually:\n%s\nusing the command: %s!\033[0m' % (', '.join(missing_tools), install_cmd)) diff --git a/fdog/showTaxa.py b/fdog/showTaxa.py index fd41ada..7bb27c4 100644 --- a/fdog/showTaxa.py +++ b/fdog/showTaxa.py @@ -17,7 +17,7 @@ import sys import os -from ete4 import NCBITaxa +from ete3 import NCBITaxa import fdog.libs.zzz as general_fn diff --git a/setup.py b/setup.py index ba8a988..dfe2971 100644 --- a/setup.py +++ b/setup.py @@ -26,8 +26,8 @@ setup( name="fdog", - version="1.0.0", - python_requires='>=3.12.0', + version="0.1.35", + python_requires='>=3.7.0', description="Feature-aware Directed OrtholoG search tool", long_description=long_description, long_description_content_type="text/markdown", @@ -39,7 +39,7 @@ install_requires=[ 'biopython', 'tqdm', - 'ete4', + 'ete3', 'six', 'PyYAML', 'pyhmmer', From 26ffa5428b9f2ea4a047e7622b88c5e844a42b44 Mon Sep 17 00:00:00 2001 From: Hannah Date: Wed, 10 Sep 2025 21:29:10 +0200 Subject: [PATCH 219/229] Revert "Update README.md" This reverts commit 5b458ea4c2ec00fa14b0693909002763dbd126d6. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2345438..082a46d 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ You will get a warning if any of the dependencies are not ready to use, please s *fdog* will run smoothly with the provided sample input file 'infile.fa' if everything is set correctly. ``` -fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@qfo24_02 +fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3 ``` The output files with the prefix `test` will be saved at your current working directory. You can have an overview about all available options with the command From ec1eec6ebf26c1735dcf769e074d2c2ba585d51e Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+HannahBioI@users.noreply.github.com> Date: Sun, 14 Sep 2025 16:06:00 +0200 Subject: [PATCH 220/229] fDOG update (#55) * removed bio.applications * version bump * update python version in github build * Update github_build.yml (#50) * replaced pkg_resources by importlib.metadata * replaced pkg_resources by importlib.metadata * Update README.md * replaced bio.applications in checkData * added option to use micromamba and mamba install * add option to use micromamba, mamba or conda * replace ete3 by ete4 * replace ete3 by ete4 * replace ete3 by ete4 * replace ete3 by ete4 * replace ete3 by ete4 * version bump * Update README.md * Update README.md (#53) * Update README.md * Update README.md (#54) * Update README.md --------- Co-authored-by: trvinh --- .github/workflows/github_build.yml | 10 +++---- README.md | 12 ++++---- fdog/addTaxa.py | 7 ++--- fdog/addTaxon.py | 6 ++-- fdog/checkData.py | 25 +++++++++------- fdog/libs/addtaxon.py | 1 - fdog/libs/blast.py | 29 +++++++++--------- fdog/libs/corecompile.py | 2 +- fdog/libs/preparation.py | 22 +++++++------- fdog/libs/tree.py | 14 +++++---- fdog/mergeOutput.py | 6 ++-- fdog/removefDog.py | 6 ++-- fdog/runMulti.py | 8 ++--- fdog/runSingle.py | 8 ++--- fdog/setPaths.py | 6 ++-- fdog/setupfDog.py | 48 +++++++++++++++++------------- fdog/showTaxa.py | 2 +- setup.py | 6 ++-- 18 files changed, 112 insertions(+), 106 deletions(-) diff --git a/.github/workflows/github_build.yml b/.github/workflows/github_build.yml index 10127d8..856dbfb 100644 --- a/.github/workflows/github_build.yml +++ b/.github/workflows/github_build.yml @@ -18,7 +18,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8] + python-version: [3.12] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} @@ -44,17 +44,17 @@ jobs: echo "TEST fdog.setup" fdog.setup -d /home/runner/work/fDOG/fDOG/dt --woFAS echo "TEST fdog.checkData" - fdog.checkData -s /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir -c /home/runner/work/fDOG/fDOG/dt/coreTaxa_dir -a /home/runner/work/fDOG/fDOG/dt/annotation_dir --reblast + fdog.checkData -s /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir -c /home/runner/work/fDOG/fDOG/dt/coreTaxa_dir -a /home/runner/work/fDOG/fDOG/dt/annotation_dir --reblast --ignoreAnno echo "TEST fdog.showTaxa" fdog.showTaxa echo "TEST fdog.run" - fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3 --fasOff --group mammalia + fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@qfo24_02 --fasOff --group mammalia mkdir seeds path=$(fdog.setup -d ./ --getSourcepath); a="1 2 3"; for i in ${a[@]}; do cp $path/data/infile.fa seeds/$i.fa; done echo "TEST fdogs.run" - fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@3 --fasOff --searchTaxa PARTE@5888@3,THAPS@35128@3 --hmmScoreType sequence + fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@qfo24_02 --fasOff --searchTaxa PARTE@5888@qfo24_02,THAPS@35128@qfo24_02 --hmmScoreType sequence echo "TEST fdog.addTaxon" - head /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir/HUMAN@9606@3/HUMAN@9606@3.fa > hm.fa + head /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir/HUMAN@9606@qfo24_02/HUMAN@9606@qfo24_02.fa > hm.fa fdog.addTaxon -f hm.fa -i 9606 -o ./ -c -a ls - name: Deploy diff --git a/README.md b/README.md index 082a46d..654f792 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # fDOG - Feature-aware Directed OrtholoG search +[![published in: MBE](https://img.shields.io/badge/published%20in-MBE-ff69b4)](https://doi.org/10.1093/molbev/msaf120) [![PyPI version](https://badge.fury.io/py/fdog.svg)](https://pypi.org/project/fdog/) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) -[![Build Status](https://travis-ci.com/BIONF/fDOG.svg?branch=master)](https://travis-ci.com/BIONF/fDOG) ![Github Build](https://github.com/BIONF/fDOG/workflows/build/badge.svg) # Poster fDOG - Assembly @@ -21,7 +21,7 @@ https://github.com/BIONF/fDOG/blob/gh-pages/www/Poster_fDOG_Assembly.pdf # How to install -*fDOG* tool is distributed as a python package called *fdog*. It is compatible with [Python ≥ v3.7](https://www.python.org/downloads/). +*fDOG* tool is distributed as a python package called *fdog*. It is compatible with [Python ≥ v3.12](https://www.python.org/downloads/). ## Install the fDOG package You can install *fdog* using `pip`: @@ -61,7 +61,7 @@ You will get a warning if any of the dependencies are not ready to use, please s *fdog* will run smoothly with the provided sample input file 'infile.fa' if everything is set correctly. ``` -fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3 +fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@qfo24_02 ``` The output files with the prefix `test` will be saved at your current working directory. You can have an overview about all available options with the command @@ -73,7 +73,7 @@ Please find more information in [our wiki](https://github.com/BIONF/fDOG/wiki) t # fDOG data set -Within the data package we provide a set of 78 reference taxa. They can be automatically downloaded during the setup. This data comes "ready to use" with the *fdog* framework. Species data must be present in the three directories listed below: +Within the data package we provide a set of [81 reference taxa](https://ftp.ebi.ac.uk/pub/databases/reference_proteomes/QfO/QfO_release_2024_02.tar.gz). They will be automatically downloaded during the setup. This data comes "ready to use" with the *fdog* framework. Species data must be present in the three directories listed below: * searchTaxa_dir (Contains sub-directories for proteome fasta files for each species) * coreTaxa_dir (Contains sub-directories for BLAST databases made with `makeblastdb` out of your proteomes) @@ -81,7 +81,7 @@ Within the data package we provide a set of 78 reference taxa. They can be autom For each species/taxon there is a sub-directory named in accordance to the naming schema ([Species acronym]@[NCBI ID]@[Proteome version]) -*fdog* is not limited to those 78 taxa. If needed the user can manually add further gene sets (multiple fasta format) using provided functions. +*fdog* is not limited to those 81 reference taxa. If needed the user can manually add further gene sets (multiple fasta format) using provided functions. ## Adding a new gene set into fDOG For adding **one gene set**, please use the `fdog.addTaxon` function: @@ -114,7 +114,7 @@ _**NOTE:** After adding new taxa into *fdog*, you should [check for the validity Any bug reports or comments, suggestions are highly appreciated. Please [open an issue on GitHub](https://github.com/BIONF/fDOG/issues/new) or be in touch via email. # How to cite -Ebersberger, I., Strauss, S. & von Haeseler, A. HaMStR: Profile hidden markov model based search for orthologs in ESTs. BMC Evol Biol 9, 157 (2009), [doi:10.1186/1471-2148-9-157](https://doi.org/10.1186/1471-2148-9-157) +Tran V, Langschied F, Muelbaier H, Dosch J, Arthen F, Balint M, Ebersberger I. 2025. Feature architecture-aware ortholog search with fDOG reveals the distribution of plant cell wall-degrading enzymes across life. Molecular Biology and Evolution:msaf120. https://doi.org/10.1093/molbev/msaf120 # Contributors - [Ingo Ebersberger](https://github.com/ebersber) diff --git a/fdog/addTaxa.py b/fdog/addTaxa.py index c03a9a8..e5982c8 100644 --- a/fdog/addTaxa.py +++ b/fdog/addTaxa.py @@ -26,12 +26,11 @@ from Bio import SeqIO import multiprocessing as mp from tqdm import tqdm -from ete3 import NCBITaxa import re import shutil from datetime import datetime import time -from pkg_resources import get_distribution +from importlib.metadata import version, PackageNotFoundError from collections import OrderedDict import fdog.libs.zzz as general_fn @@ -66,8 +65,8 @@ def parse_map_file(mapping_file, folIn): def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-i', '--input', help='Path to input folder', action='store', default='', required=True) diff --git a/fdog/addTaxon.py b/fdog/addTaxon.py index bd17fe0..3751f2e 100755 --- a/fdog/addTaxon.py +++ b/fdog/addTaxon.py @@ -26,7 +26,7 @@ import shutil import multiprocessing as mp from datetime import datetime -from pkg_resources import get_distribution +from importlib.metadata import version, PackageNotFoundError import fdog.libs.zzz as general_fn import fdog.libs.tree as tree_fn @@ -34,8 +34,8 @@ def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-f', '--fasta', help='FASTA file of input taxon', action='store', default='', required=True) diff --git a/fdog/checkData.py b/fdog/checkData.py index 74b3b93..755240b 100644 --- a/fdog/checkData.py +++ b/fdog/checkData.py @@ -26,14 +26,12 @@ import subprocess import shutil from Bio import SeqIO -from ete3 import NCBITaxa +from ete4 import NCBITaxa import re from datetime import datetime import multiprocessing as mp from tqdm import tqdm -from pkg_resources import get_distribution -from Bio.Blast.Applications import NcbiblastpCommandline - +from importlib.metadata import version, PackageNotFoundError import fdog.libs.zzz as general_fn import fdog.libs.blast as blast_fn @@ -176,13 +174,18 @@ def run_check_fasta(checkDir, replace, delete, concat): def check_blastdb(args): """ Check for outdated blastdb """ - (query, taxon, coreTaxa_dir, searchTaxa_dir) = args - blast_db = '%s/%s/%s' % (coreTaxa_dir, taxon, taxon) + query, taxon, coreTaxa_dir, searchTaxa_dir = args + blast_db = f"{coreTaxa_dir}/{taxon}/{taxon}" + try: - blastp_cline = NcbiblastpCommandline(query = query, db = blast_db) - stdout, stderr = blastp_cline() - except: + result = subprocess.run( + ["blastp", "-query", query, "-db", blast_db], + capture_output=True, text=True, check=True + ) + return(result.stdout) + except subprocess.CalledProcessError as e: return([query, blast_db]) + fai_in_genome = "%s/%s/%s.fa.fai" % (searchTaxa_dir, taxon, taxon) fai_in_blast = "%s/%s/%s.fa.fai" % (coreTaxa_dir, taxon, taxon) # check if fai_in_blast is a valid symlink @@ -418,8 +421,8 @@ def run_check(args): return(caution) def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') parser.add_argument('-s', '--searchTaxa_dir', help='Path to search taxa directory (e.g. fdog_dataPath/searchTaxa_dir)', action='store', default='') parser.add_argument('-c', '--coreTaxa_dir', help='Path to blastDB directory (e.g. fdog_dataPath/coreTaxa_dir)', action='store', default='') parser.add_argument('-a', '--annotation_dir', help='Path to feature annotation directory (e.g. fdog_dataPath/annotation_dir)', action='store', default='') diff --git a/fdog/libs/addtaxon.py b/fdog/libs/addtaxon.py index 995eeb1..2da4066 100644 --- a/fdog/libs/addtaxon.py +++ b/fdog/libs/addtaxon.py @@ -20,7 +20,6 @@ from pathlib import Path from Bio import SeqIO import subprocess -from ete3 import NCBITaxa import re from datetime import datetime from collections import OrderedDict diff --git a/fdog/libs/blast.py b/fdog/libs/blast.py index 2cb4609..e40532a 100644 --- a/fdog/libs/blast.py +++ b/fdog/libs/blast.py @@ -17,7 +17,6 @@ import os import sys -from Bio.Blast.Applications import NcbiblastpCommandline import xml.etree.ElementTree as ET import subprocess @@ -29,21 +28,21 @@ def do_blastsearch( """ Perform blastp search for a query fasta file Return an XML string contains blast result """ - filter = 'no' - if lowComplexityFilter == True: - filter = 'yes' + filter_value = "yes" if lowComplexityFilter else "no" try: - blastp_cline = NcbiblastpCommandline( - query = query, db = blast_db, evalue = evalBlast, seg = filter, - max_target_seqs = 10, outfmt = 5) - stdout, stderr = blastp_cline() - return(stdout) - except: - sys.exit( - 'ERROR: Error running blastp search for %s against %s\n%s' - % (query, blast_db, NcbiblastpCommandline( - query = query, db = blast_db, evalue = evalBlast, seg = filter, - max_target_seqs = 10, outfmt = 5))) + cmd = [ + "blastp", + "-query", query, + "-db", blast_db, + "-evalue", str(evalBlast), + "-seg", filter_value, + "-max_target_seqs", "10", + "-outfmt", "5" + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + return result.stdout + except subprocess.CalledProcessError as e: + sys.exit(f"ERROR: Error running BLASTP search for {query} against {blast_db}\n{e.stderr}") def parse_blast_xml(blast_output): diff --git a/fdog/libs/corecompile.py b/fdog/libs/corecompile.py index bd57e07..99e9bbd 100644 --- a/fdog/libs/corecompile.py +++ b/fdog/libs/corecompile.py @@ -19,7 +19,7 @@ import os import shutil from pathlib import Path -from ete3 import NCBITaxa +from ete4 import NCBITaxa from Bio import SeqIO import time diff --git a/fdog/libs/preparation.py b/fdog/libs/preparation.py index 2d79f82..f18e141 100644 --- a/fdog/libs/preparation.py +++ b/fdog/libs/preparation.py @@ -17,10 +17,10 @@ import sys import os +import subprocess from pathlib import Path from Bio import SeqIO -from Bio.Blast.Applications import NcbiblastpCommandline -from ete3 import NCBITaxa +from ete4 import NCBITaxa import fdog.libs.zzz as general_fn import fdog.libs.fasta as fasta_fn @@ -107,17 +107,15 @@ def check_input(args): def check_blast_version(corepath, refspec): """ Check if blast DBs in corepath is compatible with blastp version """ - fdog_path = os.path.realpath(__file__).replace('/libs/preparation.py','') - query = fdog_path + '/data/infile.fa' - blast_db = '%s/%s/%s' % (corepath, refspec, refspec) + fdog_path = os.path.realpath(__file__).replace('/libs/preparation.py', '') + query = os.path.join(fdog_path, 'data', 'infile.fa') + blast_db = os.path.join(corepath, refspec, refspec) try: - blastp_cline = NcbiblastpCommandline( - query = query, db = blast_db) - stdout, stderr = blastp_cline() - except: - sys.exit( - 'ERROR: Error running blast (probably conflict with BLAST DBs versions)\n%s' - % (NcbiblastpCommandline(query = query, db = blast_db))) + cmd = ["blastp", "-query", query, "-db", blast_db] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + except subprocess.CalledProcessError as e: + sys.exit(f"ERROR: Error running BLAST (probably conflict with BLAST DB versions)\n{e.stderr}") + def check_ranks_core_taxa(corepath, refspec, minDist, maxDist): """ Check if refspec (or all core taxa) have a valid minDist and maxDist tax ID diff --git a/fdog/libs/tree.py b/fdog/libs/tree.py index b2de19f..57efcaf 100644 --- a/fdog/libs/tree.py +++ b/fdog/libs/tree.py @@ -16,7 +16,7 @@ ####################################################################### import re -from ete3 import NCBITaxa +from ete4 import NCBITaxa import fdog.libs.zzz as general_fn @@ -57,8 +57,9 @@ def get_ancestor(id1, id2, ncbi): Return dictionary {ancestor_id: ancestor_rank} """ tree = ncbi.get_topology([id1, id2], intermediate_nodes = False) - ancestor = tree.get_common_ancestor(id1, id2).name - return(ncbi.get_rank([ancestor])) + ancestor_name = tree.common_ancestor(id1, id2) + ancestor_id = int(ancestor_name.name) + return(ncbi.get_rank([ancestor_id])) def check_common_ancestor(ref_id, ancestor, minDist, maxDist, ncbi): @@ -68,6 +69,7 @@ def check_common_ancestor(ref_id, ancestor, minDist, maxDist, ncbi): """ ref_lineage = ncbi.get_lineage(ref_id) (min_ref, max_ref) = get_rank_range(ref_lineage, minDist, maxDist, ncbi) + ancestor = int(ancestor) if not ancestor in ref_lineage: return(0) ancestor_index = len(ref_lineage) - ref_lineage.index(ancestor) - 1 @@ -78,7 +80,7 @@ def check_common_ancestor(ref_id, ancestor, minDist, maxDist, ncbi): def remove_clade(tree, node_id): """ Remove a clade from a tree """ - removed_clade = tree.search_nodes(name = str(node_id))[0] + removed_clade = list(tree.search_nodes(name = str(node_id)))[0] removed_node = removed_clade.detach() return(tree) @@ -96,12 +98,12 @@ def get_leaves_dict(spec_lineage, tree, min_index, max_index): for i in range(len(spec_lineage)): if i >= min_index and i <= max_index: curr_node = spec_lineage[i] - node = tree.search_nodes(name = str(curr_node)) + node = list(tree.search_nodes(name = str(curr_node))) if len(node) > 0: for leaf in node: node_dict[spec_lineage[i]] = [] for t in leaf.traverse(): - if t.is_leaf(): + if t.is_leaf: if not t.name in already_added: already_added.append(t.name) node_dict[spec_lineage[i]].append(t.name) diff --git a/fdog/mergeOutput.py b/fdog/mergeOutput.py index fa36d4d..ffebb5c 100644 --- a/fdog/mergeOutput.py +++ b/fdog/mergeOutput.py @@ -21,7 +21,7 @@ from os import listdir as ldir import argparse import yaml -from pkg_resources import get_distribution +from importlib.metadata import version, PackageNotFoundError from Bio import SeqIO def createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out): @@ -37,8 +37,8 @@ def createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out): def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') parser.add_argument('-i', '--input', help='Input directory, where all single output (o_g.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', action='store', default='', required=True) diff --git a/fdog/removefDog.py b/fdog/removefDog.py index 9a28b41..bd34c90 100644 --- a/fdog/removefDog.py +++ b/fdog/removefDog.py @@ -20,7 +20,7 @@ import argparse import subprocess import shutil -from pkg_resources import get_distribution +from importlib.metadata import version, PackageNotFoundError import fdog.setupfDog as setupfDog_fn @@ -48,8 +48,8 @@ def query_yes_no(question, default='yes'): def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') parser.add_argument('--all', help='Remove fdog together with all files/data within the installed fdog directory', action='store_true', default=False) args = parser.parse_args() data = args.all diff --git a/fdog/runMulti.py b/fdog/runMulti.py index 1bce014..b1b3587 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -25,8 +25,8 @@ import shutil import multiprocessing as mp from tqdm import tqdm -from ete3 import NCBITaxa -from pkg_resources import get_distribution +from ete4 import NCBITaxa +from importlib.metadata import version, PackageNotFoundError import time import fdog.libs.zzz as general_fn @@ -161,8 +161,8 @@ def join_outputs(outpath, jobName, seeds, keep, silentOff): def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.', + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.', epilog="For more information on certain options, please refer to the wiki pages " "on github: https://github.com/BIONF/fDOG/wiki") required = parser.add_argument_group('Required arguments') diff --git a/fdog/runSingle.py b/fdog/runSingle.py index a781a63..9520249 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -19,8 +19,8 @@ import os import argparse import subprocess -from ete3 import NCBITaxa -from pkg_resources import get_distribution +from ete4 import NCBITaxa +from importlib.metadata import version, PackageNotFoundError import time import fdog.libs.zzz as general_fn @@ -33,8 +33,8 @@ def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.', + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.', epilog="For more information on certain options, please refer to the wiki pages " "on github: https://github.com/BIONF/fDOG/wiki") required = parser.add_argument_group('Required arguments') diff --git a/fdog/setPaths.py b/fdog/setPaths.py index 3b6e501..7271427 100644 --- a/fdog/setPaths.py +++ b/fdog/setPaths.py @@ -19,7 +19,7 @@ import os import argparse -from pkg_resources import get_distribution +from importlib.metadata import version, PackageNotFoundError import fdog.libs.zzz as general_fn import fdog.checkData as check_data_fn @@ -65,8 +65,8 @@ def check_data(searchpath, corepath, annopath): def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('--searchpath', help='Path to search taxa folder (e.g. fdog_data/searchTaxa_dir)', action='store', default='', required=True) diff --git a/fdog/setupfDog.py b/fdog/setupfDog.py index 2de9c06..184de6f 100644 --- a/fdog/setupfDog.py +++ b/fdog/setupfDog.py @@ -22,9 +22,9 @@ import argparse import subprocess import shutil -from ete3 import NCBITaxa +from ete4 import NCBITaxa from pathlib import Path -from pkg_resources import get_distribution +from importlib.metadata import version, PackageNotFoundError import fdog.libs.zzz as general_fn import fdog.libs.fas as fas_fn @@ -147,25 +147,24 @@ def check_dependencies(fdogPath): def download_data(dataPath, resetData): """ Downloade pre-calculated fDOG data """ - data_fdog_file = "data_HaMStR-2019c.tar.gz" + data_fdog_file = "data_fDOG_2024.tar.gz" checksum_data = "1748371655 621731824 $data_fdog_file" genome_path = '%s/searchTaxa_dir' % dataPath Path(genome_path).mkdir(parents = True, exist_ok = True) - if len(general_fn.read_dir(genome_path)) < 1 or resetData: data_url = 'https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo' if os.path.exists(data_fdog_file) and resetData: os.remove(data_fdog_file) - # general_fn.download_file(data_url, data_fdog_file) - ####### temporary solution while the uni network does not work ######### - wgetCmd = 'wget "https://www.dropbox.com/scl/fi/t2ln18k0jthc3y74s591q/data_HaMStR-2019c.tar.gz?rlkey=c66nc3eslqyn2a6k6ey4e678r&st=plzvbllv&dl=0"' - try: - subprocess.run([wgetCmd], shell=True, check=True) - shutil.move("data_HaMStR-2019c.tar.gz?rlkey=c66nc3eslqyn2a6k6ey4e678r&st=plzvbllv&dl=0", "data_HaMStR-2019c.tar.gz") - except: - print('Problem occurred while download demo data from dropbox') - ######################################################################## + general_fn.download_file(data_url, data_fdog_file) + # ####### temporary solution while the uni network does not work ######### + # wgetCmd = 'wget "https://www.dropbox.com/scl/fi/t2ln18k0jthc3y74s591q/data_HaMStR-2019c.tar.gz?rlkey=c66nc3eslqyn2a6k6ey4e678r&st=plzvbllv&dl=0"' + # try: + # subprocess.run([wgetCmd], shell=True, check=True) + # shutil.move("data_HaMStR-2019c.tar.gz?rlkey=c66nc3eslqyn2a6k6ey4e678r&st=plzvbllv&dl=0", "data_HaMStR-2019c.tar.gz") + # except: + # print('Problem occurred while download demo data from dropbox') + # ######################################################################## try: print('Extracting %s...' % data_fdog_file) shutil.unpack_archive(data_fdog_file, dataPath, 'gztar') @@ -175,7 +174,7 @@ def download_data(dataPath, resetData): os.rename('%s/genome_dir' % dataPath, '%s/searchTaxa_dir' % dataPath) os.rename('%s/blast_dir' % dataPath, '%s/coreTaxa_dir' % dataPath) os.rename('%s/weight_dir' % dataPath, '%s/annotation_dir' % dataPath) - check_cmd = 'fdog.checkData -s %s/searchTaxa_dir -c %s/coreTaxa_dir -a %s/annotation_dir --reblast' % (dataPath, dataPath, dataPath) + check_cmd = 'fdog.checkData -s %s/searchTaxa_dir -c %s/coreTaxa_dir -a %s/annotation_dir --reblast --ignoreAnno' % (dataPath, dataPath, dataPath) try: print('Checking downloaded data...') subprocess.run([check_cmd], stdout = subprocess.DEVNULL, check = True, shell = True) @@ -201,8 +200,8 @@ def write_pathconfig(fdogPath, dataPath): def main(): - version = get_distribution('fdog').version - parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + fdog_version = version("fdog") + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(fdog_version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-d', '--dataPath', help='Output path for fDOG data', action='store', default='', required=True) @@ -234,7 +233,7 @@ def main(): ### check if pathconfig file exists pathconfigFile = '%s/bin/pathconfig.yml' % fdogPath - demo_cmd = 'fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3' + demo_cmd = 'fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@qfo24_02' if os.path.exists(pathconfigFile) and not force: check_fas = 1 if not woFAS: @@ -247,7 +246,7 @@ def main(): print('You can test fDOG using the following command:\n%s --fasOff' % demo_cmd) sys.exit() - ### get ncbi taxonomy database for ete3 + ### get ncbi taxonomy database for ete4 print('*** Creating local NCBI taxonomy database...') ncbi = NCBITaxa() @@ -262,11 +261,18 @@ def main(): if check_conda_env() == True: req_file = '%s/data/conda_requirements.yml' % fdogPath print('=> Dependencies in %s' % req_file) - conda_install_cmd = 'conda install -c bioconda --file %s -y' % (req_file) + + install_cmd = f'install -c bioconda --file {req_file} -y' + if shutil.which("micromamba"): + install_cmd = f'micromamba {install_cmd}' + elif shutil.which("mamba"): + install_cmd = f'mamba {install_cmd}' + else: + install_cmd = f'conda {install_cmd}' try: - subprocess.call([conda_install_cmd], shell = True) + subprocess.call(install_cmd, shell=True) except: - sys.exit('\033[91mERROR: Cannot install conda packages in %s!\033[0m' % req_file) + sys.exit(f'\033[91mERROR: Cannot install conda packages in {req_file}!\033[0m') else: install_cmd = 'sudo apt-get install -y -qq ' sys.exit('\033[91mERROR: Please install these tools manually:\n%s\nusing the command: %s!\033[0m' % (', '.join(missing_tools), install_cmd)) diff --git a/fdog/showTaxa.py b/fdog/showTaxa.py index 7bb27c4..fd41ada 100644 --- a/fdog/showTaxa.py +++ b/fdog/showTaxa.py @@ -17,7 +17,7 @@ import sys import os -from ete3 import NCBITaxa +from ete4 import NCBITaxa import fdog.libs.zzz as general_fn diff --git a/setup.py b/setup.py index dfe2971..ba8a988 100644 --- a/setup.py +++ b/setup.py @@ -26,8 +26,8 @@ setup( name="fdog", - version="0.1.35", - python_requires='>=3.7.0', + version="1.0.0", + python_requires='>=3.12.0', description="Feature-aware Directed OrtholoG search tool", long_description=long_description, long_description_content_type="text/markdown", @@ -39,7 +39,7 @@ install_requires=[ 'biopython', 'tqdm', - 'ete3', + 'ete4', 'six', 'PyYAML', 'pyhmmer', From 5831add49f8601e548fd3b3fbe7a067cd082e586 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+HannahBioI@users.noreply.github.com> Date: Sun, 14 Sep 2025 19:52:56 +0200 Subject: [PATCH 221/229] added test Assembly for fDOG-Assembly --- .../HOMSA@9606@v1/HOMSA@9606@v1.fa | 702 ++++++++++++++++++ .../blast_dir/HOMSA@9606@v1.fa.ndb | Bin 0 -> 32768 bytes .../blast_dir/HOMSA@9606@v1.fa.nhr | Bin 0 -> 75 bytes .../blast_dir/HOMSA@9606@v1.fa.nin | Bin 0 -> 160 bytes .../blast_dir/HOMSA@9606@v1.fa.njs | 24 + .../blast_dir/HOMSA@9606@v1.fa.nog | Bin 0 -> 36 bytes .../blast_dir/HOMSA@9606@v1.fa.nos | Bin 0 -> 27 bytes .../blast_dir/HOMSA@9606@v1.fa.not | Bin 0 -> 20 bytes .../blast_dir/HOMSA@9606@v1.fa.nsq | Bin 0 -> 14002 bytes .../blast_dir/HOMSA@9606@v1.fa.ntf | Bin 0 -> 16384 bytes .../blast_dir/HOMSA@9606@v1.fa.nto | Bin 0 -> 8 bytes 11 files changed, 726 insertions(+) create mode 100644 fdog/data/assembly_dir/HOMSA@9606@v1/HOMSA@9606@v1.fa create mode 100644 fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.ndb create mode 100644 fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nhr create mode 100644 fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nin create mode 100644 fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.njs create mode 100644 fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nog create mode 100644 fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nos create mode 100644 fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.not create mode 100644 fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nsq create mode 100644 fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.ntf create mode 100644 fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nto diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/HOMSA@9606@v1.fa b/fdog/data/assembly_dir/HOMSA@9606@v1/HOMSA@9606@v1.fa new file mode 100644 index 0000000..41a868a --- /dev/null +++ b/fdog/data/assembly_dir/HOMSA@9606@v1/HOMSA@9606@v1.fa @@ -0,0 +1,702 @@ +>CM000680.2 79953000-80009000 +CTCCTGCCTTGCGCCCCCAGGCTGGCGTTGGAACTTCTTTAGAGCCAGCGCTGCGGAGCCCCAGCACCAGGAACGTAGAG +GTGAGTAAAAGGCACAGCAGGAAGCCTGGGAGAGAAAAAGCAGCTGCCCCTGCTCAGCCGCGCCCTAGCCTGAGCCCAGC +AGCCTTTGGGATCATCTGTCCTCTCTGCCCTGGAGGTTGGAGTAAGCTCCGGACCTCCTTGACCCTCACGGGAGCACTGG +GGTCCCAGACGCTATCCTCGGTTATCTTAAATCCAGAGGCTGTGGAGGCATGGGGCAAATTGTGCCCGCAGCCTGGGGCC +TAGAGCTGCATCTGTGCACGGGGCCTCCAGGAGCCCGGCTCAGTGGACGAAATCTGGCATGCTCGGACGTCACCCCACAG +GCCTCACGCCTCAGAGGCACCCCGAAAGGTGAGTTGATGACTGCAGAGTGAGTTTGTACCTTTTCGAACACCACCTGCCG +GGGGTCcgagcccccacccccagtctACATCAgccaatcacagctcattgcaggtGCTGCGCTTGTCACATAGCTTCCTG +CAAACAAGAGAAGCCGACCAAGCCATGCACAGCTGGAAAATggcccttataaagccatcaagTCCTAATCCAGGCAACCT +GTAAATATTACCTCGTAtgggaaaacaatttttttttctaatgtcattaaggatcttgagatagagagattgtcctggat +tatccaggtgggccctaaacactgtggctcacgcctataatcccagcactttgggagcctgaggcaggtaaatctcttga +gcccaggagttcgagaccagcctgagcaacatggtgagacctcgtctctacaaaagatagaaaaattatctgggcatggt +agtgtgcacctgtagtcccagctactcctgaggtggaaggattgcttgaggccaggaggttgaggcagcagtgagccagg +atcgcaccactgcactctagcctcggtgacaaagtgagaccctatctcaaaaataaatacatacaacagagacagggtct +ctgtgttggccaggctggtcttgaactcctggcctcaagcgatcctcctgccttggcctctcaaagtgctgggatgacag +gcatgagccagcgtgcccggccACATGCATCTTtgtaagagggaggcagaaggaaatTTGGTGCCACACAGAAGAGAGGG +CCATGGGAAGATGGAGCAGAGATGAAGATGGGGCCTTGAAAATTGACGTGATGCAGTCCCAAGCCAAGTAACGCCGGAGC +CACCAGAGGCCCCAGAAACAAGGAGCAGATTCTCCCCAGGAGCCATGCGAGGGAGCCTGgccctgccaatgccttgattt +cAGCACATTAAAAGtgattttggacttttggcctccagagctgtgaagGAATAAACTTCTGCTGTTGTAAAAGCCACTTA +GCTGTGGTCATTTGTTGCAGGAGCCAATGAGTGACTGACAAGATCAGAGCTGTGACACAGAGGAAGCAGAGCTGTGCCTC +ATGCTTTCCAGGCTCTGCTCATGAGAAACAGGTCACTGGGTTGGCCCGGTGCAGATTAAGGTGTCACGTAGGAGCATGAA +TACCTCAGGGAACCACTTAAGAATTTGCCTAACACATGGTGCttataaaaggaaaagcaaacaaagcAACCTTGTTCTGA +GAGCTGGGACATCAGAGGAAGATTGGAGCCACTGTTTCTCACTCagcctatttctttttattttatttatttattttata +gagacggggagctcactatgttggccagtttggtctttgaactcctggcttcaaaggatcctctcacctaggcctcccaa +agtgctgggattacaggagtgagccaccttgctGGTCGCGCACCCCATTTTTTAAGGAAACCTGTACGGAAGGTGGATTG +GAAGTCTCACTTTAGGAGCCTGGCTCAGAAGTTCCGCAACTTCAACTGGTGTCATGTGACGTGCTTGGCAGTCACCACCC +CCATCCTTACAGCAAAGAATAGCAGGGCAGCCTTTCATGGGCCCATCAGAGAACTAAGGCTGCCAGACAAACTGGCACCC +TGCTCTGAAGAGACAGGCACATCCAGGGAGAGACAGCACCTGAGAGCTGCTCACCCAGAGCAGAAACTCCTGGATAAACT +GGCTTTAGCAGAAAAATTAGACAACATGCAAGACCAGATAGGTGATATCAGcagaaagatggaaacaataataaaaaaaa +aatgctagaagtcaaaaacacaatagaaataaagaatgctCATTAGTTGACTAgacagaatgaaggaaagaatcagtgaa +tatGAAGATAAGTCAAcagaaacttcccaaactgaaattcaaGGAGAAAaggtgatgaaaaaaaaaaaaatcaactgatc +TAGTTTGAgtgtgtgtccccaccaaatctcatgttgagttatAATCACCagttttggaggtggggcttggcagggggtgt +ttggatcatgggggcgggttcctcatgaatggcttgcaCCATCGCTTCATCCCCTTGTGATGAGTAagctctcactctga +gttcatgtgagatctgtttttgtttttgtttttgttttgagacggagtctcgttctgttgcccagactggagtgcagtgg +cacagcacagtctcagcttactgcaacctcctgggtttaagtgattcttgtgcctcagtctcccgagtagctgggattac +aggcttgtgccactatgcctggctaatttattgtatttttagtagagatggggtttcaccatgttgaccaggctggtctc +aaactcctgacctcttgatccacccacctcggcctcccaaagtgctgcgattataggcatgagccaccgcgcccgcctga +gatctgtttttctttaaagtgtCTGGCACCTACCCAcaccctcactctctctctctcttgctcctgcttttaccGTGTga +aatgcctgctcccactttgccttccgccatgagtaaaagttccctgaggcccccaagaagctgagcagatgctggtgcca +tgctggTATGGCCTgcaagaactgtgagccaattaaatctcttctctttataaattactctgtctcaggtatttctttat +agcaatgcaagaacagcctaatacaccaACAGAAAACACCATCCAAGAACAACATCAAAGGATATGATTGGTACAATGGG +AGAATcggaaggaggaaagaggagaaggggCAGGAGAAATGGTGGAATGGCAAGAATGTCCCCaaattaatgacagacac +caAACCATAGATCCAGGAAACATACAGAACATCCATCAggacaaataacaaaaaacataaagcaaaacaaaacaaaaaac +cacacctAGGCTTGTCACATTCAAACTGAATAAACCTGCCTGACTGAACCCTCGTCCACCAGCCACAGCTACACCTCTGA +CAAGACAAGAGACAGATTTCAGTAACTCTCCTGGTAAGAGACCACTGACCATGGCTGGTCCCGGCTGGTTTACAAACTCT +GTGCACCAAGTGCCTTTGAGTCTTAGAAAGACCTTTTGATGTATAGGGCCTAACTGTAATACATGTAAATGTTACATCTc +taccccaaagtgaacatgggtcaTGTGATACATACAGCAAGACccccttcatgaatattcatagctcctcatGTAACCTA +TTAAGTATATTTAGCCAACCAAGTCAGCATAAAGCTCGTGCCCCATCCTCTCCTCCTCTGAAGGGCCTGTGTCTGGTCTT +GTGGGATGTCTATCTTGCaagttatttaagaaatagtCTCCTTTTCTAAATGCAGTGTTGTGTGATTTTTTAAGTTAACA +AGAggaatatgaataaaaatgaccGTGGACTTCTTGTTAGAAATCATGCGGATAACAAAAGGGTAgagtgaaatcttttt +ttttttttttttttgagacagagtctcgctctgttgtccaggctggagtgcagtggcgcaatctcggctcactacaagct +ccgcctcccgggttcacgccattctcctgtctcagcctcccgagtagctgggactacaggcacccgccaccacatccggc +taatttttttgtatttttagtagagacggggtttcacagtgttagccaggatggtctcgatctcctgacctcgtgatctg +cctgcctcggcctcccaaagtgttgggattacaggcgtgagccactgcgcctggccgtgaAATCCTTAAAGTGttgcagt +tgaaaaaaaaaaaatctgcccacCTAGAATTCCATATAGCAAAAGTATCCTTAAGGtgcaggaaaaataaagactttttc +agaaaaacaaaaactgagggaatgtATTGCCGGTAGACCTGCCCTGTGAGAAATGTTAGAGAAGAGACAAGGAAGATGAT +ACAGGTCAGAGGCTTAGATCTACATCAGGAAAGGAAAAGCATCAGAGGCAGAATAAATGAAGAGAATTAAtgtctcttgt +ttttcttttttttctttcttttttttttttttaagacagagttttgctcttgttgcccaggatggagtgcaatggcgcaa +tctcagctcactgcaacctccacctcccaggttcaagcaattctgcctcagcttcctgagtagcttggctTACAaggaca +tgccaccatgcccagctaattttttttttttttttttttttttgagacagagtctcgctctgtcacccaggctggagtgc +agtggtgcaatatcagctaactgcaagctccgcctcccgggttcacgccattctcctgcctcagcctctcaggtgctgga +actacaggcgcccgccaccgcgcccggctaattttttgtatttttagtagagatggggtttcactgtgttagccaggatg +gtctcgatctcctgacctcgtgacctgcccaccttggcctcccaaagtgctgggattacaggcgtgagccaccgtgcccg +gcctataattttttgtatttttagtagagaccgggtttcgccatgttcgccaggctggtctcgaactcctgtcctcaggt +aaTCAACCCATCTCAGTCtcgcagagtgctgggattatacgcgtgAACCACCTcgttattcttttttgtttttgaggcgg +agtcttgctctgttgcccaggctggagtgcagtggcgcgatctcagctcactgcaagctccgcctcccgggttcaggcca +ttcttctgcctcagccttccgagtagctgggactacaggcgcccaccaccatgcccagctaatcttttgtatttttatag +agacggggtttcactgtgttagccaggatggtcttgatctcctgacctcatgatccgcccgcctcggcctaccaaagtgc +tgggattacaggtgtgagccaccgcgcccggccaacctcgttattcttaattgatctaaagACAACTGTTAAAAGCCATA +AAGTAACAATGTACTGGGTGATTACAGCTTGTGGACAAAATGTCACAAGGGACAGTGGGAGGAGCTGGGACTTTTCTGAT +GTAAAATACTTCCATTGTGCTGCCCTCCCTACCACGGGTCTAAAGCTATATTCCACAGCTTAAGCCACTTGGTCACACAC +TGAAACCCCTGGGCAGCTTTAAAACCACAAATGCTCAAGTGCCCGGGCCCACCCGCAGAGCACCGATAAAGGGGACTGTG +CCAGTCAGGGTAGCCTTCTAAGCCTTCCCCAGAAGTTCTGATGTGCAGCCATATTTGAGAgccactgttttaaaattcac +acCTACAAAAGCTGCATGGTACAGGAGGTCCTCATTACCCACAATCATGACTTCTTGGAAAGACCACTTACTCAAACAGG +ACCAGAATTGCATAGTAAATAATTCATGGGATTAGAAAATAGTTGGTTGATATTGCATTAACAATGAAAATTAAACTTAT +GAAAATATTGcaattattttagaataagtaaAACAGagcgggccaggtgcggtggctcactcctataatcccaagacttt +gggaggctcaggtgggtggatcccttgagctcaggagttcaagaccagcctggtcaacatagtgagaccccccatctcta +ctaaaaatacaaaaattagtggggtacattggtgtgcatctgtagtcccagctactcaggggtctgaggtgggaggatca +cctgagcccaggaggtcaaggctgcagtgagtcgtaattctgccattgcactccagcctgggcaacagagcaagaccctg +tctcaaaaaaagaaaggagcaacAGTCTCTACTCAGATCTGTATACCCAAAtctatacccaaaagaactgcttttatata +atttttaaataaaacttttcaaacACAAAAGAGCGTAACATAATGAAAGGCATGTATCCATCAACCAGCATCAATAATTA +TAAAATCGCTCTTTTAATGAGTATAAAATAAGAACTGAACTTAACGTTTAATTGGCAGCGTTTCTCCATTTCTGATAATA +ACAGTGCACCCTAAAATCAGCAGCGTCTTAGATTTAGGGCCTCAGAAACAAGGCCACCCACACTGGATCCGAGGGGCAAC +TGGGCACCTCCATGGACACCCATGCTTCATAAAGACAGCTGCACCCTACACTGGGTTGGACTGGGACGTTTTCAGCTATC +CAAAATAAAACGTTGGTTTTCTAATTTAAAGCTTTGTTTCATGTCTACATTCTGCTCTTGTAGTCTGCATCTTCGCTCAC +GTAATTAGCTATTTCAGTCTTACCCAGACTGTTTTCTTCCCAATTGTCTGTCCTCACACCAGGGAAAgactggctaattt +ctgtaaCTTCAGAATACAcattaatattctttaatatttagaTTCCCATGGTCAATTATTCATCCTGGGCTAGTACTCGG +CAGTTTCTCTTTTGTCAGTTCATGCGCCACCCAGGCAGCATATGCTGAGGAAGGCACAATTTCTACATTTATCCTAAGCC +AAACATGAGCAGGAAAACTGAGAGCTATTCTAAACATAATTTTATCGGGTCAAAAAGAAACAACTGCTAGGCCGATCTGG +CCGATGACTCCTCCAGTAGCTGTGAAATCGAGAATGCACATCATTTTCTTGCTGTGTTCCTAAGGTAGAGGAGACCACTT +TGAAGACCCGCAGTCCATATTGGCATTGTAGAATTTGAGCAGAGggccattttgttctgtactaagaaaaattcttctgc +cttgagatgctgctaatctgtaaccctacccccaaccctgtgctccctgaaacacgtgctgtgtcaactcagggttaaat +ggattaagggctgtgcaggatgtgctttgttaaacaaatgcttgaaggcagcttgctcgttaagagtcatcaccactccc +taacctcaaaccactccctaatctcaagtactcagagacacaaaacactgcggaaggctgcagggacctctgcctgggaa +agccaggtattgtccaaggtttctccccatgtgatagtctgaaatatggcctcatgggaagggaaagacctgaccgtccc +ccagcccgacacccgtaaagggtctttGCTgagaggattagtaaaagaggaaagaacgCCTCTTTGaggttgagataaga +ggaaggcttctgtctcctgctcgtccctgggcaatgaaatgtctcggtgtaaagccaattgtatattccatctactgaga +taggggaaaaccgccttagggatggaggtgggacatgctggcagcaatactgctcctTAAGGCATTGAGATATTCGtgta +tatgcacatcaaaagcacagcacttttttctttacgttgtttatgatgcagagacatttgttcacgtTTTTACcttctga +ccttctctccactattatcctattaTCCTGCTGCCACGcccgataatgatcaataaatactaagggaactcagaggccgg +tgccaGTGTGGATCCGTATGCTGaacgccggtcccctgggccccctttttgtttctttatactttgtgtctctttctttt +ccaagtctctcgttccacctaacAAGAAACatccacaggtgtggaggggcaacccaccccttcactgTGTTGTCTCCATC +AGATCCAGCTGAAGAGCTTTGCTCGAAAGCCAGTCCTTGGCCAGCATGGCCTCGTGCCTGTAAGTCCAGcgctttggggg +gccgaggtgggaggatcacttgagcccaggagtttgagaccagccctggcaacatatggagatccctgtctctagaaaaa +taataaaaaatagtagccgggcatggtggcgtgcctctgtagtcccacctactcgggaggctgaggcaggaggatccctt +gagcccagttcaaggctacagtgagctatgattgcaccactgcactccagcctgggtgacacagcaaaacccaatcccag +aaaaaagaaaggcagtcCTGTCATCTCCACAGACCACCCGGCCTATGTCGGTGAGGCCTGAGGCTGCTGTGTGCGGCGAT +GCTGGGTGGTCCTACTCGGGCTGCCCAAGAAGCCTGACCTGAAAAAGCAGCCAGGGCTGGAGGGACGGCCCAATATGAGC +CTGGCCGATGCCggctggagaggagagaggcGTTCCCAGGTCAGAGGCAGGCTCGTCTCGTCTTCTGCAGGGCCAGTTCC +AGGCTCGTGTGGGCCAGTGGGGCTCCCTGCTCGTTTAGCTGTCTGTGGTTAGGAACGCATAGCCCATGAGGACAGGGTGC +CCAGCTTGAACCTGCACCCAGAGTCAGCCAGTCAACATCTGAAGATAACGGGTTCCACAGGGTGCCCACAAGCTCTCCAA +AGTCCCAACGCCAGTCTGCCTTCACGGCTGTCCGAAGTGCTTTATCACTGGTGAGCTCAGTGCTGTCAAGAGCTCTGTGG +CCCCTGCTCCGCCTGAGTAAGGGAGGAGCGGCAGCCAGGGAGCAGCCCCGCCCTGGAGACACCGAAAGCCCTTGGGTCTG +CGCCTGCCCCCACCACGAGGGAGCCGTCCCCTGAAGCCATGAGCTTGATGCTGGGCTGTCCCTGGAGTAAAGCTTCCGGG +AACTCCGTGCCTGCAGAGTAAGGGAAGAGGGGACGTTCTAGAACTTtctggaagggaggaaggagggcagTCTGACTGGC +TCCCTGCTCATTTAGCTGTCTGTGGTTAGGAAAGCTGCCTGTGGGCTGATGAGGAACCCTGTCGTGGCCCTGCCGTGGGG +CACCACTGCCCATCTCCTCCCGCCTCCAGAGATGAGCGCCCTCTGAAGAGCTTGGAGGGCTGACGCCAGCCCGCTGTGTC +TGCCTCCTCTCTACAGGCGTAATTTCCTTTTCCCCTAAGTTAGCTTGAGTGACTTCTGGTTCTTGACACTGAGGTGTGTT +TCTGATGGAAACACCACCGGCTTGTACTCCTTGCCCCCCACGCCAGGCAGGCTTTCCTGGGTGCAGGCCGTCCACCCTGT +ACTGTGGCATCTTctggccctgcctcctcctcctcagcagcCACCCACACTGGCTTTCCCGAGAAGCCAGAAGCCGCCCG +GCAGGATTCCTTAGGCCTGAATGGTCCTGGGAGTGGGCCGGGAGCCACAGCACAGCAGCCACTTTCCACCCACACCGCGT +CTTTCTCAGCCGTCCACACTGGCCCCTACAACTAAGTTTGTATCTCAGCCTCtctcagagacagaaaaaaagaggaggat +gCTAAAAGCACACAAAAAGTGGCAAAATggccatgcttttaaaaaataattttaacagcttcattgaaatgtaattcaca +taccacaaaACCCACCCATTTAAAGGGCATAATTCAACAGTTTTGAGTATATTTACAGGGTTGTGAAACCTTCTTAcgat +ctaattttagaacatttttgtccCCCCACAAAGAAACCCCAGACCTCCTAAATCTCTCCGCACTCTCCCTCGTCCCTGCA +GCCCCCAGCAACAGAAGTCAACTTTCTGTCTCCGTGGCCTTGCCTGTTCCAACTCTCCATGGACTTACCACTCAGGGTCT +TCACGCCTGGCCTCTGCCACCCAGCGTTGTTCCGGAGGCCCGCCTGCTTCGTAGCGTGCGTCAGCGCCTCACTCCTTTGT +GTGGCTGAGGAATATTCTATCATCTGCATTTTGTCACCTACTTATTGGgggtggacatttgggtggtttccaccTTTTGA +GCATTACTGATGCTGCAGTAGACTTGGTGCACTCGCGCTGTCGTTTCGGTATATGCCTAGGGGTGGCGCGGCTGGGCCCT +GTGGAAACTGCTTCACTTCTGAGGACCCACCCGACTTTTTCACAGCACTGCAGCCTTTCACAGCCCACCGGCTGCGTGGG +GGGGGCCACAATCCCCACACGCCTCTCGGACTTACGTTTGTCTCTCTGATTCTAGCCATGCTGGTGGGTTGCAGTGggag +ctcattgtggttttggtttgtgttttcCTGATGACTGGTTGTAATGAGAAAAACCGAGAATCACATTTAAAACCTTACCC +TAAAGTGGGAGTTGCTGAGAGACCAAAGAATGACTGGGCGGAGTCCAGCTTGGTGAGTAGGTGAGTTTATTAGGACTCAC +ATGTGGGGCACTCGTGGGCAGCAGGACAACTCTAGAGATGAGCGTGCTTCTGTCCCTAAGCTGCTTTTGAGCTAATTTTC +TGCCTCTTTGCGCCGTCTGTGTGATGGGCCTGCTCCCTGTGGTGGTTCTCACATCCCCTCCGGGCTGTTTGGGTTCTCAG +GGACACCTGCTCCTCGCTGGGCACCGTGGCCTGGACTCACCGCCTGGCCTTTAGGGTTCAGGCCACAGACATCCGCCCTG +AAGTAACCTGGCGGGAACCGCCACGCCACAGCGGTGGTGCTGGGTGGCTGTGTTTTATTTATCCGCACATGGAGACACCA +GAGCTTCCAGGTGTGGTCACTGGGCTGTGCCCTAGGGATGCTGGGGTGACCAGACATAGCCCTGTTCCCACAGCGCTGCA +CAGGGGCACAGACATGGCAACAAGACTGGTAACCCTAAAACACCGTCCTGGGAGGTCACCCACAAGGTGACGAGCCACAA +CCCAGGTGTGAAAGAGGCCAGGgaaggcccggcgtggtggcccAGGCCTgcggtcccagcactttgggaggccgaggcag +gaggatcacttgtgcccaggaattcgagaccagcctgggcaacatagtaagatcccgtctctacaaaaaataaaaaagat +cagccGGACCTGGTGGCGCgcgcccgtggtcccagctactggccaggccgaggcaggaggatcgcttgggcccgggaagt +cgaggctgcggGGAGCCATAgtcgccactgcaccccagcctgggccacagagcgagccCCCGTCTCTGAGAAACGCAGCA +GGGGTGACCGAGGCCCATGCACTGCCCGTGCGGAGCGTTTTGGGGGTGTAGGTGGCCGGTGGCGCCCACGGGCCCTCTTG +AGTAAGGCGGCTCCGCCCAGGCCGTCCCGGGGCCTCGGCTCGGCTCGGCTCGGGAAGCCGCAGAGCCTGGGGGCGCGGAC +CAGTCCTCCGAGGCGGCCGCTCGGTGACATTGCGTCCCTGCAGGTGCAGCGCCCGCCTCTCCGCTCCGGCCCCGCCTCCG +CCCTGGAACGCAGCGCGCTCCGCCCGAGGCCTCCCGGCGGCCCATACGGGAATCGCGGAGCTTAGCTGTCGCCACCTCGC +GCCGGGTCCGCGCGGCCCACGGGACCCCCCACTGACGCCCCCGGCCAGCGGTCCACATGGACGTGCGGGGCCCTGAAGCC +CCCGGCGGGCGCGCGCTGCGGGACGCGGTGAGCCCCTCCCCGACTCCTGCTTCTCTCTGGATGGGGGCGCCCCTGCGGTT +CTGGGGGGTTTTGAGGTCCTGGGGGGGGAGCCTGCGGTGCTGGGGGTGCCCTGCGGTCTTCGTGGGGCCCTGAGGTCCTG +GGGGGCCTGCGATCCTGAGGACTCCTGTGGTCCTGAGGAGTCCTGAGTGCCTGGGAGGCCTGTGGTCCTGGGGGGGCCCC +TGAAGACCTGGGGGACCCTGCGGTCCTGGGGGGGCCTGAGGTGCAGGGGGGAGCCCTGCTGTCCTGGGGGACCCTGCGAT +CCTCGGGGGGCCTGAGGTCCTGGGGGGAGCCCTGCAGTACTGGGGGGCATTGAGGTCGGGGTCAGTGGGGAAACAACTGG +GCCAAAGGGTGCCCGAAAGGGGACCCACGAGTGTCGCAGCGCCCAGGGCTCCGGCCTCTTCCGGGAAATCCTCCCGCTCC +CCAGGCCTTCCTACTTGCCAAAGAGTTCCAGGCCCACAAGAGGACTGGCTATGAGGAAGAGACCTGGAATCTGAAGGAAT +GTGTTGGGCGTTGTGCAAACCCTAACGTAAATTTCCTGACAAAGGTAGAAAGCCCTGGCATGGTTCAGAGGTGGGGCCTC +CTCCTATGTCGACGGGATTCTAGATTCACACCATGGTACGTGGGGCTCCCTGGGATCTTGAACCCCCAGCTGGGAGAATT +TTGTGCTTCTCAGCCTCACTGTTCTCATCTCACCCTGGACACGGTCGCAGGGTGAGGAGTAGATTACACCGTGAATGGTG +TGGGACGTCAGTCTGTAAACTACAAAGCAGTATATAAACAGGAAACGCTTTCAGTAGCAATGGTGGTATTTCTGTGTGAC +CTGGTAAGTAATTTCATAAGAATTCTTGAGGAGCGCAGGGTCACGAGATTTTCTCCTGCATCCTCCTCTGCATGCTTTAT +GGCCTTGGCTCTGGGCCAAGGTGTGTGATCTCCTGCATTCATGGTGTGTGGTGTGAGGAGGGGGATGGGTCCCTGCCCCG +TCTGCACGTGGCCCTCATGATTCCCGCGCtgcttgttgaaaagactttccttTCCCTCTGAATCCAAAGGCCTTAGCACC +TTTGTCAGAAGTCAACAGATTTATGGATGGGTTTATCccatgatatttattttttttatttttattttttgagacggagt +gtcactctgtcgcccaggctggagtgcagtggcgcgatctcggctcactgcaagctccgcctcccgggttcacgccattc +tcctgcctcagcctcccgagtagctgggactacaggcgcccgccactgcacccagctaattttttgtatttttagtagag +atggggtttcacctagtttgccaggatggtctcgatctcctgacctcctgatccacccgcctcggcctcccaaagtgctg +ggattacaggcgtgagccaccgcgcccggccttatcccatgatatttaaaatgtcatacgtgggccaggtgtggtggctc +atgcctgtgatcccagcactttgggaggtcaaggtgggcagatcacctgaggccatgagttcaagaccagcctggccaac +atggtgaaactccgtctctactgaaagtataaacattagctgggcctggtggcacatgcttgtaatcccagctacttgga +ggctgaggtgggagaatcacttgaatccaggaggcggaggttgtagtgagctgagatcgcgccattgcactccagcctgg +gcaacaaaacaagacttcatctcagaaaaaaaaaaaaactcatatgccaacagtaaatatttattcaattgtCTTACGGT +TTATTTTTTCAGGCAGAAAATCTATTTCAGGAACTTCAGGAACATTTTCAAGCTCTGACGGCAACATTAAACCTCAGAAA +TATCCTTTTCTACCTTTAACAAATGCTGTGATTCTTTCGGACTGGTAGATTATCATGGAGTATCTTTTTGTTGTCTGGTA +GTAGTAGGTAATAGTTTACTTAGGATTTCCCAGTATTTACTTCTGTGCTTTTATGTGGCTTCCTGATGTGTTAATTACCC +CTCACCTATAGCAAAAGCTGTACctccggccgggtgcagtggctcacgcctgtaatcccagcactttgggaggccgaggt +gggcagatcacgaggtcaggagatggagaccatcctggctaacacagtgaaaccccgtctctactaaaaatacaaaaaat +tagcccggcatggtggcgggcgcctgtagtcccaactactcaggaggctgaggcaggagaatggtgtgaacccgggaggt +ggagcttgcagtgagccgagatcgcgccactgcactccagcctgggtgacagagcgagactccgtctcaaaaaaaaaaaa +aaaaaagctctaccTCCTGCTTGACTGGCAACACAGGATGTCTTGTGCAGCAAGTATAAAGgacaaaaacacaaacatat +aaCACTTCCCAGTGGGGCTTCCATCAGTCCCCAACCGGACAGTGCCCCCTAGACCATAGACCCCACAGAGCATAACACTG +AAGCACAGGATCCCAGATACGGCTCTCccaaactttgttttgtttagtttaacATCGCCAACTCACTGTTTTGTCTTAGA +AACATGAAATCAAACAGAatgagaatataattttttttggagttggagcAAATCTAGATAAGGGTCACAATTGCCCCAAG +ACTTGCCCAATTTGAGCTCATGCTACACAATTAAATATATTCCTTAAATATTGTGTGCTTGCAACTTTTGATTTTGTAAA +TGGGTTCCCACAGGACAAAAATGAgtcttaaacataaaattaaaccaTATGAATGTTAGTTTTTTAGTTAATTTCAGCTG +GGTGGtagtctatagtcccagctactcgagaggctgaggcagaaggatcgcttgagcccaggagttcagggctgcagtga +gctgtgatcacaccactgcactccagcctgggggaagaGCAAGACCctacaaacaaaaaagagttaattccattatattt +atttttacatacccAGAGTTTGACTAAAATATACCAGACAATCTCCTGTCCCCAAATCCATGTCCAAGCAAGGGGAGCCA +CGTTTTCTAAGCTCACAGTTTAAAGGTTAAAGAGACACACTGAGGAAAACTCAGGGAAAGAAGCTGATTTGCATGGACAC +TAGGCCTGTCGTTGATTCCCTCATTTCAAAAGTTGCATGTGCCATGGGAGGCGGAGTCTCTGAGGACATCCTGGCTGTGC +CTCGGCGGCTCTGGACTCCAGCTCTACGCAGAGCGCCTTAACACTGTACTGGAAGAAATGGGAAATCGCATTGAGGACTT +ACAGAAGAATGTCAAGGACTTAATGGTGCAAGCTGGCATTGAAAATTCTATTAAAGAACAAATGGTAAGGTTATTAGCAA +ACTATGTCAACCGTTTTCGTATGTTTTGACTGCTTTCATCTTGAAACATACAATCCCATATTTGTTCACTATGGGAATTG +CTGCCTCTGTGCTGCATGAAAACTGCTCAGGAATAATAAATTTCCAAatgctctcttctttttttgaggtggagtcttgc +tctgtcgcccaggctggagtgcagtggcatgatcttggctcactgcaaactccacctcccgggtttaagccattctcctg +cctcagcctcccaggtagctggaactacagacatgcaccaccacacccagctaatttttgtattttagtacagacggggt +ttcactatgttggccaggctggtctccaactcacaacctcaggtgatccacccaccttggcctcccagtgctgcaattac +aggcgtgagccaccacgcccggccacacccagctagtttttgtatttttagtagagacggagtttcaccttattggccag +gctggtctcgaactcctgacctcaggtgatccacccgcctcggcctcccaaagtgcctccTCTGCAGGGCATCTGAGGCC +ACCCAGTGCAACCCGAACCTCATCAGGGCACTGTAGGCATTCTCACAGCTTCTGATGCTGAATGTAAACATGCCACAAAA +GAATGCTAGGATCAGAGGCATGTTTAAAGCCATtagtaaaaataacagaaaaaatgattcagaagccaggcgcggtggct +cagcctgtaatcccagcactttgggaggccgaggtgggcggatcacgaggtcaggagatcgagaccatcctgtgaatggt +gaaagcccgtctctactaaaaatacaaaaaaaaaaaaaaaatagccgggtgtggtggcgggcgcctgtggtcccagctac +ttgggaggctgaggcgggagaatgccgtgaacccgggaggcagagcttgcagtgagccgagactgtgccacggcactcca +gcctgggcgacagagcgagacaacatgaaagaaaagaaaggaaagaaagaaagaaagaaagaaagaggagagagaggaga +gagaggagaggagagagagagagggagggagggagggagggagggagggagggagggagagagagagagaaaggaaagaa +agaaagaaagaaaaagaaagaaagaaagaaagaaagaaagaaagaaagaaagaaagaaagaaagaaagaaagaaaaaaaa +acgatGCAGAATAGCGCCCACGTTGAGCAGGTCCTCCCACCTGTGCCCACGCGGATGCTTCCCCAGGGATCCTCTGAACC +TCCAGGTCAGAAAGCTGGTGAGCCGTGAATTCACcctgcattttaatttttgggTGTGGTCTGTTCAGGTAGGCAGAGAT +GGCAGTGCTCTGATTCTGCTGGTGGACTGAAGTCCTCATCCCTTCTCTGCGTGCGCTCAGAGCAGAGGCAGTCTTCCCCC +AGACACCTAACCAAGTTTCCTGCAGACACACTGGCATCATAACCTTCAGACGTTTATTCTGAAAGTCTTAAAGCAGTATG +TCAGTGGCCCCATACTCTGTAGCCTTCTGCCCCTTGCTTTTTTGTTCAGTACTGAGATTCATGTCGACACGTGTCGCTCT +AGGTCAGTCGCCAACTGCTGCCTGATAGTCCTCTGATGATTTACTTTATATTTGACTAATATTGTTTCCAAACAATGCTG +CACTATTCTTGTCTACTTCTTTGTGTACACGTTGCCAGTTTCTCTAGGGGACTTGGTTATTCTGTACCTAGGGGCTTGTC +ACCTGCATTAGCTGCCACAAGTAATTTCTGGACTTTGCCTAGAAGCCATCCTGGGGAGATGGGAAATAGAATTCACTAGG +TACCCAGAGAGCCACTAGTATATCCAACTTGATTAAAGCACCACAGTGCCACAGACGAGGCAAGGCTTGGCAGCCATGCA +GGCCTAGGTTTCAAATTCTTATCCTCTGTTGCTGGCTGTAACCTGGGGAAATCTAagctttagttttcttctctGCCAAA +TATCTACCTGGTAAGTGGTTGTAAGGATTTAAGAAAATTGCTTGCAAAGCACTTAAGTCAGGGCAAGGTAGACAGCAGGT +ACATAATGAGATAAACTTGAAATACAGAGTCCAAATCATTATGATTTCAACTGTACTGCAGCCACAAAGCACCAGTGAGA +ACGTTATTAAAATCAGGGGTGAACATACTTAATACACCTGATAGGAGTTACGGCCTGAACGTTTATGTCTCCCTAAATAG +CTGAAGACCTAACTGCAGCAtgtctgtatttggagatggggcctctacAGAAGTCATTAAGGTTACATCGGtcctgaggg +tggggccctcatccAACAGGATTCGTCTTTCTGAGAAGAGACGCAAGGGGCTCGCCTGCTCTCCCCTCCGTGCCTGCACC +AGAGAAGGAGGCCATCGGCAAGCCAAGGAGAGCCCTCCCCAGAAATCACACGGACCagcaccttgatctgggacttccag +aaaacagaactgTGAAAAGacaagtttctgttgtttaaaccacccaatctgaagtattttgttatggcagtcctgggcag +actaatacaatatgcaACACATTtacaataaatatacaataaacttACAAATGtggaatgtaattattttatctttttct +ggtAACGCCAGTAAAGCTGTTATCCTAAGAGACATAATTCACACATCATCCACCAATTATGCAATTGACAGAATTGTACC +ATTATCACAATCTTAGGGCACTTTTTCTGTTTATGCCTGCTCGGCTTATGTAAGCACATTTTCATtacatcaaaaataaa +actgcCCATTAGCTGTCACCTCctatttccccctccctcccagtCTTAAGCAACCACTAATCCGTCTCTATGGATGTGCC +TATTCTAGAAATtccgtataaatggaatcacacaatatgtagtAACTGGCTTTTCACTTACAGTACTTCATTCtgtttat +gactgaataatattccgttctATGGATGGACATTTTGTTTAaccgttcatctgttgatggacacgtgggtttccacattt +tggcttcTGTGTACGAGTTTCTATGTGGACATATGCGTTCAATTCTCCTGGGTATgaaactgctgggtcaaatggtaaca +ccatgtttatctttttttatttttttgagacagggtcttgctcagttgcccaggctggagtgcagtggcacaaccatggc +ttactgcagccttgacctctggggcttaagtgatcctcctgcctcagcccgccgagtagctgggcccacaggcacgcacc +accatgcccagctaattgttctgtattttttgtagagacagggtctcaccaggttgcccaggctggtcgtgaactcctgg +gctcaagcaatctgcctgtctcagccttccaaagtgctgggattacaggcttgagacacctgcacccggcccatgtttaa +ctttttgacgACCTGCCTGCCTGTGTTACCACTTtgccaccagcagtgtgtgagggctccaatttctccacatccttgca +aaCGCTTGTTGCGATCTTTTTGTATCTAGCCGAACTTGTAGGTGTGAAGTGgcgtctcactgtggttttgatttgcacct +TCCTTAACAAGCAGTGATGCTGAGTGTCTCTTCGTGTGCTTCCTGGCCGTCTGTTTCACCTGCTCTGGAGAAATAGCAGG +AGTGTAATACTGAGTGTCTGTTACATGCCAGGTACTCGAGGTCACATACACGTTCTCCCTAGCCCCTCATTCCGATACGC +TGTTGCACTCTCCTACTTTATACATAGAAAAATCAGCACTTGGGAGATTAAACTGCAGAtttgcatttgaattttaaatc +catttaaaaaGCACACACTCATTTGTACTGTATGTAAAAAATGTGCTTCATTCTGTAAGTACCCTCCCGTTTCCACAACA +GCAATCAATCTCCCGACAGCAGCGAGCCCTGTGCACCACTCCTGTGCCCGGCAGGCACCACGTGCTTCTGCGTCCACATC +TGTGTGCAGGGCAGACACTATTATCCGCCTTTACCAAGGAGGAAACAAAGCTCAGAGAAGGTACCCTGCCCAAGGTCACT +TGGTGGGGCAGAGCCTGGGTCTGAGGTCAGAGGGCCCACGGTGTGCTCAAGTGCCCCCATCCCTGCCCTTCCTCACTTTC +ACTTCAGAGCTCGAGTGTGCTGAGACGGGACACGAAGATCCCAACCAACCATCCACACTGCTAGGATGACAGAGGGCAAG +AGAAACCTGAGACTGTAATaaggacattttcttttctttttttgagacggagtcttgctctgttgcccaggctggagtgc +agtggtgtgatctcggctcactgaaacctccacctcccgggttcaagcaattctcctgcctcagccttccaagtagctgg +gattacaggcacgcgccaccatgcccagctaatttttgtatttttagtagagacgggatttcaccatgttggccaggctg +gtctcgaacacctgacttcgtgatctgcccacctcggcctcccaaagtgctgggattacaggtgtgagccaccgtgcctg +gccaataaggaAATTTTCTAACTATATGAACTGAAGGGCCATGGCTTCTACAGAAAATATGTTTACATCAAATACAATCT +tgggaagaataaaaaatagctcTCCTATTCCTTACAGGGAAGgctataaacaatattttattgtactgtttTTATAACCA +GAGTAAACCTTTGGATTCTGTCATGGATTGAATTGTGTAACcgcaaaatttatatattgaagtcctaacccccagcaccT +CAGACTGTGACTTTTGTGGACATAAGATCCttgcaaatgtaattagttaagattaGGCCATGCTGGAGCAGGGTGGGCCC +CAATCCAACACAGCTGGTGTCACTACGAAAAGGGGAAATTTTGGACACCAGCACCCACACAGGGAGGACACAGGTGAAGG +CAGAGATGGGGCGAAGTATCCACACACCAAGGAACTGAAAATACCACCAGCAGCCCCCAGAAGCCAGGGACAGGCCTGAG +CAGATTCCCCTCACGGCCTCAGCAGTGACCCCCCACCCGcgacaccctgatctcagaccCCCGCCTGCAAAGCTGTGAGA +GCACACATCTCTGTTAAAGCCCAGCTCGCGGCATATGCTGTCACCGCAGCCCCAGCAAACCAACCCAGTATGCAAAACGC +ACAGGCTCACAGGATAAACACCTTCGTTTTACTCCAAGGGTAAGAATTAACTTTGACTAGGAAAATCAGTAATCTATTCA +TTAAGTTTCCTGAGAACAAACAAGTAGGCCTGCTCCTCTCACCACGTGCTTGTTTATTTCGGTGAGTTAAGACCATGTTA +TCAATTCCATCTCAGAGGTGCTCCAGCCCTGGAGCTTCCTGTATTTTCCAAAGGCTTTAAATAGCTTAAAACGTTTCCAT +ACAAAAAGGGCTCCACGACATTTATCCGCGCAGACTGAGGGCGCCTCAGTAGCGGTACTTGGTGGAGTAGTCCTTGGGGG +ACACCACCAGGCCGCGGCCTTTGCGGGCCCCGCGGTACACCGTCTCGATGATGTCCACCATCTCCTGCTTGTCCTCCATG +GCCCAGTTAATCTTGTTGTTGTTGCCAGTCCCCAAGTCAATCATGATGTGCTTGTTCCTGCAACGAGAAACAAGGGCATC +CATTCTCATAgaagtctctttaaaaaagaattccttGAAAACAATGCCGTATTTTTCCCACTGTGACAAGCTCTTGTTAA +CATCACTGAAGAACGCATAAAATCGAGAGTGTATGCCATGATGCAGGAGAATACAGGAAGCAgctaaatgtgtttttaaa +gatGCAATGCATTGTCCACCATGCTCTTTCctgcatttttctgtatttatttaggaCAGTTATTCTAAGGGGAAATTAAA +AGGTCAAAGgagggcaggcgcagtggctcatgcctgtaatcccaactctatgggaggctgaggcaggaggatcacttgag +gcccggagtCCGAGATCAACATGGATAACAAAATGAGACCCCCATCTAATTTGTACTAATTTACCTGATAAAAGCAATGT +GTAAGTTTTTCACAGTGCTCTTATGAGCAATGACATAAATTGTATTAAAAGCCCTTCAACTATGAAATGAAGATGACATT +TTGCATGTTTATGGTTAGGGAAGTAGATAATTCTTTACTTATCTTTCAAGAGACTAGTTCTTTTCCTCATGAATACTCAT +GTGCACTTTACAAGATaaggatattaaccctttgttgTACTGGCATAAATTTATCCTGGTTTGTTGaagtttctttttga +gatggggtctcactgtcatcTAGGCTAGGGCATAGTGGCAAAATCAACCTCCTggcccaaatgatcctcccacctcagcc +ctctaagtaactgggaccacaggcacgagccaccacacccagataaatttctagttttttgtaaagatggggtgtcccca +tgttggccaggctggtctcaaactcatggcctcaagtgatcctcccacctcggcctcccaaagtgctgggattacaaacg +tgagccactgtgccggcctgattatttttcttagagatgcGAGCTATGTTGCTAAGGCCGGACTCAAACCCCTAGCTCCA +GTCCCACCCCGGCCTGCAGAGTGGCTGGGCTGCAGGCATACCCCACCTAAAGACTGTTTTTTAACGTTCACTTCCACAAG +ATCATGCATTTGGCTTCCACCTACTATTCCGTAGAAAGTATTCCCCGTGAAGTCTCTGGTTCAGGAGAGGCAAATCTGAT +CAGGGTCTGGAGCCCACCTCTGGCCGCCCTCTCATTCTCCATGCTGCAGGCAGGCCCTACATGCAGCATGCACCAAGCTC +CTTCTACAATGACTGGGGGCTGCCCGTAGCACCCTCCCTGCAGCCCTATCTGACCTGGGTGTACTTCTTCGGTATTTATA +TCTCATAAACATCTGCCTCCTTTCACTTCTTCAGAACCCCCTGCTAATTTACTCAGCTGTTTCTATCAGACAACGTCCTA +TGAGAGTGGGAATCCTGTCTTATCTGACATTAGGCTCCACCACTAGACCACAAACCTGGTGGGGGCTCAAGATATATGTT +TGGTGCATAAGTTAACTAAAGTACCGTTTGTCTTAAAATTCTAAATCACAGAAACAGCAGAAGGAAGTTTCATCTCCTTT +TCTGAGTACAGCATCAGTCCTAAAAATATCAAACTACACTAAGTAGGCTGCTCCCGCCACCCAAGGCTGTTGCCTGCAGC +CTCCTTGCCCCTCACAGCATCTCCagccccaggctggcctccacTGCACGCTGTGGGGGGCAGAGACCTAAGACGGCCCT +CCCTGTTCTCACCTGAAGAATTCTGAGGGGCAAAATAGAAAAGGCCTCTATTGCCTCGAACAGGCTGTTAACAGAAATAT +GGACTTCAGGAACACCTGGTGTGGGCTTggaaggaagtgaggagtgtGTTACTGGGAAACGGAGAAAAGGGGACCCCCGT +TACAGAGTGGCCGAGGAACAGGGACACGGGGTCCGCAGTTATAGGAAGTCAGAGCTCAAAAGCAACACACTCGGGATATT +TAGCTGCGGAAACTTCCAAGCAAAGTCTCGGAGGTGCTGCCTTCAAGAGAGGAAAGAGGCAAATTAGTGTAAGAACTGTT +AACAAAAGGAATAAATTCTTTAGCCTCTCCGGATGGCCAAAGGGGCTGAAATTCAGAAATGGCTGCTGAACACACAGTAG +AGAAAAGGATGAGGGTCAGCCACGCAACCTTTGCTGAAACCTAGGAAAGATGAAAAGTTCACAAGAAATGCAGGGTGCAC +CTCACAGATCTTCTCAACCTAACAGGAGGCACCTAGGAAGCCAAGGGCGTGTCCAGGGGAGCGTATCCCGACGCCCAGAA +CCACGGCAGTGCCTCACGCTTTGGcagcctgaggcgggaggatggctcgaagccaggagttcaagcttgctgtgatggtg +ccactgcactccaccctgggtgagagtgagactccatctcaataaataaataaaacaaggtcACTATCCTGGATTATTTG +GGCAAGTCTAATCTTATCACAGTCCTCTAAAACCAGAGCATTTTCTCAGGCAGAACTTTTCAGGTCAGAGAACCTGTGTG +AAAAGTTCTCAATGTGCCGGCATGCCAGTGCTGGCTTTGAGGGTAACTGGGTCTCTGGCAGCAGAGTGGCCTCAGGTAAC +AGCAGCAAGGAAGCAGATCTCAGACCCACCACCACGAAGAGCCAGACTCGGCCCACAACCCAAACAAGCCCCTCAAACAC +TCCAGATAAGCACCCAGCCCATGGTTCAGaccttgtgagaccctgagcaggaATCCAGCGGGCCCTGCCTGGATGTCTGA +CTTACAGAACTGTGAGCTCAGAAAAGCGGGGTGATTTAAGCTGCTCAATGCGTGCTAATTTgttacaaagaatacaaaac +taACGCATACCCTATACAACCCACCTGACCTTCCTTACCGAGTAGGAGGCTCCACTTTTGGAGAATGGTCTAATGCTCAT +GTAGCTAATTCACCAACCTCTATGGTAACTAAACAAGTAAAAACAGCATCAGATGGTGATTTCTTTCCCCATTAAATTTC +CATAATGAAATTTTTGAGCTGATAAAATAATCTTGTATTTATAAGCAAAGTAGTTCAGGTGTTCCTCTTGGATATAAAAA +AGGACACCGAAGTTGATgataacatatttaataaattaggtttttgatttttttttttgagacggagttttgctcttgtt +gcctaggctggagtgcaatggcatgatctcggctaactgcaacctccacctcctgggtacaagtgattctcctgcctcag +cctcccagtagctgggattacaggtatgcaccaccacgcccggctaattttgtatttttagtagagactcggtttcacca +tcttagcccagctggtctcgaactcctgaccttgtgatccacccacctcggcctcccaaagtgctgggattacaggcgct +ttgagccactgtgcccggcctaaatttgTGATTTTACAACATTTTTGATGAACAGATTATAGTTCCAGGTATCTAAAATG +TCAGTAACAACATCTTTCATTGAATTACACTTCACCTAGGTTTCCCCGCAAAGTTATTTCAAAAGGGTAATTTGTTACTG +AGTAAATGCTAAAACTAGAATTgttactttgaaaattatttcaaagattcACACTGATTGCCTGGTGATACAAACGATTT +TGGGACATGACTGTACAACCAActtccttcaaaataaaataatctaaagaGATCTTGATTACATTAAGCTTCCAAACTGA +CAATATTCAATTTCAGTAACAACTTTAAGATAACGTACCTGAAGAAAAACATGACAGTACATGGATCGTATAACTCATAC +ATTTTGTTGAAGTCAGGCACTTCTGTAATATCCACAAGATAAATAACTGCAAAATTTTTAACCtaaaaggagattaaaaa +aaaaaactgaaataacagAACACTTTGGCTTTCATTTTATAGAAgcacatttataaaaacaaaaccaagaattTGTGTGG +ATCAgggcaatttttgttttgtttttttgagacagggtctcactctgtcatcctagagtgcaatggtgcaatcagctcac +tgcagcctcgacctcccaggctcaagctatcctcccacctcagcctcccaaagtgctgggattataggagtgagccatca +ctcctggccaACGGGGCAATTTTCAAACTAATTCTTCAACATTTTTTCCCAGTGCAGCAAACACGCGGGAAGATCATGTG +ACACTACTGGGACCTTGGCCACCGGCTCCTTCCGCCTCACGCCACCGTTTCCCAGAATAGTTCAGGAATTGGCCGCTGGC +TCCTTCCGCCTCACTTTACCATCTCCCAGAATCGCCCAGGAATTAAATTTCACTCAGGTAACAAGTTATCAAAGTCCTGC +AGAAAACGTCCACAGACATATTATTTATGCTGATCTAGTAATCATCTGTATGAACCTTTACAAATAGTTACATGCATCTC +ACAACTGCTCTAAAATATACCTGTAGAATCCTGCCATCATTTCCCCGCAATTAGGTCTGAACACTTGCAAATATTAATAG +TACCTCCAATAATACtgcatataacacattttctgttattttttctttaattgcaaGTTTATCAGAGATTtagaaaagtc +agaaaaataaaaagaaaataagatcacCTCCAATCCCAACACCTAGGGGTGACTGCTGCTAtcttttttccagagacagg +atctggttctgctgcccaggctggagtgcagtggcaagatcttggctcactgcaacctctgcctcctgggctcaagcgat +cctcccatctccgcctcccaagcagccgggactacaggcacgtaccatcacacctggctaatttttgtattttttgtaga +gacagggttttgccatgttgcccaggctagtcttgaactcctggactcaagtgatacgcctgccttggcctcccaaagtg +ctgcgattacaggcatgagccaccacgcctggcaatatttttactatattctttcccctttcttatagatatatattttt +taatttgttcataTTATTTCTGTTCAGACATAGCAAAATATGGTCATATTGTGATCGTACTACAATATTTTGTATGCTGC +TTTAATTttgagcattcttttttttttcttttttttgagacggagtctcgatctgtcgcctggtctggagtgcagttgca +tgatcttggctcactgcaacctccacctcccaggttcaagctattctccagcctcaaactctcgtgtagctgggattaca +ggtgtgtgccaccacgtctggctaatttttgtatttttagtagagacgaggtttcaccatattggccaggctgatctcga +actcctgacctcaggtgatccacccacctcggcctcccaaagtgctgggattacaggcgtgagccacagcgccggcCAAT +TAAGAGCATTCTTTTACATACATTACCTACTTTCCAAAAATGGCAAACTTCTATGGCTGCGTGATTCATCAAATGGCTAT +GCCATTACTTTAccatttgatatagtttggatatttgtccctgcccaaatctcaggttgaaacGTAACTCCCAGCGTTgc +aggtgaggcctggtgggaggcgactgGATCATGGGATGAATTTCTCATCAATGGTTGAGCgtcatcctcttggtgctgtc +ctcggggtagtgagctgaggttgttCCAAGTGTGTGACACTCCGCCCCAGCTTTCACCGTGCGACGTGCctgttcccctt +tgccttctgccatgattggaaggtcctgaggcctccccagaagcaaatgccagtaccacacttcctgtacagcctgcaga +actgtgagccaattaaacttcttttctcatAAGTTACCCggtctcatgtatttctttatagcaatgcgagaagaGCCTCC +TACACCATTCTTCTGTTACTCTCTGATTGTTCATTAATATTATTCATCTGTGAACATTTTCCCCCTTAAATTTGTATACA +TACTTCATTATTTCCTAAAGACAAATTCCTAAAATTCTGGAACAAAGGATATGACATTTTAAAGGCTCTTTATAAATACT +AACTCAATGCCCTTTAGAAGCTTGTAGCCACTTACATGTGTTATGAGTTAAACTGTGTCCCCAAAACAGTTGAAGTCCTA +AACCCCCAAAGTGACAGCGCAGCTTATTTGGAAACGGTACTGCAGGCGTAATCAGTTAaggtaagatgaggtcatactgg +agtaggtgGGCCCCTAATCTggtgtgactggtgtccttagagGAAAATGGCCACATAAAGACACAGGGACAGGAGGCTCT +GGGGGCATCGAGGACTGCAGAGGCGTCTACAAGCCTAGGAGCTCCAAGAGGCTGCTGGCagcaccagaagctgggagaag +cATGAGGACTCTCCTTGGCAGGTTTCAGAGAGAGCCCAGCCTGGAAAACACCTCGATTCTGGACTTCTGGtatccagaat +tgtgagagaagtttctgttgttttaagcccctcAGCTTGCAGCATCCTCTCACAGCAGTCACGGGAAATCAATGGGATAT +GCACaacatttcttcatataaaactaaatCTTTACAACCTGACTGTAACTTACAGGTGAAATAATTATATCTGGGATTTC +CATGAAACACACAGCCTAAAAAAATGAGGGGTTAGGTAACACAAGACCAGCAAAATACCAGCAATTGTTGAAGTGGAACA +ATAGGAACAGACGGGTTCACTATATTCTCTactttgggaaatattttttaaaaattaagatggaaatgtaaaccatggag +cagggtggggagagggggagggggaaagcCCACACTTCACAAAGACTCCACTGCCCTTCAGCTTTGGTGAAAAAAACCGT +ATCAGAACTCCAGCACAGGGCCCACTGCAGGAGCGTGCGCCTCGCCATCCATGTGCTTTCAAAACCCCAAGGTGAGCACA +CGGCCCACTGCAGGAGCATGCAACTCGCCATCCATGTGCTTTCAAAACCCCAAAGTGAGCACACGGCCCACTGCAGGGGC +ATGCGACTCCCCGTCCATGTGCTTTCAAAACCCCAAAGTGATGTTTGCATTAGACATGGGCCCCTCTGAGTCCGGTCACT +CTGACAACGCCCATTTCTGAAACTCTAGTTTCCCGCAGCGTCCACAGTGAGAATGTGTTGGTCAGGTTGCCAGAATCCAG +CTTTATTTTCATGGTAAACTaataaactgtcttttgatttgaCTTAAACACAAAATACTTCCCTACAGAAGCTAAAATAA +ATACACCAACAAAATAAATGGTCACAGGCATTAGGAAagtcaaaataatatttcattattctcCCTTCAATGCCTTGCTC +AGGGACAGATACAATAAATACCAGTAACATCATATTAACTTACGGAATCACCATGCAAATAATAAATTACTTGAAATATC +ATTTGGACAGGGTTGTAAATTGTgtacttaagaaaaaagaaaggtcacAACAAATTTTTACCATCCCTTTTCTTGCAcgt +aaaaaatagtaaatattataaGCATCTGTGGTTTTTAATAAtttgtaggcatttaataaacagAGTgcagggccaggtga +ggtggctcacgcctgtaatcccagcactttgggaggctgaggcgggcggatcacttgaggtcaggagttcgagaccaacc +tggccaacgtggtgcaaccctgtctctactaaaaacacaaaaattagctgggtgtggcggtgcacgcctgtaatcccagc +tacttgggaggctgaggcaggagaatcacctgaacccaggatgtggagattgcagtgagctgagaccgtgccactgcact +ccagcctgaaggacaaagcgagactccttctcaaactaacaacaacaacaataatcaGAGGGCAACTGAGTAACTGTAAC +TTTAAAGATACAAATGTGACTCTCCTTTTGCCTTCAAGCTCCGGCCCACTGCATCTAACCCGGAGAATTCTAACAGAGGC +AAACTACCACATAAAGCTCACGTGCCACGAGAACGTGAACATCCTCTGATTTCACCACAGACCAGTTTCATGAACACTTG +AGAATCCACCATGTATGAGGCACTTTTCACTCATGGATTCTAATTCTTTCAACACCTCTAAATAGATGTCGCCGATGTTT +TAGGTGATGGAATTCAGGCTCAGAAAGGTCACGTGAGTGATGCCCAGCCGCACGGGGAAGGGGCAGAGCTGAGATGCAct +ctgtctggctccaaagcctgaACTCTGGATGACAGAATCAGCCTAACCCACAGGAGACGGGGCGGCAGTGTGGTCACGGG +GTGGCAGGGAAATATCTGTCCTTAGAGAATTCCATGTGTAAAGAGAAAGACACAAGACCAGCAGCCAGGTCTCAGATCCA +GCACCAGGGCAAGGAAGCAGAAGTGCAGGACAGTGGGGACCGATGCGCAGGTGGACTGGCCCACAGAGGACCTTTTCCAC +TTGGGATTCTGGGCCATGTATTTATTTCTAAGAAAGCAGGAATTGAAGAGTTCAGTTTTGTATATACTGAGTTTGAGAAA +ACTAAGGGACATCCAGAGAAAAAGGCCCTCCAGGTAGCGACTGGCCTGGAGCTGTACAATGGAGTCTGGGCTGGAGAGAG +CTACCTTACTGCCGCCACCTACATCCAGCAGTGATCCCAGCCAGGGTGGCAGGTGCCAGGCTTGGGAGTGAGTGCAGAAG +CCCACTGTAGAGTTGGGGGCAGGAAAGGGATGCCAAGGAGGCAGTCTCAGAAACCAAGGCAGGAGACTCCGGGGAGCTGT +GCCGAGTGCCCCAGAGAGGTTACCAGAAAGGACGATATGGGAACGGGGACACTGCAGGGGCTGAAGGACTGAGGAACAGA +GGACTTCAGTGTGGACAACTTTCAAAAGCCGTGGCtatggagagagagggaagttACTGTGGGACTTGGGAGTTCAGATA +GGAGAGGCTgaaagaggcaggcagaggggagctccccagcctcccaccacCCCATTTTCTTGGTGAAATGGCATCTGTGG +GGTGACTGGAGGAGCATGAGCTGAGGGCAAACATCAGCAAACATCCAGCAGCATCTGCTGGTGGCGACATTTCCACGGTA +ACCAAGCGCTTGGAGGAGGAGCTGGGTGAGTGCAGCCACACTGCTCTGCCAAGTCATCCTGTGACTTCCTCAACAAaggc +agctttttcttttcttttttagatgggatctcactttgtcacccaggctggagcagagtggcgtgatctcggctcactgc +aacctctgtttcccagcctcaagagatcctcccacctcagcctcccgagtagctgggattacaagcgcccaccaccacat +ctggctaagttttgtatttttagtagagatggggtttcaacatgttggccaagctggtctcgaactcttgacctcagatg +attcgcctgcctcggcttcccaaactgctggaattacaggtttaAGCCACGGCGCTGGcctcaaattgtttttaaattct +ttgattgCCTTGAATTGTAAGTATCTTTTCCAACTCTGTTGCCTATACACCTCATAAGCATGCTTTTTCTTTAGGCAAGT +CAGCAATTAAAGTGCTCAGCAGTTCTTCCGCATGCCTCTAAGTCCACCTGCCAAGTTACAATGACATAACTCTGCAAAAC +ACATGACCAAACTGGCACATGATACCAACCACTGTTCTACACTTTTGTATGGAGTCCCTCAACCAGCTAAGAAAACACCT +AAAATTCCGGAAAAAGTGTTCTGGGCAGTTGATTAGCACAACAAATTtttgaagtgtttatttttacatttttaggtaAT +TCATGTCAAAGCcataaaaaaaactgaaaaacttgaCTTTCTAAAATTAATGTTTGCCTTAATCAGCTCTCACAATAACT +TGCAAACTTAGTAATGTGAGAGAAGTAATCACAGTAATAACAGTGGCATTACTGCCCATGTGACAGGAGTGCTCTAGAAA +CCTTTATTAATCCCACCACAACCCTATGAGTGAGGCACAAGTATGCCCTTTTTACAGGTAAGATGACCCACGGGGCCAAG +TTCGTAACGCTCATGCAGTTATGAAGTGCCAAGAGCCTAGCTAGAGTCTGCACCCTCAACCGCCGCTCAACACCACTCCT +CATACCTTGTGGCTAGAGTCTGCACCCTCAACCGCCACTCAACACCACCCCTCATACCTTGTGGCTAGAGTCTGCACCCT +CAACCACGGCTCAACATCACCCCTCATACCTTGTGGCTAGCTACAAGGctgttttttttccagagacaaaaATCCAGAGG +AAACACCAAAAATATCCATTCTGTTTGTTTCTACAAGATTCCTCATCTAGCTGAGTGTTGACTCTCACTGGTTCAGTAGG +CGTTCTGTGATCCTTAATGTGATACTTAAGATACACCCCTTAATATGGCTCCTATGAAgttccataaatatatatctgCC +TTACACACAATTTATTAGAATGTACTatcatgggccaggcatggtggctcacacttgtaatcccagaattttgggaggct +gaggtgactggctcacttgaggacaggagtttgagaccagccggccaatatggcaaaaccacatctctaccaaaaataca +aaaattagcagggtgtggtggtggtgcaggcctgtagtcccagctactcaggaggctgaggttgcagtgagccaagatcg +tgccactgcactctagcctggacaacacaagaagactcttgtctcaaacaaacaaaaaaaatgtactATCATGAGTCAAT +AATATTACtgaggaatttttttgttttgtttttagagatggggtctcgctctatcacccagggtggagtgcaacggcacg +atgatagcttactgcagccttgacctcctgagctcatactctgtcttagcctcctgagtagtggggttacaggcacatgc +tacaaTGCCCAGCTCACTGAGACTATTTTAAACTCTATTTGCGTGGGCtttttaacacaaaaaataaacgTCCCATTCAG +TCTTCTCACCCATCATGGGATTTAGGTAACAGTGAACCCCAAGCACTGCTGTGAGTGTTCCTACAAAATGGAAAGACAGC +TCAAAGGCACATGAAATTGTCCCCTCTGAGAGCGTTCCTACAAAATAGAAAGACAGCTCAAAGGCACATGAAATTGTCTC +CTGTGACGGCGTTCCCACAAAATGGAAAGACAGCTCAAAGGTCCATGAAATTGTCTCCTGTGACGGTGTTCCTACAAAAT +GGAAAGATAGCTCAAAGGTACATGAAATTGTCTCCTGTGATGGTGTTCCTACAAAATGGAAAGACAGCTCAAAGGTCCAT +GAAATTGTCTCCTGTGACAGTGTTCCTACAAAATGGAAAGACAGCTCGTACATGAAATTGTCTCGCTTCAAAACTGCATA +AAATCAGCTGCtaaatagaaaactttttttttttgagacagggtctcgctctgtcacccaggctggcatgcagtggcgcg +atcacggcttgtcacagccttgaactcctggactcatatcctcccacctcggcctaccaagtagttgggaccacaggcac +atgccacaacgcctagctaatgtttgcattttttgtacagacaaggtttcaccatgttgcccgggctgttctcgaactcc +tgggctcaagggaaccacccacctcggcctcccaaaatgctgggaccacaggcgtgagacACAGTGCCCAGatgaaaact +ttatttattattatcactTTATCAAATGAATTGCCCCACATTTATTAGCAAACTCTAGGCATCTGTGAAATCTTTTAAGC +ATAGCTCTGTTTCCAAACTGACTTTATTCCTGAAGTTTACAGCTCAACTCTAATCCAGCATGAAGTCAAGTGGGCCCTGT +GGAAGCAGCAAATCAATACAACACATCTAGGAAAAATCATATCAAGTCCAGCCTGAACATTTGCAACTGTTACTCACTTT +AAAACAGCTGGAATATCCAGACAGAACTCGGAACAAGGTAAGGAAGAGTCCCCATTATCTGCCATTTCTGCacagggaaa +tttttaaatagttatattGTTCTTTATACTAAAAccttatatttcaaaaatttctgtTGAGCTTCTCCAAAAAAGGGCTT +CCAGAATTAGGAAGTCATATTTGCCCATagactaagaagaaaataaaacatgaatattCTTTTAATGCATTTCAggtgtg +cactttttttttttttggagacagagtctcactcctgttgcccaggctggagttcagtggtgtaatcatggctcactgca +gtcttgacttcccgggctcaggtgattctcccacctcagcctcccaagtagctgggactacaggcacgtgccaccacgcc +cggcatttgtacttttagtagagaaggggtttcaccatattgtccaggctggtctcgaatccctgggctcaagtgatcgc +ctacgctggcctcccaaagtgctgggattacagacatgaactaccgcgcctggcctaggttTGCCCTTTTAATAACTATA +TTATAATATTCGCAttaaaagtttattaataaaGCTATCCAAATTTttggacaaaaacaaaaatcagactattttaaaag +accaaaaaaaatacCCCAAAGTGAGCCCATTTCAACTGTCCTTAAGTGTGTGTCCACTATGTGTGTCCACTAGTGATACA +GTGATTCATTCTGTAATAACAAAGAAATTGGACATAAAGGAGTTGGAAAAGAGCAAATCTGGCAAAGACCCTAAACACTT +ACATTAACACATAAAATCTGTTTTCTAGCTAAGAGCATCTTGCTGAGCTGTGCTCGGATGTGATGAGACAAAGGCCCTCA +CATCTCCAGCCATCCTAGCCTCGAGCTGCCGGCAGCAGGATCTGAACACCACCCGATGTTTATGCCACCACCTGCAATGT +AGAACAGTGAAAGATTCTAGAATTCATAACTCAAAGTCAATACAGCAGTGATTTACTTATTGGGAAACTGCTCTTAGTCA +ATTCTTCGCTGGATTTCCAGTATCTGCCCTTGTCCACTACAGCTCAAGCAATGACAAACTGATGTGTGGCACTGCCTCTC +TCATCATCTCTAATCCCCGGCTGGCAGAACCCCCAGCGTGTGAGAGAACTGCTGCTCTCCCGAATTCCCTAGGTCCCTCC +TCAAGTCctcagtgtccaggaggaagaggctgcgAGATGGCTGGTACTAAAGCTACTCCACCTCCATCCCTGACCCCCAT +GGAAGGTGGTTCAAGGTCAAGTCGCATCAGAATTCCCTGAGGGGCTTCTTTAAAAAGCCATTGCTGGACTTCACCCCATA +CCCATAGAATCAGAATCACAGATGCATTCTAGTAAGCACTCCAAGGGAGTCTTGTAATAACTAAAATATAACAACAGTTC +CTAACTTCATGCCTCTCCTACTAAATaagaaatagcaaaaattaataaatggcaGGCCAGATAGTCTTTCCTTTCCTGAA +TATAAAATATGTCCTTACATCTGTTACTCTTTACTCAAAGGAGTTATTTTAACCCAGGCCTACTTTTTAGCATCTGTGAA +AGAAAAGATAGCTTCAATGTTACTTTTACTCTTCATTAAAAGGAATAACCTCTCAAATTACTTATGGAAAATTAATatgg +caaatattttattaacaccCTAGTATTTGTGCCCTTTGCCCTATTGCTACAGCTGCAAAACCTCAATAAAAGTCATCCTA +AAATTGCTATGAGGAACcacagaaaaagaatgaaatctggTTATAAAATAGTGTCCTACAACTTTTATTGTCAGCAAGTA +AATGCCAAGACCACTCAGTGTACTAATATTAGAAATTCAGAGAACTAATTCTAACAAAgcatatttctattttctcctgA +TGTTTACTGCACCTCAATCACTGTAAGCTGTAAACCACATGACTCACAAAAAGCGTGGTGGGATAAAAAAACACACCCAC +AGAAAAGCAGGATTTCTCATGCTATGAACATGAAAGTCCTGCTTTATGCATACAACGAATAACTCAGGTACCAAAAAAAG +ACGTCCTTTTAAATTAAGGATCCGCGCTCTCAATAAAACCTGCAAGTGCATTTTGTTCACTGAACTATAAGTTTGGGAAA +AACGCCTTATCAACAGAAATAATTAATTCATTTGAGAGGAGGGGGATCATAAAccctttaaaatatgaaaaactacAAGA +GCTCCCCGCAGAAAAAGGCGTGAAACGCCGGCACAGTCCCCGAGAGCGCTCGACCGCAGCGAGGGGAGGAATCGCCTGAA +CGACGGAGCCGCGGCCCCTCCTCGGGGAACAGCTCGCGCCCCCAGGCGACgccggcagggcagaggcgcggGGCAGATGC +GGAAGCACAGCCCGCAGAGCGGGAGAGTCCGGCGCGCTACCTTCTCGGCGATGCTGTACAGGACCTCGTCCATCTTCATG +CACGTAGGATCCCAGTCGTGGCCGAAGCGGATGACGACCACGCGGTCCTCCTCCGAGAGGATGGCCTGGTCCACCTGCCA +GCCGTTGTGCAGGTGCGGGAGCATGTACGACATGGCGGCCCGCGCGCTCGCCGCCGCCCAAGGCGGGGCGCCAGGGAGGG +CCCAGCGAGGTGGGCTCAGCCGGCCCCTCACTCCCCGGCCCCCGCCGCCCCCGGGCCCACGGACGAAATCCGGTCCCGCC +CGCACACGCAAACTCCGCTGGGACTGCCACCCGGCAGAACGTCTGGGCGCGCACGCACCGACGCCGTGCGTGCTGACGGC +ATGCGCGCGCGCTAGCGCCGTGCGTGCTGACGGCATGTGCGTATAGGCGCCGCGCGAACGTGTAGTTGGCCGGGTGGAGC +GGCGGCGACCTGGGCACCGTGGAGCGCCGTTGGTTCCGTCATAGCAGCTGTCGGAGTGGGGGTTCCTCCCCAGCGTCCAG +GCGGCCTGGTGGTCCTGAGAAGCCCCGGGCTCGCCGTGCCCTGCCCCCACGCTCCCGCCCCGAGGCCGCCCGCCGCCGCC +CGGGCTGTCCTCCAGCCACGGATGGGGACGTCCAGAAAGGCCCGGAATGCCCGGCACTGCGGCtcgttttcttcctttct +ggtgCTTGTTTCTGTGTGTTACTAAGACAGTTCATGTGTGTCATTTGTGAAACTTGATCATAAAAATGTGTCGTTGTTGC +CATACCCAACCACAGCAGAGTCGAGCAGCGGGGTAGGGAGGAAGCACTGGGCAGGCCGTCCTTGAGGAATGTCACCATCA +GGCCGGCTGCTGAGCTGCCTGTTGTAACCTGAAACCaggttttttaatttgtctttttataaagaaaaaaaatttttttt +tgagacgtgcagtgacgtgatcttggcccactgcagcctccgcctcccggggtcaagcgcttctcctgcctccgcctccc +cataagctgggactacaaggcgccAGCCCGGCTAGTGTTTTTTTGtacttagtagagacggggtttaaccatgttggcca +ggctggtcttgaactcctgacctcaagtgatctgcccgcctcggcctcccaaagcgctgaaaTTACAGGAagaagccacc +gcgcccgacctaaAGCCAGTTTTATTCCATAGCTTCAGCATAACTTCCACCTCCAGGACTGATCTGGCCCCCTGGCTTCG +CTCACCAGTCAGAGCTCCCCAAACCCTTACTAGTGCCAGTGAACTTTCTCAAAGAGAAATAagttaatatttctcttttt +aaaataaaacctctaACCTCTGTTCTTCTGAGAGAGCACTTTGGGTTCATGCTGGAGACTCCCTGGTTTGCAAACTGATA +TTGATAGTAAAACTCTTGTCTACTATCTAGCTATCCTGGTGGTCCTTTAGatgacaaatttaaaacaaatcacgcccggg +cgcggtggctcacgcctgtaatcccagcactttgggaggccgaggccggcggatcgcctatggtcgggagttcaagacca +gtctggccaacatggtgaaaccccgtctctactaaaaatacaaaaaattacccgggcatggtggcgcccgcctataatcc +cagctactagggaggctgaggcaggagaatcgcttgaacccagggggtggaggttgcagtgagctgagatcgcaccactt +cactccagcctgggtgacagagtgagactccatctcaaaaacaacaacaaatcactATGCAGATGCTGTGGTGTCTCTCA +GATGGGCTGTGGCATGGTGAGAACTACAGTCTGAAGAGCCTGCTGGCATGGGGATTGAAATGGGTTCCATGTTTGGTTTC +CTCTGGGACACAGCACCATAAGGTGCTTTGATAATATGCACCCACAATTTAAATACTAGAATCACaagtgcagctcactg +ccagaTTAAGCTTTTTACATAATACAGAATATTTCCCAATTATATACGAAATGCTTTATTGTAATGAAACAATTTCAGGT +TGGTAGGATATTAGAGATCATCTAaatctctttcattttataattgagtAAAGCAGAGGCACAGAATATAAGCATAGTGG +TTAACAGTGAGGGTTTTAGTTACATAAAGGCGTGTTCAGGTTCATCTGTGCCTTGCATAAGTTATCTCAacaaacttcag +tttcctcatccatgagATGGGATGATAAAACAAAATCTCCTGTACTTCATTGGCACCAGGTACACACTGGAGAAATCTGA +GCCCTTACTAAGGTCAGGCTATTGCCTTCTAAAAAATTCCTTACCGTCATTTTGGTAGTCACTCATTTCAGATACAAACA +CAGTTGCTTAACACATCACAAGCATTTTATTCACAGGAGAGTTGTTCCAAGCTAAGGCATAAGACTCAAATGTCcccaaa +taatttattgaaactcattttttattgtgttactTATAACAAGATTTATCGCCTTAATTTTTAaggacattaagtacatt +catattatgcaaccatcaccactatccatctccagataaatcttttctttaataaaaacttttggccgggcgcggtggct +cacgcctgtaatcccagcactttgggaggccgaggcgggcggatcacgaggtcaggagatcgagaccatcctggctaaca +cggtgaaaccccgtctctactaaaaatacaaaaaaaaaattagccgggcgcggtggcgggcgcctgtagtcccagctact +cgggaggctgaggcaggagaatggcgtgaacccgggaggcggagcttgcagtgagccgagattgtaccactgcactccag +cctgggcgacagagccagactccgtctcaaaaaaaaaaaaaaaaaaaaacttttgttgaGATCTAATCcatatactataa +aattcaccaaTTGAAGGTGAGATGGTTTTTCAGTGTTGTGGAACCAataccacaatctaattttaggaGAATGTTATCAC +CCCGAAAAGAAAACCTATACCCATTAACAGTCATTTCCCATTTCCCTCCCattccctcagcccctggcaatcactaatct +attttctgcctctatagatttacctattctagacatttcatatagaTGGAAACACAAAATATGGAGTCGTTTGTGACtgc +tttcatttaacataatgttctcaaggtccATATTGTAGCAGACATCAACAGCTAGTTCTTTTTAATTGCTGAtgaatatt +tcattaaataaaacaaaaaattagaaactattccattatatggatatactacattttatccatcagttgattgacatttg +ggttgtttgcatttttttactattatgagtaatgctgctatgaacattcatgtacaagcttTTGTGTGGACTTAAGTTTT +CATGTGGTAactgtttaacatttttaagtattgctaaattgttttccaaattggctacaccattttacattctcaccagg +aATGTGTGAGGGTTtcagtttttccacattcttgccCATGCTTGTTATTTATGTACCTCGTGAACcccaaaaatctgaga +caggtctcaattaatttagaaagtttatttttccaaggttgaggatgcatcCATGACActgcctcaggaggtcctgatga +taTGTGCCcgaggtggtcggggcacagcttagttttatacattttagggaggcatgagacatcaatcaatatatgtaaga +agaacattggttcagtctggaaaggcgggacaacttgaagcaaggACAGGAAGACTCAAAGCGggaagggggcttccagg +tcacagatatgTGAGAGATGAACAGTTGCATTCTTGTGAGTTTCTGATcagcctttccaaaggagacaGTCAGCTGTGCA +TCtgtctcagtgagcagagggatgactgactagaatgggaggcaggttgaaGGGACCCAAGATACTTTCCTTTCACAGCC +TTGTATTATGGTCATACTGGTAGGTATAAAATGATAAGgtggttttcatttgccttttccTAATGGTTAGTGACTGTTCC +TCAGTGGCCAGGCCTATACAAATCTGCCCCAAAGTCAGAGGAAGCTGAGACGCCAAAGAAAGGGACTGGCAAATCTAGTT +CTTTAGagagaaacatttaatagggacttatGTACAGAAGCCACATCTGTGTCTCTGGCAGTGGTGAGACAAGGTGGTGG +TTTCCCAACCATCacccccaggcccagggccttcccagacccagggcttacatactgtgaaaggaaaataaatcttgtga +ccccaaactcattaagccaaagggaaaagttaagctgggaactgggtcaggcaaacctgcctccccttttggttcctaaa +taagagGGCTGCAAGATGATAACCTACACGCCTCCCCcatattttgcccacaaggaaattcctggtGAGCTCCAAGATTT +CTGCCCTAAGGTGTTCCTGTTAAAATTTCGCCATGGTAATGTAAATCGATAGCTTGTCTTTACAGGTGCAGTCATCCCCC +TGCCTACCAGatacaaatgcatatctgattgttcccctgcccttttgttatgttttgtctaTGTTACCTTATGTAATAAT +GCAGATTCCTCTGCCCCATTTGTCTGTCatcttatgtaaaaaaaaaaaaaaaaaaaaaaaaatgcagattcactgaacCA +AAGGCATGAATGACTATTTTTCCCTACCCTCCTCTTATGTGAAATTTGTGTACTTCTCAatatcccaccctttccccttt +aaatttggagccctcaaaataatctttggagaaaggcGTAGACCTGTCTCCCAGGCACGAGTCCTTATCTTTGGCAGATA +AGCCTCCTGGAATGAGACTGGTCTCATCATTTTTTTCGACTGACATGTGGTAACCACTAAGGGAACCTGGGCGAAGGTGG +CCCAGCCTGCAGCAGCTCTCCTATTGGTGCTTGGTACTGGTCTGGGCACCTTGTAGCCTAAACTGGTAGGACAATTTGCT +GAAGTCTCGGACCTCTTTTTTCCAGGGATCCCTGATCTTCCAGTGTTTTTCAGTTGGGGGTCAGAGGTTTatttgctgtt +taaaaaaaaaaaaatctccttttttcctggaatttccaCTCACTTCCATCAAGGAAGGCAGCCTGTCTGCTGCTGCATCT +GTGGAGAGCACTTTTCAGCTTGGGCTCCTATCACTAGGTAAGGAATTGGTTTGGGATTTTGTCttgcaaattctttttaa +atgactaaagtTAGCATGAACAACCCACTGGTGTTAATTTCTGCTTACACTTGAGTGCTCAGAAGTCATATAATTTGCGT +GATCACTGTTAGTTTTGCTTAGCTGTTTTGTTGTTTCCGTCTTGTTGGGTTGTGTGTgttgggttgtgtgtgtgtttcag +tccTTTCAGCTACCGGATTTGACCAACTCCAAACCCTCTAGCTTATGAGTGTGgaattttctaaagaaataagAGCACTT +TACTCCCCTCAGCCTTTCGGGGCATTCTTAGGCAACTGAGAATCGCGTGAGGGTGTCTGGGAGGAAAGCACCTTAAGACG +TGCAACAGCTCTGAGCTGGTTTTCTCCCCAGAAAAACATGCTTAAGGTCTAATCTCATCTGGTAGGTGCATATAAGGAGC +TGACCCTTCCCACACCTGGAGCCCCTGACACACTTTACCAGGTAGCCGCGACAAGCGTGGACTGAACTGGTTCAAGGAAT +AATGGCCCTGAAGAGCCAGGTCCCCAGGCAGCACACTTTGGGTCTCACACACATCCCAACTTGGTTGAATCCGAAGGGGA +ACTTTAAATTATGGGGAATGAGGCCTCTAAATTAGCTAAAACCCCAGCAGCTGAGGAACATAAAGTTCCACCTTTAGAAA +CTCCGGCCAGGTACATGCAAAATACTTACGGCAAATTGTCATGCAAATATTTAACCAAGTGGACCACTATAACCAAAGTA +GTTCTAAGTTACAATGGCCTAAATGGGGATCTTTTGAGATGCCCAATTTAGTGTACCCGCGAACCAGAATGGAAAACGCG +GGCACAAAAACTAAGAAACCGGAATGGGAGAGCTACTTTCTTGGTACCTGGAGAGTAGCCAGCGGGGGGAAAGATCACCT +CACCTCCCTACAGGAAGCCGACAACTTAGAACTGCCAATCGAGAACTCTCTAATCATTTATCTCTCTTAAAGAAATCCTC +CCAGGGGAAAGATCACCTCATCTCCCTATAGGAGGCCAACAAACAACTCAGAACTGCCAATCGAGAACTCTCTAATCGTT +TTATCTCTcttaaagcaatcctcccagggGAAAGATCACCTCATCTCCCTACAGGAGGCCAACAAACAACTCAGAACTGC +CAATCGAGAACTCTCTAATCGTTTATCTCTCTTAAAGAAATCCTCCCAGGGGAAAGATCACCTCATCTCCCTACAGGAGG +CCAACAAACAACTCAGAACTGCCAATCGAGAACTCTCTAATCGTTTCTCTCTCTTAAAGAAATCCTCCAAATCCCTTACT +TCCCCCTTAGTGCCTCCCCTACCTCCACCTACACCCCCACTCTACCCCTAACCCTCTGGACTTCCTGGACCAGACCTCTC +CCGTCCTTTTCCTTCAGGTTCCAATCTACCCTATTCTCCTCCTTGCCTGCCACTCACTGAACCGAAGGCAGTAGGGAATC +TAACTTCCAAGACCCTACTTCCTGCCGATTCTCTGGTATCGCCAGTGGCAGCCTCTCATCTGGAAGACATGGAAAAGGGG +ACACCATAGGGACTCCTCTCATGATCAACCCGTTTCGGGAACAACTGTTAACAGTTAGGGGAACCCCTGTGATTGTCTGT +CAACCCTGGTTAATGGCTGAGTTGCGAGGCATAAAGAATTTCCTGCCTCCCGTAAAGATCCAATTGGGTTTGCTCAAGAG +TTTGAGCTCATTATCAGAATCTATGACCCGGGTCATTCAGACCTTTATCAGCTGGTCCACATACTGGTCTCAGAAGCTAA +AGCTAAGTGGCTGGAAAAGGCACAGTGGTCAGACATTGTAGCAGACCTGATCTCTAAAGGCCCAGGAAGGCCAAATCAGC +CAGCCCCAAATCCTGAAGACAGACGCAGAGATGCTCGTGAACGATGACTGTTCTGTTAAATAGCATTCCTTCAGTGTCCC +GGAGGGTCGTGGATTGGAGGAAAATCCAGCAATGCTGCCAGAGCACAAAGGAATCCGTTTTAGATTATTTCACACGTTTT +GATAAAACTTTTAGACAATATTGCAGGATGTCAGCTGATTgctatgaaaacaataaaaatgctaCAATATTAAATGCAAA +TTTATTAGACTAGGTGATGATTTAGCCACCCTTAGAAAATGCCACATGATAAATTGGGCCACAGCCAGAACTAATGAACT +AGTTAGCTGACCAGTTATCCCGCACtgtgataaaaaggaaaaacagaagattGCCCAAGTTATGCGTTTACAGTTAAAGC +AATTAACTTTTCAAACCTCTCAGCCCCAGAAAGATTTTAAGCTCATAGGTCTGAGGACTCTTCCCTCCCAGTCTGTTACT +AGTGTAAAAGACCAGGacaacggccgggcgcggtggctcacgcctgtaatcccagaactttgggaggctgaggcgggcag +atcatgaggtcaggagatcgagaccatcttggctaacatggtgaaaccccgtctctactaaaaatgcaaaaaaaattagc +cgggcatggtggcgggcgcctgtagtcccagctacttgggaggctgaggcaggagaatggcgtgaacctgggaggcggag +cttgcagtgagcccacatcgcgccactgcactccagcctgggcgacagagcaagactctgactcaaaaaaaaaaaaaaaa +aaaaaaaacggccaggACAGCTTAAGAGGGATTGCAttaggctgaaatgaaagaaaagacaagaaaatgcaGCTCAGGAA +GACTAGGGTGCTCTGAGAAAGTACAGGGTTTCACTGCTCCAAATATTCTACCCTGACAAATTGGGAGAGATTGATATAAT +GATAAACCAGGAGCTTACAACTGCCTGAATTTGACACAGGAGTGGCTATATCTGTTATAAATCCCACCTAATTTAGAAAC +CCCATACGTGAGAGTAATGAAAGAATTAACATGGTGGCTGTGTCTAATAAAACTCTCTCATGTTTTAAGTCTAAACCTAA +ACCTTAACATTTCTTTGGGTTCAGATGCCCCGCCATGGGCTCTAAGTGTGACATGCTCCGTTGGTCCCATGTGTTTCTAA +TGTGCCCTGCAGCCCCGTCAATCTTTTGGGCTGTGATCTCGTCAACATCCATAATGCTCATATCTCTTTTTCATCGAAAG +GTGAGCTTCTTCTAGAATTGAAGCCAGGGACCAACAATATCAACTTAGAAAACATCCTGACAGCAGGAtgcagtcactca +tgcctgtaatcccagcactttgggaggccgaggcaggtggctcacttgaagttaggagttagagatcagccaggccaaca +tggtgaaacctcgtctctagtaaaaatacaaaaattagccaggcgtggtagcgcatgcctgtaatcccagctacttggga +ggctgaggcaggaaaatcacttgaacccaggaggcggaggttgcagtgagccaagactgtgccactgcactccagcctgg +gcgacagagggagactccatctcacaaagaaaaaacaagaccTGACATTGCACCACAATTTAGTACTAGTAATGTTAAGA +CACCATTTTGTGACTGGGGAAGGGAaatgagacagagaaggaaatactagaagagaagagagaacattGGAATAAAGAGC +AGAAAACAGTAAAACTCCTTTTAGCTTCCCCAGTCTTCCTGTTAACTCCAGAAGTGGAGCATTTGCTTAAGGATGTCCCT +TCCCACTTATACTCTCAGTCAAATACAGATATAGGGAAAATATTCTCAGCCACTTCCATAAAGGTAGAGATAAACCTGAA +GAACCCCCTACCCAACCTCAAACAGTATCCTCTCTGACAGGAAGCTACAGATGGAATCGCCCCTGTCATCCAAGATTAAC +TGAAAAGGGGGCTCAACTATTCCCTGCACAAGCCCCTGCAACAGCCCCATATTccctgtaaaacaaaaacaaaaacaaaa +aaaaacaagcaggagGGGACGGAGATTTGTACATGACTTGAGGGCAATAAATATGGCAATACCCAGACACCCAATAATCC +CCAACCCACATATCCTTCTATCAACTATACCCAGTACCAGCTAGTATTTCTCAGTTGCAGATCTCTGCAGTGCCTTCTTT +AGTATTCCTGTAGATCCAGACAGCCGGTATTTGTTTACCTTTACTTGGAAAGAAGGGCAATATATGTGGACGGTAATGCC +TCAAGGGTATACACAAAGTCCCACTTACTTTTCccaaatattaaaagctgattTAGAGGATTTAATTTTTCCCCAGGGCT +CAACACTCATTCAGTATGTAGGTGACCTTCTTTGTTCAGACACACTATCTTCCTCCCAGGAAGATGGTCTATATTTACTC +AAACagccaccaaaggacacaaagtgTCCAAAGACAAACTTCAGCGATGCTTGCTGCAAGTTAAGCATTTGGAGGCATAT +TATCTCAGTCAAAGGACTGAGTATTAACCCTGACAGAGTGAGAGGAATTTTAGCTTTCCCAATGCCTGTTACTAATAAAC +AGCTTAGAGGATTTTGGGTCCTGGCTGGCTATTGTAGAAACTGGATACCAGTGGACAGGcccggtggctcaggcctgtaa +tcccagcactttgggaggctgaggcgggtggatcacgaggtcaggagatcgagaccattctggctaacacggtgaaaccc +cctctgtactaaaaatacaaaaaaattagctgggcgtggtggcagcacctgtagtcccagctactctggaggctgaggca +ggagaatggtgtggacctgggaggcagagcttgcagtgagccgagattgcaccactgcactccagactgaacgacagagc +gagactccgtctcaaaaaaaaaaaaaagaaagaaactggataCCAAATTTCTTCCTTATGGCTCAACCTCTCTATGCATA +CCTAAAAAATGAACAACCTGATCCTGCCTTGTGGAGTCCTGAGGGACAATCAGCTGTACAACAAATAAAGGAAATTCTAA +CTAATGCCCCAGCCTTAGGGCACCCAAACTACAAACTGCCTTTCTCCCTTTTCACACACAAAACTGGAGGTACTGCATCC +AGGGTACTGATCCAGAAACATGGTGATCATCAGAGGCCTATAGGCTATTTTAGCCAACACCTGGACCCGGTGGCTTGAGG +GCTGCCTCCTTGTGTGAGAGCAGTAGCAACCATGGCCCTTCTGTACAAGTCTGTTGAAGAAATAAGTATGGGTTCCCCCC +TTACCATTTCTGTGCCACATTCTCCTGAGACCCTTCTAAACTCTCATCATACTCAACGTGTGTCTGTCAACCGGTCAGCC +TCTTATCAAATTTTGCTTGTACCATCTTCCAATATTACTACTTCCAGTATAATAATCTTAATCTGGCCACTCTCTTGTCA +GGCCCTTCTGACAAGACCCCTCATGACTGTGTTCTGATGACTGACTTCTCACCCCAGGACAGACCTACAAGAGATGCCAC +TGGATCATGCTAAAATAGAATGGTATACGGATGGGTCTTATTTAAGAGGAGAGGATGGAAATTTTGGAGCAGGATATGCT +GTGGTTTCCTTACTAGAGGTAATTAAAGCCGGTCCTCTTCCCGAAGCCAGATCATCTCAAGTGGCTGAGTTGACTGCCCT +GACCCGAGCTTGTCAATTGGCAAAATACAAGGCTGCAAACATTTGCACTGACAGCTGCTATGCTTTTGGGGTTTGCATGA +CTTTGGGATGCTATGGAAAGATGGAGGATATTTAGCCTCCTCAGGGCAACCCATAAAAAATGTACAAGTATCAGAGCTGT +TAGAAGCTattctaggccgggtgcagtgactcacgcctgtaatcccagcactttgggaggccgaggcgggcggatcacat +gaggtcaggagttcaagaccagcctggccgacatagtgaaaccccgtctctactaaaaatacaaaaattagccaggcatg +gtggcgggtgcctgtaatcccagctacttgggaggttgaagcaggagaattgcttgaacccgggaggtggaggttacagt +gagccaagactgcgccactgcacttcagcctgagtgacaagagtgagactccatctcaaaaaaaaaaaaaaaaaaaaagc +tattctaaaacCAAAATAGTTCACAACCATAAAAATCCCAGGTCACTTTAAATTAAACACCACAGAAGTTCAGGGTAACC +AATTGGCTCATGCCACAGCTAAAAGAGCAGCATTTGAGCCAGCCCCAATCCAGAAAATGACCATAAAACTCAAAACACTT +AAAAAcatatgatatagtttggctgtgtccccacccaaatcttaccttacattgtagctcccacaattctcacatgttgt +gggagggacccagtgggagataattgaatcacgggggcagtttccctcatgctgttctcatggtagtgagtaagtctcat +gagatctgatgactttataaggggaaaccccttttgcttggctcccattctctcttgcttgctgccatgtaagatatgcc +tttccccttccaccatgattgtgaggcctccccagccatgtgaaactgtgagtccattaaacctctttttcttgataaat +tacccagtcttgggtgtgtcttcatcagtagtgtgaaaacggagtaatacagtaaattggtaccagcagagtgtggtggt +gttgtaaagatacccaaaaatgtggaagtaactttggaactgggtaacaggtagaggtcggaacagtttggagggctcag +aagaagacaggaacatgtgggaaagtttggaactccctagagacttgttgaatggatttgaccaaaatgctgataatgat +atggacaatgaaatccaggctgaggtggtctcagatggagagaaggaacttgttgggaactagaataaaggtgactcttg +ctatgttttaggaAAGAgaatggtggcattttgcccctgccctagacatctgtgaaactttgaacttgagggagatgatt +taggtaTCTGGcaggaaatttctaagcaaagaaaagcattcaagaggtggctTGGGTGCTGTTAATAAGATTAAGTTTTt +gaagggaaacagcataaaagtttagaaaatgtgCAGTCTGACGatgagatagaaaaggaaaacccattttctgtggagaa +attcaagacagttgcagaaatttgcataagtaatgaggagccaaattttttgtttgtttgttttgtttttttgagatggg +agtctccctctgtcgcccaggctggagtgcagtggcatgatctcggctcaccacaccctcccttcctgggttcaagcaat +tctcctgcctcagcctcctgagtggctgggattacaggcgcctgactccactcccagctaatttttgtatttttagtaga +gacggggtttcaccatgttggccaggctggtcttgaactcctgatctcaagtgatctgcctgccttggcctcccaaagtg +ctggaattacaggcgtgagcaaggagccaaatgttagtcaccaagacaatgagaaaatgtctccagggcatgtcagagac +attcgtggcagcccctcccatcacaggcctggaggcctagtaggaaaaaatgttttttggtgcctggcccagggcccccc +tgctgtgtgcagtctagtaaggacttggtgccctgtgtcccagccactctagcaatggctaaaaggggccaaggtatagc +ttGGGTTGTGGCTTCAAAGAATGGAAGctctaagccttggcagcttccacgtgatgtggagcctgtgggtgcacagaagt +caagaattgaggtttgggaacctctgcctagatttcagaggatatatggaaatgcctggacgTCCAGGCAGGAGTTTGCT +GCAGGgctggggccctcatggagaacctctgctagggcagtgcagaagaaaaatgtgaggttgaagcccccacacagagt +cctcactagggcactgcctagtggagttgtgagaagagggccactgtcctccaaacCCCACACTGGTAGATCCATCAACA +GCTTGCTCTGTGGCCCTGGAagagccacagacactcaactccagcccatgaaagcagccaagagtggggtataccctgca +aagccacagggcagaactgcccaaggccatggaaacccaccttttgcatcagtgtgacccggatgtgagacatggtgtca +aaggaggtcattttggaactttaaagtttaatgactgccccactggatttcggacttgcatgggccttTAGTCCCTtcgt +tttggctaatttctctaATTtcgaatggctgtatttacccaatgactatacccccattgtatctaggaagtaactaactt +acttttgattttacaggctcataggcagaagggacttgctgtGTCCCAGATGAGagtttggactgtgaacttttgagtta +atgctaaaaggactttgggggactgttgggaaggcattattgcttttgaaatatgaggacatgagatttgggaggggcca +gggatgcaatggtattgtttggctgtgtccccacccaaatctcatcttgaattatggctttcataatttccatgtgtttt +ggGAGGGATCTggagggagataattgaattaggGGGGCAGTTTTTCCCCATACTTTTCTCCTGGTAGTGAacaaatctca +caagatctgatggttttataacgggAAACCCTTTCAtgtggctctcattctctcttgcctgctaccatgtgaCTGAAGAA +GCCTCACAAAACATGCAAACAGTACAGATTTCAACAATCCTGAACCAGCAAACAGATGGTCAAGGAATCTGCAAGCCCTG +TGGCACATCTTAATCCGTTAGGTCCTCGTGTCTAGGTGCATGCTGATGAGAATATGTTCCACCAAATCCAGGCTGCCCTC +CTAGGGTGATGGGAAAGCAGAGGGGTTAAGTATGGCACAATCCACAGTTCAGTTCACCAAGTAAGGATGTGAGtaaaatg +ctaaaaacaaaaaagccactgAAGAGTTGCTTATAGAGTCCAAACTCATGTAGAAAAGGCCGTGTCCCTCAAGGCTTTCA +TTGTCACCAACAAATCTGGTATTGAGATTGCCAAGAATGATTACTCAGCCATTAATTTTGCAGAGTAAGaagaaccaatt +tttaaaatgaggctgggtgtaTGGCAGCTCTGAGCATGCCCATATTTAGTATGAAAATGGATGGCACCCCCATTCTAGGA +AATCTCCGCCTTTTCCCtagaaaaccacatgattattccACCCCCTAATTAGAAGAGCACATAAAGTTAGAAACCCAAAC +TCCTTTGTGCATGACCTCTGTCCTAAGTGTGAGCTTTCGCTTTGCAATCAAAGCTCCTGGCTTTCCACTTCACGGACTcg +gccctgaattctttctcacTGTGGTGCCAAAAACCTGGACACCAGCTGTGGCTGGCATCCCACAGGCATCGGTAGACCCT +CTTGAACCCTCCAGCAACAGGTGAGGGACAGGAAGatggcagggagggaggccaggATGTCCTCAGGTGAGTCCAAGTAG +GAAGGCCCACTGCTGGGAAGGGGGCCTGTTGTCCAGTTGGTCACCCTTGAGCTGTCCTGGGAGATGGGTGTGTCTGTCCA +GCTTCAGCCTGGCCAAGCTTACGCACTGACTGCGGCTTTCTCTGCTGTCTTGGCTCCTGGGTGGTCATCATCATATGTCA +TCCTCTGTGGATGACATTTCCTCAAGTCCAGCGCTGGGTGGCTAGGATAGGCCTAGGATGTCCATCCAGCTGGCTACATC +CTCAAGAGCTCATGGGCCCAGCAATGGCCTGAGGAGTACTTCAAGGTGTGTCCAGCACAGGTCCTGTGCAGTGTGACACC +CCACCAGGTGACTTCAGGGTGTGTGCCTGCTGCCTGGACCCAGAGGGCAAGCATGTGAAATCACAGCTCTCTCTTTGGGA +ACAAAAGGAGAAGCAGTTTTTGCATGACTTAGTTCCCAAGCTTAGTTTTCCTTTGGAATTGTGAGTTTGGGGTCTTGAAA +TTCTATTTTCCTTCCAcagcttctttctcttcccttttccttgtctcttcagttataatttttttgttgttgttaatatt +aACTGTCAGGAATTTCTGTTTCCAGAACCTGGCTGATTACATGGTCAGTGATTTCTAGGGCAGcaggagaaatattttaa +aagctacttTTAAGAAGCATTATTTTATTGGAGATGGTGTAtttgttcattctcatgctgctatgaaaaaaCACCTGAGA +ataggtaattcataaaggaaaggggtttaagtgactcacggttccacattgctggggaggcctcaggaaatttacaatca +atgcagaaggcaggtacCCTCTCCACAGGGCTGCAGGATGCAATGAATGCAAGCAGGGCAAATGCCAGACGCTtaacaaa +accatcagatcccgtgagactcactcactatcacaacagcacggggaaaaccacccccacgATCCGGTCACCTCCGCTTG +GTCCTgctcttgacacatggggattacaattcaagatgagattttgggtggggacacagccagaccatgtCAGATAGGTc +actggtctcaaagtcctggcttcAAGAAATCATTCCTTCTTATCCTCCCATatttctgggatcacaggcatgagccacca +taagctacttaaaaaaaaaaaaaagcctggcagtGTAGGCAGGAAAGTTAAGGGATTCCTTGGGGCAGGGTTGGCAGGTA +AGTAGAGGGATTCCCTGAGGTTTATGTTTCCTCCAGGGCTGTTGCTTGCCAGGCGTTTGATGTGCCTGCCCTGAGGGCAG +CCTGAGGCCTGACACCACCACCCAAAGCCTGGACTCTGGGACCGTGGCTCTCAGTGCATGAACTCAGGGAAACTCTCCCT +TCAGAGAAAGGTGGGTAGAGGTGGGTGCCTCTCCCACCTTACCTGTGGGTAGGAAGGGGGCTGCATCTCTCAAGCCAGAA +ACCTGAAGCAGGCACCGAGTGCTCCCTGTCTGAGCCTGGGTGGAAGGGCAGAGATATGTTCCTGAGAATTCCCAGCAATC +TGGGGGGGATTCGCACTTCCTTGCAGCAGGAGCTCTGGGTTACTTTGCTGGTTGAAGAGGAAAGTAAGTGCCAGGCCCTC +GGCTTCGTGAAAGGGGTGGGTGGGCAGGTCAAGGTGGGCTGTGGAACTCCTCTGTGTACACATTGAATTTAGAACTTTAG +CTTTGACCCATGCAGAAGTGAATGTGCACGCTCTGAGAACTCATGAGAAAATCACCAGAGAAAACAAGCCTTGAGCAGAG +TCCCAAGAAACCACAAACAAAACATCGAGTGCGCAGTGGCTGCCGGTCCTGAGATGATCAGACCCAGAATAGGAATGCAT +CTGTGTTTAAAGGAGTGCAAAGCAGTGAGGAGGAGAAAGCATCAGAACTATTGAAGAGTCCAGAACAGAAGCCAGGGAAA +TTCCCAGAAATGAAAATCAGGACAACTAACATTAGAAACAGGATGAATTGGCTAAAAAGGTTAGACATAGCTAAAGAGCT +GCAGTGATAAAAGGGCTGGACTGAATGAGCCCGCTTATGTGAAGTTCAAACACAGGCAAAGCCAAACGTGACCTAGAGAC +ACAGACGTGCAGTAGAACCATGAGGAAGGGCAAGGGATGGTTAACACGAGAGTCTCGTGAGATTGTGATGATCTCCTGGG +GGAACAGCATGCAGAGGGAAGGCCCTGGTGACACCTGAGGCCAATGGGGCATTCTGGACTCGAAGGTTGTTGAGCAGCgg +ccctggcctccacccaccagatgccagtggtGGCCCCTCCCTGTGATGACAACCCAGGCTGTCTGCAGACTTGGCAAGCT +CCCCTGGGGGGAGAAAAATTGCCCCCAGGTGACAGCCATTGAATTAGAGGTTAAGAGTGTTTCAATTGCTTATGCTGTCT +TTCTTAGGGTGGGCAGTGCCTATGTAGGTATTCattctgtattatttaaaatgtctacaTGTGCACCCTTTATTTTAGGA +TATGTTtttcaggaataaaataataatatacattctCTGGCCTAGTAACAGAGAATCCTACTGGATAAATGAATATGGAA +AATAGTTTTCCTGGGAGCTTATAGCAGGGCTTCTCAGTGTTATTTGCTATTTAACTCCGTATTGAAAAAtgttggcaggg +cacagtggctcacatctgtaatcccaacactttgggaggctgaggcgggcagatcacttgaggtcagaagttcaagacca +gcctggccaacatggcaaaaccctgtctctactaaaagtacaagaattagccagcatgatggtgcatgcctgtaatccca +gctacttgggaggctgaggcaggagaattgcttgaaccgcgaggcagaggttgcagttagctgagatcgcaccactgcac +tccagcctgggtgacagagcaagactctgtttcaaaaaagaaaaaagaaaaagaataagaaaaaatatctagGAAGAAGA +ACTTTCCAAAAACAGCTTTATCTAGTGTGGGGACACAATTTtgtagtcccaagtactcaggaggctgaggcaggaggacc +acttgaccttgggggtttgaggccagcctgggcaagatagaccttatctctaaaaaataaacaaggccaggtgtggtggc +tcacgcctgtaatcccagcactttgggaggctaaggcgggcggatcacgaggtcaggagttcgagaccagcctggccaat +atggtgaaaccttgtctctactaaaaatacgaaaattagccgggtgtggtggcatgcgcctgtagtcccagctacttggg +aggctgaggcaggagaatcgcttgaacccaggaggcagaggttgcagtgagccaagatcatgccattgcactccagcctg +ggtgacagagtgagactctgtctcaaaaaaaaaaggaaagtttattagGAAGAATGCTGTaacagagtgcagtggggcgC +CTCAGAGAGAGGATTGAGCGCCCAGTGGTGGATTTTCCATAGGAGCATTTATGGACCTTAAGGCTGGAGCTTAGGGTTGT +AAACTGAGTTTCAGCATGGCATTCCAGAGatgtttagaaattttatttacttataaaagTTGAAAGAGGCCTGGAATTGA +AGCGGTGTCATTTGTCTGGGGTAATACCTGAGGTTTGTTGCCTCATGTCAAGGAAATTGAGGACGTAGACCCACAATGAG +TGAGTTTTAGAGTgggagtttaataggcaaaagagaaAAGTTCCCTTGTGCAGAGTGAGGGGGGTTCTGAGAGGATTTCC +CCCTTTGCAACCAGATGGTTTTATCGATGAGCTTGAgaaggtggtgtctgatttacatagggcacagaggattggttgga +ccaggtgtgccatttacatAGCACTGGAAGAAGCGGCCAACTCATCCTTTTATTATGCAAATCTTTTTATGCAAATGGAG +TCTCTACGCGCCAGGCGCCACGTTGCCTGCTTTTTTACTGCACATGTGAtgacaaagaaaatggaagggGAACCTCCATG +TTGAACATACTTGGCTTCCAGGTatcccttttctattggcacagctgccggcATTCACCTATGCAGGCTTCCAGCTGGCT +TATCTgtgtctgcagctcaattttacaagctgctctttgttagaaaagaaataatttgggggctgctttttgttaaaagg +aaaGCCTTACTGAGGACTCTCTTACCCTTGCagcaggacgagccacagacaaaacctctcagacaccgagttgtagaagg +aagggctttattcagctgggagcatcggcaagctaCTGTCTCAAAATGCGAGGTCCCTgaatgcacaatttctgtccttt +ttaagggctcacaacactaaagatttcataTGAAAGGATTGTGATTGACTGAGCAATCTAGGGGATACGTAACAGGGGTG +TCGTGCAGAGAGAGTCggagagaaacagaacagagcagggagtttcacaatgttcttccATACAATGCCTGAAATCTATG +GGTAACATCGGGTTCTAAGTcaagagttgatttttaactactaggtttaggccaggcaggcccaggcctggttttgggCC +TGGCGctgggctgcctgtctttgatttcacttccttggtttttttttttaatcaggtactgagtataaaacaatatgaaa +caatatgagagggtctctctcttccctcaccctCACTAATGGCCTAAATAAACAATATGAGAgggcctctctcttccctc +accctCACTAACTGCCTAAATGATTTCTTTCTAGTTCCTGTATCATGATCAGATGCCGTCTTCAGATACTAGGGAAGTTT +GATTACTTCTAAATTCCCCAGATAAGGAGTTTTGCCTCCAGATGGCCTGTTTGATGGTCACCAGGTGGTCTTGGCTCCCT +TCTGAATTGCCCAGGTAAGAAGATTTGTCTCTGGGGCCTGTCCAATGGTCACCAGATGATTTTTGCCCTCCtcacagcct +aggcaacatagtaagacccccatctctacaaaaaatgcaaaatcagccaggcgtggcagcgcacgcctgtggtcccagct +actcagaggggctgagacaggaggatcgcttgaacccaggaggtctgGGTTGTAATGAGCCGTgactgcacctctgcact +ccagcctgggagatggagtcttaaaaaaaaatgtttttccggGTGTTATGAGTGTGCTCTCCAGCTTCAGTGTGACTTTG +TTTGTTGGAGACTTGGGTAAGTAtccaaattgtttttcagatGTAAATGTTAGTAGCTCTCATTCATGGGAAGTTATGGT +CTGCTCTTTAAGTTCTATGGGGCTATCTTCACATCTGTGGCACAAGGCTGGTCAGTTTTACATCTGAGAAAAGTGAGTCT +GGGGAAGTGCCCCTGCCATCCATCCCACAGCAGGCAAGCAGCCTCCTCCgcattctcctcttcctcttctcatgcCTGGT +CCTCTCCTTTTGTCATCTTTTCTTCCCTCACCCCATCTTTATTGTATCTCTTTTTGTCTGTctctaaacaatattttaat +tggTTGCATTGTTTTTATGTTCCCTGTTTTTAATATTCATGacattcccttcctctcccccttcctGTCTTCTCCACATT +AAGTTCACCCTCTTTGTACTTTTACCTGAGTGCTTCCCCCCGttctcctccaccccaccccacttcaGGCAATTCTGAGT +GGTACAGAGGTGTCACATCCATCTCCTGGGACTGTATGAGTCTGTCTCACTTTCCTTTGAAAGAAGCAGTTGGAAATTCA +ttctacttcatttttctttctttctttctttcttttgagatggagtctcgctctgtcccccaggctggagtgcagtggca +cgatcgtggctcactgcaacctccgcctcctgagttcaagcgattcttctgcctcagactcccaagtagctgggattaca +ggcgtgagccaccacgcccgactaatttttgtattttcagtagagacaaggtgtcaccatattggccaggctggtctcca +g diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.ndb b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.ndb new file mode 100644 index 0000000000000000000000000000000000000000..6b7012de9d3116078f9be14b2faa0dcc7f482a3c GIT binary patch literal 32768 zcmeI*ze>YE9Ki9P@lS)xNeRA!xRlr-=oWEtDGokCs1^#^Y@PdBF4CvyC>=Tq_6;q%*@Tqw9Vl-0tg_0 z00IagfB*srAbeVS%!Yf?k!kugNuD3=WWSay-&_vYwx-WZZT@&NN;jRa%k5I0 zD7%ZUtjvn&-x^X7KmY**5I_I{1Q0*~0R#}(If3le{j9i<|MqThe;r>%VH8g>-I-VSIgi3ll=ekMzZ)nkcD|S z3o-!o;ZP%h00IagfB*srAbE}nvD^&8FpMt4Uf-EcC+o%7#XV?F` uvNTJ+|7Te!2q1s}0tg_000IagfB*srY*pa5*Z3Z{glL#M1B3l;^NfYq@2q5l*}T1eGh-%U`Gc_GXpb+GDB3JURokh zzX*y7pqO4>N)k|9JUF#L!O%oU!N|bKR6)Vq%Gg{Xz!z*GqfNdR`K?e5{s^Dyj}C=Z;@1E^ms43yx8c1;;@x z?OrV8wDOtipIFoDb#H9Q(IE-$h3@!aSp%!IDYTS|2|!#ow(*9u0SmFNHttUGBbBC& zZqNSk;lfm^G2aLs&*{U+qmtXN3Gx`A`NELEb<{egM!+M8q+3O(r&%PbwNZoRC20@n zFyP2|<)X8U`jUK$J}uNanrugKnC6FM6TZZn2VbBxz?>~lO98Hj4ryc&nmO%-)%Ui{ z*c9zt`s@y6(3zbFzm|<}JxTQT+=bLsrgD|7K{%v+RE<>yE)OCXOhSL2;g4zubY9QK zB8zYuD8An)!N!}H&5kMbPDbO6M9%2>lm$AHlbQzHEZGGVCYCHltnjk~b28Jda3xc4 zly&wp#alsO6nk+AL-L*DtyW!a$NH=%pNY$et7+TuL5roXI)Hqbwdo2oTW*9ayIMh9 zk(${IOzf%9X!NVBbJ;`vJ0xYZI_@M;E(6;~iF9UPh38~rw->*|Mya~E)L&DDM^mL1 zeKreX-8cU0=566qtvexDlc-oYCe6CmM&VQBo^{Y}))Kg2GRH*13q0(RaO|{gu?Z5V zMZKa`{Q(E~01FQ=zS7`^M_84?x&Z)#3%VGMK!ht|F2W_#5;0kZ5Ks0)x(0Dtqrgc& z9fZyd7XZ-L%6+-j))sB7Kiux(IuxSh$xAEi+IHOKckWZXe3G$WCV9lROm{JYnNNMQ z##Jjy)yme-IhQuu$(a>0VNijSIFY3(sb6i`T&6gU=GfM}x7#heD5!=GJW@Gk$vd#V zQv|jYD9M<}#bV#8-<);z@U?90jT6k=6~&%zbLD9=md0Q(lIe4Mr&y@Fk^BK1{;y6zJ#x}hr5;=9IhPpiBh~8F{MPgAP><~i`nmNuKS?v5e&&VzAORa^ z9Co%_wTu4n{O9`hkSB$=@BJoaQd-8Tl&VqCih8|U3R!}x7X3us_K->S7oEjE_X~Xi zcoeH6D+7Zp6bK-|+}>c|`a{#AWrXkZR7)Mm47`b#nA;=fmT9~5aIRiRsDX4>m7)exk_n?lol_U*v+>( zx9W-GS=w(XC!ZdOpwEga;@0_tgt)j*Z=~94qMhK%vWqsi^-l;({PU>!Px{%tNaLI z*+w0V|E0AFXs#&Td{sMCSHsu5;Rt@W9Iynd38|@IXDVhI^^zi5XXy#Y?ul*<5N*4N zJ41$(CTxBRfCw(uEvhA+Kw_`xlrNLScLifM8zkLvUw#xitB|t?9x{b#%NF50NrVY( zICG-RGPoB*Fiw-b1Vh7bKL@+qzp{ME~y@Y$yLY_O-hTsrT5|Z>oF#s{X`T1TgjT=x%4!-?WoxdDAGLH>Ua<#NAI2~L+&;D73e~oed zWJeXhr>#`@Ciu-Izhrs-KGgW(;(GHl97nMcxU@vW+Jnq*p*dP&y`a^B6urHx_lf6ne3_E%F9(u-yA z8H1o<#rE<>*F#i>#bm~}kKGoIK1XLGtI#W)g9z{hE7__fe!TN^v_Ew&hI?$T&0l;U ze7mc<6}6o5M8hCi2fzJCmyyMaP190;XjW*Cneqp^?ePx=0 zzC51XsDvMAPG7Cuy7J$5{Xv7;@L<)%ue)n`xOCsBQw@X>(LrdRPmW&av`fRRNK9l(F}LvX z4k=u4p4v5XLs@s-q2_k*G`T`)iKpg=&gW94&jH;4y@w~SHT)vrI1v-x@|cwsxv|-# zb-SLHTQf4EjBX>Jf@4hdNdSmW2z8haH$E=#{eefdn`^uIO-aM!-gB(TRH`Xg+s2-4 z{~6gWoBh>nbpy&`+Xb-9TeC)Df-X4Xjb(&fjor`#}wEU#ZRv;|# zDvSBg0`*iK)|@P*$Bkr7$1zfE=WC}5i%OPeB@Fm4P>CW&_r+FA(&9uqvtec22WWgU zxi_|m9A{^zV7J=+h~=0sc#ap%N+>iwk!W5znq`Byew=?b){f~TsS#eUAX?*XEJn!K zUs7n-M}n(P`*$f{iz!kh8~{Syz6A=x=vIQOyj1`_bf@j`q%Sn+8B7x!A*yv+LA7kz6qvZ8kH51{1HtSFK|!;^zMN;!knhkYrcLEE~8OCQ~(?PTZ+4N7vsUiQT%^o5ym@TrBv#_r=`!XctSB-L06N z`12H}MNXx28lDb$P|k67o{Gy&i!ZSY&I`(i;uccu*Q?-0C#brb3pJk-3-JxD@`Ys; z+M~e7(l~p=K{zDm6!qn?8;sLmP&a~Z$Ox6V@caPt#uxzhvD^V0gsI%Y?U{} z+?+UHOLd^u3H~kNnQ`thDA2Wnl_bgO+$(cEM8>>G-D_E=t1 zBzqvQJ?e0-NmvGB7wWWrrBhDK`47Ixo?bO^k}Ek*CkZ4Lg^;HNasY~bkq60`_e@P| zlmFBTJpDO%V+!;T6jQk3!03&VLz0mqRS){+1y&y4x=`p ztwqZ6^I2YOsi^aL%sM`PY=Ppn@f~Y-5|R=-I2U_CGWu_#4f{#hGK(=;3cja6$POB%sd_Ph9%{Ai;OX1+Sy7?17tp zm6O^UioH3xq~IOiU_)iDDK9E#FvDiS-)GY7$1$d^tI-QNRTwn)&I8{&U(J&vhPFIV zcX+$bwpqUyZkyrR`50K{FinkfwL)h#8r{|gcN2eDQ;GS2F$KzkprHz2{G1;Fq;X%T zjKy|>OJ*kCnm;@;T=819(tP(}U$;wdADaB={bsAmw_1|70yUwXegPX-w;s=`=Otr5bFZcY&CVZbj=h6~Tdy?gB6OR$^fXZb8^o^z;34#__A zWt_RXZa2WcuzVv0sE76#g|)@@hv!YST04r$0fs;~jUF|u6y@dnQuVTXy`6kW7tRi zV`g-C9klIRD7UMSPNz$%TQA*Yl#dkuio20$_>K)RDEl&kX90)!24r?O1h6Bxeghy4 zgU0(u@((U)cKqS1G*1cJ(&+IBRjyyGIm%+R$|kjnQLZ8dW{xqTnii_j-|rvMpVvfn zaK0IKw0^a4lS;7}b+9c10Q%s%+xH7Pu4ro=!LP6VXg zx{a$`5fy$A>A?X8eu%<5KUCv6`ultGtA;zd|G0=LetY#tJ$$mQ;pPp_ zJyOCBk9nuU;MC=lJX|S%`m4;=V+A9E+Vt!!e=i6K#QivUNMu%)y7*@C{x`C+vbbdk z^#J|<2LSL7S=txmpRye&L{8Cx+ojR=9^;xL_Yk%XHZ6W8G8I(Wu*eZVA{u5n(IIBZE~AM{C@EQxtC+JS(KcjTz@d_=lk{N z|6w?U#A%gK4@;Oc@An3#x{oR_fhECyy9t}S4?drDk@l7Etc^eGt6nm__F@*+sF+U? zAPOaQRyBHQLw(P1*};yl{kK-FiOxhzUO;JOp{%M;0iN&}-(GlUS_h6VeW}=FVO~F5 z)V<$$dzE`M8B?&@RIBqG?_gwL1>DlZJqAa3DYPEQ3NZyCS)UI8#PydCt@cn4=lm+T z+~G@)Kg_g@XIU@3Q)bBp35cfE`$j)d?d3(okp#cy%`D^ro$9zbI^{eSi`20TtF+NU z#zZ6Bk7ZcL=E^Nd0G#joY?$>hxBBIXqf0#An-j{XwW^dma=*G zKnRHQIzB7Z&(M7&axH%gcl6njA^i`PDVrVMthb5ptRvRny8}e#u6rOo)}^g!CA{p)7yC3pO>!c(0uCI?qewIAfgvG+y2pU zQE24-6iS!(1Lx{7iyBcKo~ku9C9F@TwSP>aLBfP9Nt#=KR!XJm0J7%^`C7s&N-qTAH=jS9<%^lb|HsUbBx!c@%{Jl4!_e@jh zvCSA8ir+T}RtlsEdp*wq;$XRzh-I~nPTSLu5NR8lp&z_^2kfB$ir7@BIcLR0@k-G?=WNS;OJiz|s5z%L zRLwbKBOj_&m4X{sZ&LhU$NcJ0_{U?8o%-(#i}-heX;5=cYN2ZxdhB4oR%(?`QV-Xy z!wA#kDTlZMUPv!WdTQ)mX}Im}nyX$J=Kxb~j}PCT+-*Do*n73*6d#Q@#c?u~^PyI; z`MyvW$cMtY3WBgGFfyp5E%+ax8t9H8-o(%z71W@E%d1U&9fZ}<;vT|U7m>6H=~dIX z4a#Q}>y~KZiS(FUvB?P*h?7-d#CW1jx^&j9Wa!%>JkX!ik2wG@hF{T${ptOPeHT<~ z>jKxkacq6HgNnce^ik;yM8QDS#Pq#4 z-)8b|&>sF^(U9YmTxW>2`%bW3d;AB(AAg&9Y%FE%Qo@T|^mkX+VwbEgRr2pY?U%|23*Iq9~^t6}Dl1Frg6ZQkMDs5E#gUbL(dmkK! z*#<`7cdOd5{{7%ouZV5mwfpwER?rj+&voR%rz(oqq2X4G9xENU&e)X)3$AF7#oft{ zl&!pa8YispLgmX!us?kv(n~_7*f4apYKV6KC?uL#fj3=LjaGF_7Cub-E}<^6a8Z3G4>=CCuElpLZeMV=Bm}7f=2?}Wy(@w!oVqn z7qERQh$cfTtTKR#4#f1-m=Q71aE6uWN}iD7rS=${|YEn{ONJi`UjhD2VO~C zjaYPrMG_*K)%{+~*>q3;{PeVQnWXvk)RAgpuw{_uVqG!h)ym>E-A>VipO#jk@$NVc zzK-s?G-cco@BUb%H+V2u*KY1OF|xXyil=YK1YM)skV*tZ+4sUaTt?StF)>evn21eY zB^aSY84&!F!0AM|m9Lj!7lox|Wi1uU@8!S>K%L2{o}4P4M$NJM6E25S#kLr0L$jKJ z7k_e-0x-@0YG5Tncpy9Q%f`zq8lW`r#0$nrpGf->uozrjq7n2Eq2pDLKjxoO0=&S@ z13ZOsW%UI#+yu>3qoUwKYGiF2spc1tuZs}G%2wLXgxr>6(h8G6k=qO5f4P8@P#07%vRd3a6)iWyj&H_TG zb$%J&6GtWt;P^Ik^ zu>LOEHQ({6!&AomD*y_cx;3LQ&<*ZW&8Q{0pL3Lq6@fa(7+lMJ8_!bb81@6*tNXeZ z>4jQK>c<+ZbuHz<-b2f`3gZbI{Ng1os_C+aiRnI)P+#!x;4`t{QbEVVHq_P;sZ$>Xpw;QvpWl4}89SWp)U?k}59PYHpNp72L2NV0sxo2l;0hP^ehRi73o#SImn+Q1!NFWWh>AbCTvc;-n?iz7h6 z>$JPDCxcEb#3v)T(6zM^Pe=Fhe$741%O05&#u@`3;vgi2!y6}{4ot&s@R7sMMqGQZ zxdy2?1F95geusP39@hy01|amTup=J+ruHqYIZxF{7QypAYnH%v4ym+CE;jdP2&mNo z-%z6H0@}A27iwdINLhpVof66Fz`EycB>t?!F;?$PFzQC-u1h5C#f#~; zO7S$UOYhVoWy3P@*!ceA)B>!-deqUv>fGPVm>PU6ct5*IlH9s< zsovR^w>#pzZ1^VHcKy`9&r>&IXEzIH<{hYZbdol>@Ig=#2ToP*`p8v_6shZzub5nFj*jI_bC0odO*%Qx?iq%9+LH``U(G5Bq3 z2@UV`8Fx_bR-mC+e+3jx^q6KgCK4uH*F0+!;Xp$h5n#H$uM?JG4K~i-ofK*uzHC`P z^lCFfS>~n^K_T-7q9}D~t+21C1CfJ=KxV;3>!piaz$y33BL((k7V4K9m!hIt6@X}+ z>7cVjSw*=2?_uF{>p-$@wqaE;zt%f5`mt~6pk}1A@Ai>&dGxr`55Y|TA~aL$g5ty% zBOzJ7?M?R!(?}`k;k#OTTFpvyrYHz(5d+s2xgIix=@~Q;B2c;**MZ>ydXhnGu|52d zo*t>weL(BiG^{sR4{E`EjV`y}1uyq@sdcs?4d`A}jAHw9fL^29E9>z^l^*)xwN(+W z)t=PSP<-Go#=yLDC)a`}&iO4vbHQIIGg%so2g1NFcpBaH@=ZRUL zU-lSBW8uaf;Tp+^k*G(!V1=2FU-!fyc1$nt_isA-X3Ey9aXLdSEwlJ<&!yHPnR&q+ zHXTIOVte7-1)4z>pWWdvH=Pp_xUZKIv;4x}LAdDw#(YvUZlr*qPSCpibYb~7Hyp|r zh{y~Tapx+#@ZFxG3q*jXx`ZfS_2z7G584}%5TK%wyZ>fWv)=_)j8|766B1c*rbeWY zToS2)-%3cXn3koIImn$_lulk`1z(CVwnJr}tl(UCjz&5>L!iF#xm;%`t7D>=Ww`S7 zwsNg?(5Y%4B(m%YrzW{#{Yr)1B$Cg!&mP6ou_h6li`)fIVS_o(dYV}abk8>;;XO-) zU!#e4jlW3dp-LK@@jeH6>f6!+wPac^4W%UkYecY_3z^m*EA zI`-}9$cH-_>HvcdT!>$hsdnM8`$OT|CrdW}Nte^?H9sN)Z@Dqyyd``VH>tI@HsGG( zZU8L`+WUZE*m<_-tF#X@IwH+22B^C4ZT-hQtS6hCt2kwR%wUh9Ckim^?u2I=ZEHQD zr2z2rZX~$YzhHM!$k7PTLWPXLRf%4{+MRe6uepZCK*zUA=>VNl5)|d+wjE{J1px!X z^)~ms(S+q-wQE-acN&uHv&(hiJNJblS@YD!Ia-6?J`@PpHyn@6s?iPlf!e*b@4&h( zsomSOnA*XX-xEsyQs~kr6TYJg9JXUuJ72Bo{Wk*8am3Zu1FR|sz`(eRdmv)~*aXv3 zg!W%bbY330*!K8fOHa%CW@9G^JRhp#QU=NQtc1h>EsQ#i0OP6)@!6hp*~^#mX0k9< zw6KvviOp{%C-JCp4dPcT{Y0$H7ZJj}dI1`01{&<)&%9rg1LqQ({Qk>ElU~mzk0jUz z1g2}{Ef>s6hr`A^PO@<(r1R4V)*Stx8y6na2Oo=<2q$;I`JHRLUZ=*>_#|(}>3CP}3&0Uq9|Gy{ zgk)p#71%e{h|`;52Fe*&C0F&Ttj8x2kQeK|y?cyd5z3V^cqDNi-0g!ISnD)-iEfRZ zy*LMF0VsN);nUzbuQDP6WB_Kb0bV}tu~=KOb~K30FmvwJ`v4Xr;x{$4-8Ruu=1Ok? z&7GtDPSYqe2=Gh8Z2-`ah|oqUhHp)VgCwQ>q5f%Fgl7QI=dr6tP;b7cxoXOsBX)-- zFY4}aKZ$jRgQ3ufX;fXVtH~bN3y=UQK8^$2nLKb7K6>VhOP}I-6=`vkUss%{qFkzh zUy38&IdIk8S3Do; zW7ERxyZqLgYY);Fs=oU^?mF47F3#Sb{O^Wp{Mne`=&IuZ~v1Lw~(&wAgu@#6ZQb3b2eePnn2yHDP=UvJ31^KWY9sfxJvr#~LXFIC9z zQ^>!NZc>GkESr zKj%-iOsJ6pC2q8C@P=AUj2jJATgf_cHfLXUs2=S5?W<9LHJmUhy1BD)YPk0W6ADk& z#@_hqw06ke#^EIE&J9yiX6{1t!hdNbv28Rk_F!wsWyK&8tlN!dr~o3x=`XPsANAyYx&?swgM+=TN*0@GdDX)3mdicO zsWx+!@~d3=hVPDoQ4>@U>QFkPCnE>09(wK`AS*T}9kEpz>EYZq0$=?KE*)r`gee>r zQ3FW#G5v|*z3yod-wv+V6)Ic};dQ+15#3oKtdGxbyFQU>Xo(fHk!hbiR}C=zUDDA4 zLQil7m|66`tEu{EY&>A&{g3EHCo?pzM^151Q{h!EgS9^P3WIwYnvj6b+q5!}z z)a$mdvg3QteN@@cYo4?FsInh@whx=`z-|6k*#Q>2)1LaFy{=F!G_@tKW4&$pa-eZw zyb6`}Wav16YkDZE(+N#hP@+b~(F^(qJ<;QoS6816TrgUHmB$SN`#5#9uoP&G?x%zI zLU*mx2k|c__Kq?q2Zlu=7~lhNAnMips|57xFqf<8%KIJHRPP788s~0JMuhj+Dwi|6 zRN;$S9U7{xfm_gy5oonx#CyLXg2A%A!bK6ZIUAn5w)WuCWHi7tiy(zUpX;uvBJcCO zJ-mVi1P7{D|Lhsz+e-Wg)gg2rFWl1I-OU$*h`1lZ2Hi}zNDW9Y%f#GFtxF;dS@~w= zX|zWqPb+5dDUqo3rV`n>)Y+PsXMQlPrt*GjdE}Lp*dd$UBa5NzzE|KY4)@oUu^Zj@ zoy|AI3xv}>`m>Qv$W-RfkqCS9d{37YB*JL5FSEZVmFU^Vs1#1a8ib0-C@i^(AQ-hI zRrmC^=m10$Ez!W?u1I^p6Wbc|dv-t;J$7e9)Md|t%xLz+jYgqogHTXvOK||2b=u)6 zM}&^^>2Cb;aPmY?Lr7Tk>@zEvw;OYJloh+%uWI&vw$|9q_hS@AEG>EVi}no?fB|lu zX{-}&a8Nb+j6UJZRlZ9ad`*lUO#$!N(7^i2ES>+$SQ@qa$K$n>$}2XKvT5_?FtqCH zDc`%s@inHksY0@VkC$FtEWOJfuZ(?oTI+5(bS_m|&QOiijLsy5bgZ}6i5=FqSJaQw zm#8FVis0ap!Lmw0a|z2+hzwTP%IU8U-bEels8Kb`ped_f@h;Yr{=~&6Xx4j+oMJ@N zEA%h9<{wn0wjC=Sr;>NtmsjeSmsiA+^;;*=8CsDhW2IhgWvY0DYqmviv^e%{9 z&+2EHjk)hhCqQ5dsRlAclb-Ig^9O3-%p-rhEbabjjghFWPbZXFVTb7ns45K&GXyK} zj21(`!;>N6O2~B}cZz2WoEM4l7i$%MiD1P4vB^k;=+nV5y`!V(+qcWyvpET~^?vR@Yp@rNR*= zQ3K<2y4YWiY1!rYQ=j-S+s|ABK_!CYqTE2al7>{;M!2V4%s4kJHeJIn5)UoXQsZrP ztIKqPgkqz2eAC=99dcO}7>_6GGc7IoR>dv^5+V(VJMG| z=WFZ7R9Ap1OTBH>lI_)7lNzqzTY*OmFnb>kHhAq7#ykfXhPxiEf|Iliv|Ma6Ts*$z zTmh5;5m$?J`+w?rq77Ad;xO=J?$y=e!_7Ri4SEQt*I8sqfSvt2?c>yGfBGiheY&aQ zOUtmG`0y6ZQ;dTE*?xli%SQa07y6vnRJ&_x8?|eMJ*h3S%+R=>+FnuUA|>u-zYklqNkts!hnyaP$8F2xH<=vL@Q>--V1Xq$TV2xggyO_X`=N`GVRQh1%E&UtAM3 zj@nwN2?GHzv}L09Yqy%0 zy@)I%BBD%`jnHV+iv5s8Zk9YT!#QW8OLcr>MlB9WoxpH6d%ys#Fey~$`4!E{%hAvL zX&R3xc+td>bYVd1!uMo3$EG-LyvxPme?WJbzTW}q#B*Gt_!Tg^K+Kg1Jf{g&ak z(D~V<3FV!-Lkw`Fy8;S`b_?l6hI}HPV|g0CkTTaQ+#1 zQ_^3oDeVtE4e)|dJ!W?+B)oL@k{*~V3K}0jONzC}ViJI}icwWi5ODr3%fr4m7pJk9 z>&`pfcX6-KkZsaX$}6EHFC#;&jTd4$u;zA%o}+vt`hCRstdW$NktAY-9U45pw4`~@ zX2Ko1y=y!XIdCv3#I-xov&w9usVZp-5l62o zS+~p3^ZYAwLPg>Q2Po*IBKv)w!<<4p444(5;!3y3b5vb{B@tl#!ZshRTu z`Je_~(8ZM|u7Gy|uaTVRwtjslTb<4RR$RVPD=tQA#bv*5;OoPaCO>{`tb0LFpRE0z z+ADXM+MpLxonbjybWioNsU7-(cul~|Ax&2>`JRPn)i5Y^yWuC`Q-;=?D+r$<&zQ#NP7&q9(Lme$2D zTYweaaM8{)>m1=?yXXoTu$p{l%Y3aEVu>;may!cC7c6%t=at6b64x0Y z*rbQuv`}hQfhbGfx%}MKAv5`6M;{2WSg9&volYO0w3tk`x4PQ*dI1F;vxa~o0vxL4 zb5C5a>Z(%9;Gv*Bt}lQ5AJc~cWttJ2LWCbAM8;KyI(k(^qLs>zaIzj ziKX5T2$ZsD0;=$vfb!@;-nj+J^xW07zaIvMI z5FbMxvSZs_=mQw2c4ikyRnc>8{ISOJomtLfeDH<9YCv7&-4{FU3+oUc?J=7j{Kn^b z^8#TjTsHv9QrbQx8S{EYak4$9RLA3~0WR(w)m?9G|o7`?mUu@EG zh}h9ub?MG53cmf{Oqo3PWy~MSTWALKHy;k##NgmF@~$V!@t}ppmI_{cn6HyL*GA4m zDl=Owww`#4P$_kp?PZl~b4svRxz|;>ePNBvXW*~>LYqkIFCB)CEURj2#(C=f@}Z8e z8Vya6vR_^X(*iv6I$nLojagqV9+plBZbh{)5rQC~H_#EM(D0ZfVQ`B9X)JB(f>EN! zlY5tz`ebu?i2e;@eFBSep}~jCw}0Ze02>JruSYiCZfP`#2LiN%Q(a0AJ7H+=xOV49 z2dme2u^Re|N3CWdUhlWMfvXF`1Sr|PmN72TZ7Ka-8Hg!nrx4fwb*X4_b2C^t3otD=E?HRc2&#PhJSJI_ zZVhj#AwU-F?d=G@6o;J1hp9#8?9@8yE@@{dHhYPkC0`yNDaErROAnZvpH+-=l8f2G z;V`!*3FmgYg64!TX}ERU?n2(kNXe}*8j>a_=PxiTdIuYp@bJgWAuU)UVQRg;O;AG3 zz}0H|a))dgDK#)11Yaea`mhm@*df`0wsh1@a5vGC5&Y(6QtZYqL%ar6t+cNWXf>57 zRz@zfneTnXGn@8!6n9`%yLS(9ZD4IauMPdwZ@Awwy;n zL3`Z25tVmi=kncUAbr8jCs7Uj&R@Dd`%Bk$lVFyHpX2EtLoGvsBE)C>2}?g$ocmx?>h3X z#_;x5B&6#0WYRBN%hVVipK8a%8k4ozX7W!Lq|V`51TG^+jBuz&3R@ZESX1G{)H;?Y z%?4qbO2Fn6Q^#%O^Y*waD^$b=USD44SFo{dUBRgD-CmsJfx2exDAj*(>*R96U_-S~ zi?ZAkk@i9BLrO+C&e=frJ@fK#78r{J1jJ?|M4QcSu7K zos$P6S2y!5Ds7|XHC$*BsRbXA_WVAiZT)ECT=Qn1kU!hsHxR)&vIlN9YBkwYq54(g zG(fjr&nEkHBQEVU+ttghOs_*yQ+k+CN^2XKLu;Kn3h-U94-(SC-g)@{4a$-oaB40t zsM1&f#Hjmw90Q;>)-w(JCiku6eUm$FZt?^Vcc1@(nY&+b1@-CK5Jm?E1b_h2AON=u zsSvdxgx?!$t22b|Iz>JZ?^o#zi;SK6ttuONz+uzZV*poJm4i_IlgwXbUgzAu)MZ}n z5ZEjP`b?Ut9Rlz_)JSky82*v!se^a)qTPI{{Vs4?VZGpo%iH6s&=#(`lSo0?))#FNOY+8N(;e*; z-vI7b`9l-f^t1{ocj>kRWPvu4WjVaM&8*uSG25C}_t(D3-bxiN&AdNciMzcxPiiaN z3oV#O+>E42_SW(}Ro>tV>928ig+B>ORpvp^=v9q4Mfd<*E)S)*Qu1jFItwwt>SmrIw$%Jk-%C9l*=2-v%Ts z4H}$z%=CsSmQq%0hcx1?*84$MTeg__V|9lb2@QYnkxmr|xJI_U%aFn5NRKyDzyGe$ zxlP>wm#cdRFLc-u2;{0oWz|ASwV-)RVG(;@rRpa1=D2FOaRa@0&5R_9$xwHdNTi+! zK2ADkMKhb5F4WgvV(%ZtOzbmTRWgcH><NZw&3qJ;&udhbD_yU1&)Yc4HI}KO2v1$?=*zMif=gqO>YLTMC$7vM+ z?%1e*GQG?oB{@$il{=O@jgKOas>HtFCl47B;bl)2Src8GKGmK>p28rz#gM(QX+TSJ zoo*2;ER1MfX)U09yGhxYpv^8)B+B8Gi5P5D%F)*o%5kYE5Z%2f+g16FuI#xDr*9-N zqLVmDBN4ho6lL!o)j28pC;CXVNw7G7Gv_X@VaUUiTK;Z&TxucNf>^h6*EcO96#GM! yHQ)R_C;Fo$`_1`%k5oW)3HWA$Mav{R{`DirvEv8T9;x_wxcWK(D?Lc@1YSi@p{p+HqW9@Fr1T_$d!a|DnwR0}#*Gvzd|71X z{g6z)-S85T*RV^r-#=duIdzJkyQgkAju_6m?CR~ezpidNTg?3z?>t+V5+Fc;009C7 z2oNAZfB*pkw+qz1`!D+c`t+KWnseOrpYYhnoWK3cJb?fK0t5&UAV7cs0RjXF5J-XS zzayHM#HQV}xgruEK!5-N0t5&UAV7cs0RsO);N@6E6QfwqKVBEpx7F%>xjxnP9Q72v tW~JgB=Xra^(*N(D`o%y literal 0 HcmV?d00001 diff --git a/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nto b/fdog/data/assembly_dir/HOMSA@9606@v1/blast_dir/HOMSA@9606@v1.fa.nto new file mode 100644 index 0000000000000000000000000000000000000000..20d5cb86e6dff1f3684dc229a358a2ea697cecfb GIT binary patch literal 8 KcmZQ%fB*mh5C8%I literal 0 HcmV?d00001 From 62de82b3252e2904dd9669fea7af7a59f0e20135 Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+HannahBioI@users.noreply.github.com> Date: Sun, 14 Sep 2025 19:56:33 +0200 Subject: [PATCH 222/229] fDOG update (#56) * removed bio.applications * version bump * update python version in github build * Update github_build.yml (#50) * replaced pkg_resources by importlib.metadata * replaced pkg_resources by importlib.metadata * Update README.md * replaced bio.applications in checkData * added option to use micromamba and mamba install * add option to use micromamba, mamba or conda * replace ete3 by ete4 * replace ete3 by ete4 * replace ete3 by ete4 * replace ete3 by ete4 * replace ete3 by ete4 * version bump * Update README.md * Update README.md (#53) * Update README.md * Update README.md (#54) * Update README.md --------- Co-authored-by: trvinh From 23cc9bd05c989d9f773f59b565d2e859d8730c0d Mon Sep 17 00:00:00 2001 From: Hannah Muelbaier <47216555+HannahBioI@users.noreply.github.com> Date: Sun, 14 Sep 2025 20:04:52 +0200 Subject: [PATCH 223/229] Fdog assembly update to 0.1.5.2 (#58) * added fDOG-Assembly workflow * added fDOG-Assembly dependencies * adjustments to new muscle version and fDOG version * further changes to adapt to new fDOG version * script to produce msa and hmm in the format fDOG-Assembly requires from a fasta file * script to produce msa and hmm in the format fDOG-Assembly requires from a fasta file * Bugfix muscle v5 command in fdog.addCoreGroup * fixed augustus version * further updates to adapt fDOG-Assembly to the new fDOG version. Additionally bugfix in co-ortholog detection if MSA crashed due to too many sequences. * contig fasta files where delited from the tmp folder to save memory * Revert "Merge branch 'fdog-assembly' of https://github.com/mueli94/fDOG-Assembly into fdog-assembly" This reverts commit d688bbb6f9498d3e8e02a20a8321a91b9215a23d, reversing changes made to 5068b7acf249209c0b4d951b23fa7c2d79af32ab. * bugfix for installation with miniconda and removal of MetaEuk tmp folder * Update .gitignore * add function to output only best isoform * added function to only output one isoform * new gff function, fixed problem that muscle can not work with path containing @ sysmbols, adapted mergeOutput that merges files with _og ending * Fixed bug where fDOG-Assembly does not terminate if FAS raises an error * bugfix createGff function * bugfix createGff function * bugfix gff output * remove MetaEuk tmp files * bug fix during distance calculation: AS U (selenocystein) will be scored as C (cystein) * bug fix user output * bug fix debug mode * updated check_ref_spec function * updated check_ref_spec function, added region output if augustus is used * update get_distance_function * bug fix check_ref_spec function * v 0.1.5.1 * update output containing candidate regions * bugfix * Updated fdog.addAssembly module * improved call of main fuction * Improved help function * bugfix using --strict * bug fix consensus sequence will now use precomputed consensus sequences when offered also with MetaEuk * day end -> working on merging overlapping candidate regions * removed bio.applications * version bump * update python version in github build * Update github_build.yml (#50) * replaced pkg_resources by importlib.metadata * replaced pkg_resources by importlib.metadata * Update README.md * replaced bio.applications in checkData * added option to use micromamba and mamba install * add option to use micromamba, mamba or conda * replace ete3 by ete4 * replace ete3 by ete4 * replace ete3 by ete4 * replace ete3 by ete4 * replace ete3 by ete4 * version bump * Update README.md * Update README.md (#53) * Update README.md * Update README.md (#54) * Update README.md * update * fDOG merge * CheckCoorthologsRef is on per default * region merge start, experimental * bug fix checkCoorthologsRefOff * --checkCoorthologs parameter is enabled per default, has to be turned off by using --checkCoorthologsOff --------- Co-authored-by: Hannah <47216555+mueli94@users.noreply.github.com> Co-authored-by: trvinh --- .idea/.gitignore | 8 ++++++++ .idea/fDOG-Assembly.iml | 14 ++++++++++++++ .idea/inspectionProfiles/profiles_settings.xml | 6 ++++++ .idea/misc.xml | 4 ++++ .idea/modules.xml | 8 ++++++++ .idea/vcs.xml | 6 ++++++ fdog/fDOGassembly.py | 9 +++++---- fdog/mergeOutput.py | 6 ++++++ 8 files changed, 57 insertions(+), 4 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/fDOG-Assembly.iml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/fDOG-Assembly.iml b/.idea/fDOG-Assembly.iml new file mode 100644 index 0000000..8e5446a --- /dev/null +++ b/.idea/fDOG-Assembly.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..b44cad8 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..6773d30 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 6358868..518840a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -258,7 +258,7 @@ def augustus_ppx(regions, candidatesOutFile, length_extension, profile_path, aug # augutus call cmd = "augustus --protein=1 --gff3=on --proteinprofile=" + profile_path + " --predictionStart=" + start + " --predictionEnd=" + end + " --species=" + augustus_ref_species + " " + tmp_path + key + ".fasta > " + tmp_path + name + ".gff" #print(cmd) - starting_subprocess(cmd, 'silent') + starting_subprocess(cmd, 'normal') # transfer augustus output to AS sequence #print(tmp_path) #print(key) @@ -1086,7 +1086,7 @@ def main(): #################### handle user input ##################################### start = time.time() - version = '0.1.5.1' + version = '0.1.5.2' ################### initialize parser ###################################### parser = argparse.ArgumentParser(description='You are running fdog.assembly version ' + str(version) + '.') parser.add_argument('--version', action='version', version=str(version)) @@ -1108,7 +1108,7 @@ def main(): optional.add_argument('--evalBlast', help='E-value cut-off for the Blast search. (default: 0.00001)', action='store', default=0.00001, type=float) optional.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', action='store_true', default=False) optional.add_argument('--msaTool', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. (default:muscle)', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') - optional.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_true', default=False) + optional.add_argument('--checkCoorthologsOff', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', action='store_false', default=True) optional.add_argument('--scoringmatrix', help='Choose a scoring matrix for the distance criteria used by the option --checkCoorthologsRef. (default: blosum62)', choices=['identity', 'blastn', 'trans', 'benner6', 'benner22', 'benner74', 'blosum100', 'blosum30', 'blosum35', 'blosum40', 'blosum45', 'blosum50', 'blosum55', 'blosum60', 'blosum62', 'blosum65', 'blosum70', 'blosum75', 'blosum80', 'blosum85', 'blosum90', 'blosum95', 'feng', 'fitch', 'genetic', 'gonnet', 'grant', 'ident', 'johnson', 'levin', 'mclach', 'miyata', 'nwsgappep', 'pam120', 'pam180', 'pam250', 'pam30', 'pam300', 'pam60', 'pam90', 'rao', 'risler', 'structure'], action='store', default='blosum62') optional.add_argument('--coreTaxa', help='List of core taxa used during --strict', action='store', nargs="+", default=[]) #optional.add_argument('--filter', help='Switch the low complexity filter for the blast search on.', action='store', default='no') @@ -1139,7 +1139,8 @@ def main(): #I/O tmp = args.tmp strict = args.strict - checkCoorthologs = args.checkCoorthologsRef + checkCoorthologs = args.checkCoorthologsOff + print(checkCoorthologs) #others average_intron_length = args.avIntron length_extension = args.lengthExtension diff --git a/fdog/mergeOutput.py b/fdog/mergeOutput.py index ffebb5c..7949dde 100644 --- a/fdog/mergeOutput.py +++ b/fdog/mergeOutput.py @@ -117,6 +117,11 @@ def main(): if not ex_fasta: ex_fasta = out + '.extended.fa' ex_fasta_out = open(ex_fasta, 'w') + inSeq = SeqIO.to_dict((SeqIO.parse(open(directory + '/' + infile), 'fasta'))) + for seq in inSeq: + if not seq in fa_seq_id: + ex_fasta_out.write('>%s\n%s\n' % (seq, inSeq[seq].seq)) + fa_seq_id.add(seq) with open(directory + '/' + infile, 'r') as reader: lines = reader.readlines() for line in lines: @@ -129,6 +134,7 @@ def main(): lines = reader.readlines() for line in lines: og_fasta_out.write(line) + if phyloprofile: phyloprofile_out.close() if domains_0: From f13becc8eb3c2d00e779607a9f060873ebd2efc5 Mon Sep 17 00:00:00 2001 From: AnnamTran Date: Sat, 4 Oct 2025 17:09:46 +0200 Subject: [PATCH 224/229] creating assembly_dir in setup; auto detect datapath in fdog-assembly --- fdog/fDOGassembly.py | 49 +++++++++++++++++++++------------------- fdog/libs/preparation.py | 4 ++-- fdog/libs/tree.py | 2 +- fdog/setupfDog.py | 4 ++++ 4 files changed, 33 insertions(+), 26 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 518840a..2efd0fe 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -30,6 +30,8 @@ import shutil import multiprocessing as mp import fdog.libs.alignment as align_fn +import fdog.libs.zzz as general_fn +import fdog.libs.fas as fas_fn from tqdm import tqdm from pathlib import Path import pandas as pd @@ -59,12 +61,6 @@ def check_ref_spec(species_list, fasta_file): print("Reference species is not part of the ortholog group. Exciting ...") sys.exit() -def load_config(config_file): - with open(config_file, 'r') as stream: - try: - return yaml.safe_load(stream) - except yaml.YAMLError as exc: - print(exc) def starting_subprocess(cmd, mode, time_out = None): @@ -830,7 +826,7 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci if to_add != min_name: checked.append(to_add) elif name in tested and isoforms == False: - pass + pass else: checked.append(name) #print(checked) @@ -1140,7 +1136,7 @@ def main(): tmp = args.tmp strict = args.strict checkCoorthologs = args.checkCoorthologsOff - print(checkCoorthologs) + # print(checkCoorthologs) #others average_intron_length = args.avIntron length_extension = args.lengthExtension @@ -1187,25 +1183,28 @@ def main(): #checking paths if dataPath == '': fdogPath = os.path.realpath(__file__).replace('/fDOGassembly.py','') - configFile = fdogPath + '/bin/pathconfig.txt' + configFile = fdogPath + '/bin/pathconfig.yml' if not os.path.exists(configFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog) or give a dataPath') - if pathFile == '': - with open(configFile) as f: - dataPath = f.readline().strip() - else: - cfg = load_config(pathFile) - try: - dataPath = cfg['dataPath'] - except: - dataPath = 'config' + sys.exit( + f'No pathconfig.yml found at {pathconfigFile}. Please run fdog.setup ' + + '(https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') + if pathFile: + configFile = os.path.abspath(pathFile) + cfg = general_fn.load_config(configFile) + try: + dataPath = cfg['datapath'] + except: + dataPath = os.getcwd() if out == '': out = os.getcwd() else: if out[-1] != "/": out = out + "/" + if not os.path.exists(out): + os.mkdir(out) check_path(out) + out = os.path.abspath(out) if os.path.exists(out + '/' + group): if append != True and force != True: @@ -1217,7 +1216,6 @@ def main(): out = out + '/' + group + '/' elif append == True: out = out + '/' + group + '/' - else: os.system('mkdir ' + out + '/' + group + ' >/dev/null 2>&1') out = out + '/' + group + '/' @@ -1379,7 +1377,7 @@ def main(): tmp_path = out + '/tmp/' fas_seed_id = createFasInput(orthologsOutFile, mappingFile) cmd = ['fas.run', '--seed', fasta_path , '--query' , orthologsOutFile , '--annotation_dir' , tmp_path + 'anno_dir' ,'--bidirectional', '--tsv', '--phyloprofile', mappingFile, '--seed_id', fas_seed_id, '--out_dir', out, '--out_name', group] - #print(cmd) + # print(cmd) fas_out = run_fas(cmd) clean_fas(out + group + "_forward.domains", 'domains') clean_fas(out + group + "_reverse.domains", 'domains') @@ -1393,8 +1391,13 @@ def main(): ################# remove tmp folder ######################################## - print("fDOG-Assembly finished completely in " + str(end-start) + "seconds.") - print("Group preparation: %s \t Ortholog search: %s \t FAS: %s \n" % (str(time_group), str(time_ortholog), str(time_fas))) + print( + f"fDOG-Assembly finished completely in {round(end-start,2)}s (" + f" Group preparation: {round(time_group,2)}s \t" + f"Ortholog search: {round(time_ortholog,2)}s \t" + f"FAS: {round(time_fas,2)}s)" + ) + print(f"Outputs are saved at {out}") sys.stdout = sys.__stdout__ cleanup(tmp, tmp_folder) diff --git a/fdog/libs/preparation.py b/fdog/libs/preparation.py index f18e141..18c2a9f 100644 --- a/fdog/libs/preparation.py +++ b/fdog/libs/preparation.py @@ -40,7 +40,7 @@ def parsing_paths(args): pathconfigFile = fdog_path + '/bin/pathconfig.yml' if not os.path.exists(pathconfigFile): sys.exit( - f'No pathconfig.txt found at {pathconfigFile}. Please run fdog.setup ' + f'No pathconfig.yml found at {pathconfigFile}. Please run fdog.setup ' + '(https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') if pathFile: @@ -48,7 +48,7 @@ def parsing_paths(args): cfg = general_fn.load_config(pathconfigFile) try: - data_path = cfg['dataPath'] + data_path = cfg['datapath'] except: data_path = os.getcwd() diff --git a/fdog/libs/tree.py b/fdog/libs/tree.py index 57efcaf..fd3b55e 100644 --- a/fdog/libs/tree.py +++ b/fdog/libs/tree.py @@ -135,7 +135,7 @@ def abbr_ncbi_name(ncbi_name): E.g. "Homo sapiens" -> "HOMSA" """ if not ncbi_name.startswith('UNK'): - ncbi_name = re.sub('[^a-zA-Z1-9\s]+', '', ncbi_name) + ncbi_name = re.sub('[^a-zA-Z1-9\\s]+', '', ncbi_name) tax_name = ncbi_name.split() name = tax_name[0][:3].upper()+tax_name[1][:2].upper() else: diff --git a/fdog/setupfDog.py b/fdog/setupfDog.py index 184de6f..cad5582 100644 --- a/fdog/setupfDog.py +++ b/fdog/setupfDog.py @@ -174,6 +174,9 @@ def download_data(dataPath, resetData): os.rename('%s/genome_dir' % dataPath, '%s/searchTaxa_dir' % dataPath) os.rename('%s/blast_dir' % dataPath, '%s/coreTaxa_dir' % dataPath) os.rename('%s/weight_dir' % dataPath, '%s/annotation_dir' % dataPath) + if not 'assembly_path' in general_fn.read_dir(dataPath): + os.makedirs(f'{dataPath}/assembly_path') + shutil.copytree(f'{get_source_path()}/data/assembly_dir', f'{dataPath}/assembly_dir') check_cmd = 'fdog.checkData -s %s/searchTaxa_dir -c %s/coreTaxa_dir -a %s/annotation_dir --reblast --ignoreAnno' % (dataPath, dataPath, dataPath) try: print('Checking downloaded data...') @@ -197,6 +200,7 @@ def write_pathconfig(fdogPath, dataPath): cf.write('corepath: \'%s/coreTaxa_dir\'\n' % dataPath) cf.write('searchpath: \'%s/searchTaxa_dir\'\n' % dataPath) cf.write('annopath: \'%s/annotation_dir\'\n' % dataPath) + cf.write('assemblypath: \'%s/assembly_dir\'\n' % dataPath) def main(): From 34df42a6633aef1cf26ddf36ab8b8d5512a57fe3 Mon Sep 17 00:00:00 2001 From: AnnamTran Date: Mon, 6 Oct 2025 09:50:48 +0200 Subject: [PATCH 225/229] fixed undefined var --- fdog/fDOGassembly.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fdog/fDOGassembly.py b/fdog/fDOGassembly.py index 2efd0fe..850304a 100644 --- a/fdog/fDOGassembly.py +++ b/fdog/fDOGassembly.py @@ -960,7 +960,7 @@ def blockProfiles(core_path, group, mode, out, msaTool): check_path(fasta_path) if msaTool == "muscle": if align_fn.get_muscle_version(msaTool) == 'v3': - print("muscle -quiet -in " + output_file + " -out " + aln_file) + print("muscle -quiet -in " + fasta_path + " -out " + msa_path) else: cmd = "muscle -quiet -align " + fasta_path + " -output " + msa_path elif msaTool == "mafft-linsi": @@ -1186,7 +1186,7 @@ def main(): configFile = fdogPath + '/bin/pathconfig.yml' if not os.path.exists(configFile): sys.exit( - f'No pathconfig.yml found at {pathconfigFile}. Please run fdog.setup ' + f'No pathconfig.yml found at {configFile}. Please run fdog.setup ' + '(https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') if pathFile: configFile = os.path.abspath(pathFile) @@ -1239,8 +1239,8 @@ def main(): ################## How to handle std output and std error ################## if mode == 'silent': - sys.stderr = f - sys.stdout = f + sys.stderr = False + sys.stdout = False else: pass From b0ae1b3562f554e42f3469322314d7750c7f1267 Mon Sep 17 00:00:00 2001 From: AnnamTran Date: Mon, 6 Oct 2025 10:09:21 +0200 Subject: [PATCH 226/229] removed hmmemit from dependencies.txt --- fdog/data/dependencies.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/fdog/data/dependencies.txt b/fdog/data/dependencies.txt index 28d26a4..4911591 100644 --- a/fdog/data/dependencies.txt +++ b/fdog/data/dependencies.txt @@ -5,5 +5,4 @@ mafft muscle augustus metaeuk -hmmemit tblastn From af6e4009c9ed2b0935130e67a1b7cfe6a8948a4e Mon Sep 17 00:00:00 2001 From: AnnamTran Date: Mon, 6 Oct 2025 10:27:07 +0200 Subject: [PATCH 227/229] removed tblasn from dependencies.txt; added check fdog.assembly to build workflow --- .github/workflows/github_build.yml | 2 ++ fdog/data/dependencies.txt | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/github_build.yml b/.github/workflows/github_build.yml index 856dbfb..4499c96 100644 --- a/.github/workflows/github_build.yml +++ b/.github/workflows/github_build.yml @@ -49,6 +49,8 @@ jobs: fdog.showTaxa echo "TEST fdog.run" fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@qfo24_02 --fasOff --group mammalia + echo "TEST fdog.assembly" + fdog.assembly --gene test --refSpec HUMAN@9606@qfo24_02 --augustus --augustusRefSpec human --coregroupPath core_orthologs/ --out test_assembly --fasoff mkdir seeds path=$(fdog.setup -d ./ --getSourcepath); a="1 2 3"; for i in ${a[@]}; do cp $path/data/infile.fa seeds/$i.fa; done echo "TEST fdogs.run" diff --git a/fdog/data/dependencies.txt b/fdog/data/dependencies.txt index 4911591..6b36db4 100644 --- a/fdog/data/dependencies.txt +++ b/fdog/data/dependencies.txt @@ -5,4 +5,3 @@ mafft muscle augustus metaeuk -tblastn From fa9031d198a53c43ff838716e0e3dd6714cec915 Mon Sep 17 00:00:00 2001 From: AnnamTran Date: Mon, 6 Oct 2025 10:43:58 +0200 Subject: [PATCH 228/229] updated github_build --- .github/workflows/github_build.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/github_build.yml b/.github/workflows/github_build.yml index 4499c96..215b7f4 100644 --- a/.github/workflows/github_build.yml +++ b/.github/workflows/github_build.yml @@ -41,6 +41,11 @@ jobs: pwd pip install . path=$(fdog.setup -d ./ --getSourcepath); for i in $(less $path/data/dependencies.txt); do sudo apt-get install -y -qq $i; done + path=$(fdog.setup -d ./ --getSourcepath); ls $path/data/ + echo "#########################################" + pwd + ls + echo "#########################################" echo "TEST fdog.setup" fdog.setup -d /home/runner/work/fDOG/fDOG/dt --woFAS echo "TEST fdog.checkData" From 39075f641925cd666d6fe6df0d8263a726bf022e Mon Sep 17 00:00:00 2001 From: AnnamTran Date: Mon, 6 Oct 2025 11:00:39 +0200 Subject: [PATCH 229/229] updated setup.py and manifest.in --- MANIFEST.in | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index a85e62a..54177f6 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -recursive-include fDOG * +recursive-include fdog/data * diff --git a/setup.py b/setup.py index ba8a988..53b242c 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ author_email="tran@bio.uni-frankfurt.de", url="https://github.com/BIONF/fDOG", packages=find_packages(), - package_data={'': ['*']}, + include_package_data=True, install_requires=[ 'biopython', 'tqdm',