diff --git a/src/scarap/callers.py b/src/scarap/callers.py index 572e168..8dd6d18 100644 --- a/src/scarap/callers.py +++ b/src/scarap/callers.py @@ -50,15 +50,20 @@ def run_orthofinder(faafins, dout, logfout, threads, engine = "blast"): def run_mafft(fin_seqs, fout_aln, threads = 1, options = [], retry=False): args = ["mafft"] + options + ["--thread", str(threads), fin_seqs] - with open(fout_aln, "w") as hout_aln: - result = subprocess.run(args, stdout = hout_aln, + + result = subprocess.run(args, + stdout = subprocess.PIPE, stderr = subprocess.PIPE) - if result.returncode == 0: return() - if not retry and result.stderr.splitlines()[-1][0:17] == b"Illegal character": - og = filename_from_path(fin_seqs) - logging.warning(f"added --amino and --anysymbol flags to mafft for {og}") - options = options + ["--amino", "--anysymbol"] - run_mafft(fin_seqs, fout_aln, threads, options, retry=True) + + with gzip.open(fout_aln, 'wb') as hout_aln: + hout_aln.write(result.stdout) + + if result.returncode == 0: return() + if not retry and result.stderr.splitlines()[-1][0:17] == b"Illegal character": + og = filename_from_path(fin_seqs) + logging.warning(f"added --amino and --anysymbol flags to mafft for {og}") + options = options + ["--amino", "--anysymbol"] + run_mafft(fin_seqs, fout_aln, threads, options, retry=True) def run_mafft_parallel(fins_aa_seqs, fouts_aa_seqs_aligned): with concurrent.futures.ProcessPoolExecutor() as executor: diff --git a/src/scarap/constants.py b/src/scarap/constants.py new file mode 100644 index 0000000..cdfd6d5 --- /dev/null +++ b/src/scarap/constants.py @@ -0,0 +1,16 @@ +# Description: Constants used in the scarap package + +# OUTPUT FOLDERS +TMP = 'tmp' +LOG = 'logs' +SFAMD = 'superfamilies' + +# OUTPUT FILES +PANF = 'pangenome.tsv' +PPANF = 'psuedopangenome.tsv' +GENEF = 'genes.tsv' + +# ALIGNMENT +ALN_OUTFMT = '.aln.gz' +REPSEQ_ALN = 'repseqs' + ALN_OUTFMT +STOCKHOLM_ALN = "alis.sto.gz" \ No newline at end of file diff --git a/src/scarap/helpers.py b/src/scarap/helpers.py index f011b69..e03d9cb 100644 --- a/src/scarap/helpers.py +++ b/src/scarap/helpers.py @@ -6,6 +6,7 @@ from scarap.readerswriters import * from scarap.computers import * from scarap.callers import * +from scarap.constants import * # used by the build and search modules def run_profilesearch(fins_faas, fins_alis, fout_hits, dout_tmp, threads): @@ -14,7 +15,7 @@ def run_profilesearch(fins_faas, fins_alis, fout_hits, dout_tmp, threads): dout_mmseqs = os.path.join(dout_tmp, "mmseqs2") dout_logs = os.path.join(dout_tmp, "mmseqs2_logs") dout_rubbish = os.path.join(dout_tmp, "rubbish") - fout_sto = os.path.join(dout_tmp, "alis.sto") + fout_sto = os.path.join(dout_tmp, STOCKHOLM_ALN) # create tmp subfolders for dir in [dout_mmseqs, dout_logs, dout_rubbish]: diff --git a/src/scarap/modules.py b/src/scarap/modules.py index 03aa032..2121ef2 100644 --- a/src/scarap/modules.py +++ b/src/scarap/modules.py @@ -5,7 +5,6 @@ import shutil from argparse import Namespace -from concurrent.futures import ProcessPoolExecutor from statistics import median, mean from random import sample @@ -15,6 +14,7 @@ from scarap.pan import * from scarap.readerswriters import * from scarap.utils import * +from scarap.constants import * def run_pan(args): if "species" in args and not args.species is None: @@ -24,7 +24,7 @@ def run_pan(args): def run_pan_nonhier(args): - pangenomefout = os.path.join(args.outfolder, "pangenome.tsv") + pangenomefout = os.path.join(args.outfolder, PANF) if os.path.isfile(pangenomefout): logging.info("existing pangenome detected - moving on") return() @@ -76,8 +76,8 @@ def run_pan_hier(args): pseudogenomesdio = os.path.join(args.outfolder, "pseudogenomes") pseudogenomesfio = os.path.join(args.outfolder, "pseudogenomes.tsv") pseudopandio = os.path.join(args.outfolder, "pseudopangenome") - pseudopanfio = os.path.join(args.outfolder, "pseudopangenome.tsv") - pangenomefout = os.path.join(args.outfolder, "pangenome.tsv") + pseudopanfio = os.path.join(args.outfolder, PPANF) + pangenomefout = os.path.join(args.outfolder, PANF) os.makedirs(speciespansdio, exist_ok = True) os.makedirs(pseudogenomesdio, exist_ok = True) os.makedirs(pseudopandio, exist_ok = True) @@ -104,7 +104,7 @@ def run_pan_hier(args): write_lines(faapaths_sub, faapathsfio) run_pan_nonhier(Namespace(faa_files = faapathsfio, outfolder = dout, **nonhier_args)) - shutil.move(os.path.join(dout, "pangenome.tsv"), speciespanfio) + shutil.move(os.path.join(dout, PANF), speciespanfio) shutil.rmtree(dout) logging.info("PHASE 2: constructing pseudogenome per species") @@ -114,7 +114,7 @@ def run_pan_hier(args): species = filename_from_path(speciespanfio) logging.info(f"constructing pseudogenome for {species}") speciespan = read_genes(speciespanfio) - tmpdio = os.path.join(args.outfolder, "temp") + tmpdio = os.path.join(args.outfolder, TMP) pseudogenomes[ix] = create_pseudogenome(speciespan, faapaths, tmpdio) pseudogenomes[ix]["species"] = species pseudogenomes = pd.concat(pseudogenomes) @@ -125,7 +125,7 @@ def run_pan_hier(args): gather_orthogroup_sequences(pseudogenomes, faapaths, pseudogenomesdio) run_pan_nonhier(Namespace(faa_files = pseudogenomesdio, outfolder = pseudopandio, **nonhier_args)) - shutil.move(os.path.join(pseudopandio, "pangenome.tsv"), pseudopanfio) + shutil.move(os.path.join(pseudopandio, PANF), pseudopanfio) shutil.rmtree(pseudogenomesdio) shutil.rmtree(pseudopandio) @@ -162,7 +162,7 @@ def run_build(args): dout_tmp = os.path.join(dout, "tmp") fout_cutoffs = os.path.join(dout, "cutoffs.csv") fout_hits = os.path.join(dout, "hits.tsv") - fout_genes = os.path.join(dout, "genes.tsv") + fout_genes = os.path.join(dout, GENEF) if os.path.isfile(fout_cutoffs): logging.info("existing database detected - moving on") @@ -192,7 +192,7 @@ def run_build(args): logging.info("aligning orthogroups") orthogroups = [os.path.splitext(f)[0] for f in os.listdir(dout_ogseqs)] fouts_ogseqs = make_paths(orthogroups, dout_ogseqs, ".fasta") - fouts_alis = make_paths(orthogroups, dout_alis, ".aln") + fouts_alis = make_paths(orthogroups, dout_alis, ALN_OUTFMT) run_mafft_parallel(fouts_ogseqs, fouts_alis) # run profile search (function does its own logging) @@ -244,7 +244,7 @@ def run_search(args): din_alis = os.path.join(din_db, "alignments") fin_cutoffs = os.path.join(din_db, "cutoffs.csv") fout_hits = os.path.join(dout, "hits.tsv") - fout_genes = os.path.join(dout, "genes.tsv") + fout_genes = os.path.join(dout, GENEF) if os.path.isfile(fout_genes): logging.info("existing search results detected - moving on") @@ -306,7 +306,7 @@ def run_filter(args): logging.info("filtering orthogroups") orthogroups = read_lines(args.orthogroups) pangenome = filter_groups(pangenome, orthogroups) - write_tsv(pangenome, os.path.join(args.outfolder, "pangenome.tsv")) + write_tsv(pangenome, os.path.join(args.outfolder, PANF)) def run_concat(args): @@ -630,9 +630,9 @@ def run_core(args): dout_seedcore = os.path.join(dout, "seedcore") fout_seedpaths = os.path.join(dout, "seedpaths.txt") fout_nonseedpaths = os.path.join(dout, "nonseedpaths.txt") - fout_seedpan = os.path.join(dout_seedpan, "pangenome.tsv") - fout_genes_core = os.path.join(dout_seedcore, "genes.tsv") - fout_genes = os.path.join(dout, "genes.tsv") + fout_seedpan = os.path.join(dout_seedpan, PANF) + fout_genes_core = os.path.join(dout_seedcore, GENEF) + fout_genes = os.path.join(dout, GENEF) # make output subfolders for dir in [dout_seedpan, dout_seedcore]: diff --git a/src/scarap/pan.py b/src/scarap/pan.py index d2a46d7..aa4a4bc 100644 --- a/src/scarap/pan.py +++ b/src/scarap/pan.py @@ -15,6 +15,7 @@ from scarap.readerswriters import * from scarap.computers import * from scarap.callers import * +from scarap.constants import * ## helpers - ficlin module (F) @@ -41,7 +42,7 @@ def update_seedmatrix(seedmatrix, sequences, dout_tmp, threads): # construct target and prefilter dbs makedirs_smart(f"{dout_tmp}") - for dir in ["sequenceDB", "prefDB", "logs", "tmp"]: + for dir in ["sequenceDB", "prefDB", "logs", TMP]: makedirs_smart(f"{dout_tmp}/{dir}") write_fasta(sequences, f"{dout_tmp}/seqs.fasta") run_mmseqs(["createdb", f"{dout_tmp}/seqs.fasta", @@ -481,9 +482,9 @@ def split_family_FH(pan, sequences, hclust, ficlin, min_reps, max_reps, reps = [s.id for s in repseqs] # logging.info(f"aligning {len(reps)} sequences") write_fasta(repseqs, f"{dio_tmp}/repseqs.fasta") - run_mafft(f"{dio_tmp}/repseqs.fasta", f"{dio_tmp}/repseqs.aln", + run_mafft(f"{dio_tmp}/repseqs.fasta", f"{dio_tmp}/{REPSEQ_ALN}", threads, ["--amino", "--anysymbol"]) - aln = read_fasta(f"{dio_tmp}/repseqs.aln") + aln = read_fasta(f"{dio_tmp}/{REPSEQ_ALN}") idmat = identity_matrix(aln) distmat = distmat_from_idmat(idmat) hclust = cluster.hierarchy.linkage(distmat, method = "average") @@ -553,9 +554,9 @@ def split_family_FT(pan, sequences, tree, ficlin, min_reps, max_reps, reps = [s.id for s in repseqs] # logging.info(f"aligning {len(reps)} sequences") write_fasta(repseqs, f"{dio_tmp}/repseqs.fasta") - run_mafft(f"{dio_tmp}/repseqs.fasta", f"{dio_tmp}/repseqs.aln", + run_mafft(f"{dio_tmp}/repseqs.fasta", f"{dio_tmp}/{REPSEQ_ALN}", threads, ["--amino"]) - run_iqtree(f"{dio_tmp}/repseqs.aln", f"{dio_tmp}/tree", threads, + run_iqtree(f"{dio_tmp}/{REPSEQ_ALN}", f"{dio_tmp}/tree", threads, ["-m", "LG"]) tree = Tree(f"{dio_tmp}/tree/tree.treefile") @@ -595,7 +596,7 @@ def split_family_P(pan, sequences, threads, dio_tmp): run_mafft(f"{dio_tmp}/seqs.fasta", f"{dio_tmp}/seqs.aln", threads) # create sequence database - for dir in ["sequenceDB", "tmp", "resultDB", "logs"]: + for dir in ["sequenceDB", TMP, "resultDB", "logs"]: makedirs_smart(f"{dio_tmp}/{dir}") run_mmseqs(["createdb", f"{dio_tmp}/seqs.fasta", f"{dio_tmp}/sequenceDB/db"], f"{dio_tmp}/logs/createdb.log") @@ -613,7 +614,7 @@ def split_family_P(pan, sequences, threads, dio_tmp): # update profiles for i in range(5): if report: print(f"iteration {i}") - for dir in ["msaDB", "profileDB", "searchDB", "tmp"]: + for dir in ["msaDB", "profileDB", "searchDB", TMP]: makedirs_smart(f"{dio_tmp}/{dir}") open(f"{dio_tmp}/profiles.sto", "w").close() for profile, profilegenes in genes.groupby("profile"): @@ -965,16 +966,16 @@ def infer_superfamilies(faafins, dout, threads): f"{dout}/logs/profile2consensus.log") run_mmseqs(["search", f"{dout}/profileDB/db", f"{dout}/profileDB/db_consensus", f"{dout}/alignmentDB/db", - f"{dout}/tmp", "-c", "0.5", "--cov-mode", "1"], - f"{dout}/logs/search.log", + f"{dout}/{TMP}", "-c", "0.5", "--cov-mode", "1"], + f"{dout}/{LOG}/search.log", skip_if_exists = f"{dout}/alignmentDB/db.index", threads = threads) run_mmseqs(["clust", f"{dout}/profileDB/db", f"{dout}/alignmentDB/db", f"{dout}/clusterDB/db", "--cluster-mode", "2"], - f"{dout}/logs/clust.log", + f"{dout}/{LOG}/clust.log", skip_if_exists = f"{dout}/clusterDB/db.index", threads = threads) # create the pangenome file - logging.info("compiling pangenome file with superfamilies") + logging.info(f"compiling pangenome file with {SFAMD}") # why --full-header option? --> to avoid MMseqs2 extracting the # UniqueIdentifier part of sequences in UniProtKB format # (see https://www.uniprot.org/help/fasta-headers) @@ -983,7 +984,7 @@ def infer_superfamilies(faafins, dout, threads): f"{dout}/logs/createtsv_preclusters.log") run_mmseqs(["createtsv", f"{dout}/sequenceDB/db", f"{dout}/sequenceDB/db", f"{dout}/clusterDB/db", f"{dout}/clusters.tsv", "--full-header"], - f"{dout}/logs/createtsv_clusters.log") + f"{dout}/{LOG}/createtsv_clusters.log") preclustertable = pd.read_csv(f"{dout}/preclusters.tsv", sep = "\t", names = ["precluster", "gene"]) preclustertable = preclustertable.applymap(lambda x: x.split(" ")[0]) @@ -1001,7 +1002,7 @@ def infer_superfamilies(faafins, dout, threads): genes["orthogroup"] = [namedict[f] for f in genes["orthogroup"]] # write pangenome file - write_tsv(genes, f"{dout}/genes.tsv") + write_tsv(genes, f"{dout}/{GENEF}") def infer_pangenome(faafins, splitstrategy, min_reps, max_reps, max_align, dout, threads): @@ -1037,61 +1038,61 @@ def infer_pangenome(faafins, splitstrategy, min_reps, max_reps, max_align, dout, padded_counts(len(pangenome.index))] logging.info("writing pangenome file") - write_tsv(pangenome, f"{dout}/pangenome.tsv") + write_tsv(pangenome, f"{dout}/{PANF}") return() - logging.info("STAGE 1: creation of superfamilies") - os.makedirs(f"{dout}/superfamilies", exist_ok = True) - infer_superfamilies(faafins, f"{dout}/superfamilies", threads) - genes_superfams = pd.read_csv(f"{dout}/superfamilies/genes.tsv", + logging.info(f"STAGE 1: creation of {SFAMD}") + os.makedirs(f"{dout}/{SFAMD}", exist_ok = True) + infer_superfamilies(faafins, f"{dout}/{SFAMD}", threads) + genes_superfams = pd.read_csv(f"{dout}/{SFAMD}/{GENEF}", sep = "\t", names = ["gene", "orthogroup"]) pangenome = pd.merge(genes, genes_superfams, on = "gene") if splitstrategy == "S": logging.info("writing pangenome file") - write_tsv(pangenome, f"{dout}/pangenome.tsv") + write_tsv(pangenome, f"{dout}/{PANF}") logging.info("removing temporary folders") - shutil.rmtree(f"{dout}/superfamilies") + shutil.rmtree(f"{dout}/{SFAMD}") return() - logging.info("STAGE 2: splitting of superfamilies") + logging.info(f"STAGE 2: splitting of {SFAMD}") - logging.info("gathering sequences of superfamilies") - os.makedirs(f"{dout}/superfamilies/superfamilies", exist_ok = True) - dio_fastas = f"{dout}/superfamilies/fastas" + logging.info(f"gathering sequences of {SFAMD}") + os.makedirs(f"{dout}/{SFAMD}/{SFAMD}", exist_ok = True) + dio_fastas = f"{dout}/{SFAMD}/fastas" gather_orthogroup_sequences(pangenome, faafins, dio_fastas) - logging.info("counting splitable superfamilies") - os.makedirs(f"{dout}/tmp", exist_ok = True) + logging.info(f"counting splittable {SFAMD}") + os.makedirs(f"{dout}/{TMP}", exist_ok = True) pangenome = pangenome.groupby("orthogroup") orthogroups = pangenome.aggregate({"genome": split_possible}) - splitable = orthogroups[orthogroups.genome].index.tolist() - pangenome_splitable = [pan for name, pan in pangenome if name in splitable] - pangenome_splitable.sort(key = lambda pan: len(pan.index), reverse = True) - n = len(pangenome_splitable) - logging.info(f"found {n} splitable superfamilies") + splittable = orthogroups[orthogroups.genome].index.tolist() + pangenome_splittable = [pan for name, pan in pangenome if name in splittable] + pangenome_splittable.sort(key = lambda pan: len(pan.index), reverse = True) + n = len(pangenome_splittable) + logging.info(f"found {n} splittable {SFAMD}") if n == 0: logging.info("writing pangenome file") pangenome = pangenome.obj # to "ungroup" - write_tsv(pangenome, f"{dout}/pangenome.tsv") + write_tsv(pangenome, f"{dout}/{PANF}") logging.info("removing temporary folders") - shutil.rmtree(f"{dout}/superfamilies") - shutil.rmtree(f"{dout}/tmp") + shutil.rmtree(f"{dout}/{SFAMD}") + shutil.rmtree(f"{dout}/{TMP}") return() - logging.info("splitting superfamilies") + logging.info(f"splitting {SFAMD}") with ProcessPoolExecutor(max_workers = threads // tpp) as executor: - pangenome_splitable = executor.map(split_superfamily, - pangenome_splitable, [splitstrategy] * n, [dio_fastas] * n, + pangenome_splittable = executor.map(split_superfamily, + pangenome_splittable, [splitstrategy] * n, [dio_fastas] * n, [min_reps] * n, [max_reps] * n, [max_align] * n, [tpp] * n, - [f"{dout}/tmp"] * n) + [f"{dout}/{TMP}"] * n) print("") - pangenome = pd.concat(list(pangenome_splitable) + - [pan for name, pan in pangenome if not name in splitable]) + pangenome = pd.concat(list(pangenome_splittable) + + [pan for name, pan in pangenome if not name in splittable]) logging.info("assigning names to the gene families") nametable = pd.DataFrame({"old": pangenome["orthogroup"].unique()}) @@ -1102,8 +1103,8 @@ def infer_pangenome(faafins, splitstrategy, min_reps, max_reps, max_align, dout, pangenome["orthogroup"] = [namedict[f] for f in pangenome["orthogroup"]] logging.info("writing pangenome file") - write_tsv(pangenome, f"{dout}/pangenome.tsv") + write_tsv(pangenome, f"{dout}/{PANF}") logging.info("removing temporary folders") - shutil.rmtree(f"{dout}/superfamilies") - shutil.rmtree(f"{dout}/tmp") + shutil.rmtree(f"{dout}/{SFAMD}") + shutil.rmtree(f"{dout}/{TMP}") diff --git a/src/scarap/readerswriters.py b/src/scarap/readerswriters.py index 90a6815..905403a 100644 --- a/src/scarap/readerswriters.py +++ b/src/scarap/readerswriters.py @@ -115,14 +115,14 @@ def extract_genes(fins_genomes, threads = 1): return(genes) def fastas2stockholm(fins_alis, fout_sto): - with open(fout_sto, "a") as hout_sto: + with gzip.open(fout_sto, "ab") as hout_sto: for fin_ali in fins_alis: orthogroup = os.path.splitext(os.path.basename(fin_ali))[0] - with open(fin_ali) as hin_ali: + with gzip.open(fin_ali, 'rt') as hin_ali: ali = AlignIO.read(hin_ali, "fasta") with StringIO() as vf: AlignIO.write(ali, vf, "stockholm") sto = vf.getvalue().splitlines() sto.insert(1, "#=GF ID " + orthogroup) for line in sto: - hout_sto.write(line + "\n") + hout_sto.write((line + "\n").encode())