Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions src/scarap/callers.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,20 @@ def run_orthofinder(faafins, dout, logfout, threads, engine = "blast"):

def run_mafft(fin_seqs, fout_aln, threads = 1, options = [], retry=False):
args = ["mafft"] + options + ["--thread", str(threads), fin_seqs]
with open(fout_aln, "w") as hout_aln:
result = subprocess.run(args, stdout = hout_aln,

result = subprocess.run(args,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE)
if result.returncode == 0: return()
if not retry and result.stderr.splitlines()[-1][0:17] == b"Illegal character":
og = filename_from_path(fin_seqs)
logging.warning(f"added --amino and --anysymbol flags to mafft for {og}")
options = options + ["--amino", "--anysymbol"]
run_mafft(fin_seqs, fout_aln, threads, options, retry=True)

with gzip.open(fout_aln, 'wb') as hout_aln:
hout_aln.write(result.stdout)

if result.returncode == 0: return()
if not retry and result.stderr.splitlines()[-1][0:17] == b"Illegal character":
og = filename_from_path(fin_seqs)
logging.warning(f"added --amino and --anysymbol flags to mafft for {og}")
options = options + ["--amino", "--anysymbol"]
run_mafft(fin_seqs, fout_aln, threads, options, retry=True)

def run_mafft_parallel(fins_aa_seqs, fouts_aa_seqs_aligned):
with concurrent.futures.ProcessPoolExecutor() as executor:
Expand Down
16 changes: 16 additions & 0 deletions src/scarap/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Description: Constants used in the scarap package

# OUTPUT FOLDERS
TMP = 'tmp'
LOG = 'logs'
SFAMD = 'superfamilies'

# OUTPUT FILES
PANF = 'pangenome.tsv'
PPANF = 'psuedopangenome.tsv'
GENEF = 'genes.tsv'

# ALIGNMENT
ALN_OUTFMT = '.aln.gz'
REPSEQ_ALN = 'repseqs' + ALN_OUTFMT
STOCKHOLM_ALN = "alis.sto.gz"
3 changes: 2 additions & 1 deletion src/scarap/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from scarap.readerswriters import *
from scarap.computers import *
from scarap.callers import *
from scarap.constants import *

# used by the build and search modules
def run_profilesearch(fins_faas, fins_alis, fout_hits, dout_tmp, threads):
Expand All @@ -14,7 +15,7 @@ def run_profilesearch(fins_faas, fins_alis, fout_hits, dout_tmp, threads):
dout_mmseqs = os.path.join(dout_tmp, "mmseqs2")
dout_logs = os.path.join(dout_tmp, "mmseqs2_logs")
dout_rubbish = os.path.join(dout_tmp, "rubbish")
fout_sto = os.path.join(dout_tmp, "alis.sto")
fout_sto = os.path.join(dout_tmp, STOCKHOLM_ALN)

# create tmp subfolders
for dir in [dout_mmseqs, dout_logs, dout_rubbish]:
Expand Down
28 changes: 14 additions & 14 deletions src/scarap/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import shutil

from argparse import Namespace
from concurrent.futures import ProcessPoolExecutor
from statistics import median, mean
from random import sample

Expand All @@ -15,6 +14,7 @@
from scarap.pan import *
from scarap.readerswriters import *
from scarap.utils import *
from scarap.constants import *

def run_pan(args):
if "species" in args and not args.species is None:
Expand All @@ -24,7 +24,7 @@ def run_pan(args):

def run_pan_nonhier(args):

pangenomefout = os.path.join(args.outfolder, "pangenome.tsv")
pangenomefout = os.path.join(args.outfolder, PANF)
if os.path.isfile(pangenomefout):
logging.info("existing pangenome detected - moving on")
return()
Expand Down Expand Up @@ -76,8 +76,8 @@ def run_pan_hier(args):
pseudogenomesdio = os.path.join(args.outfolder, "pseudogenomes")
pseudogenomesfio = os.path.join(args.outfolder, "pseudogenomes.tsv")
pseudopandio = os.path.join(args.outfolder, "pseudopangenome")
pseudopanfio = os.path.join(args.outfolder, "pseudopangenome.tsv")
pangenomefout = os.path.join(args.outfolder, "pangenome.tsv")
pseudopanfio = os.path.join(args.outfolder, PPANF)
pangenomefout = os.path.join(args.outfolder, PANF)
os.makedirs(speciespansdio, exist_ok = True)
os.makedirs(pseudogenomesdio, exist_ok = True)
os.makedirs(pseudopandio, exist_ok = True)
Expand All @@ -104,7 +104,7 @@ def run_pan_hier(args):
write_lines(faapaths_sub, faapathsfio)
run_pan_nonhier(Namespace(faa_files = faapathsfio, outfolder = dout,
**nonhier_args))
shutil.move(os.path.join(dout, "pangenome.tsv"), speciespanfio)
shutil.move(os.path.join(dout, PANF), speciespanfio)
shutil.rmtree(dout)

logging.info("PHASE 2: constructing pseudogenome per species")
Expand All @@ -114,7 +114,7 @@ def run_pan_hier(args):
species = filename_from_path(speciespanfio)
logging.info(f"constructing pseudogenome for {species}")
speciespan = read_genes(speciespanfio)
tmpdio = os.path.join(args.outfolder, "temp")
tmpdio = os.path.join(args.outfolder, TMP)
pseudogenomes[ix] = create_pseudogenome(speciespan, faapaths, tmpdio)
pseudogenomes[ix]["species"] = species
pseudogenomes = pd.concat(pseudogenomes)
Expand All @@ -125,7 +125,7 @@ def run_pan_hier(args):
gather_orthogroup_sequences(pseudogenomes, faapaths, pseudogenomesdio)
run_pan_nonhier(Namespace(faa_files = pseudogenomesdio,
outfolder = pseudopandio, **nonhier_args))
shutil.move(os.path.join(pseudopandio, "pangenome.tsv"), pseudopanfio)
shutil.move(os.path.join(pseudopandio, PANF), pseudopanfio)
shutil.rmtree(pseudogenomesdio)
shutil.rmtree(pseudopandio)

Expand Down Expand Up @@ -162,7 +162,7 @@ def run_build(args):
dout_tmp = os.path.join(dout, "tmp")
fout_cutoffs = os.path.join(dout, "cutoffs.csv")
fout_hits = os.path.join(dout, "hits.tsv")
fout_genes = os.path.join(dout, "genes.tsv")
fout_genes = os.path.join(dout, GENEF)

if os.path.isfile(fout_cutoffs):
logging.info("existing database detected - moving on")
Expand Down Expand Up @@ -192,7 +192,7 @@ def run_build(args):
logging.info("aligning orthogroups")
orthogroups = [os.path.splitext(f)[0] for f in os.listdir(dout_ogseqs)]
fouts_ogseqs = make_paths(orthogroups, dout_ogseqs, ".fasta")
fouts_alis = make_paths(orthogroups, dout_alis, ".aln")
fouts_alis = make_paths(orthogroups, dout_alis, ALN_OUTFMT)
run_mafft_parallel(fouts_ogseqs, fouts_alis)

# run profile search (function does its own logging)
Expand Down Expand Up @@ -244,7 +244,7 @@ def run_search(args):
din_alis = os.path.join(din_db, "alignments")
fin_cutoffs = os.path.join(din_db, "cutoffs.csv")
fout_hits = os.path.join(dout, "hits.tsv")
fout_genes = os.path.join(dout, "genes.tsv")
fout_genes = os.path.join(dout, GENEF)

if os.path.isfile(fout_genes):
logging.info("existing search results detected - moving on")
Expand Down Expand Up @@ -306,7 +306,7 @@ def run_filter(args):
logging.info("filtering orthogroups")
orthogroups = read_lines(args.orthogroups)
pangenome = filter_groups(pangenome, orthogroups)
write_tsv(pangenome, os.path.join(args.outfolder, "pangenome.tsv"))
write_tsv(pangenome, os.path.join(args.outfolder, PANF))

def run_concat(args):

Expand Down Expand Up @@ -630,9 +630,9 @@ def run_core(args):
dout_seedcore = os.path.join(dout, "seedcore")
fout_seedpaths = os.path.join(dout, "seedpaths.txt")
fout_nonseedpaths = os.path.join(dout, "nonseedpaths.txt")
fout_seedpan = os.path.join(dout_seedpan, "pangenome.tsv")
fout_genes_core = os.path.join(dout_seedcore, "genes.tsv")
fout_genes = os.path.join(dout, "genes.tsv")
fout_seedpan = os.path.join(dout_seedpan, PANF)
fout_genes_core = os.path.join(dout_seedcore, GENEF)
fout_genes = os.path.join(dout, GENEF)

# make output subfolders
for dir in [dout_seedpan, dout_seedcore]:
Expand Down
87 changes: 44 additions & 43 deletions src/scarap/pan.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from scarap.readerswriters import *
from scarap.computers import *
from scarap.callers import *
from scarap.constants import *

## helpers - ficlin module (F)

Expand All @@ -41,7 +42,7 @@ def update_seedmatrix(seedmatrix, sequences, dout_tmp, threads):

# construct target and prefilter dbs
makedirs_smart(f"{dout_tmp}")
for dir in ["sequenceDB", "prefDB", "logs", "tmp"]:
for dir in ["sequenceDB", "prefDB", "logs", TMP]:
makedirs_smart(f"{dout_tmp}/{dir}")
write_fasta(sequences, f"{dout_tmp}/seqs.fasta")
run_mmseqs(["createdb", f"{dout_tmp}/seqs.fasta",
Expand Down Expand Up @@ -481,9 +482,9 @@ def split_family_FH(pan, sequences, hclust, ficlin, min_reps, max_reps,
reps = [s.id for s in repseqs]
# logging.info(f"aligning {len(reps)} sequences")
write_fasta(repseqs, f"{dio_tmp}/repseqs.fasta")
run_mafft(f"{dio_tmp}/repseqs.fasta", f"{dio_tmp}/repseqs.aln",
run_mafft(f"{dio_tmp}/repseqs.fasta", f"{dio_tmp}/{REPSEQ_ALN}",
threads, ["--amino", "--anysymbol"])
aln = read_fasta(f"{dio_tmp}/repseqs.aln")
aln = read_fasta(f"{dio_tmp}/{REPSEQ_ALN}")
idmat = identity_matrix(aln)
distmat = distmat_from_idmat(idmat)
hclust = cluster.hierarchy.linkage(distmat, method = "average")
Expand Down Expand Up @@ -553,9 +554,9 @@ def split_family_FT(pan, sequences, tree, ficlin, min_reps, max_reps,
reps = [s.id for s in repseqs]
# logging.info(f"aligning {len(reps)} sequences")
write_fasta(repseqs, f"{dio_tmp}/repseqs.fasta")
run_mafft(f"{dio_tmp}/repseqs.fasta", f"{dio_tmp}/repseqs.aln",
run_mafft(f"{dio_tmp}/repseqs.fasta", f"{dio_tmp}/{REPSEQ_ALN}",
threads, ["--amino"])
run_iqtree(f"{dio_tmp}/repseqs.aln", f"{dio_tmp}/tree", threads,
run_iqtree(f"{dio_tmp}/{REPSEQ_ALN}", f"{dio_tmp}/tree", threads,
["-m", "LG"])
tree = Tree(f"{dio_tmp}/tree/tree.treefile")

Expand Down Expand Up @@ -595,7 +596,7 @@ def split_family_P(pan, sequences, threads, dio_tmp):
run_mafft(f"{dio_tmp}/seqs.fasta", f"{dio_tmp}/seqs.aln", threads)

# create sequence database
for dir in ["sequenceDB", "tmp", "resultDB", "logs"]:
for dir in ["sequenceDB", TMP, "resultDB", "logs"]:
makedirs_smart(f"{dio_tmp}/{dir}")
run_mmseqs(["createdb", f"{dio_tmp}/seqs.fasta",
f"{dio_tmp}/sequenceDB/db"], f"{dio_tmp}/logs/createdb.log")
Expand All @@ -613,7 +614,7 @@ def split_family_P(pan, sequences, threads, dio_tmp):
# update profiles
for i in range(5):
if report: print(f"iteration {i}")
for dir in ["msaDB", "profileDB", "searchDB", "tmp"]:
for dir in ["msaDB", "profileDB", "searchDB", TMP]:
makedirs_smart(f"{dio_tmp}/{dir}")
open(f"{dio_tmp}/profiles.sto", "w").close()
for profile, profilegenes in genes.groupby("profile"):
Expand Down Expand Up @@ -965,16 +966,16 @@ def infer_superfamilies(faafins, dout, threads):
f"{dout}/logs/profile2consensus.log")
run_mmseqs(["search", f"{dout}/profileDB/db",
f"{dout}/profileDB/db_consensus", f"{dout}/alignmentDB/db",
f"{dout}/tmp", "-c", "0.5", "--cov-mode", "1"],
f"{dout}/logs/search.log",
f"{dout}/{TMP}", "-c", "0.5", "--cov-mode", "1"],
f"{dout}/{LOG}/search.log",
skip_if_exists = f"{dout}/alignmentDB/db.index", threads = threads)
run_mmseqs(["clust", f"{dout}/profileDB/db", f"{dout}/alignmentDB/db",
f"{dout}/clusterDB/db", "--cluster-mode", "2"],
f"{dout}/logs/clust.log",
f"{dout}/{LOG}/clust.log",
skip_if_exists = f"{dout}/clusterDB/db.index", threads = threads)

# create the pangenome file
logging.info("compiling pangenome file with superfamilies")
logging.info(f"compiling pangenome file with {SFAMD}")
# why --full-header option? --> to avoid MMseqs2 extracting the
# UniqueIdentifier part of sequences in UniProtKB format
# (see https://www.uniprot.org/help/fasta-headers)
Expand All @@ -983,7 +984,7 @@ def infer_superfamilies(faafins, dout, threads):
f"{dout}/logs/createtsv_preclusters.log")
run_mmseqs(["createtsv", f"{dout}/sequenceDB/db", f"{dout}/sequenceDB/db",
f"{dout}/clusterDB/db", f"{dout}/clusters.tsv", "--full-header"],
f"{dout}/logs/createtsv_clusters.log")
f"{dout}/{LOG}/createtsv_clusters.log")
preclustertable = pd.read_csv(f"{dout}/preclusters.tsv", sep = "\t",
names = ["precluster", "gene"])
preclustertable = preclustertable.applymap(lambda x: x.split(" ")[0])
Expand All @@ -1001,7 +1002,7 @@ def infer_superfamilies(faafins, dout, threads):
genes["orthogroup"] = [namedict[f] for f in genes["orthogroup"]]

# write pangenome file
write_tsv(genes, f"{dout}/genes.tsv")
write_tsv(genes, f"{dout}/{GENEF}")

def infer_pangenome(faafins, splitstrategy, min_reps, max_reps, max_align, dout,
threads):
Expand Down Expand Up @@ -1037,61 +1038,61 @@ def infer_pangenome(faafins, splitstrategy, min_reps, max_reps, max_align, dout,
padded_counts(len(pangenome.index))]

logging.info("writing pangenome file")
write_tsv(pangenome, f"{dout}/pangenome.tsv")
write_tsv(pangenome, f"{dout}/{PANF}")

return()

logging.info("STAGE 1: creation of superfamilies")
os.makedirs(f"{dout}/superfamilies", exist_ok = True)
infer_superfamilies(faafins, f"{dout}/superfamilies", threads)
genes_superfams = pd.read_csv(f"{dout}/superfamilies/genes.tsv",
logging.info(f"STAGE 1: creation of {SFAMD}")
os.makedirs(f"{dout}/{SFAMD}", exist_ok = True)
infer_superfamilies(faafins, f"{dout}/{SFAMD}", threads)
genes_superfams = pd.read_csv(f"{dout}/{SFAMD}/{GENEF}",
sep = "\t", names = ["gene", "orthogroup"])
pangenome = pd.merge(genes, genes_superfams, on = "gene")

if splitstrategy == "S":

logging.info("writing pangenome file")
write_tsv(pangenome, f"{dout}/pangenome.tsv")
write_tsv(pangenome, f"{dout}/{PANF}")
logging.info("removing temporary folders")
shutil.rmtree(f"{dout}/superfamilies")
shutil.rmtree(f"{dout}/{SFAMD}")
return()

logging.info("STAGE 2: splitting of superfamilies")
logging.info(f"STAGE 2: splitting of {SFAMD}")

logging.info("gathering sequences of superfamilies")
os.makedirs(f"{dout}/superfamilies/superfamilies", exist_ok = True)
dio_fastas = f"{dout}/superfamilies/fastas"
logging.info(f"gathering sequences of {SFAMD}")
os.makedirs(f"{dout}/{SFAMD}/{SFAMD}", exist_ok = True)
dio_fastas = f"{dout}/{SFAMD}/fastas"
gather_orthogroup_sequences(pangenome, faafins, dio_fastas)

logging.info("counting splitable superfamilies")
os.makedirs(f"{dout}/tmp", exist_ok = True)
logging.info(f"counting splittable {SFAMD}")
os.makedirs(f"{dout}/{TMP}", exist_ok = True)
pangenome = pangenome.groupby("orthogroup")
orthogroups = pangenome.aggregate({"genome": split_possible})
splitable = orthogroups[orthogroups.genome].index.tolist()
pangenome_splitable = [pan for name, pan in pangenome if name in splitable]
pangenome_splitable.sort(key = lambda pan: len(pan.index), reverse = True)
n = len(pangenome_splitable)
logging.info(f"found {n} splitable superfamilies")
splittable = orthogroups[orthogroups.genome].index.tolist()
pangenome_splittable = [pan for name, pan in pangenome if name in splittable]
pangenome_splittable.sort(key = lambda pan: len(pan.index), reverse = True)
n = len(pangenome_splittable)
logging.info(f"found {n} splittable {SFAMD}")

if n == 0:

logging.info("writing pangenome file")
pangenome = pangenome.obj # to "ungroup"
write_tsv(pangenome, f"{dout}/pangenome.tsv")
write_tsv(pangenome, f"{dout}/{PANF}")
logging.info("removing temporary folders")
shutil.rmtree(f"{dout}/superfamilies")
shutil.rmtree(f"{dout}/tmp")
shutil.rmtree(f"{dout}/{SFAMD}")
shutil.rmtree(f"{dout}/{TMP}")
return()

logging.info("splitting superfamilies")
logging.info(f"splitting {SFAMD}")
with ProcessPoolExecutor(max_workers = threads // tpp) as executor:
pangenome_splitable = executor.map(split_superfamily,
pangenome_splitable, [splitstrategy] * n, [dio_fastas] * n,
pangenome_splittable = executor.map(split_superfamily,
pangenome_splittable, [splitstrategy] * n, [dio_fastas] * n,
[min_reps] * n, [max_reps] * n, [max_align] * n, [tpp] * n,
[f"{dout}/tmp"] * n)
[f"{dout}/{TMP}"] * n)
print("")
pangenome = pd.concat(list(pangenome_splitable) +
[pan for name, pan in pangenome if not name in splitable])
pangenome = pd.concat(list(pangenome_splittable) +
[pan for name, pan in pangenome if not name in splittable])

logging.info("assigning names to the gene families")
nametable = pd.DataFrame({"old": pangenome["orthogroup"].unique()})
Expand All @@ -1102,8 +1103,8 @@ def infer_pangenome(faafins, splitstrategy, min_reps, max_reps, max_align, dout,
pangenome["orthogroup"] = [namedict[f] for f in pangenome["orthogroup"]]

logging.info("writing pangenome file")
write_tsv(pangenome, f"{dout}/pangenome.tsv")
write_tsv(pangenome, f"{dout}/{PANF}")

logging.info("removing temporary folders")
shutil.rmtree(f"{dout}/superfamilies")
shutil.rmtree(f"{dout}/tmp")
shutil.rmtree(f"{dout}/{SFAMD}")
shutil.rmtree(f"{dout}/{TMP}")
6 changes: 3 additions & 3 deletions src/scarap/readerswriters.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,14 @@ def extract_genes(fins_genomes, threads = 1):
return(genes)

def fastas2stockholm(fins_alis, fout_sto):
with open(fout_sto, "a") as hout_sto:
with gzip.open(fout_sto, "ab") as hout_sto:
for fin_ali in fins_alis:
orthogroup = os.path.splitext(os.path.basename(fin_ali))[0]
with open(fin_ali) as hin_ali:
with gzip.open(fin_ali, 'rt') as hin_ali:
ali = AlignIO.read(hin_ali, "fasta")
with StringIO() as vf:
AlignIO.write(ali, vf, "stockholm")
sto = vf.getvalue().splitlines()
sto.insert(1, "#=GF ID " + orthogroup)
for line in sto:
hout_sto.write(line + "\n")
hout_sto.write((line + "\n").encode())