Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ jobs:
strategy:
matrix:
include:
- python-version: "3.8"
- python-version: "3.10"
os: ubuntu-22.04
env-file: "test/environment_min.yml"
- python-version: "3.12"
- python-version: "3.14"
os: ubuntu-24.04
env-file: "test/environment_max.yml"

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ You can also install SCARAP manually by cloning it and installing the following
* [Python3](https://www.python.org/) version >= 3.8, < 3.13
* Python packages (see pyproject.toml file for versions):
* [biopython](https://biopython.org/)
* [ete3](http://etetoolkit.org/)
* [ete4](http://etetoolkit.org/)
* [numpy](https://numpy.org/)
* [scipy](https://www.scipy.org/)
* [pandas](https://pandas.pydata.org/)
Expand Down
12 changes: 6 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,18 @@ maintainers = [
]
description = "A toolkit for prokaryotic comparative genomics"
readme = "README.md"
requires-python = ">=3.8,<3.13"
requires-python = ">=3.10,<3.15"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent"
]
dependencies = [
"biopython>=1.79,<2",
"ete3>=3.1,<4",
"numpy>=1.18,<3",
"scipy>=1.4,<2",
"pandas>=1.3,<4"
"biopython>=1.85,<2",
"ete4>=4.1",
"numpy>=2.02,<3",
"scipy>=1.13.1,<2",
"pandas>=2.3.1,<4"
]

[project.urls]
Expand Down
2 changes: 2 additions & 0 deletions src/scarap/module_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ def run_pan_withchecks(args):
check_infile(args.species)

logging.info("checking dependencies")
if args.method in ["T-nl","FT"]:
check_tool("iqtree")
if args.method in ["O-B", "O-D"]:
check_tool("orthofinder")
elif args.method == "S":
Expand Down
36 changes: 20 additions & 16 deletions src/scarap/pan.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from Bio import AlignIO, Align
from copy import copy
from ete3 import Tree
from ete4 import Tree
from concurrent.futures import ProcessPoolExecutor
from scipy import cluster

Expand Down Expand Up @@ -105,8 +105,9 @@ def update_seedmatrix(seedmatrix, sequences, dout_tmp, threads):

# give warning if some sequences don't align to their cluster seed
ids_to_seed = np.amax(seedmatrix, 1)
if np.any(ids_to_seed == 0):
logging.warning("ficlin: one or more sequences do not align to any "
seeds_not_aligned = np.count_nonzero(ids_to_seed == 0)
if seeds_not_aligned:
logging.warning(f"ficlin: {seeds_not_aligned} sequences do not align to any "
"seed")

# remove temporary output folder
Expand Down Expand Up @@ -162,24 +163,26 @@ def split_pan(pan, tree):
Args:
pan (DataFrame): A gene table with at least the columns reprf and
orthogroup.
tree: An ete3 tree (= the root node of a tree)
tree: An ete4 tree (= the root node of a tree)

Returns:
[pan1, pan2, tree1, tree2]
"""

# midpoint root the tree
midoutgr = tree.get_midpoint_outgroup()
if tree.root.dist:
tree.root.dist = None

if midoutgr != tree:
tree.set_outgroup(midoutgr)

# split tree at root
tree1 = tree.children[0].copy()
tree2 = tree.children[1].copy()

# split pan
reps_subfam1 = tree1.get_leaf_names()
reps_subfam2 = tree2.get_leaf_names()
reps_subfam1 = tree1.leaf_names()
reps_subfam2 = tree2.leaf_names()
pan1 = pan[pan["rep"].isin(reps_subfam1)].copy()
pan2 = pan[pan["rep"].isin(reps_subfam2)].copy()

Expand All @@ -194,7 +197,7 @@ def lowest_cn_roots(tree, pan):
"""Determine the set of lowest copy-number roots.

Args:
tree: ete3 tree object where the leaf names correspond to the values of
tree: ete4 tree object where the leaf names correspond to the values of
the reprf column in pan.
pan (DataFrame): Table with at least the columns reprf and genome.

Expand All @@ -213,7 +216,7 @@ def lowest_cn_roots(tree, pan):
min_av_cn = 100000000

# loop over all nodes except the root
for node in tree.iter_descendants():
for node in tree.descendants():
# initialize empty genome lists for partition 1 and 2
genomes1 = []
genomes2 = []
Expand Down Expand Up @@ -299,7 +302,7 @@ def correct_root(root, tree, pan):
min_av_cn = 100000000

# loop over all nodes except the root
for node in tree.iter_descendants():
for node in tree.descendants():
genomes1, genomes2 = partition_genomes(reprfs_genomes, node)
overlap = set(genomes1) & set(genomes2) # intersection
# if genomes that overlap in the midpoint bipartition do not all
Expand Down Expand Up @@ -429,11 +432,12 @@ def split_family_T_nl(pan, sequences, threads, dio_tmp):
run_mafft(f"{dio_tmp}/seqs.fasta", f"{dio_tmp}/seqs.aln", threads)
run_iqtree(f"{dio_tmp}/seqs.aln", f"{dio_tmp}/tree", threads,
["-m", "LG+F+G4"])
tree = Tree(f"{dio_tmp}/tree/tree.treefile")
with open(f"{dio_tmp}/tree/tree.treefile", "r") as ftree:
tree = Tree(ftree)
midoutgr = tree.get_midpoint_outgroup()
genes_subfam1 = midoutgr.get_leaf_names()
genes_subfam1 = midoutgr.leaf_names()
midoutgr.detach()
genes_subfam2 = tree.get_leaf_names()
genes_subfam2 = tree.leaf_names()
pan1 = pan.loc[genes_subfam1].copy()
pan2 = pan.loc[genes_subfam2].copy()
family = pan.orthogroup.tolist()[0]
Expand Down Expand Up @@ -558,8 +562,8 @@ def split_family_FT(pan, sequences, tree, ficlin, min_reps, max_reps,
threads, ["--amino"])
run_iqtree(f"{dio_tmp}/repseqs.aln", f"{dio_tmp}/tree", threads,
["-m", "LG"])
tree = Tree(f"{dio_tmp}/tree/tree.treefile")

with open(f"{dio_tmp}/tree/tree.treefile", "r") as ftree:
tree = Tree(ftree)
# split pan based on midpoint root
pan1, pan2, tree1, tree2 = split_pan(pan, tree)
sequences1 = [s for s in sequences if s.id in pan1.index]
Expand Down Expand Up @@ -792,7 +796,7 @@ def split_family_recursive_FT(pan, sequences, tree, ficlin, min_reps,
columns gene, genome and orthogroup.
sequences (list): A list with one SeqRecord object per row in pan, in
the same order.
tree: An ete3 tree object.
tree: An ete4 tree object.
finclin (bool): Should ficlin be used to pick representatives?
min_reps (int): The minimum number of representatives to use.
max_reps (int): The maximum number of representatives to use.
Expand Down
4 changes: 2 additions & 2 deletions test/environment_max.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ channels:
dependencies:
- mafft=7.526
- bioconda::mmseqs2=18.8cc5c
- python=3.12 # maximum allowed by ete v3
- python=3.14
- biopython=1.86
- ete3=3.1.3 # ete v4 not compatible
- conda-forge::ete4=4.3.0
- numpy=2.4.2
- scipy=1.17.0
- pandas=3.0.1
Expand Down
16 changes: 8 additions & 8 deletions test/environment_min.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ channels:
- conda-forge
- nodefaults
dependencies:
- bioconda::mafft=7.407
- bioconda::mmseqs2=11.e1a1c
- python=3.8 # 3.7 lacks importlib.metadata
- biopython=1.79
- ete3=3.1.2
- numpy=1.18.5
- scipy=1.4.1
- pandas=1.3.5
- bioconda::mafft=7.525
- bioconda::mmseqs2=18.8cc5c
- python=3.10 # 3.10 is earliest version with ete4 on conda-forge
- biopython=1.85
- conda-forge::ete4=4.1.1
- numpy=2.02
- scipy=1.13.1
- pandas=2.3.1
- pip
- pip:
- ..
Loading