diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 329e4a7..57daae8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,10 +17,10 @@ jobs: strategy: matrix: include: - - python-version: "3.8" + - python-version: "3.10" os: ubuntu-22.04 env-file: "test/environment_min.yml" - - python-version: "3.12" + - python-version: "3.14" os: ubuntu-24.04 env-file: "test/environment_max.yml" diff --git a/README.md b/README.md index c4493d1..9b73b72 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ You can also install SCARAP manually by cloning it and installing the following * [Python3](https://www.python.org/) version >= 3.8, < 3.13 * Python packages (see pyproject.toml file for versions): * [biopython](https://biopython.org/) - * [ete3](http://etetoolkit.org/) + * [ete4](http://etetoolkit.org/) * [numpy](https://numpy.org/) * [scipy](https://www.scipy.org/) * [pandas](https://pandas.pydata.org/) diff --git a/pyproject.toml b/pyproject.toml index 8b9c01f..f3b15cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,18 +14,18 @@ maintainers = [ ] description = "A toolkit for prokaryotic comparative genomics" readme = "README.md" -requires-python = ">=3.8,<3.13" +requires-python = ">=3.10,<3.15" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent" ] dependencies = [ - "biopython>=1.79,<2", - "ete3>=3.1,<4", - "numpy>=1.18,<3", - "scipy>=1.4,<2", - "pandas>=1.3,<4" + "biopython>=1.85,<2", + "ete4>=4.1", + "numpy>=2.02,<3", + "scipy>=1.13.1,<2", + "pandas>=2.3.1,<4" ] [project.urls] diff --git a/src/scarap/module_wrappers.py b/src/scarap/module_wrappers.py index 3a2d5a9..d69032e 100644 --- a/src/scarap/module_wrappers.py +++ b/src/scarap/module_wrappers.py @@ -58,6 +58,8 @@ def run_pan_withchecks(args): check_infile(args.species) logging.info("checking dependencies") + if args.method in ["T-nl","FT"]: + check_tool("iqtree") if args.method in ["O-B", "O-D"]: check_tool("orthofinder") elif args.method == "S": diff --git a/src/scarap/pan.py b/src/scarap/pan.py index e77db3a..c958bde 100644 --- a/src/scarap/pan.py +++ b/src/scarap/pan.py @@ -7,7 +7,7 @@ from Bio import AlignIO, Align from copy import copy -from ete3 import Tree +from ete4 import Tree from concurrent.futures import ProcessPoolExecutor from scipy import cluster @@ -105,8 +105,9 @@ def update_seedmatrix(seedmatrix, sequences, dout_tmp, threads): # give warning if some sequences don't align to their cluster seed ids_to_seed = np.amax(seedmatrix, 1) - if np.any(ids_to_seed == 0): - logging.warning("ficlin: one or more sequences do not align to any " + seeds_not_aligned = np.count_nonzero(ids_to_seed == 0) + if seeds_not_aligned: + logging.warning(f"ficlin: {seeds_not_aligned} sequences do not align to any " "seed") # remove temporary output folder @@ -162,7 +163,7 @@ def split_pan(pan, tree): Args: pan (DataFrame): A gene table with at least the columns reprf and orthogroup. - tree: An ete3 tree (= the root node of a tree) + tree: An ete4 tree (= the root node of a tree) Returns: [pan1, pan2, tree1, tree2] @@ -170,16 +171,18 @@ def split_pan(pan, tree): # midpoint root the tree midoutgr = tree.get_midpoint_outgroup() + if tree.root.dist: + tree.root.dist = None + if midoutgr != tree: tree.set_outgroup(midoutgr) - # split tree at root tree1 = tree.children[0].copy() tree2 = tree.children[1].copy() # split pan - reps_subfam1 = tree1.get_leaf_names() - reps_subfam2 = tree2.get_leaf_names() + reps_subfam1 = tree1.leaf_names() + reps_subfam2 = tree2.leaf_names() pan1 = pan[pan["rep"].isin(reps_subfam1)].copy() pan2 = pan[pan["rep"].isin(reps_subfam2)].copy() @@ -194,7 +197,7 @@ def lowest_cn_roots(tree, pan): """Determine the set of lowest copy-number roots. Args: - tree: ete3 tree object where the leaf names correspond to the values of + tree: ete4 tree object where the leaf names correspond to the values of the reprf column in pan. pan (DataFrame): Table with at least the columns reprf and genome. @@ -213,7 +216,7 @@ def lowest_cn_roots(tree, pan): min_av_cn = 100000000 # loop over all nodes except the root - for node in tree.iter_descendants(): + for node in tree.descendants(): # initialize empty genome lists for partition 1 and 2 genomes1 = [] genomes2 = [] @@ -299,7 +302,7 @@ def correct_root(root, tree, pan): min_av_cn = 100000000 # loop over all nodes except the root - for node in tree.iter_descendants(): + for node in tree.descendants(): genomes1, genomes2 = partition_genomes(reprfs_genomes, node) overlap = set(genomes1) & set(genomes2) # intersection # if genomes that overlap in the midpoint bipartition do not all @@ -429,11 +432,12 @@ def split_family_T_nl(pan, sequences, threads, dio_tmp): run_mafft(f"{dio_tmp}/seqs.fasta", f"{dio_tmp}/seqs.aln", threads) run_iqtree(f"{dio_tmp}/seqs.aln", f"{dio_tmp}/tree", threads, ["-m", "LG+F+G4"]) - tree = Tree(f"{dio_tmp}/tree/tree.treefile") + with open(f"{dio_tmp}/tree/tree.treefile", "r") as ftree: + tree = Tree(ftree) midoutgr = tree.get_midpoint_outgroup() - genes_subfam1 = midoutgr.get_leaf_names() + genes_subfam1 = midoutgr.leaf_names() midoutgr.detach() - genes_subfam2 = tree.get_leaf_names() + genes_subfam2 = tree.leaf_names() pan1 = pan.loc[genes_subfam1].copy() pan2 = pan.loc[genes_subfam2].copy() family = pan.orthogroup.tolist()[0] @@ -558,8 +562,8 @@ def split_family_FT(pan, sequences, tree, ficlin, min_reps, max_reps, threads, ["--amino"]) run_iqtree(f"{dio_tmp}/repseqs.aln", f"{dio_tmp}/tree", threads, ["-m", "LG"]) - tree = Tree(f"{dio_tmp}/tree/tree.treefile") - + with open(f"{dio_tmp}/tree/tree.treefile", "r") as ftree: + tree = Tree(ftree) # split pan based on midpoint root pan1, pan2, tree1, tree2 = split_pan(pan, tree) sequences1 = [s for s in sequences if s.id in pan1.index] @@ -792,7 +796,7 @@ def split_family_recursive_FT(pan, sequences, tree, ficlin, min_reps, columns gene, genome and orthogroup. sequences (list): A list with one SeqRecord object per row in pan, in the same order. - tree: An ete3 tree object. + tree: An ete4 tree object. finclin (bool): Should ficlin be used to pick representatives? min_reps (int): The minimum number of representatives to use. max_reps (int): The maximum number of representatives to use. diff --git a/test/environment_max.yml b/test/environment_max.yml index 63b0402..59d50d3 100644 --- a/test/environment_max.yml +++ b/test/environment_max.yml @@ -5,9 +5,9 @@ channels: dependencies: - mafft=7.526 - bioconda::mmseqs2=18.8cc5c - - python=3.12 # maximum allowed by ete v3 + - python=3.14 - biopython=1.86 - - ete3=3.1.3 # ete v4 not compatible + - conda-forge::ete4=4.3.0 - numpy=2.4.2 - scipy=1.17.0 - pandas=3.0.1 diff --git a/test/environment_min.yml b/test/environment_min.yml index 53bc9c8..0f777b7 100644 --- a/test/environment_min.yml +++ b/test/environment_min.yml @@ -6,14 +6,14 @@ channels: - conda-forge - nodefaults dependencies: - - bioconda::mafft=7.407 - - bioconda::mmseqs2=11.e1a1c - - python=3.8 # 3.7 lacks importlib.metadata - - biopython=1.79 - - ete3=3.1.2 - - numpy=1.18.5 - - scipy=1.4.1 - - pandas=1.3.5 + - bioconda::mafft=7.525 + - bioconda::mmseqs2=18.8cc5c + - python=3.10 # 3.10 is earliest version with ete4 on conda-forge + - biopython=1.85 + - conda-forge::ete4=4.1.1 + - numpy=2.02 + - scipy=1.13.1 + - pandas=2.3.1 - pip - pip: - ..