From e9bbc3715aed3d8440c596df66f92798785cd6bd Mon Sep 17 00:00:00 2001 From: TheOaphidian Date: Tue, 22 Oct 2024 16:48:46 +0200 Subject: [PATCH 01/13] Add iqtree dependency for tree module --- environment.yml | 1 + src/scarap/module_wrappers.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/environment.yml b/environment.yml index eb8d550..56d2763 100644 --- a/environment.yml +++ b/environment.yml @@ -5,6 +5,7 @@ dependencies: - pip - bioconda::mafft - bioconda::mmseqs2 +- bioconda::iqtree - pip: - -r requirements.txt - . diff --git a/src/scarap/module_wrappers.py b/src/scarap/module_wrappers.py index da8e30d..7967e35 100644 --- a/src/scarap/module_wrappers.py +++ b/src/scarap/module_wrappers.py @@ -59,6 +59,8 @@ def run_pan_withchecks(args): check_infile(args.species) logging.info("checking dependencies") + if args.method in ["T-nl","FT"]: + check_tool("iqtree") if args.method in ["O-B", "O-D"]: check_tool("orthofinder") elif args.method == "S": From 70fca03f33266b7517cbb556584c575a542a952c Mon Sep 17 00:00:00 2001 From: TheOaphidian Date: Tue, 22 Oct 2024 16:54:54 +0200 Subject: [PATCH 02/13] Replace ete3 with ete4 And other minor deprecation issue with applymap --- requirements.txt | 2 +- src/scarap/modules.py | 4 ++-- src/scarap/pan.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index cdcda22..cf95622 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ biopython>=1.67 -ete3>=3.1.1 +https://github.com/etetoolkit/ete/archive/ete4.zip numpy>=1.16.5 scipy>=1.4.1 pandas diff --git a/src/scarap/modules.py b/src/scarap/modules.py index 03aa032..9af6ef9 100644 --- a/src/scarap/modules.py +++ b/src/scarap/modules.py @@ -203,7 +203,7 @@ def run_build(args): hits = pd.read_csv(fout_hits, sep = "\t", names = colnames, usecols = [0, 1, 2]) hits[["gene", "profile"]] = hits[["gene", "profile"]].\ - applymap(lambda x: x.split(" ")[0]) + map(lambda x: x.split(" ")[0]) cutoffs = train_cutoffs(hits, pangenome) if core_filter != 0 or max_cores != 0: @@ -263,7 +263,7 @@ def run_search(args): hits = pd.read_csv(fout_hits, sep = "\t", names = colnames, usecols = [0, 1, 2]) hits[["gene", "profile"]] = hits[["gene", "profile"]].\ - applymap(lambda x: x.split(" ")[0]) + map(lambda x: x.split(" ")[0]) colnames = ["profile", "cutoff"] cutoffs = pd.read_csv(fin_cutoffs, sep = "\t", names = colnames) diff --git a/src/scarap/pan.py b/src/scarap/pan.py index d2a46d7..100eacf 100644 --- a/src/scarap/pan.py +++ b/src/scarap/pan.py @@ -7,7 +7,7 @@ from Bio import AlignIO, Align from copy import copy -from ete3 import Tree +from ete4 import Tree from concurrent.futures import ProcessPoolExecutor from scipy import cluster @@ -986,10 +986,10 @@ def infer_superfamilies(faafins, dout, threads): f"{dout}/logs/createtsv_clusters.log") preclustertable = pd.read_csv(f"{dout}/preclusters.tsv", sep = "\t", names = ["precluster", "gene"]) - preclustertable = preclustertable.applymap(lambda x: x.split(" ")[0]) + preclustertable = preclustertable.map(lambda x: x.split(" ")[0]) clustertable = pd.read_csv(f"{dout}/clusters.tsv", sep = "\t", names = ["cluster", "precluster"]) - clustertable = clustertable.applymap(lambda x: x.split(" ")[0]) + clustertable = clustertable.map(lambda x: x.split(" ")[0]) genes = pd.merge(preclustertable, clustertable, on = "precluster") genes = genes.rename(columns = {"cluster": "orthogroup"}) genes = genes.drop(["precluster"], axis = 1) From 875ea11e1d613b91a424cb2692a42b0d0e0c8140 Mon Sep 17 00:00:00 2001 From: TheOaphidian Date: Wed, 23 Oct 2024 11:33:47 +0200 Subject: [PATCH 03/13] Explicit opening of file for loading the tree in ete4 --- src/scarap/pan.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/scarap/pan.py b/src/scarap/pan.py index 100eacf..3210f1a 100644 --- a/src/scarap/pan.py +++ b/src/scarap/pan.py @@ -428,7 +428,8 @@ def split_family_T_nl(pan, sequences, threads, dio_tmp): run_mafft(f"{dio_tmp}/seqs.fasta", f"{dio_tmp}/seqs.aln", threads) run_iqtree(f"{dio_tmp}/seqs.aln", f"{dio_tmp}/tree", threads, ["-m", "LG+F+G4"]) - tree = Tree(f"{dio_tmp}/tree/tree.treefile") + with open(f"{dio_tmp}/tree/tree.treefile", "r") as ftree: + tree = Tree(ftree) midoutgr = tree.get_midpoint_outgroup() genes_subfam1 = midoutgr.get_leaf_names() midoutgr.detach() @@ -557,8 +558,8 @@ def split_family_FT(pan, sequences, tree, ficlin, min_reps, max_reps, threads, ["--amino"]) run_iqtree(f"{dio_tmp}/repseqs.aln", f"{dio_tmp}/tree", threads, ["-m", "LG"]) - tree = Tree(f"{dio_tmp}/tree/tree.treefile") - + with open(f"{dio_tmp}/tree/tree.treefile", "r") as ftree: + tree = Tree(ftree) # split pan based on midpoint root pan1, pan2, tree1, tree2 = split_pan(pan, tree) sequences1 = [s for s in sequences if s.id in pan1.index] From 4f4ea35b88147612bd6ab74f4ed4b6dfc299b1d1 Mon Sep 17 00:00:00 2001 From: TheOaphidian Date: Wed, 23 Oct 2024 12:01:52 +0200 Subject: [PATCH 04/13] Pythonic syntax update of getters in ete4 --- src/scarap/pan.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/scarap/pan.py b/src/scarap/pan.py index 3210f1a..49cdd6e 100644 --- a/src/scarap/pan.py +++ b/src/scarap/pan.py @@ -177,8 +177,8 @@ def split_pan(pan, tree): tree2 = tree.children[1].copy() # split pan - reps_subfam1 = tree1.get_leaf_names() - reps_subfam2 = tree2.get_leaf_names() + reps_subfam1 = tree1.leaf_names() + reps_subfam2 = tree2.leaf_names() pan1 = pan[pan["rep"].isin(reps_subfam1)].copy() pan2 = pan[pan["rep"].isin(reps_subfam2)].copy() @@ -212,7 +212,7 @@ def lowest_cn_roots(tree, pan): min_av_cn = 100000000 # loop over all nodes except the root - for node in tree.iter_descendants(): + for node in tree.descendants(): # initialize empty genome lists for partition 1 and 2 genomes1 = [] genomes2 = [] @@ -298,7 +298,7 @@ def correct_root(root, tree, pan): min_av_cn = 100000000 # loop over all nodes except the root - for node in tree.iter_descendants(): + for node in tree.descendants(): genomes1, genomes2 = partition_genomes(reprfs_genomes, node) overlap = set(genomes1) & set(genomes2) # intersection # if genomes that overlap in the midpoint bipartition do not all @@ -431,9 +431,9 @@ def split_family_T_nl(pan, sequences, threads, dio_tmp): with open(f"{dio_tmp}/tree/tree.treefile", "r") as ftree: tree = Tree(ftree) midoutgr = tree.get_midpoint_outgroup() - genes_subfam1 = midoutgr.get_leaf_names() + genes_subfam1 = midoutgr.leaf_names() midoutgr.detach() - genes_subfam2 = tree.get_leaf_names() + genes_subfam2 = tree.leaf_names() pan1 = pan.loc[genes_subfam1].copy() pan2 = pan.loc[genes_subfam2].copy() family = pan.orthogroup.tolist()[0] From ead264284009bca4ff8bcb0af8c1d9efa0da0297 Mon Sep 17 00:00:00 2001 From: TheOaphidian Date: Wed, 23 Oct 2024 14:39:03 +0200 Subject: [PATCH 05/13] Remove root distance prio to midpoint rooting --- src/scarap/pan.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/scarap/pan.py b/src/scarap/pan.py index 49cdd6e..10158c3 100644 --- a/src/scarap/pan.py +++ b/src/scarap/pan.py @@ -169,9 +169,11 @@ def split_pan(pan, tree): # midpoint root the tree midoutgr = tree.get_midpoint_outgroup() + if tree.root.dist: + tree.root.dist = None + if midoutgr != tree: tree.set_outgroup(midoutgr) - # split tree at root tree1 = tree.children[0].copy() tree2 = tree.children[1].copy() From 5dce51fdc60b7b16c72b0eb2547c7ef4ae94b6e4 Mon Sep 17 00:00:00 2001 From: Tim Van Rillaer Date: Mon, 11 Aug 2025 09:27:40 +0200 Subject: [PATCH 06/13] Show actual number of sequences not aligned in warning --- src/scarap/pan.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/scarap/pan.py b/src/scarap/pan.py index d2a46d7..2d62f3f 100644 --- a/src/scarap/pan.py +++ b/src/scarap/pan.py @@ -104,8 +104,9 @@ def update_seedmatrix(seedmatrix, sequences, dout_tmp, threads): # give warning if some sequences don't align to their cluster seed ids_to_seed = np.amax(seedmatrix, 1) - if np.any(ids_to_seed == 0): - logging.warning("ficlin: one or more sequences do not align to any " + seeds_not_aligned = np.count_nonzero(ids_to_seed == 0) + if seeds_not_aligned: + logging.warning(f"ficlin: {seeds_not_aligned} sequences do not align to any " "seed") # remove temporary output folder From a28193643311e6bebb5e1428eaeef943d23ef10d Mon Sep 17 00:00:00 2001 From: TheOaphidian Date: Fri, 20 Feb 2026 16:25:52 +0100 Subject: [PATCH 07/13] Change references to ete3 -> ete4 --- README.md | 2 +- pyproject.toml | 2 +- test/environment_max.yml | 2 +- test/environment_min.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c4493d1..9b73b72 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ You can also install SCARAP manually by cloning it and installing the following * [Python3](https://www.python.org/) version >= 3.8, < 3.13 * Python packages (see pyproject.toml file for versions): * [biopython](https://biopython.org/) - * [ete3](http://etetoolkit.org/) + * [ete4](http://etetoolkit.org/) * [numpy](https://numpy.org/) * [scipy](https://www.scipy.org/) * [pandas](https://pandas.pydata.org/) diff --git a/pyproject.toml b/pyproject.toml index 8b9c01f..a7f2911 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ ] dependencies = [ "biopython>=1.79,<2", - "ete3>=3.1,<4", + "ete4>=4.3", "numpy>=1.18,<3", "scipy>=1.4,<2", "pandas>=1.3,<4" diff --git a/test/environment_max.yml b/test/environment_max.yml index 63b0402..b5273ef 100644 --- a/test/environment_max.yml +++ b/test/environment_max.yml @@ -7,7 +7,7 @@ dependencies: - bioconda::mmseqs2=18.8cc5c - python=3.12 # maximum allowed by ete v3 - biopython=1.86 - - ete3=3.1.3 # ete v4 not compatible + - conda-forge::ete4=4.3.0 - numpy=2.4.2 - scipy=1.17.0 - pandas=3.0.1 diff --git a/test/environment_min.yml b/test/environment_min.yml index 53bc9c8..aed3e36 100644 --- a/test/environment_min.yml +++ b/test/environment_min.yml @@ -10,7 +10,7 @@ dependencies: - bioconda::mmseqs2=11.e1a1c - python=3.8 # 3.7 lacks importlib.metadata - biopython=1.79 - - ete3=3.1.2 + - conda-forge::ete4=4.3.0 - numpy=1.18.5 - scipy=1.4.1 - pandas=1.3.5 From d5f473f989b072386c060c89cf82c1eb27cf02bb Mon Sep 17 00:00:00 2001 From: TheOaphidian Date: Fri, 20 Feb 2026 16:33:26 +0100 Subject: [PATCH 08/13] Update python versions to those supported for ete4 on conda-forge --- test/environment_max.yml | 2 +- test/environment_min.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/environment_max.yml b/test/environment_max.yml index b5273ef..59d50d3 100644 --- a/test/environment_max.yml +++ b/test/environment_max.yml @@ -5,7 +5,7 @@ channels: dependencies: - mafft=7.526 - bioconda::mmseqs2=18.8cc5c - - python=3.12 # maximum allowed by ete v3 + - python=3.14 - biopython=1.86 - conda-forge::ete4=4.3.0 - numpy=2.4.2 diff --git a/test/environment_min.yml b/test/environment_min.yml index aed3e36..d185c75 100644 --- a/test/environment_min.yml +++ b/test/environment_min.yml @@ -8,9 +8,9 @@ channels: dependencies: - bioconda::mafft=7.407 - bioconda::mmseqs2=11.e1a1c - - python=3.8 # 3.7 lacks importlib.metadata + - python=3.9 # 3.9 is earliest version with ete4 on conda-forge - biopython=1.79 - - conda-forge::ete4=4.3.0 + - conda-forge::ete4=4.1.1 - numpy=1.18.5 - scipy=1.4.1 - pandas=1.3.5 From c55fe168c136043849404abdddc01f7efaa2f3b6 Mon Sep 17 00:00:00 2001 From: TheOaphidian Date: Fri, 20 Feb 2026 16:35:18 +0100 Subject: [PATCH 09/13] Update python versions of the ci --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 329e4a7..3a6b865 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,10 +17,10 @@ jobs: strategy: matrix: include: - - python-version: "3.8" + - python-version: "3.9" os: ubuntu-22.04 env-file: "test/environment_min.yml" - - python-version: "3.12" + - python-version: "3.14" os: ubuntu-24.04 env-file: "test/environment_max.yml" From 266b27a2526fede57452b68d9db862c5244cef59 Mon Sep 17 00:00:00 2001 From: TheOaphidian Date: Fri, 20 Feb 2026 16:42:37 +0100 Subject: [PATCH 10/13] Update other packages to be compatible with newer python versions --- test/environment_min.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/environment_min.yml b/test/environment_min.yml index d185c75..2ccf4ed 100644 --- a/test/environment_min.yml +++ b/test/environment_min.yml @@ -6,14 +6,14 @@ channels: - conda-forge - nodefaults dependencies: - - bioconda::mafft=7.407 - - bioconda::mmseqs2=11.e1a1c + - bioconda::mafft=7.525 + - bioconda::mmseqs2=18.8cc5c - python=3.9 # 3.9 is earliest version with ete4 on conda-forge - - biopython=1.79 + - biopython=1.85 - conda-forge::ete4=4.1.1 - - numpy=1.18.5 - - scipy=1.4.1 - - pandas=1.3.5 + - numpy=2.02 + - scipy=1.13.1 + - pandas=2.3.1 - pip - pip: - .. From c0091d5df023889d50060822c2cfbdfd958eb735 Mon Sep 17 00:00:00 2001 From: TheOaphidian Date: Fri, 20 Feb 2026 16:46:36 +0100 Subject: [PATCH 11/13] Update min python to 3.10 --- .github/workflows/ci.yml | 2 +- test/environment_min.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3a6b865..57daae8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: strategy: matrix: include: - - python-version: "3.9" + - python-version: "3.10" os: ubuntu-22.04 env-file: "test/environment_min.yml" - python-version: "3.14" diff --git a/test/environment_min.yml b/test/environment_min.yml index 2ccf4ed..0f777b7 100644 --- a/test/environment_min.yml +++ b/test/environment_min.yml @@ -8,7 +8,7 @@ channels: dependencies: - bioconda::mafft=7.525 - bioconda::mmseqs2=18.8cc5c - - python=3.9 # 3.9 is earliest version with ete4 on conda-forge + - python=3.10 # 3.10 is earliest version with ete4 on conda-forge - biopython=1.85 - conda-forge::ete4=4.1.1 - numpy=2.02 From dad7ee2ee5b1edf35452ddf735a017df17922d43 Mon Sep 17 00:00:00 2001 From: TheOaphidian Date: Fri, 20 Feb 2026 16:49:32 +0100 Subject: [PATCH 12/13] Update python version in toml --- pyproject.toml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a7f2911..f3b15cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,18 +14,18 @@ maintainers = [ ] description = "A toolkit for prokaryotic comparative genomics" readme = "README.md" -requires-python = ">=3.8,<3.13" +requires-python = ">=3.10,<3.15" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent" ] dependencies = [ - "biopython>=1.79,<2", - "ete4>=4.3", - "numpy>=1.18,<3", - "scipy>=1.4,<2", - "pandas>=1.3,<4" + "biopython>=1.85,<2", + "ete4>=4.1", + "numpy>=2.02,<3", + "scipy>=1.13.1,<2", + "pandas>=2.3.1,<4" ] [project.urls] From e7611001e0f9b33c03c1f2064654ac92d3ff52b6 Mon Sep 17 00:00:00 2001 From: TheOaphidian Date: Fri, 20 Feb 2026 17:04:50 +0100 Subject: [PATCH 13/13] Replace references to ete3 with ete4 --- src/scarap/pan.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scarap/pan.py b/src/scarap/pan.py index be2f327..c958bde 100644 --- a/src/scarap/pan.py +++ b/src/scarap/pan.py @@ -163,7 +163,7 @@ def split_pan(pan, tree): Args: pan (DataFrame): A gene table with at least the columns reprf and orthogroup. - tree: An ete3 tree (= the root node of a tree) + tree: An ete4 tree (= the root node of a tree) Returns: [pan1, pan2, tree1, tree2] @@ -197,7 +197,7 @@ def lowest_cn_roots(tree, pan): """Determine the set of lowest copy-number roots. Args: - tree: ete3 tree object where the leaf names correspond to the values of + tree: ete4 tree object where the leaf names correspond to the values of the reprf column in pan. pan (DataFrame): Table with at least the columns reprf and genome. @@ -796,7 +796,7 @@ def split_family_recursive_FT(pan, sequences, tree, ficlin, min_reps, columns gene, genome and orthogroup. sequences (list): A list with one SeqRecord object per row in pan, in the same order. - tree: An ete3 tree object. + tree: An ete4 tree object. finclin (bool): Should ficlin be used to pick representatives? min_reps (int): The minimum number of representatives to use. max_reps (int): The maximum number of representatives to use.