From 6cc2dfc7e132b15913bca415ee41383e30c849c5 Mon Sep 17 00:00:00 2001 From: eead-csic-compbio Date: Wed, 27 Aug 2025 10:35:17 +0200 Subject: [PATCH 01/11] # NOTE: tries only the bulky all-vs-all file (GBs) --- phylogenomics/PlantCompUtils.pm | 58 ++++++++++++++++----------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/phylogenomics/PlantCompUtils.pm b/phylogenomics/PlantCompUtils.pm index dc2362f3b..0ea47a848 100644 --- a/phylogenomics/PlantCompUtils.pm +++ b/phylogenomics/PlantCompUtils.pm @@ -1,7 +1,7 @@ package PlantCompUtils; require Exporter; -# Copyright [2019-2023] EMBL-European Bioinformatics Institute +# Copyright [2019-2025] EMBL-European Bioinformatics Institute @ISA = qw(Exporter); @EXPORT_OK = qw( @@ -23,8 +23,7 @@ use Time::HiRes; use HTTP::Tiny; use DBI; -# Fungi Protists Metazoa have collections and one all-vs-all TSV file -# This code won't work there +# Only tested in Plants; Fungi Protists Metazoa have collections, code will need tweaking our @DIVISIONS = qw( Plants ); our $FTPURL = 'ftp.ensemblgenomes.org'; our $COMPARADIR = '/pub/xxx/current/tsv/ensembl-compara/homologies'; @@ -258,7 +257,7 @@ sub download_GTF_file { # download compressed TSV file from FTP site, renames it # and saves it in $targetdir; uses FTP globals defined above -# NOTE: if species file is not found it tries the bulky all-vs-all file +# NOTE: tries only the bulky all-vs-all file (GBs) sub download_compara_TSV_file { my ( $dir, $ref_genome, $targetdir ) = @_; @@ -274,32 +273,31 @@ sub download_compara_TSV_file { || die "# ERROR(download_compara_TSV_file): cannot change working directory to $dir " . $ftp->message(); - # find out which file is to be downloaded - if ( $ftp->cwd($ref_genome) ) { - foreach my $file ( $ftp->ls() ) { - if ( $file =~ m/protein_default.homologies.tsv.gz/ ) { - $compara_file = $file; - $stored_compara_file = "$targetdir/$compara_file"; - $stored_compara_file =~ s/tsv.gz/$ref_genome.tsv.gz/; - last; - } - } - } - else { # try all-vs-all file instead (Fungi, Protists, Metazoa) - - print "# WARNING(download_compara_TSV_file): cannot find ". - "$ref_genome in $dir, try all-vs-all\n"; - - foreach my $file ( $ftp->ls() ) { - if ( $file =~ m/protein_default.homologies.tsv.gz/ ) { - $compara_file = $file; - $stored_compara_file = "$targetdir/$compara_file"; - foreach my $div (@DIVISIONS) { - if ( $dir =~ m/($div)/i ) { - $div = $1; - $stored_compara_file =~ s/tsv.gz/$div.tsv.gz/; - last; - } + # find file to be downloaded + + # species-specific Compara TSV files are not complete anymore (Aug2025) + #if ( 0 && $ftp->cwd($ref_genome) ) { + # foreach my $file ( $ftp->ls() ) { + # if ( $file =~ m/protein_default.homologies.tsv.gz/ ) { + # $compara_file = $file; + # $stored_compara_file = "$targetdir/$compara_file"; + # $stored_compara_file =~ s/tsv.gz/$ref_genome.tsv.gz/; + # last; + # } + # } + #} else { # try all-vs-all file instead (Fungi, Protists, Metazoa) + + print "# WARNING(download_compara_TSV_file): try all-vs-all\n"; + + foreach my $file ( $ftp->ls() ) { + if ( $file =~ m/protein_default.homologies.tsv.gz/ ) { + $compara_file = $file; + $stored_compara_file = "$targetdir/$compara_file"; + foreach my $div (@DIVISIONS) { + if ( $dir =~ m/($div)/i ) { + $div = $1; + $stored_compara_file =~ s/tsv.gz/$div.tsv.gz/; + last; } } } From d8578da2034946083145697422da804245675959 Mon Sep 17 00:00:00 2001 From: eead-csic-compbio Date: Wed, 27 Aug 2025 10:37:33 +0200 Subject: [PATCH 02/11] adapted to all-vs-all TSV file --- phylogenomics/ens_single-copy_core_genes.pl | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/phylogenomics/ens_single-copy_core_genes.pl b/phylogenomics/ens_single-copy_core_genes.pl index 0893baec7..37b2a5069 100755 --- a/phylogenomics/ens_single-copy_core_genes.pl +++ b/phylogenomics/ens_single-copy_core_genes.pl @@ -15,10 +15,10 @@ ); # Retrieves single-copy orthologous genes/proteins shared by (plant) species in clade -# by querying pre-computed Compara data from Ensembl Genomes with a reference genome. +# by querying pre-computed Compara data from Ensembl (Genomes) with a reference genome. # Multiple copies are optionally allowed for selected or all species. # -# Copyright [2019-2023] EMBL-European Bioinformatics Institute +# Copyright [2019-2025] EMBL-European Bioinformatics Institute # Ensembl Genomes my $RESTURL = 'http://rest.ensembl.org'; @@ -273,15 +273,23 @@ sub help_message { $wga_coverage, $high_confidence ) = split(/\t/); - if ( $species ne $ref_genome ) { + next if( !$supported{$species} || !$supported{$hom_species} ); + + # ref genome forced to be species as opposed to hom_species + if( $hom_species eq $ref_genome ) { + ($gene_stable_id, $hom_gene_stable_id) = ($hom_gene_stable_id, $gene_stable_id); + ($prot_stable_id, $hom_prot_stable_id) = ($hom_prot_stable_id, $prot_stable_id); + ($species, $hom_species) = ($hom_species, $species); + ($identity, $hom_identity) = ($hom_identity, $identity); + } + + if ( $species ne $ref_genome ) { if ( keys(%present) == $n_of_species ) { last; } # in case all-vs-all file is used else { next } } - next if ( !$supported{$hom_species} || $hom_species eq $ref_genome ); - if ( defined($high_confidence) ) { next if ( $LOWCONF == 0 @@ -298,7 +306,7 @@ sub help_message { && $homology_type eq 'ortholog_one2many' ) ) { - + # add $ref_genome protein if ( !$core{$gene_stable_id} ) { From 0ce517c62c2dc806df31c1a710914dbf4699de7c Mon Sep 17 00:00:00 2001 From: eead-csic-compbio Date: Wed, 27 Aug 2025 10:37:45 +0200 Subject: [PATCH 03/11] retested --- phylogenomics/ens_sequences.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phylogenomics/ens_sequences.pl b/phylogenomics/ens_sequences.pl index c0b3945d2..60f5e5caa 100755 --- a/phylogenomics/ens_sequences.pl +++ b/phylogenomics/ens_sequences.pl @@ -18,7 +18,7 @@ # Uses canonical transcripts, used in the gene tree analysis, # which usually are the longest translation with no stop codons # -# Copyright [2019-2021] EMBL-European Bioinformatics Institute +# Copyright [2019-2025] EMBL-European Bioinformatics Institute # Ensembl Genomes my $RESTURL = 'http://rest.ensembl.org'; From beff29b60bec7aae5e1c43601dab330377f43740 Mon Sep 17 00:00:00 2001 From: eead-csic-compbio Date: Wed, 27 Aug 2025 13:02:31 +0200 Subject: [PATCH 04/11] sub get_gene_coords_GTF_file can handle long chr names --- phylogenomics/PlantCompUtils.pm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/phylogenomics/PlantCompUtils.pm b/phylogenomics/PlantCompUtils.pm index 0ea47a848..4100e0b45 100644 --- a/phylogenomics/PlantCompUtils.pm +++ b/phylogenomics/PlantCompUtils.pm @@ -179,11 +179,11 @@ sub get_gene_coords_GTF_file { || die "# ERROR(get_gene_coords_GTF_file): cannot open $GTF_filename\n"; while ( my $line = ) { - #1 araport11 gene 3631 5899 . + . gene_id "AT1G01010";... + #1 araport11 gene 3631 5899 . + . gene_id "AT1G01010";... + #C3 brad gene 4809 5027 . - . gene_id "Bo3g025160";... if ( $line =~ - m/^([^#])\t[^\t]+\tgene\t(\d+)\t(\d+)\t[^\t]\t(\S+)\t[^\t]\tgene_id "([^";]+)/ - ) - { + m/^([^#]+)\t[^\t]+\tgene\t(\d+)\t(\d+)\t[^\t]\t(\S+)\t[^\t]\tgene_id "([^";]+)/) { + ( $chr, $start, $end, $strand, $geneid ) = ( $1, $2, $3, $4, $5 ); push( @chr_sorted_gene_ids, [ $geneid, $chr, $start, $end, $strand ] ); From 8de3453a5d8b7e47dc120e3a268029d69a534327 Mon Sep 17 00:00:00 2001 From: eead-csic-compbio Date: Wed, 27 Aug 2025 13:04:53 +0200 Subject: [PATCH 05/11] retested on Ensembl Plants 61 --- phylogenomics/ens_syntelogs.pl | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/phylogenomics/ens_syntelogs.pl b/phylogenomics/ens_syntelogs.pl index 3457ce495..67fe6a368 100755 --- a/phylogenomics/ens_syntelogs.pl +++ b/phylogenomics/ens_syntelogs.pl @@ -16,9 +16,9 @@ ); # Retrieves orthologous, syntenic genes (syntelogs) shared by (plant) species in clade -# by querying pre-computed Compara data from Ensembl Genomes with a reference genome. +# by querying pre-computed Compara data from Ensembl (Genomes) with a reference genome. # -# Copyright [2019-2023] EMBL-European Bioinformatics Institute +# Copyright [2019-2025] EMBL-European Bioinformatics Institute # Ensembl Genomes my $RESTURL = 'http://rest.ensembl.org'; @@ -255,6 +255,16 @@ sub help_message { $wga_coverage, $high_confidence ) = split(/\t/); + next if( !$supported{$species} || !$supported{$hom_species} ); + + # ref genome forced to be species as opposed to hom_species + if( $hom_species eq $ref_genome ) { + ($gene_stable_id, $hom_gene_stable_id) = ($hom_gene_stable_id, $gene_stable_id); + ($prot_stable_id, $hom_prot_stable_id) = ($hom_prot_stable_id, $prot_stable_id); + ($species, $hom_species) = ($hom_species, $species); + ($identity, $hom_identity) = ($hom_identity, $identity); + } + if ( $species ne $ref_genome ) { if ( keys(%present) == $n_of_species ) { last; @@ -262,8 +272,6 @@ sub help_message { else { next } } - next if ( !$supported{$hom_species} || $hom_species eq $ref_genome ); - if ( defined($high_confidence) ) { next if ( $LOWCONF == 0 @@ -273,8 +281,7 @@ sub help_message { next if ( $goc_ssynt eq 'NULL' || $goc_ssynt < $GOC ); if ( $homology_type eq 'ortholog_one2one' - || $homology_type eq 'ortholog_one2many' ) - { + || $homology_type eq 'ortholog_one2many' ) { # add $ref_genome protein if ( !$synt{$gene_stable_id} ) { @@ -302,7 +309,7 @@ sub help_message { $chrcoords{$gene_stable_id} = "$gene->[1]:$gene->[2]-$gene->[3]:$gene->[4]"; } -} +} # check GOC availability foreach $hom_species (@supported_species) { From 0352f917291e2f0f97589fe6bcc719f52bb41c52 Mon Sep 17 00:00:00 2001 From: brunocontrerasmoreira Date: Thu, 28 Aug 2025 11:11:59 +0200 Subject: [PATCH 06/11] indented example Co-authored-by: Jorge Alvarez-Jarreta --- phylogenomics/PlantCompUtils.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phylogenomics/PlantCompUtils.pm b/phylogenomics/PlantCompUtils.pm index 4100e0b45..8bb211e4a 100644 --- a/phylogenomics/PlantCompUtils.pm +++ b/phylogenomics/PlantCompUtils.pm @@ -180,7 +180,7 @@ sub get_gene_coords_GTF_file { while ( my $line = ) { #1 araport11 gene 3631 5899 . + . gene_id "AT1G01010";... - #C3 brad gene 4809 5027 . - . gene_id "Bo3g025160";... + #C3 brad gene 4809 5027 . - . gene_id "Bo3g025160";... if ( $line =~ m/^([^#]+)\t[^\t]+\tgene\t(\d+)\t(\d+)\t[^\t]\t(\S+)\t[^\t]\tgene_id "([^";]+)/) { From 5dab0cfa0a32584d1ce84dd3496e1fc8c02c1ed3 Mon Sep 17 00:00:00 2001 From: brunocontrerasmoreira Date: Thu, 28 Aug 2025 11:12:32 +0200 Subject: [PATCH 07/11] removed code for parsing split TSV file Co-authored-by: Jorge Alvarez-Jarreta --- phylogenomics/PlantCompUtils.pm | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/phylogenomics/PlantCompUtils.pm b/phylogenomics/PlantCompUtils.pm index 8bb211e4a..330491e36 100644 --- a/phylogenomics/PlantCompUtils.pm +++ b/phylogenomics/PlantCompUtils.pm @@ -274,19 +274,6 @@ sub download_compara_TSV_file { . $ftp->message(); # find file to be downloaded - - # species-specific Compara TSV files are not complete anymore (Aug2025) - #if ( 0 && $ftp->cwd($ref_genome) ) { - # foreach my $file ( $ftp->ls() ) { - # if ( $file =~ m/protein_default.homologies.tsv.gz/ ) { - # $compara_file = $file; - # $stored_compara_file = "$targetdir/$compara_file"; - # $stored_compara_file =~ s/tsv.gz/$ref_genome.tsv.gz/; - # last; - # } - # } - #} else { # try all-vs-all file instead (Fungi, Protists, Metazoa) - print "# WARNING(download_compara_TSV_file): try all-vs-all\n"; foreach my $file ( $ftp->ls() ) { From 4dfdce14d2772036b71927b103705082313b96a6 Mon Sep 17 00:00:00 2001 From: brunocontrerasmoreira Date: Thu, 28 Aug 2025 11:13:49 +0200 Subject: [PATCH 08/11] fixed blanks Co-authored-by: Jorge Alvarez-Jarreta --- phylogenomics/ens_single-copy_core_genes.pl | 1 - 1 file changed, 1 deletion(-) diff --git a/phylogenomics/ens_single-copy_core_genes.pl b/phylogenomics/ens_single-copy_core_genes.pl index 37b2a5069..971a7dc2f 100755 --- a/phylogenomics/ens_single-copy_core_genes.pl +++ b/phylogenomics/ens_single-copy_core_genes.pl @@ -306,7 +306,6 @@ sub help_message { && $homology_type eq 'ortholog_one2many' ) ) { - # add $ref_genome protein if ( !$core{$gene_stable_id} ) { From 34d33aa2188a62f33c8ee6924c01f00f8de41dd7 Mon Sep 17 00:00:00 2001 From: brunocontrerasmoreira Date: Thu, 28 Aug 2025 11:14:11 +0200 Subject: [PATCH 09/11] Ensembl is enough Co-authored-by: Jorge Alvarez-Jarreta --- phylogenomics/ens_syntelogs.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phylogenomics/ens_syntelogs.pl b/phylogenomics/ens_syntelogs.pl index 67fe6a368..0c542bf0a 100755 --- a/phylogenomics/ens_syntelogs.pl +++ b/phylogenomics/ens_syntelogs.pl @@ -16,7 +16,7 @@ ); # Retrieves orthologous, syntenic genes (syntelogs) shared by (plant) species in clade -# by querying pre-computed Compara data from Ensembl (Genomes) with a reference genome. +# by querying pre-computed Compara data from Ensembl with a reference genome. # # Copyright [2019-2025] EMBL-European Bioinformatics Institute From e0cc3d7aed761875b8f4289c48995275310889f3 Mon Sep 17 00:00:00 2001 From: brunocontrerasmoreira Date: Thu, 28 Aug 2025 11:14:30 +0200 Subject: [PATCH 10/11] Ensembl is enough Co-authored-by: Jorge Alvarez-Jarreta --- phylogenomics/ens_single-copy_core_genes.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phylogenomics/ens_single-copy_core_genes.pl b/phylogenomics/ens_single-copy_core_genes.pl index 971a7dc2f..606adb538 100755 --- a/phylogenomics/ens_single-copy_core_genes.pl +++ b/phylogenomics/ens_single-copy_core_genes.pl @@ -15,7 +15,7 @@ ); # Retrieves single-copy orthologous genes/proteins shared by (plant) species in clade -# by querying pre-computed Compara data from Ensembl (Genomes) with a reference genome. +# by querying pre-computed Compara data from Ensembl with a reference genome. # Multiple copies are optionally allowed for selected or all species. # # Copyright [2019-2025] EMBL-European Bioinformatics Institute From a16bf485b64fe952fdc5a5a49e167375140e0354 Mon Sep 17 00:00:00 2001 From: brunocontrerasmoreira Date: Thu, 28 Aug 2025 11:14:51 +0200 Subject: [PATCH 11/11] fixed blank Co-authored-by: Jorge Alvarez-Jarreta --- phylogenomics/ens_syntelogs.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phylogenomics/ens_syntelogs.pl b/phylogenomics/ens_syntelogs.pl index 0c542bf0a..550f98c59 100755 --- a/phylogenomics/ens_syntelogs.pl +++ b/phylogenomics/ens_syntelogs.pl @@ -309,7 +309,7 @@ sub help_message { $chrcoords{$gene_stable_id} = "$gene->[1]:$gene->[2]-$gene->[3]:$gene->[4]"; } -} +} # check GOC availability foreach $hom_species (@supported_species) {