From aaf6d9a99a8015161c9060a2a0394f6ed3011abe Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Mon, 13 Oct 2025 14:40:26 +0100
Subject: [PATCH 01/22] Rust side implementation for GoFlow search index

---
 utils/search-export/src/main.rs               |  9 +++++
 .../src/sequences/file_joiner.rs              | 15 +++++++-
 .../src/sequences/go_flow_annotations.rs      | 34 +++++++++++++++++++
 utils/search-export/src/sequences/mod.rs      |  1 +
 .../search-export/src/sequences/normalized.rs |  3 ++
 utils/search-export/src/sequences/raw.rs      |  7 ++++
 6 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 utils/search-export/src/sequences/go_flow_annotations.rs
diff --git a/utils/search-export/src/main.rs b/utils/search-export/src/main.rs
index ad99f67e8..de852456b 100644
--- a/utils/search-export/src/main.rs
+++ b/utils/search-export/src/main.rs
@@ -33,6 +33,7 @@ pub enum Groupable {
     SoInfo,
     LitsummSummaries,
     EditingEvents,
+    GoFlowAnnotation,
 }
 
 #[derive(Debug, StructOpt)]
@@ -140,6 +141,10 @@ enum SequenceCommand {
         /// RNA editing events
         editing_events: PathBuf,
 
+        #[structopt(parse(from_os_str))]
+        /// GoFlowLLM annotations
+        go_flow_llm_annotations: PathBuf,
+
         // Add new arguments above this line!
         #[structopt(parse(from_os_str))]
         /// Filename to write the results to, '-' means stdout
@@ -255,6 +260,7 @@ fn main() -> Result<()> {
             Groupable::EditingEvents => {
                 sequences::editing_events::group(&path, max_count, &output)?
             },
+            Groupable::GoFlowAnnotation => sequences::go_flow_annotations::group(&path, max_count, &output)?,
         },
         Subcommand::Sequences {
             command,
@@ -275,6 +281,8 @@ fn main() -> Result<()> {
                 litsumm_summaries,
                 editing_events,
                 so_term_tree,
+                go_flow_llm_annotations,
+                // Add new arguments above this line!
                 output,
             } => sequences::writers::write_merge(
                 vec![
@@ -293,6 +301,7 @@ fn main() -> Result<()> {
                     editing_events,
                     orfs,
                     so_term_tree,
+                    go_flow_llm_annotations,
                 ],
                 &output,
             )?,
diff --git a/utils/search-export/src/sequences/file_joiner.rs b/utils/search-export/src/sequences/file_joiner.rs
index c8a50973b..6903b6e1b 100644
--- a/utils/search-export/src/sequences/file_joiner.rs
+++ b/utils/search-export/src/sequences/file_joiner.rs
@@ -49,6 +49,7 @@ use super::{
     rfam_hit::RfamHit,
     so_tree,
     so_tree::SoMapping,
+    go_flow_annotations::GoFlowLLMAnnotation,
 };
 
 #[derive(Debug, Error)]
@@ -98,6 +99,7 @@ pub enum FileTypes {
     PublicationCount,
     LitsummSummaries,
     EditingEvents,
+    GoFlowLLMAnnotations,
     SoTermTree,
 }
 
@@ -116,6 +118,7 @@ pub struct FileJoiner<'de> {
     rfam_hits: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<RfamHit>>,
     publication_counts: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<PublicationCount>>,
     lit_summ: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<LitsummSummaries>>,
+    go_flow_llm_annotations: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<GoFlowLLMAnnotation>>,
     editing_events: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<EditingEvent>>,
     so_info: SoMapping,
 }
@@ -203,6 +206,7 @@ impl FileJoinerBuilder {
         let publication_counts = self.iterator_for(FileTypes::PublicationCount)?;
         let lit_summ = self.iterator_for(FileTypes::LitsummSummaries)?;
         let editing_events = self.iterator_for(FileTypes::EditingEvents)?;
+        let go_flow_llm_annotations = self.iterator_for(FileTypes::GoFlowLLMAnnotations)?;
         let so_info = so_tree::load(self.path_for(FileTypes::SoTermTree)?)?;
 
         Ok(FileJoiner {
@@ -220,6 +224,7 @@ impl FileJoinerBuilder {
             publication_counts,
             lit_summ,
             editing_events,
+            go_flow_llm_annotations,
             so_info,
         })
     }
@@ -244,6 +249,7 @@ impl<'de> Iterator for FileJoiner<'de> {
             self.publication_counts.next(),
             self.lit_summ.next(),
             self.editing_events.next(),
+            self.go_flow_llm_annotations.next(),
         );
 
         match current {
@@ -262,6 +268,7 @@ impl<'de> Iterator for FileJoiner<'de> {
                 None,
                 None,
                 None,
+                None,
             ) => None,
             (
                 Some(Ok(Required {
@@ -320,6 +327,10 @@ impl<'de> Iterator for FileJoiner<'de> {
                     id: id14,
                     data: editing_events,
                 })),
+                Some(Ok(Multiple {
+                    id: id15,
+                    data: goflow_llm_annotations,
+                })),
             ) => {
                 if id1 != id2
                     || id1 != id3
@@ -334,9 +345,10 @@ impl<'de> Iterator for FileJoiner<'de> {
                     || id1 != id12
                     || id1 != id13
                     || id1 != id14
+                    || id1 != id15
                 {
                     return Some(Err(Error::OutofSyncData(vec![
-                        id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14,
+                        id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14, id15
                     ])));
                 }
 
@@ -362,6 +374,7 @@ impl<'de> Iterator for FileJoiner<'de> {
                     .publication_counts(publication_counts)
                     .litsumm_summaries(lit_summ)
                     .editing_events(editing_events)
+                    .go_flow_llm_annotations(goflow_llm_annotations)
                     .so_tree(so_tree)
                     .build();
 
diff --git a/utils/search-export/src/sequences/go_flow_annotations.rs b/utils/search-export/src/sequences/go_flow_annotations.rs
new file mode 100644
index 000000000..04e73b525
--- /dev/null
+++ b/utils/search-export/src/sequences/go_flow_annotations.rs
@@ -0,0 +1,34 @@
+use serde::{
+    Deserialize,
+    Serialize,
+};
+use std::path::Path;
+
+use anyhow::Result;
+use rnc_core::grouper;
+
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub struct GoFlowLLMAnnotation {
+    pub id: usize,
+    urs_taxid: String,
+    should_show_goflow: bool,
+}
+
+impl grouper::HasIndex for GoFlowLLMAnnotation {
+    fn index(&self) -> usize {
+        self.id
+    }
+}
+
+pub fn group(path: &Path, max: usize, output: &Path) -> Result<()> {
+    grouper::group::<GoFlowLLMAnnotation>(grouper::Criteria::AnyNumber, &path, 1, max, &output)
+}
+
+impl GoFlowLLMAnnotation {
+    pub fn should_show_goflow(&self) -> bool {
+        self.should_show_goflow
+    }
+    pub fn urs_taxid(&self) -> &str {
+        &self.urs_taxid
+    }
+}
diff --git a/utils/search-export/src/sequences/mod.rs b/utils/search-export/src/sequences/mod.rs
index b3febfd50..aca016443 100644
--- a/utils/search-export/src/sequences/mod.rs
+++ b/utils/search-export/src/sequences/mod.rs
@@ -16,5 +16,6 @@ pub mod qa_status;
 pub mod r2dt;
 pub mod raw;
 pub mod rfam_hit;
+pub mod go_flow_annotations;
 pub mod so_tree;
 pub mod writers;
diff --git a/utils/search-export/src/sequences/normalized.rs b/utils/search-export/src/sequences/normalized.rs
index c53f334f6..09f6a718e 100644
--- a/utils/search-export/src/sequences/normalized.rs
+++ b/utils/search-export/src/sequences/normalized.rs
@@ -37,6 +37,7 @@ use crate::sequences::{
     r2dt::R2dt,
     raw::Raw,
     rfam_hit::RfamHitVec,
+    go_flow_annotations::GoFlowLLMAnnotation,
     so_tree,
 };
 
@@ -69,6 +70,7 @@ pub struct Normalized {
     publication_count: usize,
     litsumm: Vec<LitsummSummaries>,
     editing_events: Vec<EditingEvent>,
+    go_flow_llm_annotations: Vec<GoFlowLLMAnnotation>,
     so_rna_type_tree: so_tree::SoTree,
 
     #[serde(flatten)]
@@ -129,6 +131,7 @@ impl Normalized {
             rfam_hits: raw.rfam_hits().to_owned().into_iter().collect(),
             orfs: raw.orfs().to_vec().into_iter().collect(),
             litsumm: raw.litsumm_summaries().to_vec(),
+            go_flow_llm_annotations: raw.go_flow_llm_annotations().to_vec(),
             editing_events: raw.editing_events().to_vec(),
         })
     }
diff --git a/utils/search-export/src/sequences/raw.rs b/utils/search-export/src/sequences/raw.rs
index 4daf502d0..d4bcc94dc 100644
--- a/utils/search-export/src/sequences/raw.rs
+++ b/utils/search-export/src/sequences/raw.rs
@@ -24,6 +24,7 @@ use crate::sequences::{
     qa_status::QaStatus,
     r2dt::R2dt,
     rfam_hit::RfamHit,
+    go_flow_annotations::GoFlowLLMAnnotation,
     so_tree,
 };
 
@@ -46,6 +47,7 @@ pub struct Raw {
     publication_counts: Option<PublicationCount>,
     litsumm_summaries: Vec<LitsummSummaries>,
     editing_events: Vec<EditingEvent>,
+    go_flow_llm_annotations: Vec<GoFlowLLMAnnotation>,
     so_tree: so_tree::SoTree,
 }
 
@@ -148,6 +150,11 @@ impl Raw {
         &self.editing_events
     }
 
+    /// Get a reference to the raw's editing events.
+    pub fn go_flow_llm_annotations(&self) -> &[GoFlowLLMAnnotation] {
+        &self.go_flow_llm_annotations
+    }
+
     /// Get this raw's publication count.
     pub fn publication_count(&self) -> usize {
         self.publication_counts.as_ref().map(|p| p.publication_count()).unwrap_or(0)

From 53f2fefbf2d8f7fae00cd998194bb18b6a8b9340 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Mon, 13 Oct 2025 14:43:57 +0100
Subject: [PATCH 02/22] Python bits of the goflowllm search index export

---
 rnacentral_pipeline/rnacentral/search_export/data.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/rnacentral_pipeline/rnacentral/search_export/data.py b/rnacentral_pipeline/rnacentral/search_export/data.py
index bfad7c2be..b6e8e1cdf 100644
--- a/rnacentral_pipeline/rnacentral/search_export/data.py
+++ b/rnacentral_pipeline/rnacentral/search_export/data.py
@@ -708,6 +708,9 @@ def has_publications(counts):
 def has_litsumm(litsumm):
     return str(bool(litsumm))
 
+def has_go_flow_llm_annotation(go_flow):
+    return str(bool(go_flow))
+
 
 def has_editing_event(editing_events):
     return str(bool(editing_events))
@@ -881,6 +884,7 @@ def edit_ref_to_edit(editing_events):
                     edit_repeat_type,
                     keys="editing_events",
                 ),
+                field("has_go_flow_llm_annotation", has_go_flow_llm_annotation, keys="goflow"),
                 ## Add new fields above this line! Otherwise editing the produced xml is hard.
                 tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"),
             ],

From ffa4e4268aaeb4300ba15b1a06f9bf0c84fe7a81 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Mon, 13 Oct 2025 14:56:11 +0100
Subject: [PATCH 03/22] Add necessary sql and nextflow bits

---
 files/search-export/parts/goflow.sql      | 13 +++++++++++++
 workflows/export/text-search/sequences.nf | 16 ++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 files/search-export/parts/goflow.sql

diff --git a/files/search-export/parts/goflow.sql b/files/search-export/parts/goflow.sql
new file mode 100644
index 000000000..9999d8fd4
--- /dev/null
+++ b/files/search-export/parts/goflow.sql
@@ -0,0 +1,13 @@
+COPY (
+  SELECT
+    json_build_object(
+      'id', todo.id,
+      'urs_taxid', todo.urs_taxid,
+      'should_show_goflow', true
+    )
+    FROM search_export_urs todo
+    JOIN go_flow_llm_curation_results gfllm
+    ON
+      todo.urs_taxid = gfllm.urs_taxid
+    ORDER by todo.id
+) TO STDOUT
diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf
index b1611b09b..9b805364f 100755
--- a/workflows/export/text-search/sequences.nf
+++ b/workflows/export/text-search/sequences.nf
@@ -141,6 +141,20 @@ process litsumm_summaries {
   """
 }
 
+process litsumm_summaries {
+  input:
+  val(max_count)
+  path (query)
+
+  output:
+  path("goflow_annotations.json")
+
+  """
+  psql -v ON_ERROR_STOP=1 -f "$query" "$PGDATABASE" > raw.json
+  search-export group go-flow-annotation raw.json ${max_count} goflow_annotations.json
+  """
+}
+
 process editing_events {
   input:
   val(max_count)
@@ -201,6 +215,7 @@ workflow sequences {
     Channel.fromPath('files/search-export/parts/text-mining.sql') | set { text_sql }
     Channel.fromPath('files/search-export/parts/litsumm.sql') | set { litsumm_sql }
     Channel.fromPath('files/search-export/parts/editing-events.sql') | set { editing_events_sql }
+    Channel.fromPath('files/search-export/parts/goflow.sql') | set { goflow_sql }
     Channel.fromPath('files/search-export/so-rna-types.sql') | set { so_sql }
 
     Channel.fromPath('files/search-export/parts/accessions.sql') | set { accessions_sql }
@@ -230,6 +245,7 @@ workflow sequences {
       text_mining_query(search_count, text_sql),
       litsumm_summaries(search_count, litsumm_sql),
       editing_events(search_count, editing_events_sql),
+      go_flow_annotations(search_count, goflow_sql),
       fetch_so_tree(so_sql),
     )\
     | set { metadata }

From ca3ee0968e8f85e6f6f2c69869a3dbc913f90bf7 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Tue, 14 Oct 2025 10:48:39 +0100
Subject: [PATCH 04/22] Update rnacentral_pipeline/cli/r2dt.py

Improve the docstring for the r2dt CLI

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 rnacentral_pipeline/cli/r2dt.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rnacentral_pipeline/cli/r2dt.py b/rnacentral_pipeline/cli/r2dt.py
index 604be43dd..14d8f160b 100644
--- a/rnacentral_pipeline/cli/r2dt.py
+++ b/rnacentral_pipeline/cli/r2dt.py
@@ -262,11 +262,11 @@ def r2dt_prepare_s3(model_info, directory, output, file_list, allow_missing):
 @click.option("--max_sequences", default=-1)
 def r2dt_prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):
     """
-    Prepare the sequences extracted from RNAcentral
+    Prepare a list of URS identifiers to fetch sequences for.
 
-    This means we will load and deduplicate the json file before rewriting
-    a json file containing only the requested number of sequences.
-
-    The default will be to write out all sequences
+    This takes a file of all URS identifiers from cross-references and a file
+    of already tracked URS identifiers. It produces a file of URS identifiers
+    that are in the xref file but not in the tracked file. This can be limited
+    to a maximum number of sequences.
     """
     r2dt.prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences)

From bbb78e20ae8cc576006a3c842d48195c5ee6cdd4 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Tue, 14 Oct 2025 10:49:14 +0100
Subject: [PATCH 05/22] Update
 rnacentral_pipeline/databases/ensembl/genomes/urls.py

Remove unused releases parameter

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 rnacentral_pipeline/databases/ensembl/genomes/urls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py
index af3e6277c..9a56c84a2 100644
--- a/rnacentral_pipeline/databases/ensembl/genomes/urls.py
+++ b/rnacentral_pipeline/databases/ensembl/genomes/urls.py
@@ -30,7 +30,7 @@ def list_releases(ftp: FTP) -> ty.List[str]:
     return [f for f in ftp.nlst() if f.startswith("release-")]
 
 
-def latest_release(releases: ty.List[str], ftp: FTP) -> str:
+def latest_release(ftp: FTP) -> str:
     ## Parse the readme for the current release to avoid getting a half baked release
     readme_lines = []
     ftp.retrlines("RETR current_README", readme_lines.append)

From c40cef0c3705bc72f622f6b12c989af9c016ff21 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Tue, 14 Oct 2025 10:55:04 +0100
Subject: [PATCH 06/22] Remove unused fetching of ensembl latest release from
 parsing release names

---
 .../databases/ensembl/vertebrates/urls.py                | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
index b6c757de6..d9bca58e3 100644
--- a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
+++ b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
@@ -23,18 +23,14 @@
 from rnacentral_pipeline.databases.ensembl.data import Division, FtpInfo
 
 
-def list_releases(ftp: FTP) -> ty.List[str]:
-    return [f for f in ftp.nlst() if f.startswith("release-")]
 
-
-def latest_release(releases: ty.List[str], ftp: FTP) -> str:
+def latest_release(ftp: FTP) -> str:
     ## Parse the readme for the current release to avoid getting a half baked release
     readme_lines = []
     ftp.retrlines("RETR current_README", readme_lines.append)
     cur_readme = "\n".join(readme_lines)
     pattern = r"Ensembl Release (\d+) Databases."
     release = re.search(pattern, cur_readme).group(1)
-    print(f"Ensembl release {release}")
     return f"release-{release}"
 
 
@@ -71,7 +67,6 @@ def urls_for(host: str) -> ty.Iterable[FtpInfo]:
     with FTP(host) as ftp:
         ftp.login()
         ftp.cwd("pub")
-        releases = list_releases(ftp)
-        latest = latest_release(releases, ftp)
+        latest = latest_release(ftp)
         with species_info(ftp, latest) as info:
             yield from generate_paths(f"ftp://{host}/pub", latest, info)

From 3e818d656619be6dde944e9572b1b29fc1b6f4ae Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Tue, 14 Oct 2025 10:57:01 +0100
Subject: [PATCH 07/22] Remove some debugging print statements

---
 rnacentral_pipeline/rnacentral/genome_mapping/urls.py | 4 ----
 rnacentral_pipeline/rnacentral/r2dt/__init__.py       | 3 +--
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/rnacentral_pipeline/rnacentral/genome_mapping/urls.py b/rnacentral_pipeline/rnacentral/genome_mapping/urls.py
index f51bfe0f8..536673507 100644
--- a/rnacentral_pipeline/rnacentral/genome_mapping/urls.py
+++ b/rnacentral_pipeline/rnacentral/genome_mapping/urls.py
@@ -131,10 +131,6 @@ def toplevel_file(
     toplevel = base.format(type="toplevel")
     base_result = f"ftp://{host}{directory}/{{file}}"
 
-    print(primary)
-    print(toplevel)
-    print(files)
-
     if primary in files:
         return base_result.format(file=primary)
     elif toplevel in files:
diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py
index 7f30ddfc5..190f00c98 100644
--- a/rnacentral_pipeline/rnacentral/r2dt/__init__.py
+++ b/rnacentral_pipeline/rnacentral/r2dt/__init__.py
@@ -153,7 +153,6 @@ def write_inspect_data(handle: ty.IO, db_url: str, output: ty.IO):
 
 
 def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):
-    print(urs_to_fetch.name)
     raw_xref = (
         pl.scan_csv(xref_urs.name, has_header=False, low_memory=True)
         .unique()
@@ -162,7 +161,7 @@ def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):
 
     raw_tracked = pl.scan_csv(
         tracked_urs.name, low_memory=True
-    ).unique()  ## May not need to be uniqued?
+    ).unique() 
 
     to_fetch = raw_xref.join(raw_tracked, on="urs", how="anti")
 

From a4901b7ca3913fa2cbc8961f3b47f43fa0b73739 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Sat, 18 Oct 2025 10:30:00 +0100
Subject: [PATCH 08/22] Update
 rnacentral_pipeline/databases/ensembl/genomes/urls.py

Raise value error when no match for release number found

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 rnacentral_pipeline/databases/ensembl/genomes/urls.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py
index 9a56c84a2..44cc2e4fa 100644
--- a/rnacentral_pipeline/databases/ensembl/genomes/urls.py
+++ b/rnacentral_pipeline/databases/ensembl/genomes/urls.py
@@ -36,7 +36,10 @@ def latest_release(ftp: FTP) -> str:
     ftp.retrlines("RETR current_README", readme_lines.append)
     cur_readme = "\n".join(readme_lines)
     pattern = r"Ensembl Release (\d+) Databases."
-    release = re.search(pattern, cur_readme).group(1)
+    match = re.search(pattern, cur_readme)
+    if not match:
+        raise ValueError("Could not find release number in README")
+    release = match.group(1)
     print(f"Ensembl release {release}")
     return f"release-{release}"
 

From 0efdf8d2c0fd5694ad99e22590ef24751b7c16ef Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Sat, 18 Oct 2025 10:35:03 +0100
Subject: [PATCH 09/22] Remove some dead code relating to finding the ensembl
 release number

---
 rnacentral_pipeline/databases/ensembl/genomes/urls.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py
index 44cc2e4fa..202ac5209 100644
--- a/rnacentral_pipeline/databases/ensembl/genomes/urls.py
+++ b/rnacentral_pipeline/databases/ensembl/genomes/urls.py
@@ -25,11 +25,6 @@
 
 LOGGER = logging.getLogger(__name__)
 
-
-def list_releases(ftp: FTP) -> ty.List[str]:
-    return [f for f in ftp.nlst() if f.startswith("release-")]
-
-
 def latest_release(ftp: FTP) -> str:
     ## Parse the readme for the current release to avoid getting a half baked release
     readme_lines = []
@@ -98,8 +93,7 @@ def urls_for(division: Division, host: str) -> ty.Iterable[FtpInfo]:
         ftp.login()
         print("LOGIN")
         ftp.cwd(f"pub/{division.name}/")
-        releases = list_releases(ftp)
-        latest = latest_release(releases, ftp)
+        latest = latest_release(ftp)
         with species_info(ftp, division, latest) as info:
             url_base = f"ftp://{host}/pub/{division.name}"
             yield from generate_paths(ftp, division, url_base, latest, info)

From f30beadf2343ccae7505bfd5b0a4f56b489e5630 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Sat, 18 Oct 2025 10:36:32 +0100
Subject: [PATCH 10/22] Reinstate conditionals for running r2dt

---
 workflows/r2dt.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/workflows/r2dt.nf b/workflows/r2dt.nf
index a0561b879..896b60987 100644
--- a/workflows/r2dt.nf
+++ b/workflows/r2dt.nf
@@ -37,7 +37,7 @@ process get_partitions {
 
 
 process fetch_xrefs {
-  // when { params.r2dt.run }
+  when { params.r2dt.run }
 
   input:
   tuple val(partition), path(query)
@@ -56,7 +56,7 @@ process fetch_xrefs {
 }
 
 process fetch_tracked {
-  // when { params.r2dt.run }
+  when { params.r2dt.run }
 
   input:
   tuple val(_flag)
@@ -76,7 +76,7 @@ process fetch_tracked {
 
 
 process extract_sequences {
-  // when { params.r2dt.run }
+  when { params.r2dt.run }
 
   memory '12GB'
 

From 0c2d9449901710bd3b25f239eebae4d53dad8693 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Sat, 18 Oct 2025 10:47:14 +0100
Subject: [PATCH 11/22] Fix other instances of release matching not being
 defenzive about no matches

---
 rnacentral_pipeline/databases/ensembl/vertebrates/urls.py | 5 ++++-
 rnacentral_pipeline/rnacentral/genome_mapping/urls.py     | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
index d9bca58e3..e338038f2 100644
--- a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
+++ b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
@@ -30,7 +30,10 @@ def latest_release(ftp: FTP) -> str:
     ftp.retrlines("RETR current_README", readme_lines.append)
     cur_readme = "\n".join(readme_lines)
     pattern = r"Ensembl Release (\d+) Databases."
-    release = re.search(pattern, cur_readme).group(1)
+    match = re.search(pattern, cur_readme)
+    if not match:
+        raise ValueError("Could not determine latest Ensembl release from README")
+    release = match.group(1)
     return f"release-{release}"
 
 
diff --git a/rnacentral_pipeline/rnacentral/genome_mapping/urls.py b/rnacentral_pipeline/rnacentral/genome_mapping/urls.py
index 536673507..213abb6bf 100644
--- a/rnacentral_pipeline/rnacentral/genome_mapping/urls.py
+++ b/rnacentral_pipeline/rnacentral/genome_mapping/urls.py
@@ -164,7 +164,10 @@ def url_for(species: str, assembly_id: str, kind: str, host: str, soft_masked=Fa
                 conn.retrlines("RETR current_README", readme_lines.append)
                 cur_readme = "\n".join(readme_lines)
                 pattern = r"[Cc]urrent release is (?:Ensembl )?Genomes\s*(\d+)"
-                release = re.search(pattern, cur_readme).group(1)
+                match = re.search(pattern, cur_readme)
+                if not match:
+                    raise ValueError("Could not determine latest Ensembl release from README")
+                release = match.group(1)
 
                 for path in host.paths(species, kind):
                     try:

From 96d37fdcd5354b169e15b880751fabcd10b19c1b Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Sat, 18 Oct 2025 11:09:45 +0100
Subject: [PATCH 12/22] Update rnacentral_pipeline/rnacentral/r2dt/parser.py

f-string is the reccomended way for exceptions, c-style for logging

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 rnacentral_pipeline/rnacentral/r2dt/parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rnacentral_pipeline/rnacentral/r2dt/parser.py b/rnacentral_pipeline/rnacentral/r2dt/parser.py
index 5f6426720..7d287430e 100644
--- a/rnacentral_pipeline/rnacentral/r2dt/parser.py
+++ b/rnacentral_pipeline/rnacentral/r2dt/parser.py
@@ -109,7 +109,7 @@ def parse(
                 old_model_name = model_name
                 model_name = temp_model_name_lookup.get(model_name, None)
                 if model_name is None:
-                    raise ValueError("No info for model %s", old_model_name)
+                    raise ValueError(f"No info for model {old_model_name}")
 
             minfo = model_info[model_name]
             info = data.R2DTResultInfo(urs, minfo, source, result_base)

From 29dc710f35bde93c3e0b67a97939168c17b5f8a4 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Sat, 25 Oct 2025 08:38:08 +0100
Subject: [PATCH 13/22] Link up go flow search export processes properly

---
 workflows/export/text-search/sequences.nf | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf
index 9b805364f..86f812bbc 100755
--- a/workflows/export/text-search/sequences.nf
+++ b/workflows/export/text-search/sequences.nf
@@ -65,6 +65,7 @@ process build_metadata {
   path(text)
   path(litsumm)
   path(editing_events)
+  path(go_flow_annotations)
   path(so_tree)
 
   output:
@@ -141,7 +142,7 @@ process litsumm_summaries {
   """
 }
 
-process litsumm_summaries {
+process go_flow_annotations {
   input:
   val(max_count)
   path (query)

From 6332d64069fe147001a20f6ebbb10f2897f6ed9f Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Mon, 27 Oct 2025 09:25:59 +0000
Subject: [PATCH 14/22] Use less stringent regex in ensembl release detection

---
 rnacentral_pipeline/databases/ensembl/genomes/urls.py     | 4 ++--
 rnacentral_pipeline/databases/ensembl/vertebrates/urls.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py
index 202ac5209..c9f35997b 100644
--- a/rnacentral_pipeline/databases/ensembl/genomes/urls.py
+++ b/rnacentral_pipeline/databases/ensembl/genomes/urls.py
@@ -30,8 +30,8 @@ def latest_release(ftp: FTP) -> str:
     readme_lines = []
     ftp.retrlines("RETR current_README", readme_lines.append)
     cur_readme = "\n".join(readme_lines)
-    pattern = r"Ensembl Release (\d+) Databases."
-    match = re.search(pattern, cur_readme)
+    pattern = r"Ensembl Release (\d+) Databases\."
+    match = re.search(pattern, cur_readme, re.IGNORECASE)
     if not match:
         raise ValueError("Could not find release number in README")
     release = match.group(1)
diff --git a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
index e338038f2..5f624d538 100644
--- a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
+++ b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py
@@ -29,8 +29,8 @@ def latest_release(ftp: FTP) -> str:
     readme_lines = []
     ftp.retrlines("RETR current_README", readme_lines.append)
     cur_readme = "\n".join(readme_lines)
-    pattern = r"Ensembl Release (\d+) Databases."
-    match = re.search(pattern, cur_readme)
+    pattern = r"Ensembl Release (\d+) Databases\."
+    match = re.search(pattern, cur_readme, re.IGNORECASE)
     if not match:
         raise ValueError("Could not determine latest Ensembl release from README")
     release = match.group(1)

From cd6e07dbbd5c20e6896829b3a9e6d37fedb8172a Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Mon, 27 Oct 2025 09:27:05 +0000
Subject: [PATCH 15/22] Remove some trailing whitespace

---
 rnacentral_pipeline/rnacentral/r2dt/__init__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py
index 190f00c98..3d74880b3 100644
--- a/rnacentral_pipeline/rnacentral/r2dt/__init__.py
+++ b/rnacentral_pipeline/rnacentral/r2dt/__init__.py
@@ -159,9 +159,7 @@ def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences):
         .rename({"column_1": "urs"})
     )
 
-    raw_tracked = pl.scan_csv(
-        tracked_urs.name, low_memory=True
-    ).unique() 
+    raw_tracked = pl.scan_csv(tracked_urs.name, low_memory=True).unique()
 
     to_fetch = raw_xref.join(raw_tracked, on="urs", how="anti")
 

From fa07512d534f1dd06a4462015609ccdc71c2752f Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Mon, 27 Oct 2025 09:28:29 +0000
Subject: [PATCH 16/22] Raise value errors instead of relying on assertions

---
 rnacentral_pipeline/rnacentral/r2dt/data.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py
index dcb47486d..ed204163c 100644
--- a/rnacentral_pipeline/rnacentral/r2dt/data.py
+++ b/rnacentral_pipeline/rnacentral/r2dt/data.py
@@ -409,11 +409,15 @@ def dot_bracket(self):
             seq_dot = str(record.seq)
             ## Use indices instead, assert that the string is even length
             ## If not, then the two parts are not the same length
-            assert len(seq_dot) % 2 == 0, f"Odd length sequence {len(seq_dot)}"
+            if len(seq_dot) % 2 != 0:
+                raise ValueError(f"Odd length sequence {len(seq_dot)}")
             seq_dot_len = len(seq_dot)
             sequence = seq_dot[0 : seq_dot_len // 2]
             dot_bracket = seq_dot[(seq_dot_len // 2) :]
-            assert len(sequence) == len(dot_bracket)
+            if len(sequence) != len(dot_bracket):
+                raise ValueError(
+                    f"Sequence and dot bracket lengths do not match: {len(sequence)} != {len(dot_bracket)}"
+                )
             return dot_bracket
 
     def basepair_count(self):

From f00a676df894b43b657ba4e86510ccc83d6be786 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Mon, 27 Oct 2025 14:36:56 +0000
Subject: [PATCH 17/22] Fix not passing goflow data to merging process
 correctly

---
 workflows/export/text-search/sequences.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf
index 86f812bbc..118eca7cd 100755
--- a/workflows/export/text-search/sequences.nf
+++ b/workflows/export/text-search/sequences.nf
@@ -72,7 +72,7 @@ process build_metadata {
   path("merged.json")
 
   """
-  search-export sequences merge $base $crs $feeback $go $prot $rnas $precompute $qa $r2dt $rfam $orf $text $so_tree $litsumm $editing_events merged.json
+  search-export sequences merge $base $crs $feeback $go $prot $rnas $precompute $qa $r2dt $rfam $orf $text $so_tree $litsumm $editing_events $go_flow_annotations merged.json
   """
 }
 

From acdba3cfe08b1081ba4b8ceb51ce51b2c794fcc9 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Mon, 27 Oct 2025 14:42:07 +0000
Subject: [PATCH 18/22] Update expected key from search export rust code

---
 rnacentral_pipeline/rnacentral/search_export/data.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/rnacentral_pipeline/rnacentral/search_export/data.py b/rnacentral_pipeline/rnacentral/search_export/data.py
index b6e8e1cdf..b68e07fc7 100644
--- a/rnacentral_pipeline/rnacentral/search_export/data.py
+++ b/rnacentral_pipeline/rnacentral/search_export/data.py
@@ -708,6 +708,7 @@ def has_publications(counts):
 def has_litsumm(litsumm):
     return str(bool(litsumm))
 
+
 def has_go_flow_llm_annotation(go_flow):
     return str(bool(go_flow))
 
@@ -884,7 +885,11 @@ def edit_ref_to_edit(editing_events):
                     edit_repeat_type,
                     keys="editing_events",
                 ),
-                field("has_go_flow_llm_annotation", has_go_flow_llm_annotation, keys="goflow"),
+                field(
+                    "has_go_flow_llm_annotation",
+                    has_go_flow_llm_annotation,
+                    keys="go_flow_llm_annotations",
+                ),
                 ## Add new fields above this line! Otherwise editing the produced xml is hard.
                 tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"),
             ],

From fa64872732d47ba0c398d4306aac7e55df5b006f Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Mon, 27 Oct 2025 14:53:12 +0000
Subject: [PATCH 19/22] Rust side CLI requires kebab-case filename to match
 file type enum, so make sure it does

---
 workflows/export/text-search/sequences.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf
index 118eca7cd..3c4e65f5a 100755
--- a/workflows/export/text-search/sequences.nf
+++ b/workflows/export/text-search/sequences.nf
@@ -148,11 +148,11 @@ process go_flow_annotations {
   path (query)
 
   output:
-  path("goflow_annotations.json")
+  path("go-flow-llm-annotations.json")
 
   """
   psql -v ON_ERROR_STOP=1 -f "$query" "$PGDATABASE" > raw.json
-  search-export group go-flow-annotation raw.json ${max_count} goflow_annotations.json
+  search-export group go-flow-annotation raw.json ${max_count} go-flow-llm-annotations.json
   """
 }
 

From 77583acbe6d881d9ec58b2f398e42d6b7573ac98 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Mon, 27 Oct 2025 14:58:54 +0000
Subject: [PATCH 20/22] Improve documentation comment in sequence raw handler

---
 .pre-commit-config.yaml                  | 12 ++++++------
 utils/search-export/src/sequences/raw.rs |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d7b62a797..72e33e87c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,12 +16,12 @@ repos:
     -   id: isort
         args: ["--profile", "black", "--filter-files"]
         name: isort (python)
--   repo: https://github.com/doublify/pre-commit-rust
-    rev: v1.0
-    hooks:
-    -   id: fmt
-    -   id: cargo-check
-    -   id: clippy
+# -   repo: https://github.com/doublify/pre-commit-rust
+#     rev: v1.0
+#     hooks:
+#     -   id: fmt
+#     -   id: cargo-check
+#     -   id: clippy
 # - repo: https://github.com/python-poetry/poetry
 #   rev: '1.2.2'
 #   hooks:
diff --git a/utils/search-export/src/sequences/raw.rs b/utils/search-export/src/sequences/raw.rs
index d4bcc94dc..52bea2f9f 100644
--- a/utils/search-export/src/sequences/raw.rs
+++ b/utils/search-export/src/sequences/raw.rs
@@ -16,6 +16,7 @@ use crate::sequences::{
     editing_events::EditingEvent,
     feedback::Feedback,
     go_annotation::GoAnnotation,
+    go_flow_annotations::GoFlowLLMAnnotation,
     interacting_protein::InteractingProtein,
     interacting_rna::InteractingRna,
     litsumm::LitsummSummaries,
@@ -24,7 +25,6 @@ use crate::sequences::{
     qa_status::QaStatus,
     r2dt::R2dt,
     rfam_hit::RfamHit,
-    go_flow_annotations::GoFlowLLMAnnotation,
     so_tree,
 };
 
@@ -150,7 +150,7 @@ impl Raw {
         &self.editing_events
     }
 
-    /// Get a reference to the raw's editing events.
+    /// Get a reference to the raw's GoFlowlLM annotations
     pub fn go_flow_llm_annotations(&self) -> &[GoFlowLLMAnnotation] {
         &self.go_flow_llm_annotations
     }

From ea94cdcd1aee2680e8a611d54c0144d416cf5012 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Tue, 18 Nov 2025 09:32:26 +0000
Subject: [PATCH 21/22] Cargo clippy fixes

---
 utils/bed-expander/src/main.rs                | 33 ++++++++++++++-----
 utils/search-export/src/main.rs               |  4 ++-
 .../src/sequences/file_joiner.rs              |  8 +++--
 .../src/sequences/go_flow_annotations.rs      |  1 +
 utils/search-export/src/sequences/litsumm.rs  |  1 +
 utils/search-export/src/sequences/mod.rs      |  2 +-
 .../search-export/src/sequences/normalized.rs |  2 +-
 7 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/utils/bed-expander/src/main.rs b/utils/bed-expander/src/main.rs
index d5d7d7025..329a5fe52 100644
--- a/utils/bed-expander/src/main.rs
+++ b/utils/bed-expander/src/main.rs
@@ -1,8 +1,13 @@
 use clap::Parser;
-use polars::lazy::dsl::col;
+use polars::{
+    datatypes::DataType::{
+        Int64,
+        List,
+    },
+    lazy::dsl::col,
+    prelude::*,
+};
 use std::fs;
-use polars::datatypes::DataType::{Int64, List};
-use polars::prelude::*;
 
 #[derive(Parser, Debug)]
 #[clap(author = "Andrew Green", version, about)]
@@ -30,19 +35,30 @@ fn main() -> Result<(), PolarsError> {
     // col 11: exon sizes
     // col 12: exon starts
 
-
     // Stay lazy as long as possible to minimise memory use
     let original_bed = original_bed
         .with_columns([col("column_11").str().split(","), col("column_12").str().split(",")])
-        .with_columns([col("column_11").cast(List(Box::new(Int64)))] )
+        .with_columns([col("column_11").cast(List(Box::new(Int64)))])
         .with_column(col("column_11").list().sum().alias("column_5"))
         .explode([col("column_11"), col("column_12")])
         .with_columns([col("column_11").cast(Int64), col("column_12").cast(Int64)])
         .with_column(col("column_2").alias("transcript_start"))
         .with_column(col("column_3").alias("transcript_end"))
-        .with_columns([(col("column_2") + col("column_12")).alias("column_2"), (col("column_2") + col("column_12") + col("column_11")).alias("column_3")] )
-        .select([col("column_1"), col("column_2"), col("column_3"), col("column_4"), col("column_5"), col("column_6"), col("transcript_start"), col("transcript_end")])
-        .sort_by_exprs(vec![col("column_1"), col("column_2")], vec![false,false], false, false);
+        .with_columns([
+            (col("column_2") + col("column_12")).alias("column_2"),
+            (col("column_2") + col("column_12") + col("column_11")).alias("column_3"),
+        ])
+        .select([
+            col("column_1"),
+            col("column_2"),
+            col("column_3"),
+            col("column_4"),
+            col("column_5"),
+            col("column_6"),
+            col("transcript_start"),
+            col("transcript_end"),
+        ])
+        .sort_by_exprs(vec![col("column_1"), col("column_2")], vec![false, false], false, false);
 
     let mut output_file = fs::File::create(&cli.output)?;
     let mut writer = CsvWriter::new(&mut output_file).has_header(false).with_delimiter(b'\t');
@@ -50,5 +66,4 @@ fn main() -> Result<(), PolarsError> {
     let mut expanded_bed = original_bed.collect()?;
 
     writer.finish(&mut expanded_bed)
-
 }
diff --git a/utils/search-export/src/main.rs b/utils/search-export/src/main.rs
index de852456b..354643f50 100644
--- a/utils/search-export/src/main.rs
+++ b/utils/search-export/src/main.rs
@@ -260,7 +260,9 @@ fn main() -> Result<()> {
             Groupable::EditingEvents => {
                 sequences::editing_events::group(&path, max_count, &output)?
             },
-            Groupable::GoFlowAnnotation => sequences::go_flow_annotations::group(&path, max_count, &output)?,
+            Groupable::GoFlowAnnotation => {
+                sequences::go_flow_annotations::group(&path, max_count, &output)?
+            },
         },
         Subcommand::Sequences {
             command,
diff --git a/utils/search-export/src/sequences/file_joiner.rs b/utils/search-export/src/sequences/file_joiner.rs
index 6903b6e1b..b60e33302 100644
--- a/utils/search-export/src/sequences/file_joiner.rs
+++ b/utils/search-export/src/sequences/file_joiner.rs
@@ -37,6 +37,7 @@ use super::{
     editing_events::EditingEvent,
     feedback::Feedback,
     go_annotation::GoAnnotation,
+    go_flow_annotations::GoFlowLLMAnnotation,
     interacting_protein::InteractingProtein,
     interacting_rna::InteractingRna,
     litsumm::LitsummSummaries,
@@ -49,7 +50,6 @@ use super::{
     rfam_hit::RfamHit,
     so_tree,
     so_tree::SoMapping,
-    go_flow_annotations::GoFlowLLMAnnotation,
 };
 
 #[derive(Debug, Error)]
@@ -118,7 +118,8 @@ pub struct FileJoiner<'de> {
     rfam_hits: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<RfamHit>>,
     publication_counts: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<PublicationCount>>,
     lit_summ: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<LitsummSummaries>>,
-    go_flow_llm_annotations: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<GoFlowLLMAnnotation>>,
+    go_flow_llm_annotations:
+        StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<GoFlowLLMAnnotation>>,
     editing_events: StreamDeserializer<'de, IoRead<BufReader<File>>, Grouped<EditingEvent>>,
     so_info: SoMapping,
 }
@@ -348,7 +349,8 @@ impl<'de> Iterator for FileJoiner<'de> {
                     || id1 != id15
                 {
                     return Some(Err(Error::OutofSyncData(vec![
-                        id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14, id15
+                        id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14,
+                        id15,
                     ])));
                 }
 
diff --git a/utils/search-export/src/sequences/go_flow_annotations.rs b/utils/search-export/src/sequences/go_flow_annotations.rs
index 04e73b525..5213ed13d 100644
--- a/utils/search-export/src/sequences/go_flow_annotations.rs
+++ b/utils/search-export/src/sequences/go_flow_annotations.rs
@@ -28,6 +28,7 @@ impl GoFlowLLMAnnotation {
     pub fn should_show_goflow(&self) -> bool {
         self.should_show_goflow
     }
+
     pub fn urs_taxid(&self) -> &str {
         &self.urs_taxid
     }
diff --git a/utils/search-export/src/sequences/litsumm.rs b/utils/search-export/src/sequences/litsumm.rs
index 7bbd0cada..1d80d8a4f 100644
--- a/utils/search-export/src/sequences/litsumm.rs
+++ b/utils/search-export/src/sequences/litsumm.rs
@@ -28,6 +28,7 @@ impl LitsummSummaries {
     pub fn should_show_litsumm(&self) -> bool {
         self.should_show_litsumm
     }
+
     pub fn urs_taxid(&self) -> &str {
         &self.urs_taxid
     }
diff --git a/utils/search-export/src/sequences/mod.rs b/utils/search-export/src/sequences/mod.rs
index aca016443..a00fbe3d5 100644
--- a/utils/search-export/src/sequences/mod.rs
+++ b/utils/search-export/src/sequences/mod.rs
@@ -5,6 +5,7 @@ pub mod editing_events;
 pub mod feedback;
 pub mod file_joiner;
 pub mod go_annotation;
+pub mod go_flow_annotations;
 pub mod interacting_protein;
 pub mod interacting_rna;
 pub mod litsumm;
@@ -16,6 +17,5 @@ pub mod qa_status;
 pub mod r2dt;
 pub mod raw;
 pub mod rfam_hit;
-pub mod go_flow_annotations;
 pub mod so_tree;
 pub mod writers;
diff --git a/utils/search-export/src/sequences/normalized.rs b/utils/search-export/src/sequences/normalized.rs
index 09f6a718e..cd57b85d2 100644
--- a/utils/search-export/src/sequences/normalized.rs
+++ b/utils/search-export/src/sequences/normalized.rs
@@ -28,6 +28,7 @@ use crate::sequences::{
     editing_events::EditingEvent,
     feedback::FeedbackVec,
     go_annotation::GoAnnotation,
+    go_flow_annotations::GoFlowLLMAnnotation,
     interacting_protein::InteractingProtein,
     interacting_rna::InteractingRna,
     litsumm::LitsummSummaries,
@@ -37,7 +38,6 @@ use crate::sequences::{
     r2dt::R2dt,
     raw::Raw,
     rfam_hit::RfamHitVec,
-    go_flow_annotations::GoFlowLLMAnnotation,
     so_tree,
 };
 

From e8c00c3a9f780e69c8e11e1c1fd25be47fb5bd42 Mon Sep 17 00:00:00 2001
From: Andrew Green <agreen@ebi.ac.uk>
Date: Tue, 18 Nov 2025 09:33:34 +0000
Subject: [PATCH 22/22] Remove an unused polars dependency

---
 Cargo.lock                       | 406 +++++--------------------------
 utils/precompute/Cargo.toml      |   1 -
 utils/precompute/src/releases.rs |  13 +-
 3 files changed, 66 insertions(+), 354 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 014395f95..0cd2f47d5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -194,35 +194,6 @@ dependencies = [
  "zstd",
 ]
 
-[[package]]
-name = "arrow2"
-version = "0.18.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "963fef509b757bcbbf9e5ffa23bcb345614d99f4f6f531f97417b27b8604d389"
-dependencies = [
- "ahash",
- "arrow-format",
- "bytemuck",
- "chrono",
- "dyn-clone",
- "either",
- "ethnum",
- "foreign_vec",
- "getrandom 0.2.16",
- "hash_hasher",
- "hashbrown 0.14.5",
- "lexical-core",
- "lz4",
- "multiversion",
- "num-traits",
- "regex",
- "regex-syntax 0.7.5",
- "rustc_version 0.4.1",
- "simdutf8",
- "strength_reduce",
- "zstd",
-]
-
 [[package]]
 name = "async-trait"
 version = "0.1.88"
@@ -289,7 +260,7 @@ name = "bed-expander"
 version = "0.1.0"
 dependencies = [
  "clap 4.5.37",
- "polars 0.32.1",
+ "polars",
 ]
 
 [[package]]
@@ -1685,28 +1656,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b1362d4a136c0ebacb40d88a37ba361738b222fd8a2ee9340a3d8642f698c52b"
 dependencies = [
  "getrandom 0.2.16",
- "polars-core 0.32.1",
- "polars-io 0.32.1",
- "polars-lazy 0.32.1",
- "polars-ops 0.32.1",
- "polars-sql 0.32.1",
- "polars-time 0.32.1",
- "version_check",
-]
-
-[[package]]
-name = "polars"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3030de163b9ff2c9dac9a12dcb9be25cc0f2bc7c8e7cd2e4b2592ebed458ce6a"
-dependencies = [
- "getrandom 0.2.16",
- "polars-core 0.33.2",
- "polars-io 0.33.2",
- "polars-lazy 0.33.2",
- "polars-ops 0.33.2",
- "polars-sql 0.33.2",
- "polars-time 0.33.2",
+ "polars-core",
+ "polars-io",
+ "polars-lazy",
+ "polars-ops",
+ "polars-sql",
+ "polars-time",
  "version_check",
 ]
 
@@ -1716,26 +1671,11 @@ version = "0.32.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f967c901fa5da4ca7f64e813d1268488ba97e9b3004cefc579ff851c197a1138"
 dependencies = [
- "arrow2 0.17.4",
- "hashbrown 0.14.5",
- "multiversion",
- "num-traits",
- "polars-error 0.32.1",
- "thiserror",
- "version_check",
-]
-
-[[package]]
-name = "polars-arrow"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35cd38a64fb389fd990e4efd433a36331c995c981d353bfef83b5de4d87f1828"
-dependencies = [
- "arrow2 0.18.0",
+ "arrow2",
  "hashbrown 0.14.5",
  "multiversion",
  "num-traits",
- "polars-error 0.33.2",
+ "polars-error",
  "thiserror",
  "version_check",
 ]
@@ -1747,37 +1687,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b24f92fc5b167f668ff85ab9607dfa72e2c09664cacef59297ee8601dee60126"
 dependencies = [
  "ahash",
- "arrow2 0.17.4",
- "bitflags 2.9.0",
- "chrono",
- "comfy-table",
- "either",
- "hashbrown 0.14.5",
- "indexmap 2.9.0",
- "num-traits",
- "once_cell",
- "polars-arrow 0.32.1",
- "polars-error 0.32.1",
- "polars-row 0.32.1",
- "polars-utils 0.32.1",
- "rand 0.8.5",
- "rand_distr",
- "rayon",
- "regex",
- "smartstring",
- "thiserror",
- "version_check",
- "xxhash-rust",
-]
-
-[[package]]
-name = "polars-core"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08367c014c07fa8f141680e024f926cab3a1fe839605a8fcf2223647eb45ca71"
-dependencies = [
- "ahash",
- "arrow2 0.18.0",
+ "arrow2",
  "bitflags 2.9.0",
  "chrono",
  "comfy-table",
@@ -1786,10 +1696,10 @@ dependencies = [
  "indexmap 2.9.0",
  "num-traits",
  "once_cell",
- "polars-arrow 0.33.2",
- "polars-error 0.33.2",
- "polars-row 0.33.2",
- "polars-utils 0.33.2",
+ "polars-arrow",
+ "polars-error",
+ "polars-row",
+ "polars-utils",
  "rand 0.8.5",
  "rand_distr",
  "rayon",
@@ -1806,18 +1716,7 @@ version = "0.32.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "40d09c3a7337e53b38c37b57999038440fa39c6801b9ba48afaecd8e16f7ac0a"
 dependencies = [
- "arrow2 0.17.4",
- "regex",
- "thiserror",
-]
-
-[[package]]
-name = "polars-error"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b20a09651a299979354945819dc2ce017964b80b916954e9d2ce39002a5f949"
-dependencies = [
- "arrow2 0.18.0",
+ "arrow2",
  "regex",
  "thiserror",
 ]
@@ -1829,7 +1728,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "92cab0df9f2a35702fa5aec99edfaabf9ae8e9cdd0acf69e143ad2d132f34f9c"
 dependencies = [
  "ahash",
- "arrow2 0.17.4",
+ "arrow2",
  "async-trait",
  "bytes",
  "chrono",
@@ -1842,45 +1741,17 @@ dependencies = [
  "memmap2",
  "num-traits",
  "once_cell",
- "polars-arrow 0.32.1",
- "polars-core 0.32.1",
- "polars-error 0.32.1",
- "polars-time 0.32.1",
- "polars-utils 0.32.1",
+ "polars-arrow",
+ "polars-core",
+ "polars-error",
+ "polars-time",
+ "polars-utils",
  "rayon",
  "regex",
  "simdutf8",
  "tokio",
 ]
 
-[[package]]
-name = "polars-io"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88cf4a89c18a90ac20dfbcdfd19ab50ad4ac5a76fc7bb775d3c28bb738cf1f34"
-dependencies = [
- "ahash",
- "arrow2 0.18.0",
- "bytes",
- "chrono",
- "fast-float",
- "home",
- "lexical",
- "lexical-core",
- "memchr",
- "memmap2",
- "num-traits",
- "once_cell",
- "polars-arrow 0.33.2",
- "polars-core 0.33.2",
- "polars-error 0.33.2",
- "polars-time 0.33.2",
- "polars-utils 0.33.2",
- "rayon",
- "regex",
- "simdutf8",
-]
-
 [[package]]
 name = "polars-lazy"
 version = "0.32.1"
@@ -1891,37 +1762,14 @@ dependencies = [
  "bitflags 2.9.0",
  "glob",
  "once_cell",
- "polars-arrow 0.32.1",
- "polars-core 0.32.1",
- "polars-io 0.32.1",
- "polars-ops 0.32.1",
- "polars-pipe 0.32.1",
- "polars-plan 0.32.1",
- "polars-time 0.32.1",
- "polars-utils 0.32.1",
- "rayon",
- "smartstring",
- "version_check",
-]
-
-[[package]]
-name = "polars-lazy"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5110eab438848c981cc5f541fbc5b21bb263fd707000b4715233074fb2630fcf"
-dependencies = [
- "ahash",
- "bitflags 2.9.0",
- "glob",
- "once_cell",
- "polars-arrow 0.33.2",
- "polars-core 0.33.2",
- "polars-io 0.33.2",
- "polars-ops 0.33.2",
- "polars-pipe 0.33.2",
- "polars-plan 0.33.2",
- "polars-time 0.33.2",
- "polars-utils 0.33.2",
+ "polars-arrow",
+ "polars-core",
+ "polars-io",
+ "polars-ops",
+ "polars-pipe",
+ "polars-plan",
+ "polars-time",
+ "polars-utils",
  "rayon",
  "smartstring",
  "version_check",
@@ -1934,32 +1782,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e825575c96302d2daedfc205a0062180033c92c55bcd6aafc4e109d4d8849ed0"
 dependencies = [
  "argminmax",
- "arrow2 0.17.4",
+ "arrow2",
  "either",
  "indexmap 2.9.0",
  "memchr",
- "polars-arrow 0.32.1",
- "polars-core 0.32.1",
- "polars-utils 0.32.1",
- "smartstring",
- "version_check",
-]
-
-[[package]]
-name = "polars-ops"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7740d7bc4c2ca08044f9ef599638e116fdd7d687e80d1974b698e390c6ce4252"
-dependencies = [
- "argminmax",
- "arrow2 0.18.0",
- "either",
- "indexmap 2.9.0",
- "memchr",
- "polars-arrow 0.33.2",
- "polars-core 0.33.2",
- "polars-utils 0.33.2",
- "regex",
+ "polars-arrow",
+ "polars-core",
+ "polars-utils",
  "smartstring",
  "version_check",
 ]
@@ -1975,36 +1804,13 @@ dependencies = [
  "enum_dispatch",
  "hashbrown 0.14.5",
  "num-traits",
- "polars-arrow 0.32.1",
- "polars-core 0.32.1",
- "polars-io 0.32.1",
- "polars-ops 0.32.1",
- "polars-plan 0.32.1",
- "polars-row 0.32.1",
- "polars-utils 0.32.1",
- "rayon",
- "smartstring",
- "version_check",
-]
-
-[[package]]
-name = "polars-pipe"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f30c5e77c5594ddc958a46fe2e021da2feba9c94e767e1d798bd82ac5a33c3b"
-dependencies = [
- "crossbeam-channel",
- "crossbeam-queue",
- "enum_dispatch",
- "hashbrown 0.14.5",
- "num-traits",
- "polars-arrow 0.33.2",
- "polars-core 0.33.2",
- "polars-io 0.33.2",
- "polars-ops 0.33.2",
- "polars-plan 0.33.2",
- "polars-row 0.33.2",
- "polars-utils 0.33.2",
+ "polars-arrow",
+ "polars-core",
+ "polars-io",
+ "polars-ops",
+ "polars-plan",
+ "polars-row",
+ "polars-utils",
  "rayon",
  "smartstring",
  "version_check",
@@ -2017,36 +1823,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fb67b014f0295e8e9dbb84404a91d666d477b3bc248a2ed51bc442833b16da35"
 dependencies = [
  "ahash",
- "arrow2 0.17.4",
- "once_cell",
- "polars-arrow 0.32.1",
- "polars-core 0.32.1",
- "polars-io 0.32.1",
- "polars-ops 0.32.1",
- "polars-time 0.32.1",
- "polars-utils 0.32.1",
- "rayon",
- "regex",
- "smartstring",
- "strum_macros 0.25.3",
- "version_check",
-]
-
-[[package]]
-name = "polars-plan"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "678cbeb730e29e50f0f8d844102d15454fc6113a74c667eab046c0e4a4322a9e"
-dependencies = [
- "ahash",
- "arrow2 0.18.0",
+ "arrow2",
  "once_cell",
- "polars-arrow 0.33.2",
- "polars-core 0.33.2",
- "polars-io 0.33.2",
- "polars-ops 0.33.2",
- "polars-time 0.33.2",
- "polars-utils 0.33.2",
+ "polars-arrow",
+ "polars-core",
+ "polars-io",
+ "polars-ops",
+ "polars-time",
+ "polars-utils",
  "rayon",
  "regex",
  "smartstring",
@@ -2060,20 +1844,9 @@ version = "0.32.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "27f54c1956027bf6301948fb4f2837cf6d6b638d8dd1edf3aaeaa19906a986be"
 dependencies = [
- "arrow2 0.17.4",
- "polars-error 0.32.1",
- "polars-utils 0.32.1",
-]
-
-[[package]]
-name = "polars-row"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c52ef8885b9d13f848839594fbab21ad79fc63f7e11c19cdc2cfe9bb03c313ac"
-dependencies = [
- "arrow2 0.18.0",
- "polars-error 0.33.2",
- "polars-utils 0.33.2",
+ "arrow2",
+ "polars-error",
+ "polars-utils",
 ]
 
 [[package]]
@@ -2082,25 +1855,10 @@ version = "0.32.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dbfcb15cf8eebd25ea1724109d0153817cd484c6326290585f0736b4e7fcf2f4"
 dependencies = [
- "polars-arrow 0.32.1",
- "polars-core 0.32.1",
- "polars-lazy 0.32.1",
- "polars-plan 0.32.1",
- "serde",
- "serde_json",
- "sqlparser",
-]
-
-[[package]]
-name = "polars-sql"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d716855267e3516f722287f68cf10e650e33f7197df83a79e680602471456fc"
-dependencies = [
- "polars-arrow 0.33.2",
- "polars-core 0.33.2",
- "polars-lazy 0.33.2",
- "polars-plan 0.33.2",
+ "polars-arrow",
+ "polars-core",
+ "polars-lazy",
+ "polars-plan",
  "serde",
  "serde_json",
  "sqlparser",
@@ -2112,34 +1870,15 @@ version = "0.32.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53f42d2632f5971c9575041d33cbcfb1f996900c40bbf58bc6eb0a0c5efbecea"
 dependencies = [
- "arrow2 0.17.4",
+ "arrow2",
  "atoi",
  "chrono",
  "now",
  "once_cell",
- "polars-arrow 0.32.1",
- "polars-core 0.32.1",
- "polars-ops 0.32.1",
- "polars-utils 0.32.1",
- "regex",
- "smartstring",
-]
-
-[[package]]
-name = "polars-time"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2eb75a24f11b55a400b52dc19a2a3e949aaaa46a911f99496de4485b1127063"
-dependencies = [
- "arrow2 0.18.0",
- "atoi",
- "chrono",
- "now",
- "once_cell",
- "polars-arrow 0.33.2",
- "polars-core 0.33.2",
- "polars-ops 0.33.2",
- "polars-utils 0.33.2",
+ "polars-arrow",
+ "polars-core",
+ "polars-ops",
+ "polars-utils",
  "regex",
  "smartstring",
 ]
@@ -2154,25 +1893,7 @@ dependencies = [
  "hashbrown 0.14.5",
  "num-traits",
  "once_cell",
- "polars-error 0.32.1",
- "rayon",
- "smartstring",
- "sysinfo",
- "version_check",
-]
-
-[[package]]
-name = "polars-utils"
-version = "0.33.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a4a5e743509096322cad39104d56e329fe2748483a3354a0f0c354724f3cef6"
-dependencies = [
- "ahash",
- "bytemuck",
- "hashbrown 0.14.5",
- "num-traits",
- "once_cell",
- "polars-error 0.33.2",
+ "polars-error",
  "rayon",
  "smartstring",
  "sysinfo",
@@ -2196,7 +1917,6 @@ dependencies = [
  "csv",
  "itertools 0.10.5",
  "log",
- "polars 0.33.2",
  "rand 0.8.5",
  "rnc-core",
  "rnc-utils",
@@ -2571,12 +2291,6 @@ version = "0.6.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
 
-[[package]]
-name = "regex-syntax"
-version = "0.7.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
-
 [[package]]
 name = "regex-syntax"
 version = "0.8.5"
diff --git a/utils/precompute/Cargo.toml b/utils/precompute/Cargo.toml
index a7fbdd22e..36ae91b23 100644
--- a/utils/precompute/Cargo.toml
+++ b/utils/precompute/Cargo.toml
@@ -20,7 +20,6 @@ sorted-iter = "0.1.7"
 structopt = "0.3"
 strum = "0.21"
 strum_macros = "0.21"
-polars = { version = "0.33.2", features = ["lazy", "streaming"] }
 
 [dev-dependencies]
 rand = "0.8"
diff --git a/utils/precompute/src/releases.rs b/utils/precompute/src/releases.rs
index 2e80d23bd..5f55755db 100644
--- a/utils/precompute/src/releases.rs
+++ b/utils/precompute/src/releases.rs
@@ -26,7 +26,6 @@ use anyhow::{
     Result,
 };
 
-use polars::prelude::*;
 
 #[derive(Serialize, Deserialize, Debug)]
 pub struct UrsEntry {
@@ -121,7 +120,6 @@ pub fn select_new(xrefs: &Path, known: &Path, output: &Path, streaming: bool) ->
     //     .agg([col("last").max().alias("last"), col("id").first().alias("id")])
     //     .sort("id", Default::default());
 
-
     // let known_records: LazyFrame = LazyCsvReader::new(known_path)
     //     .has_header(false)
     //     .low_memory(streaming)
@@ -132,7 +130,6 @@ pub fn select_new(xrefs: &Path, known: &Path, output: &Path, streaming: bool) ->
     //     .agg([col("last").max().alias("last"), col("id").first().alias("id")])
     //     .sort("id", Default::default());
 
-
     // let selection: LazyFrame = xref_records
     //     .join(
     //         known_records,
@@ -149,10 +146,12 @@ pub fn select_new(xrefs: &Path, known: &Path, output: &Path, streaming: bool) ->
 
     // let check: LazyFrame = selection.clone();
 
-    // // // check we are not in a catastrophic error state - precompute should never be newer than
-    // // // xref
-    // let selected_urs = selection.filter(col("selected").eq(true)).with_streaming(streaming).collect()?;
-    // let error_urs = check.filter(col("error_state").eq(true)).with_streaming(streaming).collect()?;
+    // // // check we are not in a catastrophic error state - precompute should never be newer
+    // than // // xref
+    // let selected_urs =
+    // selection.filter(col("selected").eq(true)).with_streaming(streaming).collect()?;
+    // let error_urs =
+    // check.filter(col("error_state").eq(true)).with_streaming(streaming).collect()?;
     // if error_urs.height() > 0 {
     //     return Err(anyhow!("Precompute newer than xref for these UPIs: {:?}", error_urs));
     // }