From aaf6d9a99a8015161c9060a2a0394f6ed3011abe Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Mon, 13 Oct 2025 14:40:26 +0100 Subject: [PATCH 01/22] Rust side implementation for GoFlow search index --- utils/search-export/src/main.rs | 9 +++++ .../src/sequences/file_joiner.rs | 15 +++++++- .../src/sequences/go_flow_annotations.rs | 34 +++++++++++++++++++ utils/search-export/src/sequences/mod.rs | 1 + .../search-export/src/sequences/normalized.rs | 3 ++ utils/search-export/src/sequences/raw.rs | 7 ++++ 6 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 utils/search-export/src/sequences/go_flow_annotations.rs diff --git a/utils/search-export/src/main.rs b/utils/search-export/src/main.rs index ad99f67e8..de852456b 100644 --- a/utils/search-export/src/main.rs +++ b/utils/search-export/src/main.rs @@ -33,6 +33,7 @@ pub enum Groupable { SoInfo, LitsummSummaries, EditingEvents, + GoFlowAnnotation, } #[derive(Debug, StructOpt)] @@ -140,6 +141,10 @@ enum SequenceCommand { /// RNA editing events editing_events: PathBuf, + #[structopt(parse(from_os_str))] + /// GoFlowLLM annotations + go_flow_llm_annotations: PathBuf, + // Add new arguments above this line! #[structopt(parse(from_os_str))] /// Filename to write the results to, '-' means stdout @@ -255,6 +260,7 @@ fn main() -> Result<()> { Groupable::EditingEvents => { sequences::editing_events::group(&path, max_count, &output)? }, + Groupable::GoFlowAnnotation => sequences::go_flow_annotations::group(&path, max_count, &output)?, }, Subcommand::Sequences { command, @@ -275,6 +281,8 @@ fn main() -> Result<()> { litsumm_summaries, editing_events, so_term_tree, + go_flow_llm_annotations, + // Add new arguments above this line! output, } => sequences::writers::write_merge( vec![ @@ -293,6 +301,7 @@ fn main() -> Result<()> { editing_events, orfs, so_term_tree, + go_flow_llm_annotations, ], &output, )?, diff --git a/utils/search-export/src/sequences/file_joiner.rs b/utils/search-export/src/sequences/file_joiner.rs index c8a50973b..6903b6e1b 100644 --- a/utils/search-export/src/sequences/file_joiner.rs +++ b/utils/search-export/src/sequences/file_joiner.rs @@ -49,6 +49,7 @@ use super::{ rfam_hit::RfamHit, so_tree, so_tree::SoMapping, + go_flow_annotations::GoFlowLLMAnnotation, }; #[derive(Debug, Error)] @@ -98,6 +99,7 @@ pub enum FileTypes { PublicationCount, LitsummSummaries, EditingEvents, + GoFlowLLMAnnotations, SoTermTree, } @@ -116,6 +118,7 @@ pub struct FileJoiner<'de> { rfam_hits: StreamDeserializer<'de, IoRead>, Grouped>, publication_counts: StreamDeserializer<'de, IoRead>, Grouped>, lit_summ: StreamDeserializer<'de, IoRead>, Grouped>, + go_flow_llm_annotations: StreamDeserializer<'de, IoRead>, Grouped>, editing_events: StreamDeserializer<'de, IoRead>, Grouped>, so_info: SoMapping, } @@ -203,6 +206,7 @@ impl FileJoinerBuilder { let publication_counts = self.iterator_for(FileTypes::PublicationCount)?; let lit_summ = self.iterator_for(FileTypes::LitsummSummaries)?; let editing_events = self.iterator_for(FileTypes::EditingEvents)?; + let go_flow_llm_annotations = self.iterator_for(FileTypes::GoFlowLLMAnnotations)?; let so_info = so_tree::load(self.path_for(FileTypes::SoTermTree)?)?; Ok(FileJoiner { @@ -220,6 +224,7 @@ impl FileJoinerBuilder { publication_counts, lit_summ, editing_events, + go_flow_llm_annotations, so_info, }) } @@ -244,6 +249,7 @@ impl<'de> Iterator for FileJoiner<'de> { self.publication_counts.next(), self.lit_summ.next(), self.editing_events.next(), + self.go_flow_llm_annotations.next(), ); match current { @@ -262,6 +268,7 @@ impl<'de> Iterator for FileJoiner<'de> { None, None, None, + None, ) => None, ( Some(Ok(Required { @@ -320,6 +327,10 @@ impl<'de> Iterator for FileJoiner<'de> { id: id14, data: editing_events, })), + Some(Ok(Multiple { + id: id15, + data: goflow_llm_annotations, + })), ) => { if id1 != id2 || id1 != id3 @@ -334,9 +345,10 @@ impl<'de> Iterator for FileJoiner<'de> { || id1 != id12 || id1 != id13 || id1 != id14 + || id1 != id15 { return Some(Err(Error::OutofSyncData(vec![ - id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14, + id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14, id15 ]))); } @@ -362,6 +374,7 @@ impl<'de> Iterator for FileJoiner<'de> { .publication_counts(publication_counts) .litsumm_summaries(lit_summ) .editing_events(editing_events) + .go_flow_llm_annotations(goflow_llm_annotations) .so_tree(so_tree) .build(); diff --git a/utils/search-export/src/sequences/go_flow_annotations.rs b/utils/search-export/src/sequences/go_flow_annotations.rs new file mode 100644 index 000000000..04e73b525 --- /dev/null +++ b/utils/search-export/src/sequences/go_flow_annotations.rs @@ -0,0 +1,34 @@ +use serde::{ + Deserialize, + Serialize, +}; +use std::path::Path; + +use anyhow::Result; +use rnc_core::grouper; + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct GoFlowLLMAnnotation { + pub id: usize, + urs_taxid: String, + should_show_goflow: bool, +} + +impl grouper::HasIndex for GoFlowLLMAnnotation { + fn index(&self) -> usize { + self.id + } +} + +pub fn group(path: &Path, max: usize, output: &Path) -> Result<()> { + grouper::group::(grouper::Criteria::AnyNumber, &path, 1, max, &output) +} + +impl GoFlowLLMAnnotation { + pub fn should_show_goflow(&self) -> bool { + self.should_show_goflow + } + pub fn urs_taxid(&self) -> &str { + &self.urs_taxid + } +} diff --git a/utils/search-export/src/sequences/mod.rs b/utils/search-export/src/sequences/mod.rs index b3febfd50..aca016443 100644 --- a/utils/search-export/src/sequences/mod.rs +++ b/utils/search-export/src/sequences/mod.rs @@ -16,5 +16,6 @@ pub mod qa_status; pub mod r2dt; pub mod raw; pub mod rfam_hit; +pub mod go_flow_annotations; pub mod so_tree; pub mod writers; diff --git a/utils/search-export/src/sequences/normalized.rs b/utils/search-export/src/sequences/normalized.rs index c53f334f6..09f6a718e 100644 --- a/utils/search-export/src/sequences/normalized.rs +++ b/utils/search-export/src/sequences/normalized.rs @@ -37,6 +37,7 @@ use crate::sequences::{ r2dt::R2dt, raw::Raw, rfam_hit::RfamHitVec, + go_flow_annotations::GoFlowLLMAnnotation, so_tree, }; @@ -69,6 +70,7 @@ pub struct Normalized { publication_count: usize, litsumm: Vec, editing_events: Vec, + go_flow_llm_annotations: Vec, so_rna_type_tree: so_tree::SoTree, #[serde(flatten)] @@ -129,6 +131,7 @@ impl Normalized { rfam_hits: raw.rfam_hits().to_owned().into_iter().collect(), orfs: raw.orfs().to_vec().into_iter().collect(), litsumm: raw.litsumm_summaries().to_vec(), + go_flow_llm_annotations: raw.go_flow_llm_annotations().to_vec(), editing_events: raw.editing_events().to_vec(), }) } diff --git a/utils/search-export/src/sequences/raw.rs b/utils/search-export/src/sequences/raw.rs index 4daf502d0..d4bcc94dc 100644 --- a/utils/search-export/src/sequences/raw.rs +++ b/utils/search-export/src/sequences/raw.rs @@ -24,6 +24,7 @@ use crate::sequences::{ qa_status::QaStatus, r2dt::R2dt, rfam_hit::RfamHit, + go_flow_annotations::GoFlowLLMAnnotation, so_tree, }; @@ -46,6 +47,7 @@ pub struct Raw { publication_counts: Option, litsumm_summaries: Vec, editing_events: Vec, + go_flow_llm_annotations: Vec, so_tree: so_tree::SoTree, } @@ -148,6 +150,11 @@ impl Raw { &self.editing_events } + /// Get a reference to the raw's editing events. + pub fn go_flow_llm_annotations(&self) -> &[GoFlowLLMAnnotation] { + &self.go_flow_llm_annotations + } + /// Get this raw's publication count. pub fn publication_count(&self) -> usize { self.publication_counts.as_ref().map(|p| p.publication_count()).unwrap_or(0) From 53f2fefbf2d8f7fae00cd998194bb18b6a8b9340 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Mon, 13 Oct 2025 14:43:57 +0100 Subject: [PATCH 02/22] Python bits of the goflowllm search index export --- rnacentral_pipeline/rnacentral/search_export/data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rnacentral_pipeline/rnacentral/search_export/data.py b/rnacentral_pipeline/rnacentral/search_export/data.py index bfad7c2be..b6e8e1cdf 100644 --- a/rnacentral_pipeline/rnacentral/search_export/data.py +++ b/rnacentral_pipeline/rnacentral/search_export/data.py @@ -708,6 +708,9 @@ def has_publications(counts): def has_litsumm(litsumm): return str(bool(litsumm)) +def has_go_flow_llm_annotation(go_flow): + return str(bool(go_flow)) + def has_editing_event(editing_events): return str(bool(editing_events)) @@ -881,6 +884,7 @@ def edit_ref_to_edit(editing_events): edit_repeat_type, keys="editing_events", ), + field("has_go_flow_llm_annotation", has_go_flow_llm_annotation, keys="goflow"), ## Add new fields above this line! Otherwise editing the produced xml is hard. tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"), ], From ffa4e4268aaeb4300ba15b1a06f9bf0c84fe7a81 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Mon, 13 Oct 2025 14:56:11 +0100 Subject: [PATCH 03/22] Add necessary sql and nextflow bits --- files/search-export/parts/goflow.sql | 13 +++++++++++++ workflows/export/text-search/sequences.nf | 16 ++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 files/search-export/parts/goflow.sql diff --git a/files/search-export/parts/goflow.sql b/files/search-export/parts/goflow.sql new file mode 100644 index 000000000..9999d8fd4 --- /dev/null +++ b/files/search-export/parts/goflow.sql @@ -0,0 +1,13 @@ +COPY ( + SELECT + json_build_object( + 'id', todo.id, + 'urs_taxid', todo.urs_taxid, + 'should_show_goflow', true + ) + FROM search_export_urs todo + JOIN go_flow_llm_curation_results gfllm + ON + todo.urs_taxid = gfllm.urs_taxid + ORDER by todo.id +) TO STDOUT diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf index b1611b09b..9b805364f 100755 --- a/workflows/export/text-search/sequences.nf +++ b/workflows/export/text-search/sequences.nf @@ -141,6 +141,20 @@ process litsumm_summaries { """ } +process litsumm_summaries { + input: + val(max_count) + path (query) + + output: + path("goflow_annotations.json") + + """ + psql -v ON_ERROR_STOP=1 -f "$query" "$PGDATABASE" > raw.json + search-export group go-flow-annotation raw.json ${max_count} goflow_annotations.json + """ +} + process editing_events { input: val(max_count) @@ -201,6 +215,7 @@ workflow sequences { Channel.fromPath('files/search-export/parts/text-mining.sql') | set { text_sql } Channel.fromPath('files/search-export/parts/litsumm.sql') | set { litsumm_sql } Channel.fromPath('files/search-export/parts/editing-events.sql') | set { editing_events_sql } + Channel.fromPath('files/search-export/parts/goflow.sql') | set { goflow_sql } Channel.fromPath('files/search-export/so-rna-types.sql') | set { so_sql } Channel.fromPath('files/search-export/parts/accessions.sql') | set { accessions_sql } @@ -230,6 +245,7 @@ workflow sequences { text_mining_query(search_count, text_sql), litsumm_summaries(search_count, litsumm_sql), editing_events(search_count, editing_events_sql), + go_flow_annotations(search_count, goflow_sql), fetch_so_tree(so_sql), )\ | set { metadata } From ca3ee0968e8f85e6f6f2c69869a3dbc913f90bf7 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Tue, 14 Oct 2025 10:48:39 +0100 Subject: [PATCH 04/22] Update rnacentral_pipeline/cli/r2dt.py Improve the docstring for the r2dt CLI Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- rnacentral_pipeline/cli/r2dt.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rnacentral_pipeline/cli/r2dt.py b/rnacentral_pipeline/cli/r2dt.py index 604be43dd..14d8f160b 100644 --- a/rnacentral_pipeline/cli/r2dt.py +++ b/rnacentral_pipeline/cli/r2dt.py @@ -262,11 +262,11 @@ def r2dt_prepare_s3(model_info, directory, output, file_list, allow_missing): @click.option("--max_sequences", default=-1) def r2dt_prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences): """ - Prepare the sequences extracted from RNAcentral + Prepare a list of URS identifiers to fetch sequences for. - This means we will load and deduplicate the json file before rewriting - a json file containing only the requested number of sequences. - - The default will be to write out all sequences + This takes a file of all URS identifiers from cross-references and a file + of already tracked URS identifiers. It produces a file of URS identifiers + that are in the xref file but not in the tracked file. This can be limited + to a maximum number of sequences. """ r2dt.prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences) From bbb78e20ae8cc576006a3c842d48195c5ee6cdd4 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Tue, 14 Oct 2025 10:49:14 +0100 Subject: [PATCH 05/22] Update rnacentral_pipeline/databases/ensembl/genomes/urls.py Remove unused releases parameter Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- rnacentral_pipeline/databases/ensembl/genomes/urls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py index af3e6277c..9a56c84a2 100644 --- a/rnacentral_pipeline/databases/ensembl/genomes/urls.py +++ b/rnacentral_pipeline/databases/ensembl/genomes/urls.py @@ -30,7 +30,7 @@ def list_releases(ftp: FTP) -> ty.List[str]: return [f for f in ftp.nlst() if f.startswith("release-")] -def latest_release(releases: ty.List[str], ftp: FTP) -> str: +def latest_release(ftp: FTP) -> str: ## Parse the readme for the current release to avoid getting a half baked release readme_lines = [] ftp.retrlines("RETR current_README", readme_lines.append) From c40cef0c3705bc72f622f6b12c989af9c016ff21 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Tue, 14 Oct 2025 10:55:04 +0100 Subject: [PATCH 06/22] Remove unused fetching of ensembl latest release from parsing release names --- .../databases/ensembl/vertebrates/urls.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py index b6c757de6..d9bca58e3 100644 --- a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py +++ b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py @@ -23,18 +23,14 @@ from rnacentral_pipeline.databases.ensembl.data import Division, FtpInfo -def list_releases(ftp: FTP) -> ty.List[str]: - return [f for f in ftp.nlst() if f.startswith("release-")] - -def latest_release(releases: ty.List[str], ftp: FTP) -> str: +def latest_release(ftp: FTP) -> str: ## Parse the readme for the current release to avoid getting a half baked release readme_lines = [] ftp.retrlines("RETR current_README", readme_lines.append) cur_readme = "\n".join(readme_lines) pattern = r"Ensembl Release (\d+) Databases." release = re.search(pattern, cur_readme).group(1) - print(f"Ensembl release {release}") return f"release-{release}" @@ -71,7 +67,6 @@ def urls_for(host: str) -> ty.Iterable[FtpInfo]: with FTP(host) as ftp: ftp.login() ftp.cwd("pub") - releases = list_releases(ftp) - latest = latest_release(releases, ftp) + latest = latest_release(ftp) with species_info(ftp, latest) as info: yield from generate_paths(f"ftp://{host}/pub", latest, info) From 3e818d656619be6dde944e9572b1b29fc1b6f4ae Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Tue, 14 Oct 2025 10:57:01 +0100 Subject: [PATCH 07/22] Remove some debugging print statements --- rnacentral_pipeline/rnacentral/genome_mapping/urls.py | 4 ---- rnacentral_pipeline/rnacentral/r2dt/__init__.py | 3 +-- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/genome_mapping/urls.py b/rnacentral_pipeline/rnacentral/genome_mapping/urls.py index f51bfe0f8..536673507 100644 --- a/rnacentral_pipeline/rnacentral/genome_mapping/urls.py +++ b/rnacentral_pipeline/rnacentral/genome_mapping/urls.py @@ -131,10 +131,6 @@ def toplevel_file( toplevel = base.format(type="toplevel") base_result = f"ftp://{host}{directory}/{{file}}" - print(primary) - print(toplevel) - print(files) - if primary in files: return base_result.format(file=primary) elif toplevel in files: diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py index 7f30ddfc5..190f00c98 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/__init__.py +++ b/rnacentral_pipeline/rnacentral/r2dt/__init__.py @@ -153,7 +153,6 @@ def write_inspect_data(handle: ty.IO, db_url: str, output: ty.IO): def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences): - print(urs_to_fetch.name) raw_xref = ( pl.scan_csv(xref_urs.name, has_header=False, low_memory=True) .unique() @@ -162,7 +161,7 @@ def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences): raw_tracked = pl.scan_csv( tracked_urs.name, low_memory=True - ).unique() ## May not need to be uniqued? + ).unique() to_fetch = raw_xref.join(raw_tracked, on="urs", how="anti") From a4901b7ca3913fa2cbc8961f3b47f43fa0b73739 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Sat, 18 Oct 2025 10:30:00 +0100 Subject: [PATCH 08/22] Update rnacentral_pipeline/databases/ensembl/genomes/urls.py Raise value error when no match for release number found Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- rnacentral_pipeline/databases/ensembl/genomes/urls.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py index 9a56c84a2..44cc2e4fa 100644 --- a/rnacentral_pipeline/databases/ensembl/genomes/urls.py +++ b/rnacentral_pipeline/databases/ensembl/genomes/urls.py @@ -36,7 +36,10 @@ def latest_release(ftp: FTP) -> str: ftp.retrlines("RETR current_README", readme_lines.append) cur_readme = "\n".join(readme_lines) pattern = r"Ensembl Release (\d+) Databases." - release = re.search(pattern, cur_readme).group(1) + match = re.search(pattern, cur_readme) + if not match: + raise ValueError("Could not find release number in README") + release = match.group(1) print(f"Ensembl release {release}") return f"release-{release}" From 0efdf8d2c0fd5694ad99e22590ef24751b7c16ef Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Sat, 18 Oct 2025 10:35:03 +0100 Subject: [PATCH 09/22] Remove some dead code relating to finding the ensembl release number --- rnacentral_pipeline/databases/ensembl/genomes/urls.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py index 44cc2e4fa..202ac5209 100644 --- a/rnacentral_pipeline/databases/ensembl/genomes/urls.py +++ b/rnacentral_pipeline/databases/ensembl/genomes/urls.py @@ -25,11 +25,6 @@ LOGGER = logging.getLogger(__name__) - -def list_releases(ftp: FTP) -> ty.List[str]: - return [f for f in ftp.nlst() if f.startswith("release-")] - - def latest_release(ftp: FTP) -> str: ## Parse the readme for the current release to avoid getting a half baked release readme_lines = [] @@ -98,8 +93,7 @@ def urls_for(division: Division, host: str) -> ty.Iterable[FtpInfo]: ftp.login() print("LOGIN") ftp.cwd(f"pub/{division.name}/") - releases = list_releases(ftp) - latest = latest_release(releases, ftp) + latest = latest_release(ftp) with species_info(ftp, division, latest) as info: url_base = f"ftp://{host}/pub/{division.name}" yield from generate_paths(ftp, division, url_base, latest, info) From f30beadf2343ccae7505bfd5b0a4f56b489e5630 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Sat, 18 Oct 2025 10:36:32 +0100 Subject: [PATCH 10/22] Reinstate conditionals for running r2dt --- workflows/r2dt.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/r2dt.nf b/workflows/r2dt.nf index a0561b879..896b60987 100644 --- a/workflows/r2dt.nf +++ b/workflows/r2dt.nf @@ -37,7 +37,7 @@ process get_partitions { process fetch_xrefs { - // when { params.r2dt.run } + when { params.r2dt.run } input: tuple val(partition), path(query) @@ -56,7 +56,7 @@ process fetch_xrefs { } process fetch_tracked { - // when { params.r2dt.run } + when { params.r2dt.run } input: tuple val(_flag) @@ -76,7 +76,7 @@ process fetch_tracked { process extract_sequences { - // when { params.r2dt.run } + when { params.r2dt.run } memory '12GB' From 0c2d9449901710bd3b25f239eebae4d53dad8693 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Sat, 18 Oct 2025 10:47:14 +0100 Subject: [PATCH 11/22] Fix other instances of release matching not being defenzive about no matches --- rnacentral_pipeline/databases/ensembl/vertebrates/urls.py | 5 ++++- rnacentral_pipeline/rnacentral/genome_mapping/urls.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py index d9bca58e3..e338038f2 100644 --- a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py +++ b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py @@ -30,7 +30,10 @@ def latest_release(ftp: FTP) -> str: ftp.retrlines("RETR current_README", readme_lines.append) cur_readme = "\n".join(readme_lines) pattern = r"Ensembl Release (\d+) Databases." - release = re.search(pattern, cur_readme).group(1) + match = re.search(pattern, cur_readme) + if not match: + raise ValueError("Could not determine latest Ensembl release from README") + release = match.group(1) return f"release-{release}" diff --git a/rnacentral_pipeline/rnacentral/genome_mapping/urls.py b/rnacentral_pipeline/rnacentral/genome_mapping/urls.py index 536673507..213abb6bf 100644 --- a/rnacentral_pipeline/rnacentral/genome_mapping/urls.py +++ b/rnacentral_pipeline/rnacentral/genome_mapping/urls.py @@ -164,7 +164,10 @@ def url_for(species: str, assembly_id: str, kind: str, host: str, soft_masked=Fa conn.retrlines("RETR current_README", readme_lines.append) cur_readme = "\n".join(readme_lines) pattern = r"[Cc]urrent release is (?:Ensembl )?Genomes\s*(\d+)" - release = re.search(pattern, cur_readme).group(1) + match = re.search(pattern, cur_readme) + if not match: + raise ValueError("Could not determine latest Ensembl release from README") + release = match.group(1) for path in host.paths(species, kind): try: From 96d37fdcd5354b169e15b880751fabcd10b19c1b Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Sat, 18 Oct 2025 11:09:45 +0100 Subject: [PATCH 12/22] Update rnacentral_pipeline/rnacentral/r2dt/parser.py f-string is the reccomended way for exceptions, c-style for logging Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- rnacentral_pipeline/rnacentral/r2dt/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/parser.py b/rnacentral_pipeline/rnacentral/r2dt/parser.py index 5f6426720..7d287430e 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/parser.py +++ b/rnacentral_pipeline/rnacentral/r2dt/parser.py @@ -109,7 +109,7 @@ def parse( old_model_name = model_name model_name = temp_model_name_lookup.get(model_name, None) if model_name is None: - raise ValueError("No info for model %s", old_model_name) + raise ValueError(f"No info for model {old_model_name}") minfo = model_info[model_name] info = data.R2DTResultInfo(urs, minfo, source, result_base) From 29dc710f35bde93c3e0b67a97939168c17b5f8a4 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Sat, 25 Oct 2025 08:38:08 +0100 Subject: [PATCH 13/22] Link up go flow search export processes properly --- workflows/export/text-search/sequences.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf index 9b805364f..86f812bbc 100755 --- a/workflows/export/text-search/sequences.nf +++ b/workflows/export/text-search/sequences.nf @@ -65,6 +65,7 @@ process build_metadata { path(text) path(litsumm) path(editing_events) + path(go_flow_annotations) path(so_tree) output: @@ -141,7 +142,7 @@ process litsumm_summaries { """ } -process litsumm_summaries { +process go_flow_annotations { input: val(max_count) path (query) From 6332d64069fe147001a20f6ebbb10f2897f6ed9f Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Mon, 27 Oct 2025 09:25:59 +0000 Subject: [PATCH 14/22] Use less stringent regex in ensembl release detection --- rnacentral_pipeline/databases/ensembl/genomes/urls.py | 4 ++-- rnacentral_pipeline/databases/ensembl/vertebrates/urls.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py index 202ac5209..c9f35997b 100644 --- a/rnacentral_pipeline/databases/ensembl/genomes/urls.py +++ b/rnacentral_pipeline/databases/ensembl/genomes/urls.py @@ -30,8 +30,8 @@ def latest_release(ftp: FTP) -> str: readme_lines = [] ftp.retrlines("RETR current_README", readme_lines.append) cur_readme = "\n".join(readme_lines) - pattern = r"Ensembl Release (\d+) Databases." - match = re.search(pattern, cur_readme) + pattern = r"Ensembl Release (\d+) Databases\." + match = re.search(pattern, cur_readme, re.IGNORECASE) if not match: raise ValueError("Could not find release number in README") release = match.group(1) diff --git a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py index e338038f2..5f624d538 100644 --- a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py +++ b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py @@ -29,8 +29,8 @@ def latest_release(ftp: FTP) -> str: readme_lines = [] ftp.retrlines("RETR current_README", readme_lines.append) cur_readme = "\n".join(readme_lines) - pattern = r"Ensembl Release (\d+) Databases." - match = re.search(pattern, cur_readme) + pattern = r"Ensembl Release (\d+) Databases\." + match = re.search(pattern, cur_readme, re.IGNORECASE) if not match: raise ValueError("Could not determine latest Ensembl release from README") release = match.group(1) From cd6e07dbbd5c20e6896829b3a9e6d37fedb8172a Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Mon, 27 Oct 2025 09:27:05 +0000 Subject: [PATCH 15/22] Remove some trailing whitespace --- rnacentral_pipeline/rnacentral/r2dt/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py index 190f00c98..3d74880b3 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/__init__.py +++ b/rnacentral_pipeline/rnacentral/r2dt/__init__.py @@ -159,9 +159,7 @@ def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences): .rename({"column_1": "urs"}) ) - raw_tracked = pl.scan_csv( - tracked_urs.name, low_memory=True - ).unique() + raw_tracked = pl.scan_csv(tracked_urs.name, low_memory=True).unique() to_fetch = raw_xref.join(raw_tracked, on="urs", how="anti") From fa07512d534f1dd06a4462015609ccdc71c2752f Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Mon, 27 Oct 2025 09:28:29 +0000 Subject: [PATCH 16/22] Raise value errors instead of relying on assertions --- rnacentral_pipeline/rnacentral/r2dt/data.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py index dcb47486d..ed204163c 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/data.py +++ b/rnacentral_pipeline/rnacentral/r2dt/data.py @@ -409,11 +409,15 @@ def dot_bracket(self): seq_dot = str(record.seq) ## Use indices instead, assert that the string is even length ## If not, then the two parts are not the same length - assert len(seq_dot) % 2 == 0, f"Odd length sequence {len(seq_dot)}" + if len(seq_dot) % 2 != 0: + raise ValueError(f"Odd length sequence {len(seq_dot)}") seq_dot_len = len(seq_dot) sequence = seq_dot[0 : seq_dot_len // 2] dot_bracket = seq_dot[(seq_dot_len // 2) :] - assert len(sequence) == len(dot_bracket) + if len(sequence) != len(dot_bracket): + raise ValueError( + f"Sequence and dot bracket lengths do not match: {len(sequence)} != {len(dot_bracket)}" + ) return dot_bracket def basepair_count(self): From f00a676df894b43b657ba4e86510ccc83d6be786 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Mon, 27 Oct 2025 14:36:56 +0000 Subject: [PATCH 17/22] Fix not passing goflow data to merging process correctly --- workflows/export/text-search/sequences.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf index 86f812bbc..118eca7cd 100755 --- a/workflows/export/text-search/sequences.nf +++ b/workflows/export/text-search/sequences.nf @@ -72,7 +72,7 @@ process build_metadata { path("merged.json") """ - search-export sequences merge $base $crs $feeback $go $prot $rnas $precompute $qa $r2dt $rfam $orf $text $so_tree $litsumm $editing_events merged.json + search-export sequences merge $base $crs $feeback $go $prot $rnas $precompute $qa $r2dt $rfam $orf $text $so_tree $litsumm $editing_events $go_flow_annotations merged.json """ } From acdba3cfe08b1081ba4b8ceb51ce51b2c794fcc9 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Mon, 27 Oct 2025 14:42:07 +0000 Subject: [PATCH 18/22] Update expected key from search export rust code --- rnacentral_pipeline/rnacentral/search_export/data.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/rnacentral/search_export/data.py b/rnacentral_pipeline/rnacentral/search_export/data.py index b6e8e1cdf..b68e07fc7 100644 --- a/rnacentral_pipeline/rnacentral/search_export/data.py +++ b/rnacentral_pipeline/rnacentral/search_export/data.py @@ -708,6 +708,7 @@ def has_publications(counts): def has_litsumm(litsumm): return str(bool(litsumm)) + def has_go_flow_llm_annotation(go_flow): return str(bool(go_flow)) @@ -884,7 +885,11 @@ def edit_ref_to_edit(editing_events): edit_repeat_type, keys="editing_events", ), - field("has_go_flow_llm_annotation", has_go_flow_llm_annotation, keys="goflow"), + field( + "has_go_flow_llm_annotation", + has_go_flow_llm_annotation, + keys="go_flow_llm_annotations", + ), ## Add new fields above this line! Otherwise editing the produced xml is hard. tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"), ], From fa64872732d47ba0c398d4306aac7e55df5b006f Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Mon, 27 Oct 2025 14:53:12 +0000 Subject: [PATCH 19/22] Rust side CLI requires kebab-case filename to match file type enum, so make sure it does --- workflows/export/text-search/sequences.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf index 118eca7cd..3c4e65f5a 100755 --- a/workflows/export/text-search/sequences.nf +++ b/workflows/export/text-search/sequences.nf @@ -148,11 +148,11 @@ process go_flow_annotations { path (query) output: - path("goflow_annotations.json") + path("go-flow-llm-annotations.json") """ psql -v ON_ERROR_STOP=1 -f "$query" "$PGDATABASE" > raw.json - search-export group go-flow-annotation raw.json ${max_count} goflow_annotations.json + search-export group go-flow-annotation raw.json ${max_count} go-flow-llm-annotations.json """ } From 77583acbe6d881d9ec58b2f398e42d6b7573ac98 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Mon, 27 Oct 2025 14:58:54 +0000 Subject: [PATCH 20/22] Improve documentation comment in sequence raw handler --- .pre-commit-config.yaml | 12 ++++++------ utils/search-export/src/sequences/raw.rs | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d7b62a797..72e33e87c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,12 +16,12 @@ repos: - id: isort args: ["--profile", "black", "--filter-files"] name: isort (python) -- repo: https://github.com/doublify/pre-commit-rust - rev: v1.0 - hooks: - - id: fmt - - id: cargo-check - - id: clippy +# - repo: https://github.com/doublify/pre-commit-rust +# rev: v1.0 +# hooks: +# - id: fmt +# - id: cargo-check +# - id: clippy # - repo: https://github.com/python-poetry/poetry # rev: '1.2.2' # hooks: diff --git a/utils/search-export/src/sequences/raw.rs b/utils/search-export/src/sequences/raw.rs index d4bcc94dc..52bea2f9f 100644 --- a/utils/search-export/src/sequences/raw.rs +++ b/utils/search-export/src/sequences/raw.rs @@ -16,6 +16,7 @@ use crate::sequences::{ editing_events::EditingEvent, feedback::Feedback, go_annotation::GoAnnotation, + go_flow_annotations::GoFlowLLMAnnotation, interacting_protein::InteractingProtein, interacting_rna::InteractingRna, litsumm::LitsummSummaries, @@ -24,7 +25,6 @@ use crate::sequences::{ qa_status::QaStatus, r2dt::R2dt, rfam_hit::RfamHit, - go_flow_annotations::GoFlowLLMAnnotation, so_tree, }; @@ -150,7 +150,7 @@ impl Raw { &self.editing_events } - /// Get a reference to the raw's editing events. + /// Get a reference to the raw's GoFlowlLM annotations pub fn go_flow_llm_annotations(&self) -> &[GoFlowLLMAnnotation] { &self.go_flow_llm_annotations } From ea94cdcd1aee2680e8a611d54c0144d416cf5012 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Tue, 18 Nov 2025 09:32:26 +0000 Subject: [PATCH 21/22] Cargo clippy fixes --- utils/bed-expander/src/main.rs | 33 ++++++++++++++----- utils/search-export/src/main.rs | 4 ++- .../src/sequences/file_joiner.rs | 8 +++-- .../src/sequences/go_flow_annotations.rs | 1 + utils/search-export/src/sequences/litsumm.rs | 1 + utils/search-export/src/sequences/mod.rs | 2 +- .../search-export/src/sequences/normalized.rs | 2 +- 7 files changed, 36 insertions(+), 15 deletions(-) diff --git a/utils/bed-expander/src/main.rs b/utils/bed-expander/src/main.rs index d5d7d7025..329a5fe52 100644 --- a/utils/bed-expander/src/main.rs +++ b/utils/bed-expander/src/main.rs @@ -1,8 +1,13 @@ use clap::Parser; -use polars::lazy::dsl::col; +use polars::{ + datatypes::DataType::{ + Int64, + List, + }, + lazy::dsl::col, + prelude::*, +}; use std::fs; -use polars::datatypes::DataType::{Int64, List}; -use polars::prelude::*; #[derive(Parser, Debug)] #[clap(author = "Andrew Green", version, about)] @@ -30,19 +35,30 @@ fn main() -> Result<(), PolarsError> { // col 11: exon sizes // col 12: exon starts - // Stay lazy as long as possible to minimise memory use let original_bed = original_bed .with_columns([col("column_11").str().split(","), col("column_12").str().split(",")]) - .with_columns([col("column_11").cast(List(Box::new(Int64)))] ) + .with_columns([col("column_11").cast(List(Box::new(Int64)))]) .with_column(col("column_11").list().sum().alias("column_5")) .explode([col("column_11"), col("column_12")]) .with_columns([col("column_11").cast(Int64), col("column_12").cast(Int64)]) .with_column(col("column_2").alias("transcript_start")) .with_column(col("column_3").alias("transcript_end")) - .with_columns([(col("column_2") + col("column_12")).alias("column_2"), (col("column_2") + col("column_12") + col("column_11")).alias("column_3")] ) - .select([col("column_1"), col("column_2"), col("column_3"), col("column_4"), col("column_5"), col("column_6"), col("transcript_start"), col("transcript_end")]) - .sort_by_exprs(vec![col("column_1"), col("column_2")], vec![false,false], false, false); + .with_columns([ + (col("column_2") + col("column_12")).alias("column_2"), + (col("column_2") + col("column_12") + col("column_11")).alias("column_3"), + ]) + .select([ + col("column_1"), + col("column_2"), + col("column_3"), + col("column_4"), + col("column_5"), + col("column_6"), + col("transcript_start"), + col("transcript_end"), + ]) + .sort_by_exprs(vec![col("column_1"), col("column_2")], vec![false, false], false, false); let mut output_file = fs::File::create(&cli.output)?; let mut writer = CsvWriter::new(&mut output_file).has_header(false).with_delimiter(b'\t'); @@ -50,5 +66,4 @@ fn main() -> Result<(), PolarsError> { let mut expanded_bed = original_bed.collect()?; writer.finish(&mut expanded_bed) - } diff --git a/utils/search-export/src/main.rs b/utils/search-export/src/main.rs index de852456b..354643f50 100644 --- a/utils/search-export/src/main.rs +++ b/utils/search-export/src/main.rs @@ -260,7 +260,9 @@ fn main() -> Result<()> { Groupable::EditingEvents => { sequences::editing_events::group(&path, max_count, &output)? }, - Groupable::GoFlowAnnotation => sequences::go_flow_annotations::group(&path, max_count, &output)?, + Groupable::GoFlowAnnotation => { + sequences::go_flow_annotations::group(&path, max_count, &output)? + }, }, Subcommand::Sequences { command, diff --git a/utils/search-export/src/sequences/file_joiner.rs b/utils/search-export/src/sequences/file_joiner.rs index 6903b6e1b..b60e33302 100644 --- a/utils/search-export/src/sequences/file_joiner.rs +++ b/utils/search-export/src/sequences/file_joiner.rs @@ -37,6 +37,7 @@ use super::{ editing_events::EditingEvent, feedback::Feedback, go_annotation::GoAnnotation, + go_flow_annotations::GoFlowLLMAnnotation, interacting_protein::InteractingProtein, interacting_rna::InteractingRna, litsumm::LitsummSummaries, @@ -49,7 +50,6 @@ use super::{ rfam_hit::RfamHit, so_tree, so_tree::SoMapping, - go_flow_annotations::GoFlowLLMAnnotation, }; #[derive(Debug, Error)] @@ -118,7 +118,8 @@ pub struct FileJoiner<'de> { rfam_hits: StreamDeserializer<'de, IoRead>, Grouped>, publication_counts: StreamDeserializer<'de, IoRead>, Grouped>, lit_summ: StreamDeserializer<'de, IoRead>, Grouped>, - go_flow_llm_annotations: StreamDeserializer<'de, IoRead>, Grouped>, + go_flow_llm_annotations: + StreamDeserializer<'de, IoRead>, Grouped>, editing_events: StreamDeserializer<'de, IoRead>, Grouped>, so_info: SoMapping, } @@ -348,7 +349,8 @@ impl<'de> Iterator for FileJoiner<'de> { || id1 != id15 { return Some(Err(Error::OutofSyncData(vec![ - id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14, id15 + id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14, + id15, ]))); } diff --git a/utils/search-export/src/sequences/go_flow_annotations.rs b/utils/search-export/src/sequences/go_flow_annotations.rs index 04e73b525..5213ed13d 100644 --- a/utils/search-export/src/sequences/go_flow_annotations.rs +++ b/utils/search-export/src/sequences/go_flow_annotations.rs @@ -28,6 +28,7 @@ impl GoFlowLLMAnnotation { pub fn should_show_goflow(&self) -> bool { self.should_show_goflow } + pub fn urs_taxid(&self) -> &str { &self.urs_taxid } diff --git a/utils/search-export/src/sequences/litsumm.rs b/utils/search-export/src/sequences/litsumm.rs index 7bbd0cada..1d80d8a4f 100644 --- a/utils/search-export/src/sequences/litsumm.rs +++ b/utils/search-export/src/sequences/litsumm.rs @@ -28,6 +28,7 @@ impl LitsummSummaries { pub fn should_show_litsumm(&self) -> bool { self.should_show_litsumm } + pub fn urs_taxid(&self) -> &str { &self.urs_taxid } diff --git a/utils/search-export/src/sequences/mod.rs b/utils/search-export/src/sequences/mod.rs index aca016443..a00fbe3d5 100644 --- a/utils/search-export/src/sequences/mod.rs +++ b/utils/search-export/src/sequences/mod.rs @@ -5,6 +5,7 @@ pub mod editing_events; pub mod feedback; pub mod file_joiner; pub mod go_annotation; +pub mod go_flow_annotations; pub mod interacting_protein; pub mod interacting_rna; pub mod litsumm; @@ -16,6 +17,5 @@ pub mod qa_status; pub mod r2dt; pub mod raw; pub mod rfam_hit; -pub mod go_flow_annotations; pub mod so_tree; pub mod writers; diff --git a/utils/search-export/src/sequences/normalized.rs b/utils/search-export/src/sequences/normalized.rs index 09f6a718e..cd57b85d2 100644 --- a/utils/search-export/src/sequences/normalized.rs +++ b/utils/search-export/src/sequences/normalized.rs @@ -28,6 +28,7 @@ use crate::sequences::{ editing_events::EditingEvent, feedback::FeedbackVec, go_annotation::GoAnnotation, + go_flow_annotations::GoFlowLLMAnnotation, interacting_protein::InteractingProtein, interacting_rna::InteractingRna, litsumm::LitsummSummaries, @@ -37,7 +38,6 @@ use crate::sequences::{ r2dt::R2dt, raw::Raw, rfam_hit::RfamHitVec, - go_flow_annotations::GoFlowLLMAnnotation, so_tree, }; From e8c00c3a9f780e69c8e11e1c1fd25be47fb5bd42 Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Tue, 18 Nov 2025 09:33:34 +0000 Subject: [PATCH 22/22] Remove an unused polars dependency --- Cargo.lock | 406 +++++-------------------------- utils/precompute/Cargo.toml | 1 - utils/precompute/src/releases.rs | 13 +- 3 files changed, 66 insertions(+), 354 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 014395f95..0cd2f47d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -194,35 +194,6 @@ dependencies = [ "zstd", ] -[[package]] -name = "arrow2" -version = "0.18.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "963fef509b757bcbbf9e5ffa23bcb345614d99f4f6f531f97417b27b8604d389" -dependencies = [ - "ahash", - "arrow-format", - "bytemuck", - "chrono", - "dyn-clone", - "either", - "ethnum", - "foreign_vec", - "getrandom 0.2.16", - "hash_hasher", - "hashbrown 0.14.5", - "lexical-core", - "lz4", - "multiversion", - "num-traits", - "regex", - "regex-syntax 0.7.5", - "rustc_version 0.4.1", - "simdutf8", - "strength_reduce", - "zstd", -] - [[package]] name = "async-trait" version = "0.1.88" @@ -289,7 +260,7 @@ name = "bed-expander" version = "0.1.0" dependencies = [ "clap 4.5.37", - "polars 0.32.1", + "polars", ] [[package]] @@ -1685,28 +1656,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1362d4a136c0ebacb40d88a37ba361738b222fd8a2ee9340a3d8642f698c52b" dependencies = [ "getrandom 0.2.16", - "polars-core 0.32.1", - "polars-io 0.32.1", - "polars-lazy 0.32.1", - "polars-ops 0.32.1", - "polars-sql 0.32.1", - "polars-time 0.32.1", - "version_check", -] - -[[package]] -name = "polars" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3030de163b9ff2c9dac9a12dcb9be25cc0f2bc7c8e7cd2e4b2592ebed458ce6a" -dependencies = [ - "getrandom 0.2.16", - "polars-core 0.33.2", - "polars-io 0.33.2", - "polars-lazy 0.33.2", - "polars-ops 0.33.2", - "polars-sql 0.33.2", - "polars-time 0.33.2", + "polars-core", + "polars-io", + "polars-lazy", + "polars-ops", + "polars-sql", + "polars-time", "version_check", ] @@ -1716,26 +1671,11 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f967c901fa5da4ca7f64e813d1268488ba97e9b3004cefc579ff851c197a1138" dependencies = [ - "arrow2 0.17.4", - "hashbrown 0.14.5", - "multiversion", - "num-traits", - "polars-error 0.32.1", - "thiserror", - "version_check", -] - -[[package]] -name = "polars-arrow" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35cd38a64fb389fd990e4efd433a36331c995c981d353bfef83b5de4d87f1828" -dependencies = [ - "arrow2 0.18.0", + "arrow2", "hashbrown 0.14.5", "multiversion", "num-traits", - "polars-error 0.33.2", + "polars-error", "thiserror", "version_check", ] @@ -1747,37 +1687,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b24f92fc5b167f668ff85ab9607dfa72e2c09664cacef59297ee8601dee60126" dependencies = [ "ahash", - "arrow2 0.17.4", - "bitflags 2.9.0", - "chrono", - "comfy-table", - "either", - "hashbrown 0.14.5", - "indexmap 2.9.0", - "num-traits", - "once_cell", - "polars-arrow 0.32.1", - "polars-error 0.32.1", - "polars-row 0.32.1", - "polars-utils 0.32.1", - "rand 0.8.5", - "rand_distr", - "rayon", - "regex", - "smartstring", - "thiserror", - "version_check", - "xxhash-rust", -] - -[[package]] -name = "polars-core" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08367c014c07fa8f141680e024f926cab3a1fe839605a8fcf2223647eb45ca71" -dependencies = [ - "ahash", - "arrow2 0.18.0", + "arrow2", "bitflags 2.9.0", "chrono", "comfy-table", @@ -1786,10 +1696,10 @@ dependencies = [ "indexmap 2.9.0", "num-traits", "once_cell", - "polars-arrow 0.33.2", - "polars-error 0.33.2", - "polars-row 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-error", + "polars-row", + "polars-utils", "rand 0.8.5", "rand_distr", "rayon", @@ -1806,18 +1716,7 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40d09c3a7337e53b38c37b57999038440fa39c6801b9ba48afaecd8e16f7ac0a" dependencies = [ - "arrow2 0.17.4", - "regex", - "thiserror", -] - -[[package]] -name = "polars-error" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b20a09651a299979354945819dc2ce017964b80b916954e9d2ce39002a5f949" -dependencies = [ - "arrow2 0.18.0", + "arrow2", "regex", "thiserror", ] @@ -1829,7 +1728,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92cab0df9f2a35702fa5aec99edfaabf9ae8e9cdd0acf69e143ad2d132f34f9c" dependencies = [ "ahash", - "arrow2 0.17.4", + "arrow2", "async-trait", "bytes", "chrono", @@ -1842,45 +1741,17 @@ dependencies = [ "memmap2", "num-traits", "once_cell", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-error 0.32.1", - "polars-time 0.32.1", - "polars-utils 0.32.1", + "polars-arrow", + "polars-core", + "polars-error", + "polars-time", + "polars-utils", "rayon", "regex", "simdutf8", "tokio", ] -[[package]] -name = "polars-io" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88cf4a89c18a90ac20dfbcdfd19ab50ad4ac5a76fc7bb775d3c28bb738cf1f34" -dependencies = [ - "ahash", - "arrow2 0.18.0", - "bytes", - "chrono", - "fast-float", - "home", - "lexical", - "lexical-core", - "memchr", - "memmap2", - "num-traits", - "once_cell", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-error 0.33.2", - "polars-time 0.33.2", - "polars-utils 0.33.2", - "rayon", - "regex", - "simdutf8", -] - [[package]] name = "polars-lazy" version = "0.32.1" @@ -1891,37 +1762,14 @@ dependencies = [ "bitflags 2.9.0", "glob", "once_cell", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-io 0.32.1", - "polars-ops 0.32.1", - "polars-pipe 0.32.1", - "polars-plan 0.32.1", - "polars-time 0.32.1", - "polars-utils 0.32.1", - "rayon", - "smartstring", - "version_check", -] - -[[package]] -name = "polars-lazy" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5110eab438848c981cc5f541fbc5b21bb263fd707000b4715233074fb2630fcf" -dependencies = [ - "ahash", - "bitflags 2.9.0", - "glob", - "once_cell", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-io 0.33.2", - "polars-ops 0.33.2", - "polars-pipe 0.33.2", - "polars-plan 0.33.2", - "polars-time 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-pipe", + "polars-plan", + "polars-time", + "polars-utils", "rayon", "smartstring", "version_check", @@ -1934,32 +1782,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e825575c96302d2daedfc205a0062180033c92c55bcd6aafc4e109d4d8849ed0" dependencies = [ "argminmax", - "arrow2 0.17.4", + "arrow2", "either", "indexmap 2.9.0", "memchr", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-utils 0.32.1", - "smartstring", - "version_check", -] - -[[package]] -name = "polars-ops" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7740d7bc4c2ca08044f9ef599638e116fdd7d687e80d1974b698e390c6ce4252" -dependencies = [ - "argminmax", - "arrow2 0.18.0", - "either", - "indexmap 2.9.0", - "memchr", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-utils 0.33.2", - "regex", + "polars-arrow", + "polars-core", + "polars-utils", "smartstring", "version_check", ] @@ -1975,36 +1804,13 @@ dependencies = [ "enum_dispatch", "hashbrown 0.14.5", "num-traits", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-io 0.32.1", - "polars-ops 0.32.1", - "polars-plan 0.32.1", - "polars-row 0.32.1", - "polars-utils 0.32.1", - "rayon", - "smartstring", - "version_check", -] - -[[package]] -name = "polars-pipe" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f30c5e77c5594ddc958a46fe2e021da2feba9c94e767e1d798bd82ac5a33c3b" -dependencies = [ - "crossbeam-channel", - "crossbeam-queue", - "enum_dispatch", - "hashbrown 0.14.5", - "num-traits", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-io 0.33.2", - "polars-ops 0.33.2", - "polars-plan 0.33.2", - "polars-row 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-plan", + "polars-row", + "polars-utils", "rayon", "smartstring", "version_check", @@ -2017,36 +1823,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb67b014f0295e8e9dbb84404a91d666d477b3bc248a2ed51bc442833b16da35" dependencies = [ "ahash", - "arrow2 0.17.4", - "once_cell", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-io 0.32.1", - "polars-ops 0.32.1", - "polars-time 0.32.1", - "polars-utils 0.32.1", - "rayon", - "regex", - "smartstring", - "strum_macros 0.25.3", - "version_check", -] - -[[package]] -name = "polars-plan" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678cbeb730e29e50f0f8d844102d15454fc6113a74c667eab046c0e4a4322a9e" -dependencies = [ - "ahash", - "arrow2 0.18.0", + "arrow2", "once_cell", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-io 0.33.2", - "polars-ops 0.33.2", - "polars-time 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-time", + "polars-utils", "rayon", "regex", "smartstring", @@ -2060,20 +1844,9 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27f54c1956027bf6301948fb4f2837cf6d6b638d8dd1edf3aaeaa19906a986be" dependencies = [ - "arrow2 0.17.4", - "polars-error 0.32.1", - "polars-utils 0.32.1", -] - -[[package]] -name = "polars-row" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c52ef8885b9d13f848839594fbab21ad79fc63f7e11c19cdc2cfe9bb03c313ac" -dependencies = [ - "arrow2 0.18.0", - "polars-error 0.33.2", - "polars-utils 0.33.2", + "arrow2", + "polars-error", + "polars-utils", ] [[package]] @@ -2082,25 +1855,10 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbfcb15cf8eebd25ea1724109d0153817cd484c6326290585f0736b4e7fcf2f4" dependencies = [ - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-lazy 0.32.1", - "polars-plan 0.32.1", - "serde", - "serde_json", - "sqlparser", -] - -[[package]] -name = "polars-sql" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d716855267e3516f722287f68cf10e650e33f7197df83a79e680602471456fc" -dependencies = [ - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-lazy 0.33.2", - "polars-plan 0.33.2", + "polars-arrow", + "polars-core", + "polars-lazy", + "polars-plan", "serde", "serde_json", "sqlparser", @@ -2112,34 +1870,15 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53f42d2632f5971c9575041d33cbcfb1f996900c40bbf58bc6eb0a0c5efbecea" dependencies = [ - "arrow2 0.17.4", + "arrow2", "atoi", "chrono", "now", "once_cell", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-ops 0.32.1", - "polars-utils 0.32.1", - "regex", - "smartstring", -] - -[[package]] -name = "polars-time" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb75a24f11b55a400b52dc19a2a3e949aaaa46a911f99496de4485b1127063" -dependencies = [ - "arrow2 0.18.0", - "atoi", - "chrono", - "now", - "once_cell", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-ops 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-core", + "polars-ops", + "polars-utils", "regex", "smartstring", ] @@ -2154,25 +1893,7 @@ dependencies = [ "hashbrown 0.14.5", "num-traits", "once_cell", - "polars-error 0.32.1", - "rayon", - "smartstring", - "sysinfo", - "version_check", -] - -[[package]] -name = "polars-utils" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a4a5e743509096322cad39104d56e329fe2748483a3354a0f0c354724f3cef6" -dependencies = [ - "ahash", - "bytemuck", - "hashbrown 0.14.5", - "num-traits", - "once_cell", - "polars-error 0.33.2", + "polars-error", "rayon", "smartstring", "sysinfo", @@ -2196,7 +1917,6 @@ dependencies = [ "csv", "itertools 0.10.5", "log", - "polars 0.33.2", "rand 0.8.5", "rnc-core", "rnc-utils", @@ -2571,12 +2291,6 @@ version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" -[[package]] -name = "regex-syntax" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" - [[package]] name = "regex-syntax" version = "0.8.5" diff --git a/utils/precompute/Cargo.toml b/utils/precompute/Cargo.toml index a7fbdd22e..36ae91b23 100644 --- a/utils/precompute/Cargo.toml +++ b/utils/precompute/Cargo.toml @@ -20,7 +20,6 @@ sorted-iter = "0.1.7" structopt = "0.3" strum = "0.21" strum_macros = "0.21" -polars = { version = "0.33.2", features = ["lazy", "streaming"] } [dev-dependencies] rand = "0.8" diff --git a/utils/precompute/src/releases.rs b/utils/precompute/src/releases.rs index 2e80d23bd..5f55755db 100644 --- a/utils/precompute/src/releases.rs +++ b/utils/precompute/src/releases.rs @@ -26,7 +26,6 @@ use anyhow::{ Result, }; -use polars::prelude::*; #[derive(Serialize, Deserialize, Debug)] pub struct UrsEntry { @@ -121,7 +120,6 @@ pub fn select_new(xrefs: &Path, known: &Path, output: &Path, streaming: bool) -> // .agg([col("last").max().alias("last"), col("id").first().alias("id")]) // .sort("id", Default::default()); - // let known_records: LazyFrame = LazyCsvReader::new(known_path) // .has_header(false) // .low_memory(streaming) @@ -132,7 +130,6 @@ pub fn select_new(xrefs: &Path, known: &Path, output: &Path, streaming: bool) -> // .agg([col("last").max().alias("last"), col("id").first().alias("id")]) // .sort("id", Default::default()); - // let selection: LazyFrame = xref_records // .join( // known_records, @@ -149,10 +146,12 @@ pub fn select_new(xrefs: &Path, known: &Path, output: &Path, streaming: bool) -> // let check: LazyFrame = selection.clone(); - // // // check we are not in a catastrophic error state - precompute should never be newer than - // // // xref - // let selected_urs = selection.filter(col("selected").eq(true)).with_streaming(streaming).collect()?; - // let error_urs = check.filter(col("error_state").eq(true)).with_streaming(streaming).collect()?; + // // // check we are not in a catastrophic error state - precompute should never be newer + // than // // xref + // let selected_urs = + // selection.filter(col("selected").eq(true)).with_streaming(streaming).collect()?; + // let error_urs = + // check.filter(col("error_state").eq(true)).with_streaming(streaming).collect()?; // if error_urs.height() > 0 { // return Err(anyhow!("Precompute newer than xref for these UPIs: {:?}", error_urs)); // }