diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d7b62a797..72e33e87c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,12 +16,12 @@ repos: - id: isort args: ["--profile", "black", "--filter-files"] name: isort (python) -- repo: https://github.com/doublify/pre-commit-rust - rev: v1.0 - hooks: - - id: fmt - - id: cargo-check - - id: clippy +# - repo: https://github.com/doublify/pre-commit-rust +# rev: v1.0 +# hooks: +# - id: fmt +# - id: cargo-check +# - id: clippy # - repo: https://github.com/python-poetry/poetry # rev: '1.2.2' # hooks: diff --git a/Cargo.lock b/Cargo.lock index 014395f95..0cd2f47d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -194,35 +194,6 @@ dependencies = [ "zstd", ] -[[package]] -name = "arrow2" -version = "0.18.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "963fef509b757bcbbf9e5ffa23bcb345614d99f4f6f531f97417b27b8604d389" -dependencies = [ - "ahash", - "arrow-format", - "bytemuck", - "chrono", - "dyn-clone", - "either", - "ethnum", - "foreign_vec", - "getrandom 0.2.16", - "hash_hasher", - "hashbrown 0.14.5", - "lexical-core", - "lz4", - "multiversion", - "num-traits", - "regex", - "regex-syntax 0.7.5", - "rustc_version 0.4.1", - "simdutf8", - "strength_reduce", - "zstd", -] - [[package]] name = "async-trait" version = "0.1.88" @@ -289,7 +260,7 @@ name = "bed-expander" version = "0.1.0" dependencies = [ "clap 4.5.37", - "polars 0.32.1", + "polars", ] [[package]] @@ -1685,28 +1656,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1362d4a136c0ebacb40d88a37ba361738b222fd8a2ee9340a3d8642f698c52b" dependencies = [ "getrandom 0.2.16", - "polars-core 0.32.1", - "polars-io 0.32.1", - "polars-lazy 0.32.1", - "polars-ops 0.32.1", - "polars-sql 0.32.1", - "polars-time 0.32.1", - "version_check", -] - -[[package]] -name = "polars" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3030de163b9ff2c9dac9a12dcb9be25cc0f2bc7c8e7cd2e4b2592ebed458ce6a" -dependencies = [ - "getrandom 0.2.16", - "polars-core 0.33.2", - "polars-io 0.33.2", - "polars-lazy 0.33.2", - "polars-ops 0.33.2", - "polars-sql 0.33.2", - "polars-time 0.33.2", + "polars-core", + "polars-io", + "polars-lazy", + "polars-ops", + "polars-sql", + "polars-time", "version_check", ] @@ -1716,26 +1671,11 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f967c901fa5da4ca7f64e813d1268488ba97e9b3004cefc579ff851c197a1138" dependencies = [ - "arrow2 0.17.4", - "hashbrown 0.14.5", - "multiversion", - "num-traits", - "polars-error 0.32.1", - "thiserror", - "version_check", -] - -[[package]] -name = "polars-arrow" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35cd38a64fb389fd990e4efd433a36331c995c981d353bfef83b5de4d87f1828" -dependencies = [ - "arrow2 0.18.0", + "arrow2", "hashbrown 0.14.5", "multiversion", "num-traits", - "polars-error 0.33.2", + "polars-error", "thiserror", "version_check", ] @@ -1747,37 +1687,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b24f92fc5b167f668ff85ab9607dfa72e2c09664cacef59297ee8601dee60126" dependencies = [ "ahash", - "arrow2 0.17.4", - "bitflags 2.9.0", - "chrono", - "comfy-table", - "either", - "hashbrown 0.14.5", - "indexmap 2.9.0", - "num-traits", - "once_cell", - "polars-arrow 0.32.1", - "polars-error 0.32.1", - "polars-row 0.32.1", - "polars-utils 0.32.1", - "rand 0.8.5", - "rand_distr", - "rayon", - "regex", - "smartstring", - "thiserror", - "version_check", - "xxhash-rust", -] - -[[package]] -name = "polars-core" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08367c014c07fa8f141680e024f926cab3a1fe839605a8fcf2223647eb45ca71" -dependencies = [ - "ahash", - "arrow2 0.18.0", + "arrow2", "bitflags 2.9.0", "chrono", "comfy-table", @@ -1786,10 +1696,10 @@ dependencies = [ "indexmap 2.9.0", "num-traits", "once_cell", - "polars-arrow 0.33.2", - "polars-error 0.33.2", - "polars-row 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-error", + "polars-row", + "polars-utils", "rand 0.8.5", "rand_distr", "rayon", @@ -1806,18 +1716,7 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40d09c3a7337e53b38c37b57999038440fa39c6801b9ba48afaecd8e16f7ac0a" dependencies = [ - "arrow2 0.17.4", - "regex", - "thiserror", -] - -[[package]] -name = "polars-error" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b20a09651a299979354945819dc2ce017964b80b916954e9d2ce39002a5f949" -dependencies = [ - "arrow2 0.18.0", + "arrow2", "regex", "thiserror", ] @@ -1829,7 +1728,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92cab0df9f2a35702fa5aec99edfaabf9ae8e9cdd0acf69e143ad2d132f34f9c" dependencies = [ "ahash", - "arrow2 0.17.4", + "arrow2", "async-trait", "bytes", "chrono", @@ -1842,45 +1741,17 @@ dependencies = [ "memmap2", "num-traits", "once_cell", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-error 0.32.1", - "polars-time 0.32.1", - "polars-utils 0.32.1", + "polars-arrow", + "polars-core", + "polars-error", + "polars-time", + "polars-utils", "rayon", "regex", "simdutf8", "tokio", ] -[[package]] -name = "polars-io" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88cf4a89c18a90ac20dfbcdfd19ab50ad4ac5a76fc7bb775d3c28bb738cf1f34" -dependencies = [ - "ahash", - "arrow2 0.18.0", - "bytes", - "chrono", - "fast-float", - "home", - "lexical", - "lexical-core", - "memchr", - "memmap2", - "num-traits", - "once_cell", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-error 0.33.2", - "polars-time 0.33.2", - "polars-utils 0.33.2", - "rayon", - "regex", - "simdutf8", -] - [[package]] name = "polars-lazy" version = "0.32.1" @@ -1891,37 +1762,14 @@ dependencies = [ "bitflags 2.9.0", "glob", "once_cell", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-io 0.32.1", - "polars-ops 0.32.1", - "polars-pipe 0.32.1", - "polars-plan 0.32.1", - "polars-time 0.32.1", - "polars-utils 0.32.1", - "rayon", - "smartstring", - "version_check", -] - -[[package]] -name = "polars-lazy" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5110eab438848c981cc5f541fbc5b21bb263fd707000b4715233074fb2630fcf" -dependencies = [ - "ahash", - "bitflags 2.9.0", - "glob", - "once_cell", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-io 0.33.2", - "polars-ops 0.33.2", - "polars-pipe 0.33.2", - "polars-plan 0.33.2", - "polars-time 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-pipe", + "polars-plan", + "polars-time", + "polars-utils", "rayon", "smartstring", "version_check", @@ -1934,32 +1782,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e825575c96302d2daedfc205a0062180033c92c55bcd6aafc4e109d4d8849ed0" dependencies = [ "argminmax", - "arrow2 0.17.4", + "arrow2", "either", "indexmap 2.9.0", "memchr", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-utils 0.32.1", - "smartstring", - "version_check", -] - -[[package]] -name = "polars-ops" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7740d7bc4c2ca08044f9ef599638e116fdd7d687e80d1974b698e390c6ce4252" -dependencies = [ - "argminmax", - "arrow2 0.18.0", - "either", - "indexmap 2.9.0", - "memchr", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-utils 0.33.2", - "regex", + "polars-arrow", + "polars-core", + "polars-utils", "smartstring", "version_check", ] @@ -1975,36 +1804,13 @@ dependencies = [ "enum_dispatch", "hashbrown 0.14.5", "num-traits", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-io 0.32.1", - "polars-ops 0.32.1", - "polars-plan 0.32.1", - "polars-row 0.32.1", - "polars-utils 0.32.1", - "rayon", - "smartstring", - "version_check", -] - -[[package]] -name = "polars-pipe" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f30c5e77c5594ddc958a46fe2e021da2feba9c94e767e1d798bd82ac5a33c3b" -dependencies = [ - "crossbeam-channel", - "crossbeam-queue", - "enum_dispatch", - "hashbrown 0.14.5", - "num-traits", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-io 0.33.2", - "polars-ops 0.33.2", - "polars-plan 0.33.2", - "polars-row 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-plan", + "polars-row", + "polars-utils", "rayon", "smartstring", "version_check", @@ -2017,36 +1823,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb67b014f0295e8e9dbb84404a91d666d477b3bc248a2ed51bc442833b16da35" dependencies = [ "ahash", - "arrow2 0.17.4", - "once_cell", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-io 0.32.1", - "polars-ops 0.32.1", - "polars-time 0.32.1", - "polars-utils 0.32.1", - "rayon", - "regex", - "smartstring", - "strum_macros 0.25.3", - "version_check", -] - -[[package]] -name = "polars-plan" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678cbeb730e29e50f0f8d844102d15454fc6113a74c667eab046c0e4a4322a9e" -dependencies = [ - "ahash", - "arrow2 0.18.0", + "arrow2", "once_cell", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-io 0.33.2", - "polars-ops 0.33.2", - "polars-time 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-core", + "polars-io", + "polars-ops", + "polars-time", + "polars-utils", "rayon", "regex", "smartstring", @@ -2060,20 +1844,9 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27f54c1956027bf6301948fb4f2837cf6d6b638d8dd1edf3aaeaa19906a986be" dependencies = [ - "arrow2 0.17.4", - "polars-error 0.32.1", - "polars-utils 0.32.1", -] - -[[package]] -name = "polars-row" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c52ef8885b9d13f848839594fbab21ad79fc63f7e11c19cdc2cfe9bb03c313ac" -dependencies = [ - "arrow2 0.18.0", - "polars-error 0.33.2", - "polars-utils 0.33.2", + "arrow2", + "polars-error", + "polars-utils", ] [[package]] @@ -2082,25 +1855,10 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbfcb15cf8eebd25ea1724109d0153817cd484c6326290585f0736b4e7fcf2f4" dependencies = [ - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-lazy 0.32.1", - "polars-plan 0.32.1", - "serde", - "serde_json", - "sqlparser", -] - -[[package]] -name = "polars-sql" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d716855267e3516f722287f68cf10e650e33f7197df83a79e680602471456fc" -dependencies = [ - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-lazy 0.33.2", - "polars-plan 0.33.2", + "polars-arrow", + "polars-core", + "polars-lazy", + "polars-plan", "serde", "serde_json", "sqlparser", @@ -2112,34 +1870,15 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53f42d2632f5971c9575041d33cbcfb1f996900c40bbf58bc6eb0a0c5efbecea" dependencies = [ - "arrow2 0.17.4", + "arrow2", "atoi", "chrono", "now", "once_cell", - "polars-arrow 0.32.1", - "polars-core 0.32.1", - "polars-ops 0.32.1", - "polars-utils 0.32.1", - "regex", - "smartstring", -] - -[[package]] -name = "polars-time" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb75a24f11b55a400b52dc19a2a3e949aaaa46a911f99496de4485b1127063" -dependencies = [ - "arrow2 0.18.0", - "atoi", - "chrono", - "now", - "once_cell", - "polars-arrow 0.33.2", - "polars-core 0.33.2", - "polars-ops 0.33.2", - "polars-utils 0.33.2", + "polars-arrow", + "polars-core", + "polars-ops", + "polars-utils", "regex", "smartstring", ] @@ -2154,25 +1893,7 @@ dependencies = [ "hashbrown 0.14.5", "num-traits", "once_cell", - "polars-error 0.32.1", - "rayon", - "smartstring", - "sysinfo", - "version_check", -] - -[[package]] -name = "polars-utils" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a4a5e743509096322cad39104d56e329fe2748483a3354a0f0c354724f3cef6" -dependencies = [ - "ahash", - "bytemuck", - "hashbrown 0.14.5", - "num-traits", - "once_cell", - "polars-error 0.33.2", + "polars-error", "rayon", "smartstring", "sysinfo", @@ -2196,7 +1917,6 @@ dependencies = [ "csv", "itertools 0.10.5", "log", - "polars 0.33.2", "rand 0.8.5", "rnc-core", "rnc-utils", @@ -2571,12 +2291,6 @@ version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" -[[package]] -name = "regex-syntax" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" - [[package]] name = "regex-syntax" version = "0.8.5" diff --git a/files/search-export/parts/goflow.sql b/files/search-export/parts/goflow.sql new file mode 100644 index 000000000..9999d8fd4 --- /dev/null +++ b/files/search-export/parts/goflow.sql @@ -0,0 +1,13 @@ +COPY ( + SELECT + json_build_object( + 'id', todo.id, + 'urs_taxid', todo.urs_taxid, + 'should_show_goflow', true + ) + FROM search_export_urs todo + JOIN go_flow_llm_curation_results gfllm + ON + todo.urs_taxid = gfllm.urs_taxid + ORDER by todo.id +) TO STDOUT diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py index 202ac5209..fe034337b 100644 --- a/rnacentral_pipeline/databases/ensembl/genomes/urls.py +++ b/rnacentral_pipeline/databases/ensembl/genomes/urls.py @@ -30,8 +30,9 @@ def latest_release(ftp: FTP) -> str: readme_lines = [] ftp.retrlines("RETR current_README", readme_lines.append) cur_readme = "\n".join(readme_lines) - pattern = r"Ensembl Release (\d+) Databases." - match = re.search(pattern, cur_readme) + pattern = r"Ensembl Release (\d+) Databases\." + match = re.search(pattern, cur_readme, re.IGNORECASE) + if not match: raise ValueError("Could not find release number in README") release = match.group(1) diff --git a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py index e338038f2..a1b3eb7fc 100644 --- a/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py +++ b/rnacentral_pipeline/databases/ensembl/vertebrates/urls.py @@ -29,8 +29,9 @@ def latest_release(ftp: FTP) -> str: readme_lines = [] ftp.retrlines("RETR current_README", readme_lines.append) cur_readme = "\n".join(readme_lines) - pattern = r"Ensembl Release (\d+) Databases." - match = re.search(pattern, cur_readme) + pattern = r"Ensembl Release (\d+) Databases\." + match = re.search(pattern, cur_readme, re.IGNORECASE) + if not match: raise ValueError("Could not determine latest Ensembl release from README") release = match.group(1) diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py index 190f00c98..3d74880b3 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/__init__.py +++ b/rnacentral_pipeline/rnacentral/r2dt/__init__.py @@ -159,9 +159,7 @@ def prepare_sequences(xref_urs, tracked_urs, urs_to_fetch, max_sequences): .rename({"column_1": "urs"}) ) - raw_tracked = pl.scan_csv( - tracked_urs.name, low_memory=True - ).unique() + raw_tracked = pl.scan_csv(tracked_urs.name, low_memory=True).unique() to_fetch = raw_xref.join(raw_tracked, on="urs", how="anti") diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py index dcb47486d..ed204163c 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/data.py +++ b/rnacentral_pipeline/rnacentral/r2dt/data.py @@ -409,11 +409,15 @@ def dot_bracket(self): seq_dot = str(record.seq) ## Use indices instead, assert that the string is even length ## If not, then the two parts are not the same length - assert len(seq_dot) % 2 == 0, f"Odd length sequence {len(seq_dot)}" + if len(seq_dot) % 2 != 0: + raise ValueError(f"Odd length sequence {len(seq_dot)}") seq_dot_len = len(seq_dot) sequence = seq_dot[0 : seq_dot_len // 2] dot_bracket = seq_dot[(seq_dot_len // 2) :] - assert len(sequence) == len(dot_bracket) + if len(sequence) != len(dot_bracket): + raise ValueError( + f"Sequence and dot bracket lengths do not match: {len(sequence)} != {len(dot_bracket)}" + ) return dot_bracket def basepair_count(self): diff --git a/rnacentral_pipeline/rnacentral/search_export/data.py b/rnacentral_pipeline/rnacentral/search_export/data.py index bfad7c2be..b68e07fc7 100644 --- a/rnacentral_pipeline/rnacentral/search_export/data.py +++ b/rnacentral_pipeline/rnacentral/search_export/data.py @@ -709,6 +709,10 @@ def has_litsumm(litsumm): return str(bool(litsumm)) +def has_go_flow_llm_annotation(go_flow): + return str(bool(go_flow)) + + def has_editing_event(editing_events): return str(bool(editing_events)) @@ -881,6 +885,11 @@ def edit_ref_to_edit(editing_events): edit_repeat_type, keys="editing_events", ), + field( + "has_go_flow_llm_annotation", + has_go_flow_llm_annotation, + keys="go_flow_llm_annotations", + ), ## Add new fields above this line! Otherwise editing the produced xml is hard. tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"), ], diff --git a/utils/bed-expander/src/main.rs b/utils/bed-expander/src/main.rs index d5d7d7025..329a5fe52 100644 --- a/utils/bed-expander/src/main.rs +++ b/utils/bed-expander/src/main.rs @@ -1,8 +1,13 @@ use clap::Parser; -use polars::lazy::dsl::col; +use polars::{ + datatypes::DataType::{ + Int64, + List, + }, + lazy::dsl::col, + prelude::*, +}; use std::fs; -use polars::datatypes::DataType::{Int64, List}; -use polars::prelude::*; #[derive(Parser, Debug)] #[clap(author = "Andrew Green", version, about)] @@ -30,19 +35,30 @@ fn main() -> Result<(), PolarsError> { // col 11: exon sizes // col 12: exon starts - // Stay lazy as long as possible to minimise memory use let original_bed = original_bed .with_columns([col("column_11").str().split(","), col("column_12").str().split(",")]) - .with_columns([col("column_11").cast(List(Box::new(Int64)))] ) + .with_columns([col("column_11").cast(List(Box::new(Int64)))]) .with_column(col("column_11").list().sum().alias("column_5")) .explode([col("column_11"), col("column_12")]) .with_columns([col("column_11").cast(Int64), col("column_12").cast(Int64)]) .with_column(col("column_2").alias("transcript_start")) .with_column(col("column_3").alias("transcript_end")) - .with_columns([(col("column_2") + col("column_12")).alias("column_2"), (col("column_2") + col("column_12") + col("column_11")).alias("column_3")] ) - .select([col("column_1"), col("column_2"), col("column_3"), col("column_4"), col("column_5"), col("column_6"), col("transcript_start"), col("transcript_end")]) - .sort_by_exprs(vec![col("column_1"), col("column_2")], vec![false,false], false, false); + .with_columns([ + (col("column_2") + col("column_12")).alias("column_2"), + (col("column_2") + col("column_12") + col("column_11")).alias("column_3"), + ]) + .select([ + col("column_1"), + col("column_2"), + col("column_3"), + col("column_4"), + col("column_5"), + col("column_6"), + col("transcript_start"), + col("transcript_end"), + ]) + .sort_by_exprs(vec![col("column_1"), col("column_2")], vec![false, false], false, false); let mut output_file = fs::File::create(&cli.output)?; let mut writer = CsvWriter::new(&mut output_file).has_header(false).with_delimiter(b'\t'); @@ -50,5 +66,4 @@ fn main() -> Result<(), PolarsError> { let mut expanded_bed = original_bed.collect()?; writer.finish(&mut expanded_bed) - } diff --git a/utils/precompute/Cargo.toml b/utils/precompute/Cargo.toml index a7fbdd22e..36ae91b23 100644 --- a/utils/precompute/Cargo.toml +++ b/utils/precompute/Cargo.toml @@ -20,7 +20,6 @@ sorted-iter = "0.1.7" structopt = "0.3" strum = "0.21" strum_macros = "0.21" -polars = { version = "0.33.2", features = ["lazy", "streaming"] } [dev-dependencies] rand = "0.8" diff --git a/utils/precompute/src/releases.rs b/utils/precompute/src/releases.rs index 2e80d23bd..5f55755db 100644 --- a/utils/precompute/src/releases.rs +++ b/utils/precompute/src/releases.rs @@ -26,7 +26,6 @@ use anyhow::{ Result, }; -use polars::prelude::*; #[derive(Serialize, Deserialize, Debug)] pub struct UrsEntry { @@ -121,7 +120,6 @@ pub fn select_new(xrefs: &Path, known: &Path, output: &Path, streaming: bool) -> // .agg([col("last").max().alias("last"), col("id").first().alias("id")]) // .sort("id", Default::default()); - // let known_records: LazyFrame = LazyCsvReader::new(known_path) // .has_header(false) // .low_memory(streaming) @@ -132,7 +130,6 @@ pub fn select_new(xrefs: &Path, known: &Path, output: &Path, streaming: bool) -> // .agg([col("last").max().alias("last"), col("id").first().alias("id")]) // .sort("id", Default::default()); - // let selection: LazyFrame = xref_records // .join( // known_records, @@ -149,10 +146,12 @@ pub fn select_new(xrefs: &Path, known: &Path, output: &Path, streaming: bool) -> // let check: LazyFrame = selection.clone(); - // // // check we are not in a catastrophic error state - precompute should never be newer than - // // // xref - // let selected_urs = selection.filter(col("selected").eq(true)).with_streaming(streaming).collect()?; - // let error_urs = check.filter(col("error_state").eq(true)).with_streaming(streaming).collect()?; + // // // check we are not in a catastrophic error state - precompute should never be newer + // than // // xref + // let selected_urs = + // selection.filter(col("selected").eq(true)).with_streaming(streaming).collect()?; + // let error_urs = + // check.filter(col("error_state").eq(true)).with_streaming(streaming).collect()?; // if error_urs.height() > 0 { // return Err(anyhow!("Precompute newer than xref for these UPIs: {:?}", error_urs)); // } diff --git a/utils/search-export/src/main.rs b/utils/search-export/src/main.rs index ad99f67e8..354643f50 100644 --- a/utils/search-export/src/main.rs +++ b/utils/search-export/src/main.rs @@ -33,6 +33,7 @@ pub enum Groupable { SoInfo, LitsummSummaries, EditingEvents, + GoFlowAnnotation, } #[derive(Debug, StructOpt)] @@ -140,6 +141,10 @@ enum SequenceCommand { /// RNA editing events editing_events: PathBuf, + #[structopt(parse(from_os_str))] + /// GoFlowLLM annotations + go_flow_llm_annotations: PathBuf, + // Add new arguments above this line! #[structopt(parse(from_os_str))] /// Filename to write the results to, '-' means stdout @@ -255,6 +260,9 @@ fn main() -> Result<()> { Groupable::EditingEvents => { sequences::editing_events::group(&path, max_count, &output)? }, + Groupable::GoFlowAnnotation => { + sequences::go_flow_annotations::group(&path, max_count, &output)? + }, }, Subcommand::Sequences { command, @@ -275,6 +283,8 @@ fn main() -> Result<()> { litsumm_summaries, editing_events, so_term_tree, + go_flow_llm_annotations, + // Add new arguments above this line! output, } => sequences::writers::write_merge( vec![ @@ -293,6 +303,7 @@ fn main() -> Result<()> { editing_events, orfs, so_term_tree, + go_flow_llm_annotations, ], &output, )?, diff --git a/utils/search-export/src/sequences/file_joiner.rs b/utils/search-export/src/sequences/file_joiner.rs index c8a50973b..b60e33302 100644 --- a/utils/search-export/src/sequences/file_joiner.rs +++ b/utils/search-export/src/sequences/file_joiner.rs @@ -37,6 +37,7 @@ use super::{ editing_events::EditingEvent, feedback::Feedback, go_annotation::GoAnnotation, + go_flow_annotations::GoFlowLLMAnnotation, interacting_protein::InteractingProtein, interacting_rna::InteractingRna, litsumm::LitsummSummaries, @@ -98,6 +99,7 @@ pub enum FileTypes { PublicationCount, LitsummSummaries, EditingEvents, + GoFlowLLMAnnotations, SoTermTree, } @@ -116,6 +118,8 @@ pub struct FileJoiner<'de> { rfam_hits: StreamDeserializer<'de, IoRead>, Grouped>, publication_counts: StreamDeserializer<'de, IoRead>, Grouped>, lit_summ: StreamDeserializer<'de, IoRead>, Grouped>, + go_flow_llm_annotations: + StreamDeserializer<'de, IoRead>, Grouped>, editing_events: StreamDeserializer<'de, IoRead>, Grouped>, so_info: SoMapping, } @@ -203,6 +207,7 @@ impl FileJoinerBuilder { let publication_counts = self.iterator_for(FileTypes::PublicationCount)?; let lit_summ = self.iterator_for(FileTypes::LitsummSummaries)?; let editing_events = self.iterator_for(FileTypes::EditingEvents)?; + let go_flow_llm_annotations = self.iterator_for(FileTypes::GoFlowLLMAnnotations)?; let so_info = so_tree::load(self.path_for(FileTypes::SoTermTree)?)?; Ok(FileJoiner { @@ -220,6 +225,7 @@ impl FileJoinerBuilder { publication_counts, lit_summ, editing_events, + go_flow_llm_annotations, so_info, }) } @@ -244,6 +250,7 @@ impl<'de> Iterator for FileJoiner<'de> { self.publication_counts.next(), self.lit_summ.next(), self.editing_events.next(), + self.go_flow_llm_annotations.next(), ); match current { @@ -262,6 +269,7 @@ impl<'de> Iterator for FileJoiner<'de> { None, None, None, + None, ) => None, ( Some(Ok(Required { @@ -320,6 +328,10 @@ impl<'de> Iterator for FileJoiner<'de> { id: id14, data: editing_events, })), + Some(Ok(Multiple { + id: id15, + data: goflow_llm_annotations, + })), ) => { if id1 != id2 || id1 != id3 @@ -334,9 +346,11 @@ impl<'de> Iterator for FileJoiner<'de> { || id1 != id12 || id1 != id13 || id1 != id14 + || id1 != id15 { return Some(Err(Error::OutofSyncData(vec![ id1, id2, id3, id4, id5, id6, id7, id8, id9, id10, id11, id12, id13, id14, + id15, ]))); } @@ -362,6 +376,7 @@ impl<'de> Iterator for FileJoiner<'de> { .publication_counts(publication_counts) .litsumm_summaries(lit_summ) .editing_events(editing_events) + .go_flow_llm_annotations(goflow_llm_annotations) .so_tree(so_tree) .build(); diff --git a/utils/search-export/src/sequences/go_flow_annotations.rs b/utils/search-export/src/sequences/go_flow_annotations.rs new file mode 100644 index 000000000..5213ed13d --- /dev/null +++ b/utils/search-export/src/sequences/go_flow_annotations.rs @@ -0,0 +1,35 @@ +use serde::{ + Deserialize, + Serialize, +}; +use std::path::Path; + +use anyhow::Result; +use rnc_core::grouper; + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct GoFlowLLMAnnotation { + pub id: usize, + urs_taxid: String, + should_show_goflow: bool, +} + +impl grouper::HasIndex for GoFlowLLMAnnotation { + fn index(&self) -> usize { + self.id + } +} + +pub fn group(path: &Path, max: usize, output: &Path) -> Result<()> { + grouper::group::(grouper::Criteria::AnyNumber, &path, 1, max, &output) +} + +impl GoFlowLLMAnnotation { + pub fn should_show_goflow(&self) -> bool { + self.should_show_goflow + } + + pub fn urs_taxid(&self) -> &str { + &self.urs_taxid + } +} diff --git a/utils/search-export/src/sequences/litsumm.rs b/utils/search-export/src/sequences/litsumm.rs index 7bbd0cada..1d80d8a4f 100644 --- a/utils/search-export/src/sequences/litsumm.rs +++ b/utils/search-export/src/sequences/litsumm.rs @@ -28,6 +28,7 @@ impl LitsummSummaries { pub fn should_show_litsumm(&self) -> bool { self.should_show_litsumm } + pub fn urs_taxid(&self) -> &str { &self.urs_taxid } diff --git a/utils/search-export/src/sequences/mod.rs b/utils/search-export/src/sequences/mod.rs index b3febfd50..a00fbe3d5 100644 --- a/utils/search-export/src/sequences/mod.rs +++ b/utils/search-export/src/sequences/mod.rs @@ -5,6 +5,7 @@ pub mod editing_events; pub mod feedback; pub mod file_joiner; pub mod go_annotation; +pub mod go_flow_annotations; pub mod interacting_protein; pub mod interacting_rna; pub mod litsumm; diff --git a/utils/search-export/src/sequences/normalized.rs b/utils/search-export/src/sequences/normalized.rs index c53f334f6..cd57b85d2 100644 --- a/utils/search-export/src/sequences/normalized.rs +++ b/utils/search-export/src/sequences/normalized.rs @@ -28,6 +28,7 @@ use crate::sequences::{ editing_events::EditingEvent, feedback::FeedbackVec, go_annotation::GoAnnotation, + go_flow_annotations::GoFlowLLMAnnotation, interacting_protein::InteractingProtein, interacting_rna::InteractingRna, litsumm::LitsummSummaries, @@ -69,6 +70,7 @@ pub struct Normalized { publication_count: usize, litsumm: Vec, editing_events: Vec, + go_flow_llm_annotations: Vec, so_rna_type_tree: so_tree::SoTree, #[serde(flatten)] @@ -129,6 +131,7 @@ impl Normalized { rfam_hits: raw.rfam_hits().to_owned().into_iter().collect(), orfs: raw.orfs().to_vec().into_iter().collect(), litsumm: raw.litsumm_summaries().to_vec(), + go_flow_llm_annotations: raw.go_flow_llm_annotations().to_vec(), editing_events: raw.editing_events().to_vec(), }) } diff --git a/utils/search-export/src/sequences/raw.rs b/utils/search-export/src/sequences/raw.rs index 4daf502d0..52bea2f9f 100644 --- a/utils/search-export/src/sequences/raw.rs +++ b/utils/search-export/src/sequences/raw.rs @@ -16,6 +16,7 @@ use crate::sequences::{ editing_events::EditingEvent, feedback::Feedback, go_annotation::GoAnnotation, + go_flow_annotations::GoFlowLLMAnnotation, interacting_protein::InteractingProtein, interacting_rna::InteractingRna, litsumm::LitsummSummaries, @@ -46,6 +47,7 @@ pub struct Raw { publication_counts: Option, litsumm_summaries: Vec, editing_events: Vec, + go_flow_llm_annotations: Vec, so_tree: so_tree::SoTree, } @@ -148,6 +150,11 @@ impl Raw { &self.editing_events } + /// Get a reference to the raw's GoFlowlLM annotations + pub fn go_flow_llm_annotations(&self) -> &[GoFlowLLMAnnotation] { + &self.go_flow_llm_annotations + } + /// Get this raw's publication count. pub fn publication_count(&self) -> usize { self.publication_counts.as_ref().map(|p| p.publication_count()).unwrap_or(0) diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf index b1611b09b..3c4e65f5a 100755 --- a/workflows/export/text-search/sequences.nf +++ b/workflows/export/text-search/sequences.nf @@ -65,13 +65,14 @@ process build_metadata { path(text) path(litsumm) path(editing_events) + path(go_flow_annotations) path(so_tree) output: path("merged.json") """ - search-export sequences merge $base $crs $feeback $go $prot $rnas $precompute $qa $r2dt $rfam $orf $text $so_tree $litsumm $editing_events merged.json + search-export sequences merge $base $crs $feeback $go $prot $rnas $precompute $qa $r2dt $rfam $orf $text $so_tree $litsumm $editing_events $go_flow_annotations merged.json """ } @@ -141,6 +142,20 @@ process litsumm_summaries { """ } +process go_flow_annotations { + input: + val(max_count) + path (query) + + output: + path("go-flow-llm-annotations.json") + + """ + psql -v ON_ERROR_STOP=1 -f "$query" "$PGDATABASE" > raw.json + search-export group go-flow-annotation raw.json ${max_count} go-flow-llm-annotations.json + """ +} + process editing_events { input: val(max_count) @@ -201,6 +216,7 @@ workflow sequences { Channel.fromPath('files/search-export/parts/text-mining.sql') | set { text_sql } Channel.fromPath('files/search-export/parts/litsumm.sql') | set { litsumm_sql } Channel.fromPath('files/search-export/parts/editing-events.sql') | set { editing_events_sql } + Channel.fromPath('files/search-export/parts/goflow.sql') | set { goflow_sql } Channel.fromPath('files/search-export/so-rna-types.sql') | set { so_sql } Channel.fromPath('files/search-export/parts/accessions.sql') | set { accessions_sql } @@ -230,6 +246,7 @@ workflow sequences { text_mining_query(search_count, text_sql), litsumm_summaries(search_count, litsumm_sql), editing_events(search_count, editing_events_sql), + go_flow_annotations(search_count, goflow_sql), fetch_so_tree(so_sql), )\ | set { metadata }