From b3f2e3a0aba6917e1e213af88047a93ff2431ceb Mon Sep 17 00:00:00 2001 From: Andrew Green Date: Thu, 15 Sep 2022 13:52:46 +0100 Subject: [PATCH] Changes in pipeline to account for multiple assemblies I have checked the workflows that use these files and I'm pretty sure the changes are ok The full list of potentially affected files: files/repeats/find-assemblies.sql files/genome-mapping/post.sql files/genome-mapping/find_species.sql files/ftp-export/genome_coordinates/known-coordinates.sql files/genome-mapping/load.ctl files/genes/species.sql files/genes/schema.sql files/import-data/post-release/001__regions.sql files/import-data/post-release/001__coordinate-systems.sql files/import-data/post-release/001__ensembl-pseudogenes.sql files/import-data/post-release/001__locations.sql files/import-data/post-release/002__Cleanup_assembly_table.sql files/import-data/ensembl/known-assemblies.sql files/import-data/pre-release/000__assemblies.sql workflows/databases/mirgenedb.nf rnacentral_pipeline/databases/ensembl/metadata/assemblies.py If one is not touched in this commit, then I think it didn't need modification --- files/ftp-export/genome_coordinates/known-coordinates.sql | 1 + files/genes/species.sql | 1 + files/genome-mapping/find_species.sql | 1 + files/import-data/post-release/001__coordinate-systems.sql | 1 + files/import-data/post-release/001__ensembl-pseudogenes.sql | 1 + files/import-data/post-release/001__locations.sql | 2 ++ files/repeats/find-assemblies.sql | 2 +- workflows/databases/mirgenedb.nf | 2 +- 8 files changed, 9 insertions(+), 2 deletions(-) diff --git a/files/ftp-export/genome_coordinates/known-coordinates.sql b/files/ftp-export/genome_coordinates/known-coordinates.sql index bdbef5bc3..6e06fc488 100644 --- a/files/ftp-export/genome_coordinates/known-coordinates.sql +++ b/files/ftp-export/genome_coordinates/known-coordinates.sql @@ -4,4 +4,5 @@ select distinct assembly.ensembl_url, assembly.taxid from ensembl_assembly assembly +where assembly.selected_genome = true ) TO STDOUT CSV diff --git a/files/genes/species.sql b/files/genes/species.sql index c6383d65d..3b3337912 100644 --- a/files/genes/species.sql +++ b/files/genes/species.sql @@ -2,4 +2,5 @@ COPY ( select distinct assembly_id, taxid from ensembl_assembly + where selected_genome = true ) TO STDOUT CSV diff --git a/files/genome-mapping/find_species.sql b/files/genome-mapping/find_species.sql index c87f6e67d..7c8476ba8 100644 --- a/files/genome-mapping/find_species.sql +++ b/files/genome-mapping/find_species.sql @@ -7,4 +7,5 @@ COPY ( FROM ensembl_assembly WHERE division NOT IN ('EnsemblProtists', 'EnsemblFungi') + AND selected_genome = true ) TO STDOUT CSV; diff --git a/files/import-data/post-release/001__coordinate-systems.sql b/files/import-data/post-release/001__coordinate-systems.sql index fd5d7dd6e..95546630a 100644 --- a/files/import-data/post-release/001__coordinate-systems.sql +++ b/files/import-data/post-release/001__coordinate-systems.sql @@ -17,6 +17,7 @@ SELECT load.karyotype_rank FROM load_coordinate_info load JOIN ensembl_assembly ensembl ON ensembl.assembly_id = load.assembly_id +WHERE ensembl.selected_genome = true ) ON CONFLICT (chromosome, assembly_id) DO UPDATE SET diff --git a/files/import-data/post-release/001__ensembl-pseudogenes.sql b/files/import-data/post-release/001__ensembl-pseudogenes.sql index 6aa064e1f..786b7a7db 100644 --- a/files/import-data/post-release/001__ensembl-pseudogenes.sql +++ b/files/import-data/post-release/001__ensembl-pseudogenes.sql @@ -25,6 +25,7 @@ from load_ensembl_pseudogenes load join ensembl_assembly assem on assem.assembly_id = load.assembly_id +where assem.selected_genome = true ) ON CONFLICT (md5(region_name)) DO NOTHING; INSERT INTO ensembl_pseudogene_exons ( diff --git a/files/import-data/post-release/001__locations.sql b/files/import-data/post-release/001__locations.sql index a1d1fe72f..5252fccab 100644 --- a/files/import-data/post-release/001__locations.sql +++ b/files/import-data/post-release/001__locations.sql @@ -27,6 +27,8 @@ on assembly.assembly_id = load.assembly_id WHERE load.chromosome is not null +AND + assembly.selected_genome = true ON CONFLICT (accession, name, local_start, local_end, assembly_id) DO NOTHING ; diff --git a/files/repeats/find-assemblies.sql b/files/repeats/find-assemblies.sql index b98f76e20..3c1f91d85 100644 --- a/files/repeats/find-assemblies.sql +++ b/files/repeats/find-assemblies.sql @@ -7,5 +7,5 @@ FROM ensembl_assembly species WHERE exists(select 1 from rnc_sequence_regions reg where reg.assembly_id = species.assembly_id) and species.division != 'EnsemblFungi' + and species.selected_genome = true ) TO STDOUT CSV; - diff --git a/workflows/databases/mirgenedb.nf b/workflows/databases/mirgenedb.nf index 7277a017e..bd88aea98 100644 --- a/workflows/databases/mirgenedb.nf +++ b/workflows/databases/mirgenedb.nf @@ -7,7 +7,7 @@ process mirgenedb { """ scp $params.databases.mirgenedb.remote mirgenedb.json psql \ - --command='COPY (select assembly_id,assembly_ucsc from ensembl_assembly where assembly_ucsc is not null) TO STDOUT (FORMAT CSV)' \ + --command='COPY (select assembly_id,assembly_ucsc from ensembl_assembly where assembly_ucsc is not null and selected_genome = true) TO STDOUT (FORMAT CSV)' \ "$PGDATABASE" > assemblies.tsv rnac mirgenedb parse assemblies.tsv mirgenedb.json . """