From 78729f600e2928ebf92cebebbdf9e0cb06bc142c Mon Sep 17 00:00:00 2001 From: Amanda Warkow Date: Thu, 22 Jan 2026 19:12:10 -0500 Subject: [PATCH 01/12] =?UTF-8?q?=F0=9F=8E=89=20add=20source=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dbt_project/models/access/demographics.sql | 15 ------- dbt_project/models/access/investigator.sql | 11 ----- dbt_project/models/access/study.sql | 41 ------------------- dbt_project/models/access/subject.sql | 10 ----- .../models/access/subject_assertation.sql | 40 ------------------ .../src/kf_sd_1nns3k8v_src_clinical.sql | 6 +++ .../sd_1nns3k8v/src/kf_sd_1nns3k8v_src_s3.sql | 6 +++ .../src/kf_sd_1nns3k8v_src_sequencing.sql | 6 +++ 8 files changed, 18 insertions(+), 117 deletions(-) delete mode 100644 dbt_project/models/access/demographics.sql delete mode 100644 dbt_project/models/access/investigator.sql delete mode 100644 dbt_project/models/access/study.sql delete mode 100644 dbt_project/models/access/subject.sql delete mode 100644 dbt_project/models/access/subject_assertation.sql create mode 100644 dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_clinical.sql create mode 100644 dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_s3.sql create mode 100644 dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_sequencing.sql diff --git a/dbt_project/models/access/demographics.sql b/dbt_project/models/access/demographics.sql deleted file mode 100644 index 07c2711..0000000 --- a/dbt_project/models/access/demographics.sql +++ /dev/null @@ -1,15 +0,0 @@ -{{ config( - schema='access' -) }} - -select distinct - participant_id as subject_id, - gender as sex, - race, - ethnicity, - 'Not Applicable' as down_syndrome_status, - 'Not Reported' as age_at_last_vital_status, -- ideally needs to be pulled from outcome data - 'Not Reported' as vital_status, - 'Not Reported' as age_at_first_engagement, - external_id -from {{ ref('stable_participant') }} \ No newline at end of file diff --git a/dbt_project/models/access/investigator.sql b/dbt_project/models/access/investigator.sql deleted file mode 100644 index 2e76303..0000000 --- a/dbt_project/models/access/investigator.sql +++ /dev/null @@ -1,11 +0,0 @@ -{{ config( - schema='access' -) }} - -select distinct - name, - institution, - 'Principle Investigator' as investigator_title, - 'Not Reported' as email, - external_id -from {{ ref('stable_investigator') }} \ No newline at end of file diff --git a/dbt_project/models/access/study.sql b/dbt_project/models/access/study.sql deleted file mode 100644 index 8cf8518..0000000 --- a/dbt_project/models/access/study.sql +++ /dev/null @@ -1,41 +0,0 @@ -{{ config( - schema='access' -) }} - --- WIP --- should some of the constants be assigned upstream in the stable tables? --- it would make it easier in the access model when bringing in every study --- these fields are not explictly provided in kf - -select distinct - s.study_id, -- do we want to use the KF study id or the dewrangle one? - 'X01' as funding_source, -- is this the funding source for every kf study? - s.name as study_title, -- is study title the full study name? - s.short_code as study_code, - s.short_name as study_short_name, - 'KF' as program, -- see enum list; should this always be a constant? or should we bother mapping what we have in study.program? - 'Not Reported' as study_description, -- we have this just not stored in any db - s.domain as research_domain, -- current linkml enums don't quite match - 'Pediatric' as participant_lifespan_age, -- all kf studies should be set to pediatric - 'Not Reported' as selection_criteria, -- we may have this info in intake sheets, but not stored in any db for kf - 'Longitudinal' as study_design, -- not stored in a db, but might be in intake sheets ; otherwise not explicitly provided for kf - 'Unknown' as clinical_data_source_type, - 'Genomics' as data_category, -- this should be derived from available seq types in DS; need to review best apporach - 'Not Reported' as website, - count(distinct p.participant_id) as expected_number_of_participants, -- should this be a count of every pt in ds? - count(case when p.visible='true' then participant_id end) as actual_number_of_participants, -- should this be a count of only vis pt in ds? - 'Not Reported' as acknowledgements, - 'Not Reported' as citation_statement, - 'Not Reported' as doi, - i.name as contact, - s.parent_study_id as parent_study, - i.name as principle_investigator, - s.external_id -- using phs number for now -from {{ ref('stable_investigator') }} as i -inner join {{ ref('stable_study') }} as s on i.investigator_id=s.investigator_id-- think about how this would be adjusted to account for all stable_study tables in KF -inner join {{ ref('stable_participant') }} as p on s.study_id=p.study_id -group by - s.study_id, funding_source, study_title, study_code, study_short_name, - program, study_description, research_domain, participant_lifespan_age, selection_criteria, - study_design, clinical_data_source_type, data_category, website, acknowledgements, citation_statement, doi, s.external_id, - contact, parent_study, principle_investigator \ No newline at end of file diff --git a/dbt_project/models/access/subject.sql b/dbt_project/models/access/subject.sql deleted file mode 100644 index 1d1c71d..0000000 --- a/dbt_project/models/access/subject.sql +++ /dev/null @@ -1,10 +0,0 @@ -{{ config( - schema='access' -) }} - -select distinct - participant_id as subject_id, - 'KF participant' as subject_type, -- adding in KF as a source identifier - external_id --- organism_type -- not sure if this is needed. could be set to 'human' if needed -from {{ ref('stable_participant') }} \ No newline at end of file diff --git a/dbt_project/models/access/subject_assertation.sql b/dbt_project/models/access/subject_assertation.sql deleted file mode 100644 index cd98781..0000000 --- a/dbt_project/models/access/subject_assertation.sql +++ /dev/null @@ -1,40 +0,0 @@ -{{ config( - schema='access' -) }} - -( - select distinct - diagnosis_id as assertion_id, - participant_id as subject_id, - -- assertion_provenance, -- don't think we have this available in KF - age_at_event_days as age_at_assertion, -- could also use age_at_event - -- age_at_event, - -- age_at_resolution, - mondo_id_diagnosis as concept_source, -- using mondo as the primary ontology for diagnoses - -- value_number, - source_text_diagnosis as value_source -- not sure this is right place - -- value_units, - -- value_units_source - from {{ ref('stable_diagnosis') }} - -) - -union all - -( - select distinct - phenotype_id as assertion_id, - participant_id as subject_id, - -- assertion_provenance, -- don't think we have this available in KF - age_at_event_days as age_at_assertion, -- could also use age_at_event - -- age_at_event, - -- age_at_resolution, - hpo_id_phenotype as concept_source, -- using hpo as the primary ontology for phenotypes - -- value_number, - source_text_phenotype as value_source -- not sure this is right place - -- value_units, - -- value_units_source - from {{ ref('stable_phenotype') }} -) - - diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_clinical.sql b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_clinical.sql new file mode 100644 index 0000000..0efe7e1 --- /dev/null +++ b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_clinical.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from {{ ref('kids_first_update') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_s3.sql b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_s3.sql new file mode 100644 index 0000000..e06c9a4 --- /dev/null +++ b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_s3.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from {{ ref('s3_scrape_cody') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_sequencing.sql b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_sequencing.sql new file mode 100644 index 0000000..a0ddfdd --- /dev/null +++ b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_sequencing.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from {{ ref('sample') }} \ No newline at end of file From 4e2b0fd2dd9b9663cdef7c37672016fed574e226 Mon Sep 17 00:00:00 2001 From: Amanda Warkow Date: Thu, 22 Jan 2026 19:13:08 -0500 Subject: [PATCH 02/12] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20fix=20mismatched=20f?= =?UTF-8?q?ilenames?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql | 6 ++++++ .../kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql | 6 ++++++ .../sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql | 6 ++++++ .../sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_bsgf.sql | 2 +- .../sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_diagnosis.sql | 2 +- .../sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_family.sql | 2 +- .../int/kf_sd_1nns3k8v_int_genomic_files.sql | 2 +- .../int/kf_sd_1nns3k8v_int_investigator.sql | 2 +- .../sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_participant.sql | 2 +- .../sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_phenotype.sql | 2 +- .../sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_segf.sql | 2 +- .../int/kf_sd_1nns3k8v_int_sequencing_experiment.sql | 2 +- .../sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_specimens.sql | 2 +- .../sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_study.sql | 2 +- .../sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_bsgf.sql | 2 +- .../stable/kf_sd_1nns3k8v_stable_diagnosis.sql | 2 +- .../stable/kf_sd_1nns3k8v_stable_family.sql | 2 +- .../stable/kf_sd_1nns3k8v_stable_genomic_file.sql | 2 +- .../stable/kf_sd_1nns3k8v_stable_investigator.sql | 2 +- .../stable/kf_sd_1nns3k8v_stable_participant.sql | 2 +- .../stable/kf_sd_1nns3k8v_stable_phenotype.sql | 2 +- .../sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_segf.sql | 2 +- .../stable/kf_sd_1nns3k8v_stable_sequencing_experiment.sql | 2 +- .../stable/kf_sd_1nns3k8v_stable_specimens.sql | 2 +- .../sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_study.sql | 2 +- 25 files changed, 40 insertions(+), 22 deletions(-) create mode 100644 dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql create mode 100644 dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql create mode 100644 dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql new file mode 100644 index 0000000..0efe7e1 --- /dev/null +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from {{ ref('kids_first_update') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql new file mode 100644 index 0000000..e06c9a4 --- /dev/null +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from {{ ref('s3_scrape_cody') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql new file mode 100644 index 0000000..a0ddfdd --- /dev/null +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from {{ ref('sample') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_bsgf.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_bsgf.sql index 1880852..903b922 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_bsgf.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_bsgf.sql @@ -16,5 +16,5 @@ select distinct external_id, -- i think we can leave this out - it's rarely populated/used, visibility_reason, visibility_comment -from {{ ref('src_bsgf') }} +from {{ ref('kf_sd_1nns3k8v_src_bsgf') }} diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_diagnosis.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_diagnosis.sql index c1f0f61..493eae3 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_diagnosis.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_diagnosis.sql @@ -20,4 +20,4 @@ select distinct uberon_id_tumor_location, spatial_descriptor -from {{ ref('src_diagnosis') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_src_diagnosis') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_family.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_family.sql index cbf2906..42c91ee 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_family.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_family.sql @@ -13,4 +13,4 @@ select distinct visible, visibility_reason, visibility_comment -from {{ ref('src_family') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_src_family') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_genomic_files.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_genomic_files.sql index f34cf82..e223ec9 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_genomic_files.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_genomic_files.sql @@ -32,4 +32,4 @@ select distinct cavatica_file_id, cavatica_volume -from {{ ref('src_genomic_files') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_src_genomic_files') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_investigator.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_investigator.sql index f0719b9..60fc967 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_investigator.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_investigator.sql @@ -14,4 +14,4 @@ select distinct visible, visibility_reason, visibility_comment -from {{ ref('src_investigator') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_src_investigator') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_participant.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_participant.sql index 7327230..51c2866 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_participant.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_participant.sql @@ -22,4 +22,4 @@ select distinct visible, visibility_reason, -- can we standardize this a bit more? maybe release status instead of reason? and try to standardize more? visibility_comment -from {{ ref('src_participant') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_src_participant') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_phenotype.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_phenotype.sql index b755230..f7613d4 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_phenotype.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_phenotype.sql @@ -16,4 +16,4 @@ select distinct snomed_id_phenotype, external_id -from {{ ref('src_phenotype') }} +from {{ ref('kf_sd_1nns3k8v_src_phenotype') }} diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_segf.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_segf.sql index 05f30ff..446d4f5 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_segf.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_segf.sql @@ -13,4 +13,4 @@ select distinct kf_id as segf_id, visibility_reason, visibility_comment -from {{ ref ('src_segf') }} \ No newline at end of file +from {{ ref ('kf_sd_1nns3k8v_src_segf') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_sequencing_experiment.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_sequencing_experiment.sql index 749e045..42e4060 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_sequencing_experiment.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_sequencing_experiment.sql @@ -55,4 +55,4 @@ select distinct umi_barcode_read, umi_barcode_size -from {{ ref('src_sequencing_experiments') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_src_sequencing_experiments') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_specimens.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_specimens.sql index 7ec9c69..f7dc671 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_specimens.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_specimens.sql @@ -43,4 +43,4 @@ select distinct amount, amount_units, cell_entity -from {{ ref('src_specimens') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_src_specimens') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_study.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_study.sql index 57f527f..7645f82 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_study.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/int/kf_sd_1nns3k8v_int_study.sql @@ -27,4 +27,4 @@ select distinct biobank_name, -- has been NA for kids first, only used for CBTN, should we keep? biobank_request_instructions, -- has been NA for kids first, only used for CBTN, should we keep? biobank_request_link -- has been NA for kids first, only used for CBTN, should we keep? -from {{ ref('src_study') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_src_study') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_bsgf.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_bsgf.sql index 77691a7..5efcb54 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_bsgf.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_bsgf.sql @@ -16,4 +16,4 @@ select distinct external_id, -- i think we can leave this out - it's rarely populated/used, visibility_reason, visibility_comment -from {{ ref('int_bsgf') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_int_bsgf') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_diagnosis.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_diagnosis.sql index 95a8076..cf672d0 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_diagnosis.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_diagnosis.sql @@ -19,4 +19,4 @@ select distinct uberon_id_tumor_location, spatial_descriptor -from {{ ref('int_diagnosis') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_int_diagnosis') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_family.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_family.sql index 9c68b90..f64c0ea 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_family.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_family.sql @@ -10,6 +10,6 @@ select distinct visible, visibility_reason, visibility_comment -from {{ ref('int_family') }} +from {{ ref('kf_sd_1nns3k8v_int_family') }} diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_genomic_file.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_genomic_file.sql index 48ce216..e40e1b1 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_genomic_file.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_genomic_file.sql @@ -27,4 +27,4 @@ select distinct file_version_descriptor, cavatica_volume -from {{ ref('int_genomic_files') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_int_genomic_files') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_investigator.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_investigator.sql index 0a3151a..0a8a032 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_investigator.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_investigator.sql @@ -14,4 +14,4 @@ select distinct visible, visibility_reason, visibility_comment -from {{ ref('int_investigator') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_int_investigator') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_participant.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_participant.sql index a10b326..d283148 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_participant.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_participant.sql @@ -21,4 +21,4 @@ select distinct visible, visibility_reason, visibility_comment -from {{ ref('int_participant') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_int_participant') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_phenotype.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_phenotype.sql index 057de7b..b38262b 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_phenotype.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_phenotype.sql @@ -13,4 +13,4 @@ select distinct observed, snomed_id_phenotype, external_id -from {{ ref('int_phenotype') }} +from {{ ref('kf_sd_1nns3k8v_int_phenotype') }} diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_segf.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_segf.sql index 7bf0a44..2677a68 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_segf.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_segf.sql @@ -13,4 +13,4 @@ select distinct segf_id, visibility_reason, visibility_comment -from {{ ref ('int_segf') }} \ No newline at end of file +from {{ ref ('kf_sd_1nns3k8v_int_segf') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_sequencing_experiment.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_sequencing_experiment.sql index ba14384..74fdaab 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_sequencing_experiment.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_sequencing_experiment.sql @@ -53,4 +53,4 @@ select distinct umi_barcode_offset, umi_barcode_read, umi_barcode_size -from {{ ref('int_sequencing_experiment') }} +from {{ ref('kf_sd_1nns3k8v_int_sequencing_experiment') }} diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_specimens.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_specimens.sql index 9a28948..d651227 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_specimens.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_specimens.sql @@ -40,4 +40,4 @@ select distinct amount, amount_units, cell_entity -from {{ ref('int_specimens') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_int_specimens') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_study.sql b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_study.sql index 615fbc1..77d94a9 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_study.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v_test/stable/kf_sd_1nns3k8v_stable_study.sql @@ -27,4 +27,4 @@ select distinct biobank_name, -- has been NA for kids first, only used for CBTN, should we keep? biobank_request_instructions, -- has been NA for kids first, only used for CBTN, should we keep? biobank_request_link -- has been NA for kids first, only used for CBTN, should we keep? -from {{ ref('int_study') }} \ No newline at end of file +from {{ ref('kf_sd_1nns3k8v_int_study') }} \ No newline at end of file From 22e1de3ebd3030dcce973eabd553e6c0312df828 Mon Sep 17 00:00:00 2001 From: Amanda Warkow Date: Fri, 23 Jan 2026 12:57:36 -0500 Subject: [PATCH 03/12] =?UTF-8?q?=E2=9C=A8=20add=20int=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../int/kf_sd_1nns3k8v_int_clinical.sql | 21 ++++++++++-- .../sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql | 10 ++++-- .../int/kf_sd_1nns3k8v_int_sequencing.sql | 32 +++++++++++++++++-- 3 files changed, 55 insertions(+), 8 deletions(-) diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql index 0efe7e1..1cacf62 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql @@ -1,6 +1,21 @@ {{ config( - schema='src' + schema='int' ) }} -select * -from {{ ref('kids_first_update') }} \ No newline at end of file +select distinct + "Family ID" as family_id, + "Participant ID" as participant_id, + "Family Relationship Target Participant ID" as family_relationship_target_participant_id, + "alliquot_id" as aliquot_id, + "Consent Group" as consent_group, + "Age at Sample" as age_at_sample, + "Sample Composition" as sample_composition, + "Sample Tissue Type" as sample_tissue_type, + "Race" as race, + "Sex" as sex, + "Ethnicity" as ethnicity, + "Analyte Type" as analyte_type, + "Age at Phenotype" as age_at_phenotype, + "Affected Status" as affected_status, + "Vital Status" as vital_status +from {{ ref('kf_sd_1nns3k8v_src_clinical') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql index e06c9a4..4873529 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql @@ -1,6 +1,12 @@ {{ config( - schema='src' + schema='int' ) }} -select * +select distinct + "Bucket" as bucket, + "Key" as key, + "LastModified" as last_modified, + "ETag" as etag, + "Size" as size, + "StorageClass" as storage_class from {{ ref('s3_scrape_cody') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql index a0ddfdd..ecccf19 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql @@ -1,6 +1,32 @@ {{ config( - schema='src' + schema='int' ) }} -select * -from {{ ref('sample') }} \ No newline at end of file +select distinct + "entity:sample_id" as sample_id, + "project" as project, + "collaborator_sample_id" as collaborator_sample_id, + -- "version" as version, + "data_type" as data_type, + -- "pdo" as pdo, + -- "release_date" as release_date, + -- "reference_sequence_name" as reference_sequence_name, + "collaborator_participant_id" as collaborator_participant_id, + "cram_path" as cram_path, + "crai_path" as crai_path, + "md5_path" as md5_path, + "root_sample_id" as root_sample_id, + "mean_coverage" as mean_coverage, + "adapter_rate" as adapter_rate, + -- "pf_hq_aligned_q20_bases" as pf_hq_aligned_q20_bases, + -- "pf_hq_aligned_reads" as pf_hq_aligned_reads, + -- "pf_mismatch_rate" as pf_mismatch_rate, + -- "pf_noise_reads" as pf_noise_reads, + -- "pf_reads" as pf_reads, + -- "pf_reads_aligned" as pf_reads_aligned, + "total_reads" as total_reads, + "mean_read_length" as mean_read_length, + -- "pf_reads_rate" as pf_reads_rate, + -- "pf_reads_aligned_rate" as pf_reads_aligned_rate, + -- "pf_aligned_bases" as pf_aligned_bases, + -- "reads_aligned_in_pairs" as reads_aligned_in_pairs, \ No newline at end of file From e53c69a920e2024147e3772a67c879169382ffc9 Mon Sep 17 00:00:00 2001 From: Amanda Warkow Date: Fri, 23 Jan 2026 13:33:33 -0500 Subject: [PATCH 04/12] =?UTF-8?q?=F0=9F=93=9D=20add=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../sd_1nns3k8v/int/kf_sd_1nns3k8v_int.yml | 91 +++++++++++++++++++ .../int/kf_sd_1nns3k8v_int_sequencing.sql | 2 +- 2 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int.yml diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int.yml b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int.yml new file mode 100644 index 0000000..ab4bd47 --- /dev/null +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int.yml @@ -0,0 +1,91 @@ +version: 2 + +models: + +- name: kf_sd_1nns3k8v_int_clinical + description: '{{ doc("int_clinical") }}' + config: + meta: + study: kf_sd_1nns3k8v + columns: + - name: family_id + description: '{{ doc("family_id") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: family_relationship_target_participant_id + description: '{{ doc("family_relationship_target_participant_id") }}' + - name: aliquot_id + description: '{{ doc("aliquot_id") }}' + - name: consent_group + description: '{{ doc("consent_group") }}' + - name: age_at_sample + description: '{{ doc("age_at_sample") }}' + - name: sample_composition + description: '{{ doc("sample_composition") }}' + - name: sample_tissue_type + description: '{{ doc("sample_tissue_type") }}' + - name: race + description: '{{ doc("race") }}' + - name: sex + description: '{{ doc("sex") }}' + - name: ethnicity + description: '{{ doc("ethnicity") }}' + - name: analyte_type + description: '{{ doc("analyte_type") }}' + - name: age_at_phenotype + description: '{{ doc("age_at_phenotype") }}' + - name: affected_status + description: '{{ doc("affected_status") }}' + - name: vital_status + description: '{{ doc("vital_status") }}' + +- name: kf_sd_1nns3k8v_int_sequencing + description: '{{ doc("int_sequencing") }}' + config: + meta: + study: kf_sd_1nns3k8v + columns: + - name: sample_id + description: '{{ doc("sample_id") }}' + - name: project + description: '{{ doc("project") }}' + - name: collaborator_sample_id + description: '{{ doc("collaborator_sample_id") }}' + - name: data_type + description: '{{ doc("data_type") }}' + - name: collaborator_participant_id + description: '{{ doc("collaborator_participant_id") }}' + - name: cram_path + description: '{{ doc("cram_path") }}' + - name: crai_path + description: '{{ doc("crai_path") }}' + - name: md5_path + description: '{{ doc("md5_path") }}' + - name: root_sample_id + description: '{{ doc("root_sample_id") }}' + - name: mean_coverage + description: '{{ doc("mean_coverage") }}' + - name: total_reads + description: '{{ doc("total_reads") }}' + - name: mean_read_length + description: '{{ doc("mean_read_length") }}' + + +- name: kf_sd_1nns3k8v_int_s3 + description: '{{ doc("int_s3") }}' + config: + meta: + study: kf_sd_1nns3k8v + columns: + - name: bucket + description: '{{ doc("bucket") }}' + - name: key + description: '{{ doc("key") }}' + - name: last_modified + description: '{{ doc("last_modified") }}' + - name: etag + description: '{{ doc("etag") }}' + - name: size + description: '{{ doc("size") }}' + - name: storage_class + description: '{{ doc("storage_class") }}' \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql index ecccf19..33a4b26 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql @@ -17,7 +17,7 @@ select distinct "md5_path" as md5_path, "root_sample_id" as root_sample_id, "mean_coverage" as mean_coverage, - "adapter_rate" as adapter_rate, + -- "adapter_rate" as adapter_rate, -- "pf_hq_aligned_q20_bases" as pf_hq_aligned_q20_bases, -- "pf_hq_aligned_reads" as pf_hq_aligned_reads, -- "pf_mismatch_rate" as pf_mismatch_rate, From a3098653a3aff2b58a45300e7765487fc344c038 Mon Sep 17 00:00:00 2001 From: Amanda Warkow Date: Fri, 23 Jan 2026 13:33:52 -0500 Subject: [PATCH 05/12] =?UTF-8?q?=E2=9C=A8=20add=20DAG?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dags/kids_first/kf_sd_1nns3k8v.py | 36 +++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 dags/kids_first/kf_sd_1nns3k8v.py diff --git a/dags/kids_first/kf_sd_1nns3k8v.py b/dags/kids_first/kf_sd_1nns3k8v.py new file mode 100644 index 0000000..4d86a13 --- /dev/null +++ b/dags/kids_first/kf_sd_1nns3k8v.py @@ -0,0 +1,36 @@ +from airflow.sdk import Variable + +from cosmos import ( + DbtDag, + ProjectConfig, + ProfileConfig, + ExecutionConfig, + RenderConfig, +) +from cosmos.profiles import PostgresUserPasswordProfileMapping + +profile_config = ProfileConfig( + # make sure target_name and profile_mapping align + profile_name=Variable.get("DBT_PROFILE_NAME"), + target_name="prd", + profile_mapping=PostgresUserPasswordProfileMapping( + conn_id="postgres_prd_svc", + profile_args={"schema": "prd"}, + ), +) + +example_study_dag = DbtDag( + project_config=ProjectConfig( + Variable.get("DBT_PROJECT_DIR"), + install_dbt_deps=True, + ), + profile_config=profile_config, + execution_config=ExecutionConfig( + dbt_executable_path=Variable.get("DBT_EXECUTABLE_PATH"), + ), + render_config=RenderConfig(select=["config.meta.study:kf_sd_1nns3k8v"]), + # normal dag parameters + schedule="@daily", + dag_id="kf_sd_1nns3k8v_dbt_dag", + tags=["POC", "Kids First"], +) \ No newline at end of file From 4f70d82b1bfcc75614469facabb56b336ffce780 Mon Sep 17 00:00:00 2001 From: Amanda Warkow Date: Fri, 23 Jan 2026 13:43:26 -0500 Subject: [PATCH 06/12] =?UTF-8?q?=F0=9F=93=9D=20add=20missing=20doc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml new file mode 100644 index 0000000..7cb7c73 --- /dev/null +++ b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml @@ -0,0 +1,91 @@ +version: 2 + +models: + +- name: kf_sd_1nns3k8v_src_clinical + description: '{{ doc("src_clinical") }}' + config: + meta: + study: kf_sd_1nns3k8v + columns: + - name: family_id + description: '{{ doc("family_id") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: family_relationship_target_participant_id + description: '{{ doc("family_relationship_target_participant_id") }}' + - name: aliquot_id + description: '{{ doc("aliquot_id") }}' + - name: consent_group + description: '{{ doc("consent_group") }}' + - name: age_at_sample + description: '{{ doc("age_at_sample") }}' + - name: sample_composition + description: '{{ doc("sample_composition") }}' + - name: sample_tissue_type + description: '{{ doc("sample_tissue_type") }}' + - name: race + description: '{{ doc("race") }}' + - name: sex + description: '{{ doc("sex") }}' + - name: ethnicity + description: '{{ doc("ethnicity") }}' + - name: analyte_type + description: '{{ doc("analyte_type") }}' + - name: age_at_phenotype + description: '{{ doc("age_at_phenotype") }}' + - name: affected_status + description: '{{ doc("affected_status") }}' + - name: vital_status + description: '{{ doc("vital_status") }}' + +- name: kf_sd_1nns3k8v_src_sequencing + description: '{{ doc("src_sequencing") }}' + config: + meta: + study: kf_sd_1nns3k8v + columns: + - name: sample_id + description: '{{ doc("sample_id") }}' + - name: project + description: '{{ doc("project") }}' + - name: collaborator_sample_id + description: '{{ doc("collaborator_sample_id") }}' + - name: data_type + description: '{{ doc("data_type") }}' + - name: collaborator_participant_id + description: '{{ doc("collaborator_participant_id") }}' + - name: cram_path + description: '{{ doc("cram_path") }}' + - name: crai_path + description: '{{ doc("crai_path") }}' + - name: md5_path + description: '{{ doc("md5_path") }}' + - name: root_sample_id + description: '{{ doc("root_sample_id") }}' + - name: mean_coverage + description: '{{ doc("mean_coverage") }}' + - name: total_reads + description: '{{ doc("total_reads") }}' + - name: mean_read_length + description: '{{ doc("mean_read_length") }}' + + +- name: kf_sd_1nns3k8v_src_s3 + description: '{{ doc("src_s3") }}' + config: + meta: + study: kf_sd_1nns3k8v + columns: + - name: bucket + description: '{{ doc("bucket") }}' + - name: key + description: '{{ doc("key") }}' + - name: last_modified + description: '{{ doc("last_modified") }}' + - name: etag + description: '{{ doc("etag") }}' + - name: size + description: '{{ doc("size") }}' + - name: storage_class + description: '{{ doc("storage_class") }}' \ No newline at end of file From e7be05e802e07579feeb2aba6eb8b6c4d29ca95d Mon Sep 17 00:00:00 2001 From: Amanda Warkow Date: Fri, 23 Jan 2026 13:46:09 -0500 Subject: [PATCH 07/12] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20add=20back=20mistake?= =?UTF-8?q?nly=20removed=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dbt_project/models/access/demographics.sql | 15 +++++++ dbt_project/models/access/investigator.sql | 11 +++++ dbt_project/models/access/study.sql | 41 +++++++++++++++++++ dbt_project/models/access/subject.sql | 10 +++++ .../models/access/subject_assertation.sql | 40 ++++++++++++++++++ 5 files changed, 117 insertions(+) create mode 100644 dbt_project/models/access/demographics.sql create mode 100644 dbt_project/models/access/investigator.sql create mode 100644 dbt_project/models/access/study.sql create mode 100644 dbt_project/models/access/subject.sql create mode 100644 dbt_project/models/access/subject_assertation.sql diff --git a/dbt_project/models/access/demographics.sql b/dbt_project/models/access/demographics.sql new file mode 100644 index 0000000..07c2711 --- /dev/null +++ b/dbt_project/models/access/demographics.sql @@ -0,0 +1,15 @@ +{{ config( + schema='access' +) }} + +select distinct + participant_id as subject_id, + gender as sex, + race, + ethnicity, + 'Not Applicable' as down_syndrome_status, + 'Not Reported' as age_at_last_vital_status, -- ideally needs to be pulled from outcome data + 'Not Reported' as vital_status, + 'Not Reported' as age_at_first_engagement, + external_id +from {{ ref('stable_participant') }} \ No newline at end of file diff --git a/dbt_project/models/access/investigator.sql b/dbt_project/models/access/investigator.sql new file mode 100644 index 0000000..2e76303 --- /dev/null +++ b/dbt_project/models/access/investigator.sql @@ -0,0 +1,11 @@ +{{ config( + schema='access' +) }} + +select distinct + name, + institution, + 'Principle Investigator' as investigator_title, + 'Not Reported' as email, + external_id +from {{ ref('stable_investigator') }} \ No newline at end of file diff --git a/dbt_project/models/access/study.sql b/dbt_project/models/access/study.sql new file mode 100644 index 0000000..8cf8518 --- /dev/null +++ b/dbt_project/models/access/study.sql @@ -0,0 +1,41 @@ +{{ config( + schema='access' +) }} + +-- WIP +-- should some of the constants be assigned upstream in the stable tables? +-- it would make it easier in the access model when bringing in every study +-- these fields are not explictly provided in kf + +select distinct + s.study_id, -- do we want to use the KF study id or the dewrangle one? + 'X01' as funding_source, -- is this the funding source for every kf study? + s.name as study_title, -- is study title the full study name? + s.short_code as study_code, + s.short_name as study_short_name, + 'KF' as program, -- see enum list; should this always be a constant? or should we bother mapping what we have in study.program? + 'Not Reported' as study_description, -- we have this just not stored in any db + s.domain as research_domain, -- current linkml enums don't quite match + 'Pediatric' as participant_lifespan_age, -- all kf studies should be set to pediatric + 'Not Reported' as selection_criteria, -- we may have this info in intake sheets, but not stored in any db for kf + 'Longitudinal' as study_design, -- not stored in a db, but might be in intake sheets ; otherwise not explicitly provided for kf + 'Unknown' as clinical_data_source_type, + 'Genomics' as data_category, -- this should be derived from available seq types in DS; need to review best apporach + 'Not Reported' as website, + count(distinct p.participant_id) as expected_number_of_participants, -- should this be a count of every pt in ds? + count(case when p.visible='true' then participant_id end) as actual_number_of_participants, -- should this be a count of only vis pt in ds? + 'Not Reported' as acknowledgements, + 'Not Reported' as citation_statement, + 'Not Reported' as doi, + i.name as contact, + s.parent_study_id as parent_study, + i.name as principle_investigator, + s.external_id -- using phs number for now +from {{ ref('stable_investigator') }} as i +inner join {{ ref('stable_study') }} as s on i.investigator_id=s.investigator_id-- think about how this would be adjusted to account for all stable_study tables in KF +inner join {{ ref('stable_participant') }} as p on s.study_id=p.study_id +group by + s.study_id, funding_source, study_title, study_code, study_short_name, + program, study_description, research_domain, participant_lifespan_age, selection_criteria, + study_design, clinical_data_source_type, data_category, website, acknowledgements, citation_statement, doi, s.external_id, + contact, parent_study, principle_investigator \ No newline at end of file diff --git a/dbt_project/models/access/subject.sql b/dbt_project/models/access/subject.sql new file mode 100644 index 0000000..1d1c71d --- /dev/null +++ b/dbt_project/models/access/subject.sql @@ -0,0 +1,10 @@ +{{ config( + schema='access' +) }} + +select distinct + participant_id as subject_id, + 'KF participant' as subject_type, -- adding in KF as a source identifier + external_id +-- organism_type -- not sure if this is needed. could be set to 'human' if needed +from {{ ref('stable_participant') }} \ No newline at end of file diff --git a/dbt_project/models/access/subject_assertation.sql b/dbt_project/models/access/subject_assertation.sql new file mode 100644 index 0000000..cd98781 --- /dev/null +++ b/dbt_project/models/access/subject_assertation.sql @@ -0,0 +1,40 @@ +{{ config( + schema='access' +) }} + +( + select distinct + diagnosis_id as assertion_id, + participant_id as subject_id, + -- assertion_provenance, -- don't think we have this available in KF + age_at_event_days as age_at_assertion, -- could also use age_at_event + -- age_at_event, + -- age_at_resolution, + mondo_id_diagnosis as concept_source, -- using mondo as the primary ontology for diagnoses + -- value_number, + source_text_diagnosis as value_source -- not sure this is right place + -- value_units, + -- value_units_source + from {{ ref('stable_diagnosis') }} + +) + +union all + +( + select distinct + phenotype_id as assertion_id, + participant_id as subject_id, + -- assertion_provenance, -- don't think we have this available in KF + age_at_event_days as age_at_assertion, -- could also use age_at_event + -- age_at_event, + -- age_at_resolution, + hpo_id_phenotype as concept_source, -- using hpo as the primary ontology for phenotypes + -- value_number, + source_text_phenotype as value_source -- not sure this is right place + -- value_units, + -- value_units_source + from {{ ref('stable_phenotype') }} +) + + From 4b19372958994138da2b47454bff85398859a158 Mon Sep 17 00:00:00 2001 From: Amanda Warkow Date: Fri, 23 Jan 2026 14:31:50 -0500 Subject: [PATCH 08/12] =?UTF-8?q?=F0=9F=93=9D=20fix=20doc=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../docs_fields.md | 88 +++++++++++++++++++ .../docs_tables.md | 26 ++++++ .../models/access/subject_assertation.sql | 40 --------- .../sd_1nns3k8v/int/kf_sd_1nns3k8v_int.yml | 10 +-- .../int/kf_sd_1nns3k8v_int_sequencing.sql | 4 +- .../sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml | 12 +-- 6 files changed, 123 insertions(+), 57 deletions(-) delete mode 100644 dbt_project/models/access/subject_assertation.sql diff --git a/dbt_project/models/_metadata_description_files/docs_fields.md b/dbt_project/models/_metadata_description_files/docs_fields.md index 06b7cba..4c2ccf5 100644 --- a/dbt_project/models/_metadata_description_files/docs_fields.md +++ b/dbt_project/models/_metadata_description_files/docs_fields.md @@ -600,3 +600,91 @@ Link to request specimens from study biobank. +## Cody Fields +{% docs family_relationship_target_participant_id %} +Participant ID of Proband of Family +{% enddocs %} + +{% docs aliquot_id %} +Aliquot ID from sequencing manifest +{% enddocs %} + +{% docs consent_group %} +Consent group of study +{% enddocs %} + +{% docs age_at_sample %} +Age sample was collected from sequencing manifest +{% enddocs %} + +{% docs sample_composition %} +Composition of specimen +{% enddocs %} + +{% docs sample_tissue_type %} +Tissue type of collected specimen +{% enddocs %} + +{% docs sex %} +Sex of pariticipant +{% enddocs %} + +{% docs age_at_phenotype %} +Age of participant when phenotype was asserted +{% enddocs %} + +{% docs vital_status %} +Vital status of participant +{% enddocs %} + +{% docs collaborator_sample_id %} +Sample ID submitted by PI to sequencing center +{% enddocs %} + +{% docs collaborator_participant_id %} +Participant ID submitted by PI to sequencing center +{% enddocs %} + +{% docs cram_path %} +path of cram file from sequencing manifest +{% enddocs %} + +{% docs crai_path %} +path of crai file from sequencing manifest +{% enddocs %} + +{% docs md5_path %} +path of md5 file from sequencing manifest +{% enddocs %} + +{% docs root_sample_id %} +Specimen sample ID from sequencing manifest +{% enddocs %} + +{% docs mean_coverage %} +mean coverage value from sequencing manifest +{% enddocs %} + +{% docs bucket %} +s3 bucket for file +{% enddocs %} + +{% docs key %} +s3 key for file +{% enddocs %} + +{% docs last_modified %} +last time s3 file was modified +{% enddocs %} + +{% docs etag %} +etag hash from s3 manifest +{% enddocs %} + +{% docs size %} +size of file from s3 manifest +{% enddocs %} + +{% docs storage_class %} +s3 storage class for file +{% enddocs %} \ No newline at end of file diff --git a/dbt_project/models/_metadata_description_files/docs_tables.md b/dbt_project/models/_metadata_description_files/docs_tables.md index 5d03390..d254b48 100644 --- a/dbt_project/models/_metadata_description_files/docs_tables.md +++ b/dbt_project/models/_metadata_description_files/docs_tables.md @@ -136,4 +136,30 @@ Stable table for int_specimens. Finalized mapping of transformed dataservice ent {% docs stable_study %} Stable table for int_study. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% enddocs %} + + +## Kids First Study SD_1NNS3K8V +{% docs kf_sd_1nns3k8v_src_clinical %} +Source table for Cody study source clinical data +{% enddocs %} + +{% docs kf_sd_1nns3k8v_src_sequencing %} +Source table for Cody study source sequencing data +{% enddocs %} + +{% docs kf_sd_1nns3k8v_src_s3 %} +Source table for Cody study s3 file manifest +{% enddocs %} + +{% docs kf_sd_1nns3k8v_int_clinical %} +Intermediate table for src_clinical. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% enddocs %} + +{% docs kf_sd_1nns3k8v_int_sequencing %} +Intermediate table for src_sequencing. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% enddocs %} + +{% docs kf_sd_1nns3k8v_int_s3 %} +Intermediate table for src_s3. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. {% enddocs %} \ No newline at end of file diff --git a/dbt_project/models/access/subject_assertation.sql b/dbt_project/models/access/subject_assertation.sql deleted file mode 100644 index cd98781..0000000 --- a/dbt_project/models/access/subject_assertation.sql +++ /dev/null @@ -1,40 +0,0 @@ -{{ config( - schema='access' -) }} - -( - select distinct - diagnosis_id as assertion_id, - participant_id as subject_id, - -- assertion_provenance, -- don't think we have this available in KF - age_at_event_days as age_at_assertion, -- could also use age_at_event - -- age_at_event, - -- age_at_resolution, - mondo_id_diagnosis as concept_source, -- using mondo as the primary ontology for diagnoses - -- value_number, - source_text_diagnosis as value_source -- not sure this is right place - -- value_units, - -- value_units_source - from {{ ref('stable_diagnosis') }} - -) - -union all - -( - select distinct - phenotype_id as assertion_id, - participant_id as subject_id, - -- assertion_provenance, -- don't think we have this available in KF - age_at_event_days as age_at_assertion, -- could also use age_at_event - -- age_at_event, - -- age_at_resolution, - hpo_id_phenotype as concept_source, -- using hpo as the primary ontology for phenotypes - -- value_number, - source_text_phenotype as value_source -- not sure this is right place - -- value_units, - -- value_units_source - from {{ ref('stable_phenotype') }} -) - - diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int.yml b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int.yml index ab4bd47..ec4f3ea 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int.yml +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int.yml @@ -3,7 +3,7 @@ version: 2 models: - name: kf_sd_1nns3k8v_int_clinical - description: '{{ doc("int_clinical") }}' + description: '{{ doc("kf_sd_1nns3k8v_int_clinical") }}' config: meta: study: kf_sd_1nns3k8v @@ -40,19 +40,15 @@ models: description: '{{ doc("vital_status") }}' - name: kf_sd_1nns3k8v_int_sequencing - description: '{{ doc("int_sequencing") }}' + description: '{{ doc("kf_sd_1nns3k8v_int_sequencing") }}' config: meta: study: kf_sd_1nns3k8v columns: - name: sample_id description: '{{ doc("sample_id") }}' - - name: project - description: '{{ doc("project") }}' - name: collaborator_sample_id description: '{{ doc("collaborator_sample_id") }}' - - name: data_type - description: '{{ doc("data_type") }}' - name: collaborator_participant_id description: '{{ doc("collaborator_participant_id") }}' - name: cram_path @@ -72,7 +68,7 @@ models: - name: kf_sd_1nns3k8v_int_s3 - description: '{{ doc("int_s3") }}' + description: '{{ doc("kf_sd_1nns3k8v_int_s3") }}' config: meta: study: kf_sd_1nns3k8v diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql index 33a4b26..69d3794 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql @@ -4,10 +4,10 @@ select distinct "entity:sample_id" as sample_id, - "project" as project, + -- "project" as project, "collaborator_sample_id" as collaborator_sample_id, -- "version" as version, - "data_type" as data_type, + -- "data_type" as data_type, -- "pdo" as pdo, -- "release_date" as release_date, -- "reference_sequence_name" as reference_sequence_name, diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml index 7cb7c73..d4ae549 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml +++ b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml @@ -3,12 +3,12 @@ version: 2 models: - name: kf_sd_1nns3k8v_src_clinical - description: '{{ doc("src_clinical") }}' + description: '{{ doc("kf_sd_1nns3k8v_src_clinical") }}' config: meta: study: kf_sd_1nns3k8v columns: - - name: family_id + - name: "family_id" description: '{{ doc("family_id") }}' - name: participant_id description: '{{ doc("participant_id") }}' @@ -40,19 +40,15 @@ models: description: '{{ doc("vital_status") }}' - name: kf_sd_1nns3k8v_src_sequencing - description: '{{ doc("src_sequencing") }}' + description: '{{ doc("kf_sd_1nns3k8v_src_sequencing") }}' config: meta: study: kf_sd_1nns3k8v columns: - name: sample_id description: '{{ doc("sample_id") }}' - - name: project - description: '{{ doc("project") }}' - name: collaborator_sample_id description: '{{ doc("collaborator_sample_id") }}' - - name: data_type - description: '{{ doc("data_type") }}' - name: collaborator_participant_id description: '{{ doc("collaborator_participant_id") }}' - name: cram_path @@ -72,7 +68,7 @@ models: - name: kf_sd_1nns3k8v_src_s3 - description: '{{ doc("src_s3") }}' + description: '{{ doc("kf_sd_1nns3k8v_src_s3") }}' config: meta: study: kf_sd_1nns3k8v From 5028c13b027c700adad6ea850f8467ae4d0ad110 Mon Sep 17 00:00:00 2001 From: Amanda Warkow Date: Tue, 27 Jan 2026 15:05:48 -0500 Subject: [PATCH 09/12] =?UTF-8?q?=E2=9C=85=20resolve=20seed=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../int/kf_sd_1nns3k8v_int_sequencing.sql | 6 +++-- dbt_project/seeds/_seeds.yml | 23 ++++++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql index 69d3794..33b1689 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql @@ -25,8 +25,10 @@ select distinct -- "pf_reads" as pf_reads, -- "pf_reads_aligned" as pf_reads_aligned, "total_reads" as total_reads, - "mean_read_length" as mean_read_length, + "mean_read_length" as mean_read_length -- "pf_reads_rate" as pf_reads_rate, -- "pf_reads_aligned_rate" as pf_reads_aligned_rate, -- "pf_aligned_bases" as pf_aligned_bases, - -- "reads_aligned_in_pairs" as reads_aligned_in_pairs, \ No newline at end of file + -- "reads_aligned_in_pairs" as reads_aligned_in_pairs, + +from {{ ref('kf_sd_1nns3k8v_src_sequencing') }} \ No newline at end of file diff --git a/dbt_project/seeds/_seeds.yml b/dbt_project/seeds/_seeds.yml index 3ea41e7..09a2d9f 100644 --- a/dbt_project/seeds/_seeds.yml +++ b/dbt_project/seeds/_seeds.yml @@ -9,4 +9,25 @@ seeds: - name: letter description: A letter of the alphabet - name: letter_grouping - description: The grouping that the letter belongs to \ No newline at end of file + description: The grouping that the letter belongs to + + - name: s3_scrape_cody + config: + column_types: + Size: bigint + + - name: sample + config: + column_types: + total_reads: bigint + pf_aligned_bases: bigint + pf_hq_aligned_bases: bigint + pf_hq_aligned_q20_bases: bigint + genome_territory: bigint + library-1_estimated_library_size: bigint + # pf_reads: bigint + # pf_reads_aligned: bigint + # reads_aligned_in_pairs: bigint + # pf_hq_aligned_reads: bigint + # library-1_read_pairs: bigint + # pf_reads_improper_pairs: bigint \ No newline at end of file From 3d1eaf1d7235deae88cf086edacb0485b7994c67 Mon Sep 17 00:00:00 2001 From: Amanda Warkow Date: Tue, 27 Jan 2026 15:18:02 -0500 Subject: [PATCH 10/12] =?UTF-8?q?=E2=9C=8F=EF=B8=8F=20lint=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../example_study/my_first_dbt_model.sql | 7 ++- .../int/kf_sd_1nns3k8v_int_clinical.sql | 30 +++++------ .../sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql | 12 ++--- .../int/kf_sd_1nns3k8v_int_sequencing.sql | 54 +++++++++---------- 4 files changed, 51 insertions(+), 52 deletions(-) diff --git a/dbt_project/models/kids_first/example_study/my_first_dbt_model.sql b/dbt_project/models/kids_first/example_study/my_first_dbt_model.sql index 5b51873..279006b 100644 --- a/dbt_project/models/kids_first/example_study/my_first_dbt_model.sql +++ b/dbt_project/models/kids_first/example_study/my_first_dbt_model.sql @@ -1,4 +1,3 @@ - /* Welcome to your first dbt model! Did you know that you can also configure models directly within SQL files? @@ -11,9 +10,9 @@ with source_data as ( - select 1 as id - union all - select null as id + select 1 as id + union all + select null as id ) diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql index 1cacf62..404613a 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_clinical.sql @@ -3,19 +3,19 @@ ) }} select distinct - "Family ID" as family_id, - "Participant ID" as participant_id, - "Family Relationship Target Participant ID" as family_relationship_target_participant_id, - "alliquot_id" as aliquot_id, - "Consent Group" as consent_group, - "Age at Sample" as age_at_sample, - "Sample Composition" as sample_composition, - "Sample Tissue Type" as sample_tissue_type, - "Race" as race, - "Sex" as sex, - "Ethnicity" as ethnicity, - "Analyte Type" as analyte_type, - "Age at Phenotype" as age_at_phenotype, - "Affected Status" as affected_status, - "Vital Status" as vital_status + "Family ID" as family_id, + "Participant ID" as participant_id, + "Family Relationship Target Participant ID" as family_relationship_target_participant_id, + alliquot_id as aliquot_id, + "Consent Group" as consent_group, + "Age at Sample" as age_at_sample, + "Sample Composition" as sample_composition, + "Sample Tissue Type" as sample_tissue_type, + "Race" as race, + "Sex" as sex, + "Ethnicity" as ethnicity, + "Analyte Type" as analyte_type, + "Age at Phenotype" as age_at_phenotype, + "Affected Status" as affected_status, + "Vital Status" as vital_status from {{ ref('kf_sd_1nns3k8v_src_clinical') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql index 4873529..bb15b5e 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_s3.sql @@ -3,10 +3,10 @@ ) }} select distinct - "Bucket" as bucket, - "Key" as key, - "LastModified" as last_modified, - "ETag" as etag, - "Size" as size, - "StorageClass" as storage_class + "Bucket" as bucket, + "Key" as key, + "LastModified" as last_modified, + "ETag" as etag, + "Size" as size, + "StorageClass" as storage_class from {{ ref('s3_scrape_cody') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql index 33b1689..8ae9f7a 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v/int/kf_sd_1nns3k8v_int_sequencing.sql @@ -3,32 +3,32 @@ ) }} select distinct - "entity:sample_id" as sample_id, - -- "project" as project, - "collaborator_sample_id" as collaborator_sample_id, - -- "version" as version, - -- "data_type" as data_type, - -- "pdo" as pdo, - -- "release_date" as release_date, - -- "reference_sequence_name" as reference_sequence_name, - "collaborator_participant_id" as collaborator_participant_id, - "cram_path" as cram_path, - "crai_path" as crai_path, - "md5_path" as md5_path, - "root_sample_id" as root_sample_id, - "mean_coverage" as mean_coverage, - -- "adapter_rate" as adapter_rate, - -- "pf_hq_aligned_q20_bases" as pf_hq_aligned_q20_bases, - -- "pf_hq_aligned_reads" as pf_hq_aligned_reads, - -- "pf_mismatch_rate" as pf_mismatch_rate, - -- "pf_noise_reads" as pf_noise_reads, - -- "pf_reads" as pf_reads, - -- "pf_reads_aligned" as pf_reads_aligned, - "total_reads" as total_reads, - "mean_read_length" as mean_read_length - -- "pf_reads_rate" as pf_reads_rate, - -- "pf_reads_aligned_rate" as pf_reads_aligned_rate, - -- "pf_aligned_bases" as pf_aligned_bases, - -- "reads_aligned_in_pairs" as reads_aligned_in_pairs, + "entity:sample_id" as sample_id, + -- "project" as project, + collaborator_sample_id, + -- "version" as version, + -- "data_type" as data_type, + -- "pdo" as pdo, + -- "release_date" as release_date, + -- "reference_sequence_name" as reference_sequence_name, + collaborator_participant_id, + cram_path, + crai_path, + md5_path, + root_sample_id, + mean_coverage, + -- "adapter_rate" as adapter_rate, + -- "pf_hq_aligned_q20_bases" as pf_hq_aligned_q20_bases, + -- "pf_hq_aligned_reads" as pf_hq_aligned_reads, + -- "pf_mismatch_rate" as pf_mismatch_rate, + -- "pf_noise_reads" as pf_noise_reads, + -- "pf_reads" as pf_reads, + -- "pf_reads_aligned" as pf_reads_aligned, + total_reads, + mean_read_length +-- "pf_reads_rate" as pf_reads_rate, +-- "pf_reads_aligned_rate" as pf_reads_aligned_rate, +-- "pf_aligned_bases" as pf_aligned_bases, +-- "reads_aligned_in_pairs" as reads_aligned_in_pairs, from {{ ref('kf_sd_1nns3k8v_src_sequencing') }} \ No newline at end of file From 5271e2d29cfacfd8c2fc3f46e9bc7211e4ae8b21 Mon Sep 17 00:00:00 2001 From: Amanda Warkow Date: Tue, 27 Jan 2026 19:01:57 -0500 Subject: [PATCH 11/12] =?UTF-8?q?=F0=9F=93=9D=20fix=20documentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../docs_fields.md | 39 ++++++++++++++++++- dbt_project/seeds/_seeds.yml | 1 + 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/dbt_project/models/_metadata_description_files/docs_fields.md b/dbt_project/models/_metadata_description_files/docs_fields.md index 4c2ccf5..e027cc2 100644 --- a/dbt_project/models/_metadata_description_files/docs_fields.md +++ b/dbt_project/models/_metadata_description_files/docs_fields.md @@ -598,9 +598,10 @@ Link to request specimens from study biobank. {% enddocs %} +## Kids First Source Data Fields +### Clinical Data -## Cody Fields {% docs family_relationship_target_participant_id %} Participant ID of Proband of Family {% enddocs %} @@ -637,6 +638,35 @@ Age of participant when phenotype was asserted Vital status of participant {% enddocs %} + +### Broad Manifest + +{% docs collaborator_sample_id %} +Sample ID submitted by PI to sequencing center +{% enddocs %} + +{% docs collaborator_participant_id %} +Participant ID submitted by PI to sequencing center +{% enddocs %} + +{% docs cram_path %} +path of cram file from sequencing manifest +{% enddocs %} + +{% docs crai_path %} +path of crai file from sequencing manifest +{% enddocs %} + +{% docs md5_path %} +path of md5 file from sequencing manifest +{% enddocs %} + +{% docs root_sample_id %} +Specimen sample ID from sequencing manifest +{% enddocs %} + +### S3 Scrape + {% docs collaborator_sample_id %} Sample ID submitted by PI to sequencing center {% enddocs %} @@ -687,4 +717,9 @@ size of file from s3 manifest {% docs storage_class %} s3 storage class for file -{% enddocs %} \ No newline at end of file +{% enddocs %} + + +## Study Specific Fields + +# SD_1NNS3K8V Fields \ No newline at end of file diff --git a/dbt_project/seeds/_seeds.yml b/dbt_project/seeds/_seeds.yml index 09a2d9f..d98ab0f 100644 --- a/dbt_project/seeds/_seeds.yml +++ b/dbt_project/seeds/_seeds.yml @@ -19,6 +19,7 @@ seeds: - name: sample config: column_types: + # these were all the columns that had the highest integers and I narrowed it down to what would pass total_reads: bigint pf_aligned_bases: bigint pf_hq_aligned_bases: bigint From 25e1bea62a6ddee55d180e9697d98f2f5ad632e7 Mon Sep 17 00:00:00 2001 From: Amanda Warkow Date: Wed, 4 Feb 2026 13:36:10 -0500 Subject: [PATCH 12/12] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20make=20seed=20files?= =?UTF-8?q?=20prd=20sources?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .sqlfluff | 2 +- .../docs_fields.md | 24 ------------------- .../sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml | 10 ++++++++ .../src/kf_sd_1nns3k8v_src_clinical.sql | 2 +- .../sd_1nns3k8v/src/kf_sd_1nns3k8v_src_s3.sql | 2 +- .../src/kf_sd_1nns3k8v_src_sequencing.sql | 2 +- dbt_project/profiles.yml | 2 +- 7 files changed, 15 insertions(+), 29 deletions(-) diff --git a/.sqlfluff b/.sqlfluff index 995d6dc..7b88f5a 100644 --- a/.sqlfluff +++ b/.sqlfluff @@ -10,7 +10,7 @@ project_dir = dbt_project target = dev # If needed, uncomment the line below to specify the directory where your # profiles.yml file is located -profiles_dir = ~/.dbt/include +profiles_dir = ~/.dbt/ [sqlfluff:indentation] tab_space_size = 2 diff --git a/dbt_project/models/_metadata_description_files/docs_fields.md b/dbt_project/models/_metadata_description_files/docs_fields.md index e027cc2..a5cd9aa 100644 --- a/dbt_project/models/_metadata_description_files/docs_fields.md +++ b/dbt_project/models/_metadata_description_files/docs_fields.md @@ -667,30 +667,6 @@ Specimen sample ID from sequencing manifest ### S3 Scrape -{% docs collaborator_sample_id %} -Sample ID submitted by PI to sequencing center -{% enddocs %} - -{% docs collaborator_participant_id %} -Participant ID submitted by PI to sequencing center -{% enddocs %} - -{% docs cram_path %} -path of cram file from sequencing manifest -{% enddocs %} - -{% docs crai_path %} -path of crai file from sequencing manifest -{% enddocs %} - -{% docs md5_path %} -path of md5 file from sequencing manifest -{% enddocs %} - -{% docs root_sample_id %} -Specimen sample ID from sequencing manifest -{% enddocs %} - {% docs mean_coverage %} mean coverage value from sequencing manifest {% enddocs %} diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml index d4ae549..3daf953 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml +++ b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src.yml @@ -1,5 +1,15 @@ version: 2 +sources: + - name: kf_sd_1nns3k8v_src + database: includewarehouse + schema: prd_import + tables: + - name: kids_first_update + - name: sample + - name: s3_scrape_cody + + models: - name: kf_sd_1nns3k8v_src_clinical diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_clinical.sql b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_clinical.sql index 0efe7e1..83f5178 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_clinical.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_clinical.sql @@ -3,4 +3,4 @@ ) }} select * -from {{ ref('kids_first_update') }} \ No newline at end of file +from {{ source('kf_sd_1nns3k8v_src', 'kids_first_update') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_s3.sql b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_s3.sql index e06c9a4..d05fff0 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_s3.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_s3.sql @@ -3,4 +3,4 @@ ) }} select * -from {{ ref('s3_scrape_cody') }} \ No newline at end of file +from {{ source('kf_sd_1nns3k8v_src', 's3_scrape_cody') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_sequencing.sql b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_sequencing.sql index a0ddfdd..12ecfce 100644 --- a/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_sequencing.sql +++ b/dbt_project/models/kids_first/sd_1nns3k8v/src/kf_sd_1nns3k8v_src_sequencing.sql @@ -3,4 +3,4 @@ ) }} select * -from {{ ref('sample') }} \ No newline at end of file +from {{ source('kf_sd_1nns3k8v_src', 'sample') }} \ No newline at end of file diff --git a/dbt_project/profiles.yml b/dbt_project/profiles.yml index 6269949..5388117 100644 --- a/dbt_project/profiles.yml +++ b/dbt_project/profiles.yml @@ -26,7 +26,7 @@ include_dbt_sandbox: user: "{{ env_var('INCLUDEWAREHOUSE_SCV_USERNAME') }}" password: "{{ env_var('INCLUDEWAREHOUSE_SCV_PASSWORD') }}" port: 5432 - dbname: postgres + dbname: includewarehouse schema: prd threads: 4