diff --git a/dags/kids_first/dataservice_studies.py b/dags/kids_first/dataservice_studies.py new file mode 100644 index 0000000..3c3a1d5 --- /dev/null +++ b/dags/kids_first/dataservice_studies.py @@ -0,0 +1,36 @@ +from airflow.sdk import Variable + +from cosmos import ( + DbtDag, + ProjectConfig, + ProfileConfig, + ExecutionConfig, + RenderConfig, +) +from cosmos.profiles import PostgresUserPasswordProfileMapping + +profile_config = ProfileConfig( + # make sure target_name and profile_mapping align + profile_name=Variable.get("DBT_PROFILE_NAME"), + target_name="prd", + profile_mapping=PostgresUserPasswordProfileMapping( + conn_id="postgres_prd_svc", + profile_args={"schema": "prd"}, + ), +) + +example_study_dag = DbtDag( + project_config=ProjectConfig( + Variable.get("DBT_PROJECT_DIR"), + install_dbt_deps=True, + ), + profile_config=profile_config, + execution_config=ExecutionConfig( + dbt_executable_path=Variable.get("DBT_EXECUTABLE_PATH"), + ), + render_config=RenderConfig(select=["config.meta.study:kf_dataservice_study"]), + # normal dag parameters + schedule="@daily", + dag_id="kf_dataservice_studies", + tags=["POC", "Kids First"], +) \ No newline at end of file diff --git a/dbt_project/models/_metadata_description_files/docs_fields.md b/dbt_project/models/_metadata_description_files/docs_fields.md index a5cd9aa..aee4563 100644 --- a/dbt_project/models/_metadata_description_files/docs_fields.md +++ b/dbt_project/models/_metadata_description_files/docs_fields.md @@ -128,6 +128,32 @@ The dewrangle generated id for a family. This id is a lower-cased version of the Denotes type of family using a set of enums, such as proband only or trio. Not currently populated in Kids First dataservie, but is calculcated by the portal etl and displayed on the Kids First portal. {% enddocs %} +### family relationship fields + +{% docs participant1_id %} +The kf id of one person in the family relationship. +{% enddocs %} + +{% docs participant2_id %} +The kf id of the second person in the family relationship. +{% enddocs %} + +{% docs participant1_to_participant2_relation %} +A descriptor that indicates person 1's genetic relationship to person 2. Is typically mother, father, child, or sibling. +{% enddocs %} + +{% docs participant2_to_participant1_relation %} +A descriptor that indicates person 2's genetic relationship to person 1. Is typically null, mother, father, son/daughter, brother/sister. +{% enddocs %} + +{% docs relationship_id %} +The Kids First assigned kf id that represents a genetic relationship between two participants. In the format, "FR_XXXXXXXX" +{% enddocs %} + +{% docs source_text_notes %} +Additional text notes from source describing the relationship. Not typically populated. +{% enddocs %} + ### genomic_file fields {% docs dewrangle_genomic_file_id %} @@ -204,6 +230,16 @@ The dewrangle generated id for an investigator. This id is a lower-cased version The name of the investigator's institution. {% enddocs %} +### outcome fields + +{% docs vital_status %} +The patient's reported state of being alive or deceased. +{% enddocs %} + +{% docs disease_related %} +A yes or no field indicating whether a patient's deceased vital status is a result of the disease. +{% enddocs %} + ### participant fields {% docs alias_group_id %} @@ -260,6 +296,34 @@ Denotes whether a phenotype is negative or positive The ID of the term from Systematized Nomenclature of Medicine --Clinical Terms which encodes clinical terminology. Not actively populated. {% enddocs %} +### sample fields + +{% docs sample_event_key %} +Identifier for event when sample was first drawn +{% enddocs %} + +{% docs tissue_type %} +Description of the kind of tissue collected if its a tissue type sample. +{% enddocs %} + +{% docs sample_type %} +The kind of material of the sample. +{% enddocs %} + +{% docs anatomical_location %} +The anatomical location of collection. +{% enddocs %} + +{% docs external_collection_id %} +Identifier for the collection event +{% enddocs %} + +### sequencing center fields + +{% docs sequencing_center_name %} +The official name of the sequencing center used to generate source genomic file outputs. +{% enddocs %} + ### sequencing experiment fields {% docs dewrangle_sequencing_experiment_id %} @@ -634,10 +698,6 @@ Sex of pariticipant Age of participant when phenotype was asserted {% enddocs %} -{% docs vital_status %} -Vital status of participant -{% enddocs %} - ### Broad Manifest diff --git a/dbt_project/models/_metadata_description_files/docs_tables.md b/dbt_project/models/_metadata_description_files/docs_tables.md index d254b48..ff03c6f 100644 --- a/dbt_project/models/_metadata_description_files/docs_tables.md +++ b/dbt_project/models/_metadata_description_files/docs_tables.md @@ -2,140 +2,188 @@ ## Kids First Dataservice Tables - Source Stage -{% docs src_bsgf %} +{% docs kf_ds_src_bsgf %} Kids First Dataservivce source table for linking specimens to genomic files. One file may be linked to many specimens. {% enddocs %} -{% docs src_diagnosis %} +{% docs kf_ds_src_diagnosis %} Kids First Dataservice source table for harmonized conditions curated to MONDO codes at the patient level. All conditions in this table are implied to be observed in patients. Each row represents one condition per patient. {% enddocs %} -{% docs src_family %} -Kids First Dataservice source table that holds family ids for each participant. This table can be joined to src_participants to obtain participant to family id mappings. +{% docs kf_ds_src_family %} +Kids First Dataservice source table that holds family ids for each participant. This table can be joined to kf_ds_src_participants to obtain participant to family id mappings. {% enddocs %} -{% docs src_genomic_files %} -Kids First Dataservice source table that holds raw and harmonized genomic file outputs. This table provides file and bioinformatic workflow metadata for each file. Must be joined to src_bsgf to obtain specimen to file mappings. +{% docs kf_ds_src_family_relationship %} +Kids First Dataservice source table that holds family relationships for each participant. Usually only reports relationships for duos, trios, or trios+. {% enddocs %} -{% docs src_investigator %} +{% docs kf_ds_src_genomic_file %} +Kids First Dataservice source table that holds raw and harmonized genomic file outputs. This table provides file and bioinformatic workflow metadata for each file. Must be joined to kf_ds_src_bsgf to obtain specimen to file mappings. +{% enddocs %} + +{% docs kf_ds_src_investigator %} Kids First Dataservice source table for investigator information. Only contains minimal contact information for the Principle Investigator of a study. One investigator may be associated to multiple study ids. {% enddocs %} -{% docs src_participant %} -Kids First Dataservice source table for participant demographic information. Also contains information regarding a participant's affected status. Links each participant to an assigned family id from src_family and an assigned study id from src_study. +{% docs kf_ds_src_outcome %} +Kids First Dataservice source table for outcome information. Reports the vital status of patients and whether or not death was disease related. +{% enddocs %} + +{% docs kf_ds_src_participant %} +Kids First Dataservice source table for participant demographic information. Also contains information regarding a participant's affected status. Links each participant to an assigned family id from kf_ds_src_family and an assigned study id from kf_ds_src_study. {% enddocs %} -{% docs src_phenotype %} +{% docs kf_ds_src_phenotype %} Kids First Dataservice source table for harmonized conditions curated to HPO codes at the patient level. Conditions can be observed or not observed in a patient. Each row represents one condition and observation status per patient. {% enddocs %} -{% docs src_segf %} +{% docs kf_ds_src_sample %} +Kids First Dataservice source table for samples. +{% enddocs %} + +{% docs kf_ds_src_segf %} Kids First Dataservice source table for linking sequencing experiments to genomic files. Multiple files can be linked to one sequencing experiment. {% enddocs %} -{% docs src_sequencing_experiments %} +{% docs kf_ds_src_sequencing_center %} +Kids First Dataserivce source table for sequencing center information. +{% enddocs %} + +{% docs kf_ds_src_sequencing_experiment %} Kids First Dataservice source table for sequencing experiments that holds sequencing metadata. {% enddocs %} -{% docs src_specimens %} +{% docs kf_ds_src_biospecimen %} Kids First Dataservice source table for biospecimen information. Contains specimen collection information and specimen material information, as well as VBR specific entities to support CBTN VBR fields. Each row represents one aliquot per participant. {% enddocs %} -{% docs src_study %} +{% docs kf_ds_src_study %} Kids First Dataservice source table for study metadata. Contains full and short study names, study codes, study program, and dbgap phs numbers. {% enddocs %} ## Kids First Dataservice Tables - Int Stage -{% docs int_bsgf %} -Intermediate table for src_bsgf. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% docs kf_ds_int_bsgf %} +Intermediate table for kf_ds_src_bsgf. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% enddocs %} + +{% docs kf_ds_int_diagnosis %} +Intermediate table for kf_ds_src_diagnosis. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% enddocs %} + +{% docs kf_ds_int_family %} +Intermediate table for kf_ds_src_family. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. {% enddocs %} -{% docs int_diagnosis %} -Intermediate table for src_diagnosis. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% docs kf_ds_int_family_relationship %} +Intermediate table for kf_ds_src_family_relationship. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. {% enddocs %} -{% docs int_family %} -Intermediate table for src_family. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% docs kf_ds_int_genomic_file %} +Intermediate table for kf_ds_src_genomic_files. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. {% enddocs %} -{% docs int_genomic_files %} -Intermediate table for src_genomic_files. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% docs kf_ds_int_investigator %} +Intermediate table for kf_ds_src_investigator. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. {% enddocs %} -{% docs int_investigator %} -Intermediate table for src_investigator. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% docs kf_ds_int_outcome %} +Intermediate table for kf_ds_src_outcome. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. {% enddocs %} -{% docs int_participant %} -Intermediate table for src_participant. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% docs kf_ds_int_participant %} +Intermediate table for kf_ds_src_participant. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. {% enddocs %} -{% docs int_phenotype %} -Intermediate table for src_phenotype. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% docs kf_ds_int_phenotype %} +Intermediate table for kf_ds_src_phenotype. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. {% enddocs %} -{% docs int_segf %} -Intermediate table for src_segf. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% docs kf_ds_int_sample %} +Intermediate table for kf_ds_src_sample. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. {% enddocs %} -{% docs int_sequencing_experiment %} -Intermediate table for src_sequencing_experiments. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% docs kf_ds_int_sequencing_center %} +Intermediate table for kf_ds_src_sequencing_center. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. {% enddocs %} -{% docs int_specimens %} -Intermediate table for src_specimens. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% docs kf_ds_int_segf %} +Intermediate table for kf_ds_src_segf. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. {% enddocs %} -{% docs int_study %} -Intermediate table for src_study. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% docs kf_ds_int_sequencing_experiment %} +Intermediate table for kf_ds_src_sequencing_experiments. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% enddocs %} + +{% docs kf_ds_int_biospecimen %} +Intermediate table for kf_ds_src_biospecimen. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. +{% enddocs %} + +{% docs kf_ds_int_study %} +Intermediate table for kf_ds_src_study. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed. {% enddocs %} ## Kids First Dataservice Tables - Stable Stage -{% docs stable_bsgf %} -Stable table for int_bsgf. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% docs kf_ds_stable_bsgf %} +Stable table for kf_ds_int_bsgf. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% enddocs %} + +{% docs kf_ds_stable_diagnosis %} +Stable table for kf_ds_int_diagnosis. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% enddocs %} + +{% docs kf_ds_stable_family %} +Stable table for kf_ds_int_family. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% enddocs %} + +{% docs kf_ds_stable_family_relationship %} +Stable table for kf_ds_src_family_relationship. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% enddocs %} + +{% docs kf_ds_stable_genomic_file %} +Stable table for kf_ds_int_families. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. {% enddocs %} -{% docs stable_diagnosis %} -Stable table for int_diagnosis. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% docs kf_ds_stable_investigator %} +Stable table for kf_ds_int_investigator. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. {% enddocs %} -{% docs stable_family %} -Stable table for int_family. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% docs kf_ds_stable_outcome %} +Stable table for kf_ds_int_outcome. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. {% enddocs %} -{% docs stable_genomic_file %} -Stable table for int_families. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% docs kf_ds_stable_participant %} +Stable table for kf_ds_int_participant. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. {% enddocs %} -{% docs stable_investigator %} -Stable table for int_investigator. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% docs kf_ds_stable_phenotype %} +Stable table for kf_ds_int_phenotype. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. {% enddocs %} -{% docs stable_participant %} -Stable table for int_participant. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% docs kf_ds_stable_sample %} +Stable table for kf_ds_int_sample. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. {% enddocs %} -{% docs stable_phenotype %} -Stable table for int_phenotype. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% docs kf_ds_stable_sequencing_center %} +Stable table for kf_ds_int_sequencing_center. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. {% enddocs %} -{% docs stable_segf %} -Stable table for int_segf. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% docs kf_ds_stable_segf %} +Stable table for kf_ds_int_segf. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. {% enddocs %} -{% docs stable_sequencing_experiment %} -Stable table for int_sequencing_experiment. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% docs kf_ds_stable_sequencing_experiment %} +Stable table for kf_ds_int_sequencing_experiment. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. {% enddocs %} -{% docs stable_specimens %} -Stable table for int_specimens. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% docs kf_ds_stable_biospecimen %} +Stable table for kf_ds_int_biospecimen. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. {% enddocs %} -{% docs stable_study %} -Stable table for int_study. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. +{% docs kf_ds_stable_study %} +Stable table for kf_ds_int_study. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer. {% enddocs %} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int.yaml b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int.yaml new file mode 100644 index 0000000..0feb58c --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int.yaml @@ -0,0 +1,673 @@ +version: 2 + +models: + +- name: kf_ds_int_bsgf + description: '{{ doc("kf_ds_int_bsgf") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: bsgf_id + description: '{{ doc("bsgf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: genomic_file_id + description: '{{ doc("genomic_file_id") }}' + - name: biospecimen_id + description: '{{ doc("specimen_id") }}' + + +- name: kf_ds_int_diagnosis + description: '{{ doc("kf_ds_int_diagnosis") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: diagnosis_id + description: '{{ doc("diagnosis_id") }}' + - name: dewrangle_diagnosis_id + description: '{{ doc("dewrangle_diagnosis_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: source_text_diagnosis + description: '{{ doc("source_text_diagnosis") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + - name: mondo_id_diagnosis + description: '{{ doc("mondo_id_diagnosis") }}' + - name: icd_id_diagnosis + description: '{{ doc("icd_id_diagnosis") }}' + - name: diagnosis_category + description: '{{ doc("diagnosis_category") }}' + - name: source_text_tumor_location + description: '{{ doc("source_text_tumor_location") }}' + - name: uberon_id_tumor_location + description: '{{ doc("uberon_id_tumor_location") }}' + - name: spatial_descriptor + description: '{{ doc("spatial_descriptor") }}' + - name: observed + description: '{{ doc("observed") }}' + +- name: kf_ds_int_family_relationship + description: '{{ doc("kf_ds_int_family_relationship") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: relationship_id + description: '{{ doc("relationship_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: participant1_id + description: '{{ doc("participant1_id") }}' + - name: participant2_id + description: '{{ doc("participant2_id") }}' + - name: participant1_to_participant2_relation + description: '{{ doc("participant1_to_participant2_relation") }}' + - name: participant2_to_participant1_relation + description: '{{ doc("participant2_to_participant1_relation") }}' + - name: source_text_notes + description: '{{ doc("source_text_notes") }}' + +- name: kf_ds_int_family + description: '{{ doc("kf_ds_int_family") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: family_id + description: '{{ doc("family_id") }}' + - name: dewrangle_family_id + description: '{{ doc("dewrangle_family_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: family_type + description: '{{ doc("family_type") }}' + +- name: kf_ds_int_genomic_file + description: '{{ doc("kf_ds_int_genomic_file") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: genomic_file_id + description: '{{ doc("genomic_file_id") }}' + - name: dewrangle_genomic_file_id + description: '{{ doc("dewrangle_genomic_file_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: is_harmonized + description: '{{ doc("is_harmonized") }}' + - name: reference_genome + description: '{{ doc("reference_genome") }}' + - name: controlled_access + description: '{{ doc("controlled_access") }}' + - name: availability + description: '{{ doc("availability") }}' + - name: paired_end + description: '{{ doc("paired_end") }}' + - name: data_type + description: '{{ doc("data_type") }}' + - name: file_format + description: '{{ doc("file_format") }}' + - name: data_category + description: '{{ doc("data_category") }}' + - name: workflow_tool + description: '{{ doc("workflow_tool") }}' + - name: workflow_type + description: '{{ doc("workflow_type") }}' + - name: workflow_version + description: '{{ doc("workflow_version") }}' + - name: workflow_endpoint + description: '{{ doc("workflow_endpoint") }}' + - name: file_version_descriptor + description: '{{ doc("file_version_descriptor") }}' + - name: cavatica_file_id + description: '{{ doc("cavatica_file_id") }}' + - name: cavatica_volume + description: '{{ doc("cavatica_volume") }}' +- name: kf_ds_int_investigator + description: '{{ doc("kf_ds_int_investigator") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: investigator_id + description: '{{ doc("investigator_id") }}' + - name: dewrangle_investigator_id + description: '{{ doc("dewrangle_investigator_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: name + description: '{{ doc("name") }}' + - name: institution + description: '{{ doc("institution") }}' + +- name: kf_ds_int_outcome + description: '{{ doc("kf_ds_int_outcome") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: vital_status + description: '{{ doc("vital_status") }}' + - name: disease_related + description: '{{ doc("disease_related") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + +- name: kf_ds_int_participant + description: '{{ doc("kf_ds_int_participant") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: dewrangle_participant_id + description: '{{ doc("dewrangle_participant_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: alias_group_id + description: '{{ doc("alias_group_id") }}' + - name: study_id + description: '{{ doc("study_id") }}' + - name: dewrangle_study_id + description: '{{ doc("dewrangle_study_id") }}' + - name: family_id + description: '{{ doc("family_id") }}' + - name: is_proband + description: '{{ doc("is_proband") }}' + - name: race + description: '{{ doc("race") }}' + - name: ethnicity + description: '{{ doc("ethnicity") }}' + - name: gender + description: '{{ doc("gender") }}' + - name: affected_status + description: '{{ doc("affected_status") }}' + - name: species + description: '{{ doc("species") }}' +- name: kf_ds_int_phenotype + description: '{{ doc("kf_ds_int_phenotype") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: phenotype_id + description: '{{ doc("phenotype_id") }}' + - name: dewrangle_phenotype_id + description: '{{ doc("dewrangle_phenotype_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: source_text_phenotype + description: '{{ doc("source_text_phenotype") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + - name: hpo_id_phenotype + description: '{{ doc("hpo_id_phenotype") }}' + - name: observed + description: '{{ doc("observed") }}' + - name: snomed_id_phenotype + description: '{{ doc("snomed_id_phenotype") }}' + +- name: kf_ds_int_sample + description: '{{ doc("kf_ds_src_sample") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: external_sample_id + description: '{{ doc("sample_event_key") }}' + - name: tissue_type + description: '{{ doc("tissue_type") }}' + - name: composition + description: '{{ doc("sample_type") }}' + - name: anatomical_location + description: '{{ doc("anatomical_location") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: method_of_sample_procurement + description: '{{ doc("method_of_sample_procurement") }}' + - name: has_matched_normal_sample + description: '{{ doc("has_matched_normal_sample") }}' + - name: external_collection_id + description: '{{ doc("external_collection_id") }}' + - name: volume_ul + description: '{{ doc("volume_ul") }}' + - name: preservation_method + description: '{{ doc("preservation_method") }}' + - name: amount + description: '{{ doc("amount") }}' + - name: amount_units + description: '{{ doc("amount_units") }}' + +- name: kf_ds_int_sequencing_center + description: '{{ doc("kf_ds_int_sequencing_center") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: sequencing_center_name + description: '{{ doc("sequencing_center_name") }}' + +- name: kf_ds_int_segf + description: '{{ doc("kf_ds_int_segf") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: segf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: sequencing_experiment_id + description: '{{ doc("sequencing_experiment_id") }}' + - name: genomic_file_id + description: '{{ doc("genomic_file_id") }}' +- name: kf_ds_int_sequencing_experiment + description: '{{ doc("kf_ds_int_sequencing_experiment") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: sequencing_experiment_id + description: '{{ doc("sequencing_experiment_id") }}' + - name: dewrangle_sequencing_experiment_id + description: '{{ doc("dewrangle_sequencing_experiment_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: sequencing_center_id + description: '{{ doc("sequencing_center_id") }}' + - name: experiment_date + description: '{{ doc("experiment_date") }}' + - name: experiment_strategy + description: '{{ doc("experiment_strategy") }}' + - name: is_paired_end + description: '{{ doc("is_paired_end") }}' + - name: platform + description: '{{ doc("platform") }}' + - name: instrument_model + description: '{{ doc("instrument_model") }}' + - name: library_name + description: '{{ doc("library_name") }}' + - name: library_strand + description: '{{ doc("library_strand") }}' + - name: library_prep + description: '{{ doc("library_prep") }}' + - name: library_selection + description: '{{ doc("library_selection") }}' + - name: max_insert_size + description: '{{ doc("max_insert_size") }}' + - name: mean_insert_size + description: '{{ doc("mean_insert_size") }}' + - name: mean_depth + description: '{{ doc("mean_depth") }}' + - name: total_reads + description: '{{ doc("total_reads") }}' + - name: mean_read_length + description: '{{ doc("mean_read_length") }}' + - name: adapter_sequencing + description: '{{ doc("adapter_sequencing") }}' + - name: is_adapter_trimmed + description: '{{ doc("is_adapter_trimmed") }}' + - name: read_pair_number + description: '{{ doc("read_pair_number") }}' + - name: target_capture_kit + description: '{{ doc("target_capture_kit") }}' + - name: acquisition_type + description: '{{ doc("acquisition_type") }}' + - name: cdna_read + description: '{{ doc("cdna_read") }}' + - name: cdna_read_offset + description: '{{ doc("cdna_read_offset") }}' + - name: cell_barcode_offset + description: '{{ doc("cell_barcode_offset") }}' + - name: cell_barcode_read + description: '{{ doc("cell_barcode_read") }}' + - name: cell_barcode_size + description: '{{ doc("cell_barcode_size") }}' + - name: chromatography_approach + description: '{{ doc("chromatography_approach") }}' + - name: end_bias + description: '{{ doc("end_bias") }}' + - name: enrichment_approach + description: '{{ doc("enrichment_approach") }}' + - name: fraction_number + description: '{{ doc("fraction_number") }}' + - name: fractionation_approach + description: '{{ doc("fractionation_approach") }}' + - name: ion_fragmentation + description: '{{ doc("ion_fragmentation") }}' + - name: library_construction + description: '{{ doc("library_construction") }}' + - name: mass_spec_rawfile_conversion + description: '{{ doc("mass_spec_rawfile_conversion") }}' + - name: proteomics_experiment + description: '{{ doc("proteomics_experiment") }}' + - name: quantification_label_id + description: '{{ doc("quantification_label_id") }}' + - name: quantification_labeling_method + description: '{{ doc("quantification_labeling_method") }}' + - name: quantification_technique + description: '{{ doc("quantification_technique") }}' + - name: sequencing_mode + description: '{{ doc("sequencing_mode") }}' + - name: target_cell_number + description: '{{ doc("target_cell_number") }}' + - name: umi_barcode_offset + description: '{{ doc("umi_barcode_offset") }}' + - name: umi_barcode_read + description: '{{ doc("umi_barcode_read") }}' + - name: umi_barcode_size + description: '{{ doc("umi_barcode_size") }}' + +- name: kf_ds_int_biospecimen + description: '{{ doc("kf_ds_int_biospecimen") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: specimen_id + description: '{{ doc("specimen_id") }}' + - name: dewrangle_specimen_id + description: '{{ doc("dewrangle_specimen_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: external_sample_id + description: '{{ doc("external_sample_id") }}' + - name: external_aliquot_id + description: '{{ doc("external_aliquot_id") }}' + - name: source_text_tissue_type + description: '{{ doc("source_text_tissue_type") }}' + - name: composition + description: '{{ doc("composition") }}' + - name: source_text_anatomical_site + description: '{{ doc("source_text_anatomical_site") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + - name: source_text_tumor_descriptor + description: '{{ doc("source_text_tumor_descriptor") }}' + - name: analyte_type + description: '{{ doc("analyte_type") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: sequencing_center_id + description: '{{ doc("sequencing_center_id") }}' + - name: dbgap_consent_code + description: '{{ doc("dbgap_consent_code") }}' + - name: consent_type + description: '{{ doc("consent_type") }}' + - name: method_of_sample_procurement + description: '{{ doc("method_of_sample_procurement") }}' + - name: sample_id + description: '{{ doc("sample_id") }}' + - name: specimen_status + description: '{{ doc("specimen_status") }}' + - name: has_matched_normal_sample + description: '{{ doc("has_matched_normal_sample") }}' + - name: shipment_origin + description: '{{ doc("shipment_origin") }}' + - name: concentration_mg_per_ml + description: '{{ doc("concentration_mg_per_ml") }}' + - name: volume_ul + description: '{{ doc("volume_ul") }}' + - name: shipment_date + description: '{{ doc("shipment_date") }}' + - name: uberon_id_anatomical_site + description: '{{ doc("uberon_id_anatomical_site") }}' + - name: ncit_id_tissue_type + description: '{{ doc("ncit_id_tissue_type") }}' + - name: ncit_id_anatomical_site + description: '{{ doc("ncit_id_anatomical_site") }}' + - name: spatial_descriptor + description: '{{ doc("spatial_descriptor") }}' + - name: preservation_method + description: '{{ doc("preservation_method") }}' + - name: amount + description: '{{ doc("amount") }}' + - name: amount_units + description: '{{ doc("amount_units") }}' + - name: cell_entity + description: '{{ doc("cell_entity") }}' +- name: kf_ds_int_study + description: '{{ doc("kf_ds_int_study") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: study_id + description: '{{ doc("study_id") }}' + - name: dewrangle_study_id + description: '{{ doc("dewrangle_study_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: data_access_authority + description: '{{ doc("data_access_authority") }}' + - name: version + description: '{{ doc("version") }}' + - name: name + description: '{{ doc("name") }}' + - name: short_name + description: '{{ doc("short_name") }}' + - name: attribution + description: '{{ doc("attribution") }}' + - name: release_status + description: '{{ doc("release_status") }}' + - name: investigator_id + description: '{{ doc("investigator_id") }}' + - name: short_code + description: '{{ doc("short_code") }}' + - name: domain + description: '{{ doc("domain") }}' + - name: program + description: '{{ doc("program") }}' + - name: parent_study_id + description: '{{ doc("parent_study_id") }}' + - name: biobank_email + description: '{{ doc("biobank_email") }}' + - name: biobank_name + description: '{{ doc("biobank_name") }}' + - name: biobank_request_instructions + description: '{{ doc("biobank_request_instructions") }}' + - name: biobank_request_link + description: '{{ doc("biobank_request_link") }}' \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_biospecimen.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_biospecimen.sql new file mode 100644 index 0000000..781ea0b --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_biospecimen.sql @@ -0,0 +1,46 @@ +{{ config( + schema='int' +) }} + +-- need to review all fields in ds and which ones are neeeded +-- for now, extracting the main ones we use +select distinct + uuid, -- we can leave this out; it's not used, + created_at, + modified_at, + kf_id as specimen_id, + lower(replace(kf_id, '_', '-')) as dewrangle_specimen_id, + external_sample_id, + external_aliquot_id, + source_text_tissue_type, -- can rename without source text prefix? + composition, + source_text_anatomical_site, -- can rename without source text prefix? + age_at_event_days, + source_text_tumor_descriptor, -- can rename without source text prefix? + analyte_type, + participant_id, + sequencing_center_id, + dbgap_consent_code, -- maybe this should be on the pt level? + consent_type, -- maybe this should be on the pt level? + method_of_sample_procurement, + sample_id, -- is this needed? haven't exported 'sample' table for kf + specimen_status, -- should this be set as a constant for all kf studies ? + has_matched_normal_sample, + visible, + visibility_reason, + visibility_comment, + + -- these are fields we should discuss if needed + shipment_origin, + concentration_mg_per_ml, + volume_ul, + shipment_date, + uberon_id_anatomical_site, + ncit_id_tissue_type, + ncit_id_anatomical_site, + spatial_descriptor, + preservation_method, + amount, + amount_units, + cell_entity +from {{ ref('kf_ds_src_biospecimen') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_bsgf.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_bsgf.sql new file mode 100644 index 0000000..d8274a1 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_bsgf.sql @@ -0,0 +1,19 @@ +{{ config( + schema='int' +) }} + +-- we currently use this table in DS to link specimens/gfs +-- not sure if we would need it here, since we can easily add specimen ids to a files model + +select distinct + uuid, -- we can leave this out, it's not used + created_at, + modified_at, + genomic_file_id, + biospecimen_id, + kf_id as bsgf_id, + visible, + external_id, -- i think we can leave this out - it's rarely populated/used, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_src_bsgf') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_diagnosis.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_diagnosis.sql new file mode 100644 index 0000000..f38ab70 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_diagnosis.sql @@ -0,0 +1,22 @@ +{{ config( + schema='int' +) }} + +select distinct + uuid, -- we can leave this out; it's not used, + kf_id as diagnosis_id, + lower(replace(kf_id, 'DG_', 'cn-')) as dewrangle_diagnosis_id, + participant_id, + source_text_diagnosis, + age_at_event_days, + mondo_id_diagnosis, + icd_id_diagnosis, + 'Positive' as observed, -- implied by presence in diagnosis table + + -- unsure if necessary to include tehse fields + diagnosis_category, + external_id, + source_text_tumor_location, + uberon_id_tumor_location, + spatial_descriptor +from {{ ref('kf_ds_src_diagnosis') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_family.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_family.sql new file mode 100644 index 0000000..315773b --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_family.sql @@ -0,0 +1,16 @@ +{{ config( + schema='int' +) }} + +select distinct + uuid, -- we can leave this out; isn't used, + created_at, + modified_at, + kf_id as family_id, + lower(replace(kf_id, '_', '-')) as dewrangle_family_id, + external_id, + family_type, -- not historically populated but it should be; can use logic rules to calculate + visible, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_src_family') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_family_relationship.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_family_relationship.sql new file mode 100644 index 0000000..7457ce7 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_family_relationship.sql @@ -0,0 +1,19 @@ +{{ config( + schema='int' +) }} + +select distinct + uuid, -- we can leave this out, it's not used + created_at, + modified_at, + external_id, -- i think we can leave this out - it's rarely populated/used, + participant1_id, + participant2_id, + participant1_to_participant2_relation, + participant2_to_participant1_relation, + kf_id as relationship_id, + visible, + visibility_reason, + visibility_comment, + source_text_notes -- don't think we need this? we never populate it +from {{ ref('kf_ds_src_family_relationship') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_genomic_file.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_genomic_file.sql new file mode 100644 index 0000000..fa751ba --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_genomic_file.sql @@ -0,0 +1,34 @@ +{{ config( + schema='int' +) }} + +select distinct + uuid, -- we can leave this out; it's not used, + kf_id as genomic_file_id, + lower(replace(kf_id, '_', 'dr-')) as dewrangle_genomic_file_id, + created_at, -- think these could be useful when answering questions about changes over time + modified_at, -- ^^^ + external_id, + is_harmonized, + reference_genome, + controlled_access, -- would be nice to incorporate mappung logic for this field based on file type, location and harmonization + availability, + paired_end, + visible, + visibility_reason, + visibility_comment, + + -- should we discuss with bix about standardizing these values? I know ingest mapping logic has changed over time + data_type, + file_format, + data_category, + workflow_tool, + workflow_type, + workflow_version, + workflow_endpoint, + file_version_descriptor, -- should discuss with bix about reliability of these values in dataservice currently + + -- could be useful after delivery but would be null during source load + cavatica_file_id, + cavatica_volume +from {{ ref('kf_ds_src_genomic_file') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_investigator.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_investigator.sql new file mode 100644 index 0000000..3f496b7 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_investigator.sql @@ -0,0 +1,17 @@ +{{ config( + schema='int' +) }} + +select distinct + uuid, -- we can leave this out; isn't used, + created_at, + modified_at, + kf_id as investigator_id, + lower(replace(kf_id, '_', '-')) as dewrangle_investigator_id, + external_id, + name, + institution, + visible, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_src_investigator') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_outcome.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_outcome.sql new file mode 100644 index 0000000..d7085c8 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_outcome.sql @@ -0,0 +1,18 @@ +{{ config( + schema='int' +) }} + +select distinct + uuid, -- we can leave this out, it's not used + created_at, + modified_at, + external_id, -- i think we can leave this out - it's rarely populated/used, + vital_status, + disease_related, + age_at_event_days, + participant_id, + kf_id, + visible, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_src_outcome') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_participant.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_participant.sql new file mode 100644 index 0000000..8902c9e --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_participant.sql @@ -0,0 +1,25 @@ +{{ config( + schema='int' +) }} + +select distinct + uuid, -- we can leave this out ; it's not used, + created_at, + modified_at, + alias_group_id, -- we can leave this out; it's not used + study_id, + lower(replace(study_id, '_', '-')) as dewrangle_study_id, + kf_id as participant_id, + lower(replace(kf_id, '_', '-')) as dewrangle_participant_id, + external_id, + family_id, + is_proband, + race, + ethnicity, + gender, + affected_status, + species, + visible, + visibility_reason, -- can we standardize this a bit more? maybe release status instead of reason? and try to standardize more? + visibility_comment +from {{ ref('kf_ds_src_participant') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_phenotype.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_phenotype.sql new file mode 100644 index 0000000..b44a446 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_phenotype.sql @@ -0,0 +1,14 @@ +select distinct + uuid, -- we can leave this out; it's not used, + kf_id as phenotype_id, + lower(replace(kf_id, '_', 'cn-')) as dewrangle_phenotype_id, + participant_id, + source_text_phenotype, + age_at_event_days, + hpo_id_phenotype, + observed, + + -- additional fields that may be included + snomed_id_phenotype, + external_id +from {{ ref('kf_ds_src_phenotype') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_sample.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_sample.sql new file mode 100644 index 0000000..4f58c84 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_sample.sql @@ -0,0 +1,27 @@ +{{ config( + schema='int' +) }} + +select distinct + uuid, -- we can leave this out, it's not used + created_at, + modified_at, + external_id, + kf_id, + age_at_event_days, + sample_event_key, + tissue_type, + sample_type, + anatomical_location, + volume_ul, + method_of_sample_procurement, + preservation_method, + participant_id, + external_collection_id, + has_matched_normal_sample, + amount, + amount_units, + visible, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_src_sample') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_segf.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_segf.sql new file mode 100644 index 0000000..9b8fc4c --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_segf.sql @@ -0,0 +1,16 @@ +{{ config( + schema='int' +) }} + +select distinct + created_at, + modified_at, + uuid, -- we can leave this out, it's not used + visible, + sequencing_experiment_id, + genomic_file_id, + external_id, -- this is rarely populated, it could be left out + kf_id as segf_id, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_src_segf') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_sequencing_center.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_sequencing_center.sql new file mode 100644 index 0000000..5558404 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_sequencing_center.sql @@ -0,0 +1,17 @@ +{{ config( + schema='int' +) }} + +-- this entity as a whole may not be needed - a simple "seq center name" may be all that's needed in the seq exp table + +select distinct + uuid, -- we can leave this out, it's not used + created_at, + modified_at, + external_id, + kf_id, + name as sequencing_center_name, + visible, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_src_sequencing_center') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_sequencing_experiment.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_sequencing_experiment.sql new file mode 100644 index 0000000..7b82514 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_sequencing_experiment.sql @@ -0,0 +1,57 @@ +{{ config( + schema='int' +) }} + +select distinct + uuid, -- we can leave this out; it's not used, + kf_id as sequencing_experiment_id, + lower(replace(kf_id, '_', '-')) as dewrangle_sequencing_experiment_id, + sequencing_center_id, + external_id, + experiment_date, + experiment_strategy, + is_paired_end, + platform, + instrument_model, + visible, + visibility_reason, + visibility_comment, + + -- I think these should be included? talk with BIX + library_name, + library_strand, + library_prep, + library_selection, + max_insert_size, + mean_insert_size, + mean_depth, + total_reads, + mean_read_length, + adapter_sequencing, + is_adapter_trimmed, + read_pair_number, + target_capture_kit, + acquisition_type, + cdna_read, + cdna_read_offset, + cell_barcode_offset, + cell_barcode_read, + cell_barcode_size, + chromatography_approach, + end_bias, + enrichment_approach, + fraction_number, + fractionation_approach, + ion_fragmentation, + library_construction, + mass_spec_rawfile_conversion, + proteomics_experiment, + quantification_label_id, + quantification_labeling_method, + quantification_technique, + sequencing_mode, + target_cell_number, + umi_barcode_offset, + umi_barcode_read, + umi_barcode_size +from {{ ref('kf_ds_src_sequencing_experiment') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_study.sql b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_study.sql new file mode 100644 index 0000000..2835ce5 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/int/kf_ds_int_study.sql @@ -0,0 +1,34 @@ +{{ config( + schema='int' +) }} + +select distinct + created_at, + modified_at, + uuid, -- we can leave this out, it's not used + data_access_authority, -- usually value is dbgap here + external_id,-- usually we make this the phs number - would be worth renaming to reflect that + version, -- seems useful, but is rarely ever used. + name, -- this is the full name of the study; might be worth renaming + short_name, -- do we need name and short name? + attribution, -- not exactly sure what would go here, we rarely populate it + release_status, -- like the idea of this, but it's not consistently updated + investigator_id, -- like the idea of this, but again not consistently populated, should it be? + kf_id as study_id, + lower(replace(kf_id, '_', '-')) as dewrangle_study_id, + visible, + short_code, + domain, -- this is research domain e.g., CANCER vs SBD + program, + visibility_reason, + visibility_comment, + parent_study_id, + biobank_email, -- has been NA for kids first, only used for CBTN, should we keep? + biobank_name, -- has been NA for kids first, only used for CBTN, should we keep? + biobank_request_instructions, -- has been NA for kids first, only used for CBTN, should we keep? + biobank_request_link -- has been NA for kids first, only used for CBTN, should we keep? +from {{ ref('kf_ds_src_study') }} +where lower(program) in ( + 'kids first', + 'kf/include' +) \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src.yaml b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src.yaml new file mode 100644 index 0000000..425f33a --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src.yaml @@ -0,0 +1,655 @@ +version: 2 + +models: + +- name: kf_ds_src_bsgf + description: '{{ doc("kf_ds_src_bsgf") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: genomic_file_id + description: '{{ doc("genomic_file_id") }}' + - name: biospecimen_id + description: '{{ doc("specimen_id") }}' + +- name: kf_ds_src_diagnosis + description: '{{ doc("kf_ds_src_diagnosis") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: source_text_diagnosis + description: '{{ doc("source_text_diagnosis") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + - name: mondo_id_diagnosis + description: '{{ doc("mondo_id_diagnosis") }}' + - name: icd_id_diagnosis + description: '{{ doc("icd_id_diagnosis") }}' + - name: diagnosis_category + description: '{{ doc("diagnosis_category") }}' + - name: source_text_tumor_location + description: '{{ doc("source_text_tumor_location") }}' + - name: uberon_id_tumor_location + description: '{{ doc("uberon_id_tumor_location") }}' + - name: spatial_descriptor + description: '{{ doc("spatial_descriptor") }}' + +- name: kf_ds_src_family_relationship + description: '{{ doc("kf_ds_src_family_relationship") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: participant1_id + description: '{{ doc("participant1_id") }}' + - name: participant2_id + description: '{{ doc("participant2_id") }}' + - name: participant1_to_participant2_relation + description: '{{ doc("participant1_to_participant2_relation") }}' + - name: participant2_to_participant1_relation + description: '{{ doc("participant2_to_participant1_relation") }}' + - name: source_text_notes + description: '{{ doc("source_text_notes") }}' + +- name: kf_ds_src_family + description: '{{ doc("kf_ds_src_family") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: family_type + description: '{{ doc("family_type") }}' + +- name: kf_ds_src_genomic_file + description: '{{ doc("kf_ds_src_genomic_file") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: is_harmonized + description: '{{ doc("is_harmonized") }}' + - name: reference_genome + description: '{{ doc("reference_genome") }}' + - name: controlled_access + description: '{{ doc("controlled_access") }}' + - name: availability + description: '{{ doc("availability") }}' + - name: paired_end + description: '{{ doc("paired_end") }}' + - name: data_type + description: '{{ doc("data_type") }}' + - name: file_format + description: '{{ doc("file_format") }}' + - name: data_category + description: '{{ doc("data_category") }}' + - name: workflow_tool + description: '{{ doc("workflow_tool") }}' + - name: workflow_type + description: '{{ doc("workflow_type") }}' + - name: workflow_version + description: '{{ doc("workflow_version") }}' + - name: workflow_endpoint + description: '{{ doc("workflow_endpoint") }}' + - name: file_version_descriptor + description: '{{ doc("file_version_descriptor") }}' + - name: cavatica_file_id + description: '{{ doc("cavatica_file_id") }}' + - name: cavatica_volume + description: '{{ doc("cavatica_volume") }}' + +- name: kf_ds_src_investigator + description: '{{ doc("kf_ds_src_investigator") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: name + description: '{{ doc("name") }}' + - name: institution + description: '{{ doc("institution") }}' + +- name: kf_ds_src_outcome + description: '{{ doc("kf_ds_src_outcome") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: vital_status + description: '{{ doc("vital_status") }}' + - name: disease_related + description: '{{ doc("disease_related") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + +- name: kf_ds_src_participant + description: '{{ doc("kf_ds_src_participant") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: alias_group_id + description: '{{ doc("alias_group_id") }}' + - name: study_id + description: '{{ doc("study_id") }}' + - name: family_id + description: '{{ doc("family_id") }}' + - name: is_proband + description: '{{ doc("is_proband") }}' + - name: race + description: '{{ doc("race") }}' + - name: ethnicity + description: '{{ doc("ethnicity") }}' + - name: gender + description: '{{ doc("gender") }}' + - name: affected_status + description: '{{ doc("affected_status") }}' + - name: species + description: '{{ doc("species") }}' + +- name: kf_ds_src_phenotype + description: '{{ doc("kf_ds_src_phenotype") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: source_text_phenotype + description: '{{ doc("source_text_phenotype") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + - name: hpo_id_phenotype + description: '{{ doc("hpo_id_phenotype") }}' + - name: observed + description: '{{ doc("observed") }}' + - name: snomed_id_phenotype + description: '{{ doc("snomed_id_phenotype") }}' + +- name: kf_ds_src_sample + description: '{{ doc("kf_ds_src_sample") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: external_sample_id + description: '{{ doc("sample_event_key") }}' + - name: tissue_type + description: '{{ doc("tissue_type") }}' + - name: composition + description: '{{ doc("sample_type") }}' + - name: anatomical_location + description: '{{ doc("anatomical_location") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: method_of_sample_procurement + description: '{{ doc("method_of_sample_procurement") }}' + - name: has_matched_normal_sample + description: '{{ doc("has_matched_normal_sample") }}' + - name: external_collection_id + description: '{{ doc("external_collection_id") }}' + - name: volume_ul + description: '{{ doc("volume_ul") }}' + - name: preservation_method + description: '{{ doc("preservation_method") }}' + - name: amount + description: '{{ doc("amount") }}' + - name: amount_units + description: '{{ doc("amount_units") }}' + + +- name: kf_ds_src_segf + description: '{{ doc("kf_ds_src_segf") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: sequencing_experiment_id + description: '{{ doc("sequencing_experiment_id") }}' + - name: genomic_file_id + description: '{{ doc("genomic_file_id") }}' + +- name: kf_ds_src_sequencing_center + description: '{{ doc("kf_ds_src_sequencing_center") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: name + description: '{{ doc("name") }}' + +- name: kf_ds_src_sequencing_experiment + description: '{{ doc("kf_ds_src_sequencing_experiment") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: sequencing_center_id + description: '{{ doc("sequencing_center_id") }}' + - name: experiment_date + description: '{{ doc("experiment_date") }}' + - name: experiment_strategy + description: '{{ doc("experiment_strategy") }}' + - name: is_paired_end + description: '{{ doc("is_paired_end") }}' + - name: platform + description: '{{ doc("platform") }}' + - name: instrument_model + description: '{{ doc("instrument_model") }}' + - name: library_name + description: '{{ doc("library_name") }}' + - name: library_strand + description: '{{ doc("library_strand") }}' + - name: library_prep + description: '{{ doc("library_prep") }}' + - name: library_selection + description: '{{ doc("library_selection") }}' + - name: max_insert_size + description: '{{ doc("max_insert_size") }}' + - name: mean_insert_size + description: '{{ doc("mean_insert_size") }}' + - name: mean_depth + description: '{{ doc("mean_depth") }}' + - name: total_reads + description: '{{ doc("total_reads") }}' + - name: mean_read_length + description: '{{ doc("mean_read_length") }}' + - name: adapter_sequencing + description: '{{ doc("adapter_sequencing") }}' + - name: is_adapter_trimmed + description: '{{ doc("is_adapter_trimmed") }}' + - name: read_pair_number + description: '{{ doc("read_pair_number") }}' + - name: target_capture_kit + description: '{{ doc("target_capture_kit") }}' + - name: acquisition_type + description: '{{ doc("acquisition_type") }}' + - name: cdna_read + description: '{{ doc("cdna_read") }}' + - name: cdna_read_offset + description: '{{ doc("cdna_read_offset") }}' + - name: cell_barcode_offset + description: '{{ doc("cell_barcode_offset") }}' + - name: cell_barcode_read + description: '{{ doc("cell_barcode_read") }}' + - name: cell_barcode_size + description: '{{ doc("cell_barcode_size") }}' + - name: chromatography_approach + description: '{{ doc("chromatography_approach") }}' + - name: end_bias + description: '{{ doc("end_bias") }}' + - name: enrichment_approach + description: '{{ doc("enrichment_approach") }}' + - name: fraction_number + description: '{{ doc("fraction_number") }}' + - name: fractionation_approach + description: '{{ doc("fractionation_approach") }}' + - name: ion_fragmentation + description: '{{ doc("ion_fragmentation") }}' + - name: library_construction + description: '{{ doc("library_construction") }}' + - name: mass_spec_rawfile_conversion + description: '{{ doc("mass_spec_rawfile_conversion") }}' + - name: proteomics_experiment + description: '{{ doc("proteomics_experiment") }}' + - name: quantification_label_id + description: '{{ doc("quantification_label_id") }}' + - name: quantification_labeling_method + description: '{{ doc("quantification_labeling_method") }}' + - name: quantification_technique + description: '{{ doc("quantification_technique") }}' + - name: sequencing_mode + description: '{{ doc("sequencing_mode") }}' + - name: target_cell_number + description: '{{ doc("target_cell_number") }}' + - name: umi_barcode_offset + description: '{{ doc("umi_barcode_offset") }}' + - name: umi_barcode_read + description: '{{ doc("umi_barcode_read") }}' + - name: umi_barcode_size + description: '{{ doc("umi_barcode_size") }}' + +- name: kf_ds_src_biospecimen + description: '{{ doc("kf_ds_src_biospecimen") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: external_sample_id + description: '{{ doc("external_sample_id") }}' + - name: external_aliquot_id + description: '{{ doc("external_aliquot_id") }}' + - name: source_text_tissue_type + description: '{{ doc("source_text_tissue_type") }}' + - name: composition + description: '{{ doc("composition") }}' + - name: source_text_anatomical_site + description: '{{ doc("source_text_anatomical_site") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + - name: source_text_tumor_descriptor + description: '{{ doc("source_text_tumor_descriptor") }}' + - name: analyte_type + description: '{{ doc("analyte_type") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: sequencing_center_id + description: '{{ doc("sequencing_center_id") }}' + - name: dbgap_consent_code + description: '{{ doc("dbgap_consent_code") }}' + - name: consent_type + description: '{{ doc("consent_type") }}' + - name: method_of_sample_procurement + description: '{{ doc("method_of_sample_procurement") }}' + - name: sample_id + description: '{{ doc("sample_id") }}' + - name: specimen_status + description: '{{ doc("specimen_status") }}' + - name: has_matched_normal_sample + description: '{{ doc("has_matched_normal_sample") }}' + - name: shipment_origin + description: '{{ doc("shipment_origin") }}' + - name: concentration_mg_per_ml + description: '{{ doc("concentration_mg_per_ml") }}' + - name: volume_ul + description: '{{ doc("volume_ul") }}' + - name: shipment_date + description: '{{ doc("shipment_date") }}' + - name: uberon_id_anatomical_site + description: '{{ doc("uberon_id_anatomical_site") }}' + - name: ncit_id_tissue_type + description: '{{ doc("ncit_id_tissue_type") }}' + - name: ncit_id_anatomical_site + description: '{{ doc("ncit_id_anatomical_site") }}' + - name: spatial_descriptor + description: '{{ doc("spatial_descriptor") }}' + - name: preservation_method + description: '{{ doc("preservation_method") }}' + - name: amount + description: '{{ doc("amount") }}' + - name: amount_units + description: '{{ doc("amount_units") }}' + - name: cell_entity + description: '{{ doc("cell_entity") }}' + +- name: kf_ds_src_study + description: '{{ doc("kf_ds_src_study") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: data_access_authority + description: '{{ doc("data_access_authority") }}' + - name: version + description: '{{ doc("version") }}' + - name: name + description: '{{ doc("name") }}' + - name: short_name + description: '{{ doc("short_name") }}' + - name: attribution + description: '{{ doc("attribution") }}' + - name: release_status + description: '{{ doc("release_status") }}' + - name: investigator_id + description: '{{ doc("investigator_id") }}' + - name: short_code + description: '{{ doc("short_code") }}' + - name: domain + description: '{{ doc("domain") }}' + - name: program + description: '{{ doc("program") }}' + - name: parent_study_id + description: '{{ doc("parent_study_id") }}' + - name: biobank_email + description: '{{ doc("biobank_email") }}' + - name: biobank_name + description: '{{ doc("biobank_name") }}' + - name: biobank_request_instructions + description: '{{ doc("biobank_request_instructions") }}' + - name: biobank_request_link + description: '{{ doc("biobank_request_link") }}' \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_biospecimen.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_biospecimen.sql new file mode 100644 index 0000000..f52c2b2 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_biospecimen.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from {{ ref('ds_specimens') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_bsgf.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_bsgf.sql new file mode 100644 index 0000000..089ca60 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_bsgf.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from {{ ref('ds_bsgfs') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_diagnosis.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_diagnosis.sql new file mode 100644 index 0000000..e7824f2 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_diagnosis.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_diagnoses') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_family.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_family.sql new file mode 100644 index 0000000..052d446 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_family.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_families') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_family_relationship.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_family_relationship.sql new file mode 100644 index 0000000..ec75898 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_family_relationship.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_family_relationships') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_genomic_file.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_genomic_file.sql new file mode 100644 index 0000000..855f6b8 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_genomic_file.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_genomic_files') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_investigator.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_investigator.sql new file mode 100644 index 0000000..2024eeb --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_investigator.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_investigators') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_outcome.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_outcome.sql new file mode 100644 index 0000000..f3c3c72 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_outcome.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_outcomes') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_participant.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_participant.sql new file mode 100644 index 0000000..d188eb5 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_participant.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_participants') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_phenotype.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_phenotype.sql new file mode 100644 index 0000000..89baf40 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_phenotype.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_phenotypes') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_sample.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_sample.sql new file mode 100644 index 0000000..1d2e8dd --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_sample.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_samples') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_segf.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_segf.sql new file mode 100644 index 0000000..507e45a --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_segf.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_segfs') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_sequencing_center.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_sequencing_center.sql new file mode 100644 index 0000000..07baf06 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_sequencing_center.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_sequencing_centers') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_sequencing_experiment.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_sequencing_experiment.sql new file mode 100644 index 0000000..0a44ca7 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_sequencing_experiment.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_ses') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_study.sql b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_study.sql new file mode 100644 index 0000000..35a5c46 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/src/kf_ds_src_study.sql @@ -0,0 +1,6 @@ +{{ config( + schema='src' +) }} + +select * +from{{ ref('ds_studies') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable.yaml b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable.yaml new file mode 100644 index 0000000..f68eac0 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable.yaml @@ -0,0 +1,677 @@ +version: 2 + +models: + +- name: kf_ds_stable_bsgf + description: '{{ doc("kf_ds_stable_bsgf") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: bsgf_id + description: '{{ doc("bsgf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: genomic_file_id + description: '{{ doc("genomic_file_id") }}' + - name: biospecimen_id + description: '{{ doc("specimen_id") }}' + +- name: kf_ds_stable_diagnosis + description: '{{ doc("kf_ds_stable_diagnosis") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: diagnosis_id + description: '{{ doc("diagnosis_id") }}' + - name: dewrangle_diagnosis_id + description: '{{ doc("dewrangle_diagnosis_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: source_text_diagnosis + description: '{{ doc("source_text_diagnosis") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + - name: mondo_id_diagnosis + description: '{{ doc("mondo_id_diagnosis") }}' + - name: icd_id_diagnosis + description: '{{ doc("icd_id_diagnosis") }}' + - name: diagnosis_category + description: '{{ doc("diagnosis_category") }}' + - name: source_text_tumor_location + description: '{{ doc("source_text_tumor_location") }}' + - name: uberon_id_tumor_location + description: '{{ doc("uberon_id_tumor_location") }}' + - name: spatial_descriptor + description: '{{ doc("spatial_descriptor") }}' + - name: observed + description: '{{ doc("observed") }}' + +- name: kf_ds_stable_family_relationship + description: '{{ doc("kf_ds_stable_family_relationship") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: relationship_id + description: '{{ doc("relationship_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: participant1_id + description: '{{ doc("participant1_id") }}' + - name: participant2_id + description: '{{ doc("participant2_id") }}' + - name: participant1_to_participant2_relation + description: '{{ doc("participant1_to_participant2_relation") }}' + - name: participant2_to_participant1_relation + description: '{{ doc("participant2_to_participant1_relation") }}' + - name: source_text_notes + description: '{{ doc("source_text_notes") }}' + + +- name: kf_ds_stable_family + description: '{{ doc("kf_ds_stable_family") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: family_id + description: '{{ doc("family_id") }}' + - name: dewrangle_family_id + description: '{{ doc("dewrangle_family_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: family_type + description: '{{ doc("family_type") }}' + +- name: kf_ds_stable_genomic_file + description: '{{ doc("kf_ds_stable_genomic_file") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: genomic_file_id + description: '{{ doc("genomic_file_id") }}' + - name: dewrangle_genomic_file_id + description: '{{ doc("dewrangle_genomic_file_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: is_harmonized + description: '{{ doc("is_harmonized") }}' + - name: reference_genome + description: '{{ doc("reference_genome") }}' + - name: controlled_access + description: '{{ doc("controlled_access") }}' + - name: availability + description: '{{ doc("availability") }}' + - name: paired_end + description: '{{ doc("paired_end") }}' + - name: data_type + description: '{{ doc("data_type") }}' + - name: file_format + description: '{{ doc("file_format") }}' + - name: data_category + description: '{{ doc("data_category") }}' + - name: workflow_tool + description: '{{ doc("workflow_tool") }}' + - name: workflow_type + description: '{{ doc("workflow_type") }}' + - name: workflow_version + description: '{{ doc("workflow_version") }}' + - name: workflow_endpoint + description: '{{ doc("workflow_endpoint") }}' + - name: file_version_descriptor + description: '{{ doc("file_version_descriptor") }}' + - name: cavatica_file_id + description: '{{ doc("cavatica_file_id") }}' + - name: cavatica_volume + description: '{{ doc("cavatica_volume") }}' + +- name: kf_ds_stable_investigator + description: '{{ doc("kf_ds_stable_investigator") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: investigator_id + description: '{{ doc("investigator_id") }}' + - name: dewrangle_investigator_id + description: '{{ doc("dewrangle_investigator_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: name + description: '{{ doc("name") }}' + - name: institution + description: '{{ doc("institution") }}' + +- name: kf_ds_stable_outcome + description: '{{ doc("kf_ds_stable_outcome") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: vital_status + description: '{{ doc("vital_status") }}' + - name: disease_related + description: '{{ doc("disease_related") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + +- name: kf_ds_stable_participant + description: '{{ doc("kf_ds_stable_participant") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: dewrangle_participant_id + description: '{{ doc("dewrangle_participant_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: alias_group_id + description: '{{ doc("alias_group_id") }}' + - name: study_id + description: '{{ doc("study_id") }}' + - name: dewrangle_study_id + description: '{{ doc("dewrangle_study_id") }}' + - name: family_id + description: '{{ doc("family_id") }}' + - name: is_proband + description: '{{ doc("is_proband") }}' + - name: race + description: '{{ doc("race") }}' + - name: ethnicity + description: '{{ doc("ethnicity") }}' + - name: gender + description: '{{ doc("gender") }}' + - name: affected_status + description: '{{ doc("affected_status") }}' + - name: species + description: '{{ doc("species") }}' + +- name: kf_ds_stable_phenotype + description: '{{ doc("kf_ds_stable_phenotype") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: phenotype_id + description: '{{ doc("phenotype_id") }}' + - name: dewrangle_phenotype_id + description: '{{ doc("dewrangle_phenotype_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: source_text_phenotype + description: '{{ doc("source_text_phenotype") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + - name: hpo_id_phenotype + description: '{{ doc("hpo_id_phenotype") }}' + - name: observed + description: '{{ doc("observed") }}' + - name: snomed_id_phenotype + description: '{{ doc("snomed_id_phenotype") }}' + +- name: kf_ds_stable_sample + description: '{{ doc("kf_ds_src_sample") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: external_sample_id + description: '{{ doc("sample_event_key") }}' + - name: tissue_type + description: '{{ doc("tissue_type") }}' + - name: composition + description: '{{ doc("sample_type") }}' + - name: anatomical_location + description: '{{ doc("anatomical_location") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: method_of_sample_procurement + description: '{{ doc("method_of_sample_procurement") }}' + - name: has_matched_normal_sample + description: '{{ doc("has_matched_normal_sample") }}' + - name: external_collection_id + description: '{{ doc("external_collection_id") }}' + - name: volume_ul + description: '{{ doc("volume_ul") }}' + - name: preservation_method + description: '{{ doc("preservation_method") }}' + - name: amount + description: '{{ doc("amount") }}' + - name: amount_units + description: '{{ doc("amount_units") }}' + +- name: kf_ds_stable_sequencing_center + description: '{{ doc("kf_ds_stable_sequencing_center") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: kf_id + description: '{{ doc("kf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: sequencing_center_name + description: '{{ doc("sequencing_center_name") }}' + +- name: kf_ds_stable_segf + description: '{{ doc("kf_ds_stable_segf") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: segf_id + description: '{{ doc("segf_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: sequencing_experiment_id + description: '{{ doc("sequencing_experiment_id") }}' + - name: genomic_file_id + description: '{{ doc("genomic_file_id") }}' + +- name: kf_ds_stable_sequencing_experiment + description: '{{ doc("kf_ds_stable_sequencing_experiment") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: sequencing_experiment_id + description: '{{ doc("sequencing_experiment_id") }}' + - name: dewrangle_sequencing_experiment_id + description: '{{ doc("dewrangle_sequencing_experiment_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: sequencing_center_id + description: '{{ doc("sequencing_center_id") }}' + - name: experiment_date + description: '{{ doc("experiment_date") }}' + - name: experiment_strategy + description: '{{ doc("experiment_strategy") }}' + - name: is_paired_end + description: '{{ doc("is_paired_end") }}' + - name: platform + description: '{{ doc("platform") }}' + - name: instrument_model + description: '{{ doc("instrument_model") }}' + - name: library_name + description: '{{ doc("library_name") }}' + - name: library_strand + description: '{{ doc("library_strand") }}' + - name: library_prep + description: '{{ doc("library_prep") }}' + - name: library_selection + description: '{{ doc("library_selection") }}' + - name: max_insert_size + description: '{{ doc("max_insert_size") }}' + - name: mean_insert_size + description: '{{ doc("mean_insert_size") }}' + - name: mean_depth + description: '{{ doc("mean_depth") }}' + - name: total_reads + description: '{{ doc("total_reads") }}' + - name: mean_read_length + description: '{{ doc("mean_read_length") }}' + - name: adapter_sequencing + description: '{{ doc("adapter_sequencing") }}' + - name: is_adapter_trimmed + description: '{{ doc("is_adapter_trimmed") }}' + - name: read_pair_number + description: '{{ doc("read_pair_number") }}' + - name: target_capture_kit + description: '{{ doc("target_capture_kit") }}' + - name: acquisition_type + description: '{{ doc("acquisition_type") }}' + - name: cdna_read + description: '{{ doc("cdna_read") }}' + - name: cdna_read_offset + description: '{{ doc("cdna_read_offset") }}' + - name: cell_barcode_offset + description: '{{ doc("cell_barcode_offset") }}' + - name: cell_barcode_read + description: '{{ doc("cell_barcode_read") }}' + - name: cell_barcode_size + description: '{{ doc("cell_barcode_size") }}' + - name: chromatography_approach + description: '{{ doc("chromatography_approach") }}' + - name: end_bias + description: '{{ doc("end_bias") }}' + - name: enrichment_approach + description: '{{ doc("enrichment_approach") }}' + - name: fraction_number + description: '{{ doc("fraction_number") }}' + - name: fractionation_approach + description: '{{ doc("fractionation_approach") }}' + - name: ion_fragmentation + description: '{{ doc("ion_fragmentation") }}' + - name: library_construction + description: '{{ doc("library_construction") }}' + - name: mass_spec_rawfile_conversion + description: '{{ doc("mass_spec_rawfile_conversion") }}' + - name: proteomics_experiment + description: '{{ doc("proteomics_experiment") }}' + - name: quantification_label_id + description: '{{ doc("quantification_label_id") }}' + - name: quantification_labeling_method + description: '{{ doc("quantification_labeling_method") }}' + - name: quantification_technique + description: '{{ doc("quantification_technique") }}' + - name: sequencing_mode + description: '{{ doc("sequencing_mode") }}' + - name: target_cell_number + description: '{{ doc("target_cell_number") }}' + - name: umi_barcode_offset + description: '{{ doc("umi_barcode_offset") }}' + - name: umi_barcode_read + description: '{{ doc("umi_barcode_read") }}' + - name: umi_barcode_size + description: '{{ doc("umi_barcode_size") }}' + +- name: kf_ds_stable_biospecimen + description: '{{ doc("kf_ds_stable_biospecimen") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: specimen_id + description: '{{ doc("specimen_id") }}' + - name: dewrangle_specimen_id + description: '{{ doc("dewrangle_specimen_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: external_sample_id + description: '{{ doc("external_sample_id") }}' + - name: external_aliquot_id + description: '{{ doc("external_aliquot_id") }}' + - name: source_text_tissue_type + description: '{{ doc("source_text_tissue_type") }}' + - name: composition + description: '{{ doc("composition") }}' + - name: source_text_anatomical_site + description: '{{ doc("source_text_anatomical_site") }}' + - name: age_at_event_days + description: '{{ doc("age_at_event_days") }}' + - name: source_text_tumor_descriptor + description: '{{ doc("source_text_tumor_descriptor") }}' + - name: analyte_type + description: '{{ doc("analyte_type") }}' + - name: participant_id + description: '{{ doc("participant_id") }}' + - name: sequencing_center_id + description: '{{ doc("sequencing_center_id") }}' + - name: dbgap_consent_code + description: '{{ doc("dbgap_consent_code") }}' + - name: consent_type + description: '{{ doc("consent_type") }}' + - name: method_of_sample_procurement + description: '{{ doc("method_of_sample_procurement") }}' + - name: sample_id + description: '{{ doc("sample_id") }}' + - name: specimen_status + description: '{{ doc("specimen_status") }}' + - name: has_matched_normal_sample + description: '{{ doc("has_matched_normal_sample") }}' + - name: shipment_origin + description: '{{ doc("shipment_origin") }}' + - name: concentration_mg_per_ml + description: '{{ doc("concentration_mg_per_ml") }}' + - name: volume_ul + description: '{{ doc("volume_ul") }}' + - name: shipment_date + description: '{{ doc("shipment_date") }}' + - name: uberon_id_anatomical_site + description: '{{ doc("uberon_id_anatomical_site") }}' + - name: ncit_id_tissue_type + description: '{{ doc("ncit_id_tissue_type") }}' + - name: ncit_id_anatomical_site + description: '{{ doc("ncit_id_anatomical_site") }}' + - name: spatial_descriptor + description: '{{ doc("spatial_descriptor") }}' + - name: preservation_method + description: '{{ doc("preservation_method") }}' + - name: amount + description: '{{ doc("amount") }}' + - name: amount_units + description: '{{ doc("amount_units") }}' + - name: cell_entity + description: '{{ doc("cell_entity") }}' + +- name: kf_ds_stable_study + description: '{{ doc("kf_ds_stable_study") }}' + config: + meta: + study: kf_dataservice_study + columns: + - name: uuid + description: '{{ doc("uuid") }}' + - name: study_id + description: '{{ doc("study_id") }}' + - name: dewrangle_study_id + description: '{{ doc("dewrangle_study_id") }}' + - name: created_at + description: '{{ doc("created_at") }}' + - name: modified_at + description: '{{ doc("modified_at") }}' + - name: external_id + description: '{{ doc("external_id") }}' + - name: visible + description: '{{ doc("visible") }}' + - name: visibility_reason + description: '{{ doc("visibility_reason") }}' + - name: visibility_comment + description: '{{ doc("visibility_comment") }}' + - name: data_access_authority + description: '{{ doc("data_access_authority") }}' + - name: version + description: '{{ doc("version") }}' + - name: name + description: '{{ doc("name") }}' + - name: short_name + description: '{{ doc("short_name") }}' + - name: attribution + description: '{{ doc("attribution") }}' + - name: release_status + description: '{{ doc("release_status") }}' + - name: investigator_id + description: '{{ doc("investigator_id") }}' + - name: short_code + description: '{{ doc("short_code") }}' + - name: domain + description: '{{ doc("domain") }}' + - name: program + description: '{{ doc("program") }}' + - name: parent_study_id + description: '{{ doc("parent_study_id") }}' + - name: biobank_email + description: '{{ doc("biobank_email") }}' + - name: biobank_name + description: '{{ doc("biobank_name") }}' + - name: biobank_request_instructions + description: '{{ doc("biobank_request_instructions") }}' + - name: biobank_request_link + description: '{{ doc("biobank_request_link") }}' \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_biospecimen.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_biospecimen.sql new file mode 100644 index 0000000..5bee206 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_biospecimen.sql @@ -0,0 +1,46 @@ +{{ config( + schema='stable' +) }} + +-- need to review all fields in ds and which ones are neeeded +-- for now, extracting the main ones we use +select distinct + uuid, -- we can leave this out; it's not used, + created_at, + modified_at, + specimen_id, + dewrangle_specimen_id, + external_sample_id, + external_aliquot_id, + source_text_tissue_type, -- can rename without source text prefix? + composition, + source_text_anatomical_site, -- can rename without source text prefix? + age_at_event_days, + source_text_tumor_descriptor, -- can rename without source text prefix? + analyte_type, + participant_id, + sequencing_center_id, + dbgap_consent_code, -- maybe this should be on the pt level? + consent_type, -- maybe this should be on the pt level? + method_of_sample_procurement, + sample_id, -- is this needed? haven't exported 'sample' table for kf + specimen_status, -- should this be set as a constant for all kf studies ? + has_matched_normal_sample, + visible, + visibility_reason, + visibility_comment, + + -- these are fields we should discuss if needed + shipment_origin, + concentration_mg_per_ml, + volume_ul, + shipment_date, + uberon_id_anatomical_site, + ncit_id_tissue_type, + ncit_id_anatomical_site, + spatial_descriptor, + preservation_method, + amount, + amount_units, + cell_entity +from {{ ref('kf_ds_int_biospecimen') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_bsgf.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_bsgf.sql new file mode 100644 index 0000000..0f42bc8 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_bsgf.sql @@ -0,0 +1,19 @@ +{{ config( + schema='stable' +) }} + +-- we currently use this table in DS to link specimens/gfs +-- not sure if we would need it here, since we can easily add specimen ids to a files model + +select distinct + uuid, -- we can leave this out, it's not used + created_at, + modified_at, + genomic_file_id, + biospecimen_id, + bsgf_id, + visible, + external_id, -- i think we can leave this out - it's rarely populated/used, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_int_bsgf') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_diagnosis.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_diagnosis.sql new file mode 100644 index 0000000..dc16b27 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_diagnosis.sql @@ -0,0 +1,22 @@ +{{ config( + schema='stable' +) }} + +select distinct + uuid, -- we can leave this out; it's not used, + diagnosis_id, + dewrangle_diagnosis_id, + participant_id, + source_text_diagnosis, + age_at_event_days, + mondo_id_diagnosis, + icd_id_diagnosis, + observed, -- implied by presence in diagnosis table + + -- unsure if necessary to include tehse fields + diagnosis_category, + external_id, + source_text_tumor_location, + uberon_id_tumor_location, + spatial_descriptor +from {{ ref('kf_ds_int_diagnosis') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_family.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_family.sql new file mode 100644 index 0000000..15ce859 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_family.sql @@ -0,0 +1,16 @@ +{{ config( + schema='stable' +) }} + +select distinct + uuid, -- we can leave this out; isn't used, + created_at, + modified_at, + family_id, + dewrangle_family_id, + external_id, + family_type, -- not historically populated but it should be; can use logic rules to calculate + visible, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_int_family') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_family_relationship.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_family_relationship.sql new file mode 100644 index 0000000..381ae64 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_family_relationship.sql @@ -0,0 +1,19 @@ +{{ config( + schema='stable' +) }} + +select distinct + uuid, -- we can leave this out, it's not used + created_at, + modified_at, + external_id, -- i think we can leave this out - it's rarely populated/used, + participant1_id, + participant2_id, + participant1_to_participant2_relation, + participant2_to_participant1_relation, + relationship_id, + visible, + visibility_reason, + visibility_comment, + source_text_notes -- don't think we need this? we never populate it +from {{ ref('kf_ds_int_family_relationship') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_genomic_file.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_genomic_file.sql new file mode 100644 index 0000000..7683746 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_genomic_file.sql @@ -0,0 +1,34 @@ +{{ config( + schema='stable' +) }} + +select distinct + uuid, -- we can leave this out; it's not used, + genomic_file_id, + dewrangle_genomic_file_id, + created_at, -- think these could be useful when answering questions about changes over time + modified_at, -- ^^^ + external_id, + is_harmonized, + reference_genome, + controlled_access, -- would be nice to incorporate mappung logic for this field based on file type, location and harmonization + availability, + paired_end, + visible, + visibility_reason, + visibility_comment, + + -- should we discuss with bix about standardizing these values? I know ingest mapping logic has changed over time + data_type, + file_format, + data_category, + workflow_tool, + workflow_type, + workflow_version, + workflow_endpoint, + file_version_descriptor, -- should discuss with bix about reliability of these values in dataservice currently + + -- could be useful after delivery but would be null during source load + cavatica_file_id, + cavatica_volume +from {{ ref('kf_ds_int_genomic_file') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_investigator.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_investigator.sql new file mode 100644 index 0000000..9f7d5f1 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_investigator.sql @@ -0,0 +1,17 @@ +{{ config( + schema='stable' +) }} + +select distinct + uuid, -- we can leave this out; isn't used, + created_at, + modified_at, + investigator_id, + dewrangle_investigator_id, + external_id, + name, + institution, + visible, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_int_investigator') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_outcome.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_outcome.sql new file mode 100644 index 0000000..81c0572 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_outcome.sql @@ -0,0 +1,18 @@ +{{ config( + schema='stable' +) }} + +select distinct + uuid, -- we can leave this out, it's not used + created_at, + modified_at, + external_id, -- i think we can leave this out - it's rarely populated/used, + vital_status, + disease_related, + age_at_event_days, + participant_id, + kf_id, + visible, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_int_outcome') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_participant.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_participant.sql new file mode 100644 index 0000000..ce54158 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_participant.sql @@ -0,0 +1,21 @@ +select distinct + uuid, -- we can leave this out ; it's not used, + created_at, + modified_at, + alias_group_id, -- we can leave this out; it's not used + study_id, + dewrangle_study_id, + participant_id, + dewrangle_participant_id, + external_id, + family_id, + is_proband, + race, + ethnicity, + gender, + affected_status, + species, + visible, + visibility_reason, -- can we standardize this a bit more? maybe release status instead of reason? and try to standardize more? + visibility_comment +from {{ ref('kf_ds_int_participant') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_phenotype.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_phenotype.sql new file mode 100644 index 0000000..8d19f98 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_phenotype.sql @@ -0,0 +1,14 @@ +select distinct + uuid, -- we can leave this out; it's not used, + phenotype_id, + dewrangle_phenotype_id, + participant_id, + source_text_phenotype, + age_at_event_days, + hpo_id_phenotype, + observed, + + -- additional fields that may be included + snomed_id_phenotype, + external_id +from {{ ref('kf_ds_int_phenotype') }} diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_sample.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_sample.sql new file mode 100644 index 0000000..2d62d94 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_sample.sql @@ -0,0 +1,27 @@ +{{ config( + schema='stable' +) }} + +select distinct + uuid, -- we can leave this out, it's not used + created_at, + modified_at, + external_id, + kf_id, + age_at_event_days, + sample_event_key, + tissue_type, + sample_type, + anatomical_location, + volume_ul, + method_of_sample_procurement, + preservation_method, + participant_id, + external_collection_id, + has_matched_normal_sample, + amount, + amount_units, + visible, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_int_sample') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_segf.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_segf.sql new file mode 100644 index 0000000..0ff00cd --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_segf.sql @@ -0,0 +1,16 @@ +{{ config( + schema='stable' +) }} + +select distinct + created_at, + modified_at, + uuid, -- we can leave this out, it's not used + visible, + sequencing_experiment_id, + genomic_file_id, + external_id, -- this is rarely populated, it could be left out + segf_id, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_int_segf') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_sequencing_center.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_sequencing_center.sql new file mode 100644 index 0000000..166d3d2 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_sequencing_center.sql @@ -0,0 +1,16 @@ +{{ config( + schema='stable' +) }} + + +select distinct + uuid, -- we can leave this out, it's not used + created_at, + modified_at, + external_id, + kf_id, + sequencing_center_name, + visible, + visibility_reason, + visibility_comment +from {{ ref('kf_ds_int_sequencing_center') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_sequencing_experiment.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_sequencing_experiment.sql new file mode 100644 index 0000000..f80b2e4 --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_sequencing_experiment.sql @@ -0,0 +1,57 @@ +{{ config( + schema='stable' +) }} + +select distinct + uuid, -- we can leave this out; it's not used, + sequencing_experiment_id, + dewrangle_sequencing_experiment_id, + sequencing_center_id, + external_id, + experiment_date, + experiment_strategy, + is_paired_end, + platform, + instrument_model, + visible, + visibility_reason, + visibility_comment, + + -- I think these should be included? talk with BIX + library_name, + library_strand, + library_prep, + library_selection, + max_insert_size, + mean_insert_size, + mean_depth, + total_reads, + mean_read_length, + adapter_sequencing, + is_adapter_trimmed, + read_pair_number, + target_capture_kit, + acquisition_type, + cdna_read, + cdna_read_offset, + cell_barcode_offset, + cell_barcode_read, + cell_barcode_size, + chromatography_approach, + end_bias, + enrichment_approach, + fraction_number, + fractionation_approach, + ion_fragmentation, + library_construction, + mass_spec_rawfile_conversion, + proteomics_experiment, + quantification_label_id, + quantification_labeling_method, + quantification_technique, + sequencing_mode, + target_cell_number, + umi_barcode_offset, + umi_barcode_read, + umi_barcode_size +from {{ ref('kf_ds_int_sequencing_experiment') }} \ No newline at end of file diff --git a/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_study.sql b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_study.sql new file mode 100644 index 0000000..7095b0b --- /dev/null +++ b/dbt_project/models/kids_first/dataservice_studies/stable/kf_ds_stable_study.sql @@ -0,0 +1,30 @@ +{{ config( + schema='stable' +) }} + +select distinct + created_at, + modified_at, + uuid, -- we can leave this out, it's not used + data_access_authority, -- usually value is dbgap here + external_id,-- usually we make this the phs number - would be worth renaming to reflect that + version, -- seems useful, but is rarely ever used. + name, -- this is the full name of the study; might be worth renaming + short_name, -- do we need name and short name? + attribution, -- not exactly sure what would go here, we rarely populate it + release_status, -- like the idea of this, but it's not consistently updated + investigator_id, -- like the idea of this, but again not consistently populated, should it be? + study_id, + dewrangle_study_id, + visible, + short_code, + domain, -- this is research domain e.g., CANCER vs SBD + program, + visibility_reason, + visibility_comment, + parent_study_id, + biobank_email, -- has been NA for kids first, only used for CBTN, should we keep? + biobank_name, -- has been NA for kids first, only used for CBTN, should we keep? + biobank_request_instructions, -- has been NA for kids first, only used for CBTN, should we keep? + biobank_request_link -- has been NA for kids first, only used for CBTN, should we keep? +from {{ ref('kf_ds_int_study') }} \ No newline at end of file diff --git a/dbt_project/seeds/_seeds.yml b/dbt_project/seeds/_seeds.yml index d98ab0f..a7883b3 100644 --- a/dbt_project/seeds/_seeds.yml +++ b/dbt_project/seeds/_seeds.yml @@ -15,6 +15,7 @@ seeds: config: column_types: Size: bigint + LastModified: varchar - name: sample config: @@ -26,9 +27,13 @@ seeds: pf_hq_aligned_q20_bases: bigint genome_territory: bigint library-1_estimated_library_size: bigint - # pf_reads: bigint - # pf_reads_aligned: bigint - # reads_aligned_in_pairs: bigint - # pf_hq_aligned_reads: bigint - # library-1_read_pairs: bigint - # pf_reads_improper_pairs: bigint \ No newline at end of file + pf_reads: bigint + pf_reads_aligned: bigint + reads_aligned_in_pairs: bigint + pf_hq_aligned_reads: bigint + library-1_read_pairs: bigint + pf_reads_improper_pairs: bigint + - name: ds_ses + config: + column_types: + total_reads: bigint \ No newline at end of file