Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .sqlfluff
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ project_dir = dbt_project
target = dev
# If needed, uncomment the line below to specify the directory where your
# profiles.yml file is located
profiles_dir = ~/.dbt/include
profiles_dir = ~/.dbt/

[sqlfluff:indentation]
tab_space_size = 2
Expand Down
36 changes: 36 additions & 0 deletions dags/kids_first/kf_sd_1nns3k8v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from airflow.sdk import Variable

from cosmos import (
DbtDag,
ProjectConfig,
ProfileConfig,
ExecutionConfig,
RenderConfig,
)
from cosmos.profiles import PostgresUserPasswordProfileMapping

profile_config = ProfileConfig(
# make sure target_name and profile_mapping align
profile_name=Variable.get("DBT_PROFILE_NAME"),
target_name="prd",
profile_mapping=PostgresUserPasswordProfileMapping(
conn_id="postgres_prd_svc",
profile_args={"schema": "prd"},
),
)

example_study_dag = DbtDag(
project_config=ProjectConfig(
Variable.get("DBT_PROJECT_DIR"),
install_dbt_deps=True,
),
profile_config=profile_config,
execution_config=ExecutionConfig(
dbt_executable_path=Variable.get("DBT_EXECUTABLE_PATH"),
),
render_config=RenderConfig(select=["config.meta.study:kf_sd_1nns3k8v"]),
# normal dag parameters
schedule="@daily",
dag_id="kf_sd_1nns3k8v_dbt_dag",
tags=["POC", "Kids First"],
)
99 changes: 99 additions & 0 deletions dbt_project/models/_metadata_description_files/docs_fields.md
Original file line number Diff line number Diff line change
Expand Up @@ -598,5 +598,104 @@ Link to request specimens from study biobank.
{% enddocs %}


## Kids First Source Data Fields

### Clinical Data

{% docs family_relationship_target_participant_id %}
Participant ID of Proband of Family
{% enddocs %}

{% docs aliquot_id %}
Aliquot ID from sequencing manifest
{% enddocs %}

{% docs consent_group %}
Consent group of study
{% enddocs %}

{% docs age_at_sample %}
Age sample was collected from sequencing manifest
{% enddocs %}

{% docs sample_composition %}
Composition of specimen
{% enddocs %}

{% docs sample_tissue_type %}
Tissue type of collected specimen
{% enddocs %}

{% docs sex %}
Sex of pariticipant
{% enddocs %}

{% docs age_at_phenotype %}
Age of participant when phenotype was asserted
{% enddocs %}

{% docs vital_status %}
Vital status of participant
{% enddocs %}


### Broad Manifest

{% docs collaborator_sample_id %}
Sample ID submitted by PI to sequencing center
{% enddocs %}

{% docs collaborator_participant_id %}
Participant ID submitted by PI to sequencing center
{% enddocs %}

{% docs cram_path %}
path of cram file from sequencing manifest
{% enddocs %}

{% docs crai_path %}
path of crai file from sequencing manifest
{% enddocs %}

{% docs md5_path %}
path of md5 file from sequencing manifest
{% enddocs %}

{% docs root_sample_id %}
Specimen sample ID from sequencing manifest
{% enddocs %}

### S3 Scrape

{% docs mean_coverage %}
mean coverage value from sequencing manifest
{% enddocs %}

{% docs bucket %}
s3 bucket for file
{% enddocs %}

{% docs key %}
s3 key for file
{% enddocs %}

{% docs last_modified %}
last time s3 file was modified
{% enddocs %}

{% docs etag %}
etag hash from s3 manifest
{% enddocs %}

{% docs size %}
size of file from s3 manifest
{% enddocs %}

{% docs storage_class %}
s3 storage class for file
{% enddocs %}


## Study Specific Fields

# SD_1NNS3K8V Fields
26 changes: 26 additions & 0 deletions dbt_project/models/_metadata_description_files/docs_tables.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,4 +136,30 @@ Stable table for int_specimens. Finalized mapping of transformed dataservice ent

{% docs stable_study %}
Stable table for int_study. Finalized mapping of transformed dataservice entities that are ready to be brought into the access layer.
{% enddocs %}


## Kids First Study SD_1NNS3K8V
{% docs kf_sd_1nns3k8v_src_clinical %}
Source table for Cody study source clinical data
{% enddocs %}

{% docs kf_sd_1nns3k8v_src_sequencing %}
Source table for Cody study source sequencing data
{% enddocs %}

{% docs kf_sd_1nns3k8v_src_s3 %}
Source table for Cody study s3 file manifest
{% enddocs %}

{% docs kf_sd_1nns3k8v_int_clinical %}
Intermediate table for src_clinical. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed.
{% enddocs %}

{% docs kf_sd_1nns3k8v_int_sequencing %}
Intermediate table for src_sequencing. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed.
{% enddocs %}

{% docs kf_sd_1nns3k8v_int_s3 %}
Intermediate table for src_s3. Transforms dataservice entities for better usability and clarity. Excludes certain entites that are not needed.
{% enddocs %}
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

/*
Welcome to your first dbt model!
Did you know that you can also configure models directly within SQL files?
Expand All @@ -11,9 +10,9 @@

with source_data as (

select 1 as id
union all
select null as id
select 1 as id
union all
select null as id

)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
version: 2

models:

- name: kf_sd_1nns3k8v_int_clinical
description: '{{ doc("kf_sd_1nns3k8v_int_clinical") }}'
config:
meta:
study: kf_sd_1nns3k8v
columns:
- name: family_id
description: '{{ doc("family_id") }}'
- name: participant_id
description: '{{ doc("participant_id") }}'
- name: family_relationship_target_participant_id
description: '{{ doc("family_relationship_target_participant_id") }}'
- name: aliquot_id
description: '{{ doc("aliquot_id") }}'
- name: consent_group
description: '{{ doc("consent_group") }}'
- name: age_at_sample
description: '{{ doc("age_at_sample") }}'
- name: sample_composition
description: '{{ doc("sample_composition") }}'
- name: sample_tissue_type
description: '{{ doc("sample_tissue_type") }}'
- name: race
description: '{{ doc("race") }}'
- name: sex
description: '{{ doc("sex") }}'
- name: ethnicity
description: '{{ doc("ethnicity") }}'
- name: analyte_type
description: '{{ doc("analyte_type") }}'
- name: age_at_phenotype
description: '{{ doc("age_at_phenotype") }}'
- name: affected_status
description: '{{ doc("affected_status") }}'
- name: vital_status
description: '{{ doc("vital_status") }}'

- name: kf_sd_1nns3k8v_int_sequencing
description: '{{ doc("kf_sd_1nns3k8v_int_sequencing") }}'
config:
meta:
study: kf_sd_1nns3k8v
columns:
- name: sample_id
description: '{{ doc("sample_id") }}'
- name: collaborator_sample_id
description: '{{ doc("collaborator_sample_id") }}'
- name: collaborator_participant_id
description: '{{ doc("collaborator_participant_id") }}'
- name: cram_path
description: '{{ doc("cram_path") }}'
- name: crai_path
description: '{{ doc("crai_path") }}'
- name: md5_path
description: '{{ doc("md5_path") }}'
- name: root_sample_id
description: '{{ doc("root_sample_id") }}'
- name: mean_coverage
description: '{{ doc("mean_coverage") }}'
- name: total_reads
description: '{{ doc("total_reads") }}'
- name: mean_read_length
description: '{{ doc("mean_read_length") }}'


- name: kf_sd_1nns3k8v_int_s3
description: '{{ doc("kf_sd_1nns3k8v_int_s3") }}'
config:
meta:
study: kf_sd_1nns3k8v
columns:
- name: bucket
description: '{{ doc("bucket") }}'
- name: key
description: '{{ doc("key") }}'
- name: last_modified
description: '{{ doc("last_modified") }}'
- name: etag
description: '{{ doc("etag") }}'
- name: size
description: '{{ doc("size") }}'
- name: storage_class
description: '{{ doc("storage_class") }}'
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{{ config(
schema='int'
) }}

select distinct
"Family ID" as family_id,
"Participant ID" as participant_id,
"Family Relationship Target Participant ID" as family_relationship_target_participant_id,
alliquot_id as aliquot_id,
"Consent Group" as consent_group,
"Age at Sample" as age_at_sample,
"Sample Composition" as sample_composition,
"Sample Tissue Type" as sample_tissue_type,
"Race" as race,
"Sex" as sex,
"Ethnicity" as ethnicity,
"Analyte Type" as analyte_type,
"Age at Phenotype" as age_at_phenotype,
"Affected Status" as affected_status,
"Vital Status" as vital_status
from {{ ref('kf_sd_1nns3k8v_src_clinical') }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{{ config(
schema='int'
) }}

select distinct
"Bucket" as bucket,
"Key" as key,
"LastModified" as last_modified,
"ETag" as etag,
"Size" as size,
"StorageClass" as storage_class
from {{ ref('s3_scrape_cody') }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{{ config(
schema='int'
) }}

select distinct
"entity:sample_id" as sample_id,
-- "project" as project,
collaborator_sample_id,
-- "version" as version,
-- "data_type" as data_type,
-- "pdo" as pdo,
-- "release_date" as release_date,
-- "reference_sequence_name" as reference_sequence_name,
collaborator_participant_id,
cram_path,
crai_path,
md5_path,
root_sample_id,
mean_coverage,
-- "adapter_rate" as adapter_rate,
-- "pf_hq_aligned_q20_bases" as pf_hq_aligned_q20_bases,
-- "pf_hq_aligned_reads" as pf_hq_aligned_reads,
-- "pf_mismatch_rate" as pf_mismatch_rate,
-- "pf_noise_reads" as pf_noise_reads,
-- "pf_reads" as pf_reads,
-- "pf_reads_aligned" as pf_reads_aligned,
total_reads,
mean_read_length
-- "pf_reads_rate" as pf_reads_rate,
-- "pf_reads_aligned_rate" as pf_reads_aligned_rate,
-- "pf_aligned_bases" as pf_aligned_bases,
-- "reads_aligned_in_pairs" as reads_aligned_in_pairs,

from {{ ref('kf_sd_1nns3k8v_src_sequencing') }}
Loading