diff --git a/bq/utilities.py b/bq/bq_utilities.py
similarity index 63%
rename from bq/utilities.py
rename to bq/bq_utilities.py
index 19f1866..090f250 100644
--- a/bq/utilities.py
+++ b/bq/bq_utilities.py
@@ -22,8 +22,52 @@
import pytz
import argparse
import json5
-
-
+from datetime import datetime, timedelta, timezone
+import requests
+import yaml
+
+
+def get_data_from_comet(path, branch="current"):
+ file_url = f"https://raw.githubusercontent.com/ImagingDataCommons/idc-comet/{branch}/{path}"
+ headers = {
+ "Authorization": f"token {settings.GITHUB_TOKEN}",
+ "Accept": "application/vnd.github.v3+json"
+ }
+ response = requests.get(file_url, headers=headers)
+ if response.status_code == 200:
+ # Specify the local path where you want to save the file
+ metadata = yaml.load(StringIO(response.text), Loader=yaml.Loader)
+ return metadata["programs"]
+ else:
+ print(f"Failed to retrieve file. Status code: {response.status_code}")
+ exit(1)
+
+# Create a table from a data frame. The table will be deleted after the time limit expires
+def create_temp_table_from_df(client, table_id, schema, df, expire_in_minutes=10):
+ table = bigquery.Table(table_id)
+
+ # Set expiration to 2 minutes from now
+ expiration_duration = timedelta(minutes=expire_in_minutes)
+ table.expires = datetime.now(timezone.utc) + expiration_duration
+ try:
+ client.create_table(table, exists_ok=True)
+ # print(f"Table {table_id} created/updated with expiration at {table.expires}")
+ except Exception as e:
+ print(f"Error setting table metadata: {e}")
+ exit(1)
+
+ job_config = bigquery.LoadJobConfig(
+ schema=schema,
+ write_disposition="WRITE_TRUNCATE"
+ )
+ # 5. Load data
+ job = client.load_table_from_dataframe(
+ df, table_id, job_config=job_config
+ )
+ job.result() # Wait for job to complete
+
+
+# Read the file at the file path into a dataframe. The file is assumed to be JSON formatted
def read_json_to_dataframe(file_path):
with open(file_path) as f:
definitions = json5.load(f)
@@ -74,33 +118,6 @@ def json_file_to_bq(args, file_path, lifetime=None):
return
- # # Initialize the BigQuery client
- # client = bigquery.Client()
- #
- # # Define the BigQuery table reference
- # table_ref = f'{args.project}.{args.bq_dataset_id}.{args.table_id}'
- #
- # # Create the BigQuery table if it doesn't exist
- # try:
- # client.get_table(table_ref)
- # except:
- # table = bigquery.Table(table_ref)
- # client.create_table(table)
- #
- # # Write the DataFrame data to BigQuery
- # job_config = bigquery.LoadJobConfig(write_disposition='WRITE_TRUNCATE')
- # job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
- # job.result()
- #
- # if lifetime:
- # table = client.get_table(table_ref) # Get the table object
- # expiration_time = datetime.now(pytz.utc) + timedelta(minutes=lifetime)
- # table.expires = expiration_time
- # client.update_table(table, ["expires"])
- #
- # print('Data imported successfully!')
-
-
if __name__ == '__main__':
parser = argparse.ArgumentParser()
diff --git a/bq/generate_tables_and_views/all_joined.py b/bq/generate_tables_and_views/all_joined.py
index 0d9b1b6..53b4017 100644
--- a/bq/generate_tables_and_views/all_joined.py
+++ b/bq/generate_tables_and_views/all_joined.py
@@ -23,6 +23,9 @@
import settings
from google.cloud import bigquery
from utilities.bq_helpers import create_BQ_dataset
+from utilities.tcia_helpers import get_tcia_collection_manager_data
+import pandas as pd
+from bq.bq_utilities import create_temp_table_from_df
# Flatten the version/collection/... hierarchy
# Note that we no longer include license here as the license can change over time.
@@ -119,7 +122,16 @@ def create_all_flattened(client):
def create_all_sources(client):
- table_id = f"{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_sources"
+ # Create a temporary table of names of all TCIA analysis results
+ tcia_analysis_result_dois= [row['result_doi'].lower() for row in get_tcia_collection_manager_data('analysis-results')]
+ df = pd.DataFrame(tcia_analysis_result_dois, columns=['source_doi'])
+ table_id = f"{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.tcia_analysis_result_dois"
+ schema = [
+ bigquery.SchemaField("source_doi", "STRING")
+ ]
+ create_temp_table_from_df(client, table_id, schema, df, 60)
+
+
query = f"""
with basics as (
SELECT distinct
@@ -136,17 +148,28 @@ def create_all_sources(client):
ON af.source_doi = dtc.source_doi
LEFT JOIN `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.metadata_sunset` ms
ON af.source_doi = ms.source_doi
+),
+analysis_result_dois as (
+ SELECT DISTINCT source_doi
+ FROM `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.tcia_analysis_result_dois`
+UNION ALL
+ SELECT DISTINCT source_doi
+ FROM `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.analysis_results_metadata_idc_source`
)
SELECT
- *,
+ basics.*,
+ if(analysis_result_dois.source_doi IS NULL, False, True) analysis_result,
if(Type='Open', 'idc-arch-open', if(Type='Cr', 'idc-arch-cr', if(Type='Defaced', 'idc-arch-defaced', if(Type='Redacted','idc-arch-redacted','idc-arch-excluded')))) dev_bucket,
if(Type='Open', 'idc-open-data', if(Type='Cr', 'idc-open-cr', if(Type='Defaced', 'idc-open-idc1', NULL))) pub_gcs_bucket,
if(Type='Open', 'idc-open-data', if(Type='Cr', 'idc-open-data-cr', if(Type='Defaced', 'idc-open-data-two', NULL))) pub_aws_bucket,
FROM basics
-- ORDER by collection_id, source_doi, dev_bucket, pub_gcs_bucket, pub_aws_bucket
-ORDER by collection_id, source_doi, pub_gcs_bucket, pub_aws_bucket
+LEFT JOIN analysis_result_dois
+ON basics.source_doi = analysis_result_dois.source_doi
+ORDER by collection_id, basics.source_doi, pub_gcs_bucket, pub_aws_bucket
"""
# Make an API request to create the view.
+ table_id = f"{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_sources"
client.delete_table(table_id, not_found_ok=True)
job_config = bigquery.QueryJobConfig(destination=table_id)
query_job = client.query(query,job_config=job_config)
@@ -159,7 +182,7 @@ def create_all_joined(client):
view = bigquery.Table(view_id)
view.view_query = f"""
-- SELECT af.*, ac.source, ac.Class, ac.Access, ac.metadata_sunset, ac.dev_bucket, ac.pub_gcs_bucket, ac.pub_aws_bucket
-SELECT af.*, ac.source, ac.Type, ac.Access, ac.metadata_sunset, ac.dev_bucket, ac.pub_gcs_bucket, ac.pub_aws_bucket
+SELECT af.*, ac.source, ac.Type, ac.Access, ac.metadata_sunset, ac.analysis_result, ac.dev_bucket, ac.pub_gcs_bucket, ac.pub_aws_bucket
FROM `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_flattened` af
JOIN `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_sources` ac
ON af.source_doi = ac.source_doi
diff --git a/bq/generate_tables_and_views/analysis_results_end_user_descriptions.py b/bq/generate_tables_and_views/analysis_results_end_user_descriptions.py
index b99729e..4bf5d9d 100644
--- a/bq/generate_tables_and_views/analysis_results_end_user_descriptions.py
+++ b/bq/generate_tables_and_views/analysis_results_end_user_descriptions.py
@@ -22,7 +22,7 @@
import pandas as pd
from google.cloud import bigquery
import markdownify
-from bq.utilities import read_json_to_dataframe, dataframe_to_bq
+from bq.bq_utilities import read_json_to_dataframe, dataframe_to_bq
# Get the descriptions of collections that are only sourced from IDC
diff --git a/bq/generate_tables_and_views/analysis_results_metadata.py b/bq/generate_tables_and_views/analysis_results_metadata.py
index e294f7f..74d9741 100644
--- a/bq/generate_tables_and_views/analysis_results_metadata.py
+++ b/bq/generate_tables_and_views/analysis_results_metadata.py
@@ -21,35 +21,42 @@
import os
import json
import time
-from re import split as re_split
+from bq.bq_utilities import create_temp_table_from_df
+import pandas as pd
from google.cloud import bigquery
from utilities.bq_helpers import load_BQ_from_json
-# from bq.generate_tables_and_views.analysis_results_metadata.schema import analysis_results_metadata_schema
-from utilities.tcia_helpers import get_all_tcia_metadata
+from utilities.tcia_helpers import get_tcia_collection_manager_data
from utilities.logging_config import successlogger, progresslogger, errlogger
-# from python_settings import settings
+
import settings
import requests
analysis_results_metadata_schema = [
- bigquery.SchemaField('ID', 'STRING', mode='REQUIRED', description='Results ID'),
- bigquery.SchemaField('Title', 'STRING', mode='REQUIRED', description='Descriptive title'),
- bigquery.SchemaField('source_doi','STRING', mode='NULLABLE', description='DOI that can be resolved at doi.org to a wiki page'),
+ bigquery.SchemaField('analysis_result_name', 'STRING', mode='REQUIRED', description='Analysis result name as used externally by IDC webapp'),
+ bigquery.SchemaField('analysis_result_id', 'STRING', mode='REQUIRED', description='Analysis result ID as used internally by IDC webapp'),
+ bigquery.SchemaField('analysis_result_title', 'STRING', mode='REQUIRED', description='Descriptive title of this analysis result'),
+ bigquery.SchemaField('source_doi','STRING', mode='NULLABLE', description='DOI that can be resolved at doi.org to an information page'),
bigquery.SchemaField('source_url','STRING', mode='REQUIRED', description='URL of a wiki page'),
- bigquery.SchemaField('CancerTypes','STRING', mode='REQUIRED', description='Type(s) of cancer analyzed'),
- bigquery.SchemaField('TumorLocations', 'STRING', mode='REQUIRED', description='Body location that was analyzed'),
- bigquery.SchemaField('Subjects', 'INTEGER', mode='REQUIRED', description='Number of subjects whose data was analyzed'),
- bigquery.SchemaField('Collections', 'STRING', mode='REQUIRED', description='collection_names of original data collections analyzed'),
- bigquery.SchemaField('Modalities', 'STRING', mode='REQUIRED', description='Modalities of this analysis result'),
- bigquery.SchemaField('Updated', 'DATE', mode='REQUIRED', description='Most recent update reported by TCIA'),
+ bigquery.SchemaField('cancer_types','STRING', mode='REQUIRED', description='Type(s) of cancer analyzed'),
+ bigquery.SchemaField('tumor_locations', 'STRING', mode='REQUIRED', description='Body location that was analyzed'),
+ bigquery.SchemaField('subjects', 'INTEGER', mode='REQUIRED', description='Number of subjects whose data was analyzed'),
+ bigquery.SchemaField('collections', 'STRING', mode='REQUIRED', description='collection_names of original data collections analyzed'),
+ bigquery.SchemaField('modalities', 'STRING', mode='REQUIRED', description='Modalities of this analysis result'),
+ bigquery.SchemaField('updated', 'DATE', mode='REQUIRED', description='Most recent update reported by TCIA'),
bigquery.SchemaField('license_url', 'STRING', mode='REQUIRED', description='URL of license of this analysis result'),
bigquery.SchemaField('license_long_name', 'STRING', mode='REQUIRED', description='Long name of license of this analysis result'),
bigquery.SchemaField('license_short_name', 'STRING', mode='REQUIRED', description='Short name of license of this analysis result'),
- bigquery.SchemaField('Description', 'STRING', mode='REQUIRED',
- description='Analysis result description'),
- bigquery.SchemaField('Citation', 'STRING', mode='NULLABLE',
- description='Citation to be used for this source'),
- bigquery.SchemaField('Access', 'STRING', mode='REQUIRED', description='Deprecated: Access is always Public'),
+ bigquery.SchemaField('description', 'STRING', mode='REQUIRED',
+ description='Description of this analysis result'),
+ bigquery.SchemaField('citation', 'STRING', mode='NULLABLE',
+ description='Citation to be used for this analysis result'),
+ # Deprecations
+ bigquery.SchemaField('ID', 'STRING', mode='REQUIRED',
+ description='DEPRECATED: Duplicate of analysis_result_name'),
+ bigquery.SchemaField('Title', 'STRING', mode='REQUIRED', description='DEPRECATED: Duplicate of analysis_result_title'),
+ bigquery.SchemaField('CancerTypes', 'STRING', mode='REQUIRED', description='DEPRECATED: Duplicate of cancer_types'),
+ bigquery.SchemaField('TumorLocations', 'STRING', mode='REQUIRED', description='DEPRECATED: Duplicate of tumor_locations'),
+ bigquery.SchemaField('Access', 'STRING', mode='REQUIRED', description='DEPRECATED: Access is always Public'),
]
@@ -62,41 +69,24 @@ def get_descriptions(client,args):
return descriptions
-def get_idc_sourced_analysis_metadata(client):
+def get_idc_sourced_analysis_result_metadata(client):
query = f"""
-- SELECT DISTINCT ID, Title, Access, source_doi, Updated
- SELECT DISTINCT ID, Title, source_doi, Updated
+ SELECT DISTINCT analysis_result_name, analysis_result_title, source_doi, updated
FROM `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.analysis_results_metadata_idc_source`
"""
- results = [dict(row) for row in client.query(query).result()]
- metadata = {row["source_doi"]:row for row in results}
+ metadata = [dict(row) for row in client.query(query).result()]
+ # metadata = {row["source_doi"]:row for row in results}
return metadata
# Get a list of subjects per source_doi
def get_citation(source_url):
- temp_citations = {
-
-# "https://doi.org/10.5281/zenodo.15643312": "Krishnaswamy, D., Bridge, C., Clunie, D., & Fedorov, A. (2025). PROSTATEx-Targets: Point annotations of lesion targets for the PROSTATEx collection[Dataset]. Zenodo. https://doi.org/10.5281/zenodo.15643312",
-# "https://doi.org/10.5281/zenodo.15643334": "Krishnaswamy, D., Bridge, C., Clunie, D., & Fedorov, A. (2025). NLST-Sybil: Expert annotations of tumor regions in the NLST CT images[Dataset]. Zenodo. https://doi.org/10.5281/zenodo.15643334",
-# "https://doi.org/10.5281/zenodo.16989819": "Krishnaswamy, D., Bridge, C., Clunie, D., & Fedorov, A. (2025). Lung-PET-CT-Dx-Annotations: Expert annotation of lung tumors for the Lung-PET-CT-Dx collection[Dataset]. Zenodo. https://doi.org/10.5281/zenodo.16989819",
-#
-# "https://doi.org/10.5281/zenodo.17362624": "Krishnaswamy, D., Clunie, D., & Fedorov, A. (2025). NLSTSeg: Expert lesion segmentations and radiomics features for NLST CT images[Dataset]. Zenodo. https://doi.org/10.5281/zenodo.17362624",
-# "https://doi.org/10.5281/zenodo.17470190": "Bridge, C., Zheng, Y., Gevaert, O., Clunie, D., & Fedorov, A. (2025). TCGA-GBM360: GBM360 aggressiveness maps for a subset of TCGA pathology slides[Dataset]. Zenodo. https://doi.org/10.5281/zenodo.17470190",
-# "https://doi.org/10.5281/zenodo.16966285": "Bridge, C., Abousamra, S., Saltz, J., Gupta, R., Kurc, T., Zhang, Y., Zhao, T., Batiste, R., Samaras, D., Bremer, E., Shroyer, K. R., Nguyen, V., Singh, P., Hou, L., Le, H., Van Arnam, J., Shmulevich, I., \
-# Rao, A. U. K., Lazar, A. J., Sharma, A., Thorsson, V., Shankar, A., Chen, C., Clunie, D., & Fedorov, A. (2025). TCGA-SBU-TIL-Maps: AI-derived Tumor Infiltrating Lymphocyte maps for the TCGA collections[Dataset]. Zenodo. https://doi.org/10.5281/zenodo.16966285"
-
- }
-
- # header = {"Accept": "text/x-bibliography; style=apa"}
header = {"Accept": "text/x-bibliography; style=elsevier-vancouver-no-et-al"}
citation = requests.get(source_url, headers=header).text
if citation.startswith(""):
- try:
- citation = temp_citations[source_url]
- except:
- breakpoint()
-
+ errlogger.error(f'No citation for {source_url}')
+ exit(1)
return citation
@@ -108,36 +98,21 @@ def get_citation(source_url):
# 3. Get cumulative metadata for each remaining AR
def get_analysis_results_metadata(client, analysis_metadata):
schema = [
- bigquery.SchemaField("ID", "STRING", mode="REQUIRED"),
- bigquery.SchemaField("Title", "STRING", mode="REQUIRED"),
- # bigquery.SchemaField("Access", "STRING", mode="REQUIRED"),
+ bigquery.SchemaField("analysis_result_name", "STRING", mode="REQUIRED"),
+ bigquery.SchemaField("analysis_result_title", "STRING", mode="REQUIRED"),
bigquery.SchemaField("source_doi", "STRING", mode="REQUIRED"),
- bigquery.SchemaField("Updated", "STRING", mode="REQUIRED"),
+ bigquery.SchemaField("updated", "STRING", mode="REQUIRED"),
]
- table_id = 'gen_analysis_results_metadata'
- table_ref = client.dataset('whc_dev').table(table_id)
- table = bigquery.Table(table_ref, schema=schema)
- client.delete_table(table, not_found_ok=True)
- table = client.create_table(table,["schema"])
- client.update_table(table,["schema"])
- try:
- errors = client.insert_rows_json(table_ref, [v for k,v in analysis_metadata.items()])
+ table_name = 'gen_analysis_results_metadata'
+ table_id = f"{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.{table_name}"
+ df = pd.DataFrame(analysis_metadata)
+ create_temp_table_from_df(client, table_id, schema, df, expire_in_minutes=30)
- except:
- try:
- # Known bug. Need to try to insert twice.
- errors = client.insert_rows_json(table_ref, [v for k, v in analysis_metadata.items()])
- except Exception as exc:
- print("Encountered errors while inserting rows:", exc)
- exit(1)
- if errors != []:
- print("Encountered errors while inserting rows:")
- exit(1)
query = f"""
WITH ocm AS (
-- Flatten
SELECT * except(Updated, sources, CancerTypes, TumorLocations, Subjects)
- FROM `idc-dev-etl.{settings.BQ_DEV_EXT_DATASET}.original_collections_metadata` ocm,
+ FROM `{settings.DEV_PROJECT}.{settings.BQ_DEV_EXT_DATASET}.original_collections_metadata` ocm,
UNNEST(sources) AS srcs, UNNEST(SPLIT(CancerTypes, ',')) as CTypes,
UNNEST(SPLIT(TumorLocations, ',')) TLocations,
UNNEST(SPLIT(srcs.ImageTypes, ',')) ITypes
@@ -146,23 +121,32 @@ def get_analysis_results_metadata(client, analysis_metadata):
s1 AS (
SELECT DISTINCT garm.*, ocm.collection_id, ocm.CTypes, ocm.TLocations, ocm.ITypes
FROM ocm
- JOIN `idc-dev-etl.whc_dev.{table_id}` garm
+ JOIN `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.{table_name}` garm
ON ocm.source_doi = garm.source_doi
)
-SELECT DISTINCT ID, Title, s1.source_doi source_doi, CONCAT("https://doi.org/",s1.source_doi) source_url,
+SELECT DISTINCT
+ etl_functions.name_to_id(analysis_result_name) analysis_result_id,
+ analysis_result_name,
+ analysis_result_title,
+ s1.source_doi source_doi,
+ CONCAT("https://doi.org/", s1.source_doi) source_url,
+ STRING_AGG(DISTINCT TRIM(s1.CTypes, ' '), ", " ORDER BY TRIM(CTypes, ' ')) cancer_types,
+ STRING_AGG(DISTINCT TRIM(TLocations, ' '), ", " ORDER BY TRIM(TLocations, ' ')) tumor_locations,
+ COUNT(DISTINCT ajpac.submitter_case_id) subjects,
+ STRING_AGG( DISTINCT TRIM(s1.collection_id, ' '), ", " ORDER BY TRIM(s1.collection_id, ' ')) collections,
+ STRING_AGG( DISTINCT TRIM(Modality, ' '), ", " ORDER BY TRIM(Modality, ' ')) modalities,
+ updated,
+ license_url, license_long_name, license_short_name,
+ "Public" Access,
+ analysis_result_name ID,
+ analysis_result_title Title,
STRING_AGG(DISTINCT TRIM(s1.CTypes, ' '), ", " ORDER BY TRIM(CTypes, ' ')) CancerTypes,
STRING_AGG(DISTINCT TRIM(TLocations, ' '), ", " ORDER BY TRIM(TLocations, ' ')) TumorLocations,
- COUNT(DISTINCT ajpac.submitter_case_id) Subjects,
- STRING_AGG( DISTINCT TRIM(s1.collection_id, ' '), ", " ORDER BY TRIM(s1.collection_id, ' ')) Collections,
- STRING_AGG( DISTINCT TRIM(Modality, ' '), ", " ORDER BY TRIM(Modality, ' ')) Modalities,
- Updated,
- license_url, license_long_name, license_short_name,
- "Public" Access
FROM s1
- JOIN `idc-dev-etl.{settings.BQ_DEV_INT_DATASET}.all_joined_public_and_current` ajpac
+ JOIN `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_joined_public_and_current` ajpac
ON s1.source_doi = ajpac.source_doi
- JOIN `idc-dev-etl.{settings.BQ_DEV_EXT_DATASET}.dicom_metadata` dm
+ JOIN `{settings.DEV_PROJECT}.{settings.BQ_DEV_EXT_DATASET}.dicom_metadata` dm
ON ajpac.sop_instance_uid = dm.SOPInstanceUID
GROUP BY ID, Title, source_doi, source_url, Updated, license_url, license_long_name, license_short_name
ORDER BY ID
@@ -171,23 +155,26 @@ def get_analysis_results_metadata(client, analysis_metadata):
results = {row['source_doi']:dict(row) for row in client.query(query)}
return results
-def get_tcia_sourced_analysis_metadata(BQ_client):
- tcia_ars = get_all_tcia_metadata('analysis-results')
- ar_metadata = {}
+def get_tcia_sourced_analysis_result_metadata(BQ_client):
+ tcia_ars = get_tcia_collection_manager_data('analysis-results')
+ ar_metadata = []
for ar in tcia_ars:
- ar_metadata[ar['result_doi'].lower()] = dict(
- ID = ar['result_short_title'],
- Title = ar['result_title'],
+ ar_metadata.append(
+ dict(
+ analysis_result_name = ar['result_short_title'],
+ analysis_result_title = ar['result_title'],
# Access = ar['result_page_accessibility'],
source_doi = ar['result_doi'].lower(),
- Updated = ar['date_updated'],
+ updated = ar['date_updated'],
+ )
)
return ar_metadata
def build_metadata(args, BQ_client):
- tcia_analysis_metadata = get_tcia_sourced_analysis_metadata(BQ_client)
- idc_analysis_metadata = get_idc_sourced_analysis_metadata(BQ_client)
- all_analysis_results = idc_analysis_metadata | tcia_analysis_metadata
+ # Get some basic metadata for each tcia-sourced and idc-sourced analysis result
+ tcia_analysis_metadata = get_tcia_sourced_analysis_result_metadata(BQ_client)
+ idc_analysis_metadata = get_idc_sourced_analysis_result_metadata(BQ_client)
+ all_analysis_results = idc_analysis_metadata + tcia_analysis_metadata
analysis_results_metadata = get_analysis_results_metadata(BQ_client, all_analysis_results)
# Get analysis results descriptions
@@ -195,31 +182,31 @@ def build_metadata(args, BQ_client):
rows = []
for source_doi, analysis_data in analysis_results_metadata.items():
- # analysis_data['source_url'] = f'https://doi.org/{analysis_data["source_doi"]}'
try:
- analysis_data['Description'] = descriptions[analysis_data['ID']]
+ analysis_data['description'] = descriptions[analysis_data['ID']]
except Exception as exc:
errlogger.error(f'No description found for {analysis_data["ID"]}')
analysis_data['Citation'] = get_citation(analysis_data['source_url'])
rows.append(json.dumps(analysis_data))
- metadata = '\n'.join(rows)
- return metadata
+ return rows
def gen_table(args):
BQ_client = bigquery.Client(project=settings.DEV_PROJECT)
- metadata = build_metadata(args, BQ_client)
+ rows = build_metadata(args, BQ_client)
+ metadata = '\n'.join(rows)
try:
job = load_BQ_from_json(BQ_client, settings.DEV_PROJECT, settings.BQ_DEV_EXT_DATASET, args.bqtable_name, metadata, analysis_results_metadata_schema,
write_disposition='WRITE_TRUNCATE',
table_description='Metadata of Analysis Results. These are the results of analysis performed against Original Collections hosted by IDC.')
+ while not job.state == 'DONE':
+ progresslogger.info('Status: {}'.format(job.state))
+ time.sleep(args.period * 60)
+ successlogger.info(f"{time.asctime()}: Completed {args.bqtable_name}")
+ return
except Exception as exc:
print(f'Error {exc}')
- exit
- while not job.state == 'DONE':
- progresslogger.info('Status: {}'.format(job.state))
- time.sleep(args.period * 60)
- successlogger.info(f"{time.asctime()}: Completed {args.bqtable_name}")
+ exit(1)
if __name__ == '__main__':
parser =argparse.ArgumentParser()
diff --git a/bq/generate_tables_and_views/analysis_results_metadata_idc_source.py b/bq/generate_tables_and_views/analysis_results_metadata_idc_source.py
index cd958c9..6390cfc 100644
--- a/bq/generate_tables_and_views/analysis_results_metadata_idc_source.py
+++ b/bq/generate_tables_and_views/analysis_results_metadata_idc_source.py
@@ -18,7 +18,7 @@
# spreadsheet in Google Drive
import settings
import argparse
-from bq.utilities import json_file_to_bq
+from bq.bq_utilities import json_file_to_bq
if __name__ == '__main__':
parser = argparse.ArgumentParser()
@@ -29,5 +29,5 @@
args = parser.parse_args()
print('args: {}'.format(args))
- json_file_path = f"{settings.PROJECT_PATH}/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json"
+ json_file_path = f"{settings.PROJECT_PATH}/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json5"
json_file_to_bq(args, json_file_path)
\ No newline at end of file
diff --git a/bq/generate_tables_and_views/analysis_results_tooltip_descriptions.py b/bq/generate_tables_and_views/analysis_results_tooltip_descriptions.py
index ac77681..531cb4c 100644
--- a/bq/generate_tables_and_views/analysis_results_tooltip_descriptions.py
+++ b/bq/generate_tables_and_views/analysis_results_tooltip_descriptions.py
@@ -22,7 +22,7 @@
import pandas as pd
from google.cloud import bigquery
import markdownify
-from bq.utilities import read_json_to_dataframe, dataframe_to_bq
+from bq.bq_utilities import read_json_to_dataframe, dataframe_to_bq
import re
diff --git a/bq/generate_tables_and_views/auxiliary_metadata_table/schema.py b/bq/generate_tables_and_views/auxiliary_metadata_table/schema.py
index a263c3d..68ec0eb 100644
--- a/bq/generate_tables_and_views/auxiliary_metadata_table/schema.py
+++ b/bq/generate_tables_and_views/auxiliary_metadata_table/schema.py
@@ -51,19 +51,17 @@
bigquery.SchemaField('study_hash', 'STRING', mode='NULLABLE', description='md5 hash of the data in the this version of the study containing this instance'),
bigquery.SchemaField('study_init_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which the study containing this instance first appeared'),
bigquery.SchemaField('study_revised_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which this version of the study containing this instance first appeared'),
- bigquery.SchemaField('study_final_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which this version of the study containing this instance last appeared. If 0, thise is the current version.'),
bigquery.SchemaField('SeriesInstanceUID', 'STRING', mode='NULLABLE', description='DICOM series containing this instance'),
bigquery.SchemaField('series_uuid', 'STRING', mode='NULLABLE', description='UUID of this version of the series containing this instance'),
bigquery.SchemaField('series_gcs_url', 'STRING', mode='NULLABLE', description='URL of the Google Cloud Storage (GCS) folder of the series containing this instance'),
bigquery.SchemaField('series_aws_url', 'STRING', mode='NULLABLE', description='URL of the Amazon Web Services (AWS) folder of the series containing this instance'),
- bigquery.SchemaField('Source_DOI', 'STRING', mode='NULLABLE', description='The DOI of a wiki page that describes the original collection or analysis result that includes this instance'),
- bigquery.SchemaField('Source_URL', 'STRING', mode='NULLABLE', description='The URL of a wiki page that describes the original collection or analysis result that includes this instance'),
+ bigquery.SchemaField('source_DOI', 'STRING', mode='NULLABLE', description='The DOI of a wiki page that describes the original collection or analysis result that includes this instance'),
+ bigquery.SchemaField('source_URL', 'STRING', mode='NULLABLE', description='The URL of a wiki page that describes the original collection or analysis result that includes this instance'),
bigquery.SchemaField('versioned_Source_DOI', 'STRING', mode='NULLABLE', description='If present, the DOI of a wiki page that describes the original collection or analysis result that includes this version of this instance'),
bigquery.SchemaField('series_instances', 'INTEGER', mode='NULLABLE', description='Number of instances in the version of the study containing this instance'),
bigquery.SchemaField('series_hash', 'STRING', mode='NULLABLE', description='md5 hash of the data in the this version of the series containing this instance'),
bigquery.SchemaField('series_init_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which the series containing this instance first appeared'),
bigquery.SchemaField('series_revised_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which this version of the series containing this instance first appeared'),
- bigquery.SchemaField('series_final_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which this version of the series containing this instance last appeared. If 0, thise is the current version.'),
bigquery.SchemaField('SOPInstanceUID', 'STRING', mode='NULLABLE', description='DICOM instance containing this instance version'),
bigquery.SchemaField('instance_uuid', 'STRING', mode='NULLABLE', description='UUID of this version of this instance'),
bigquery.SchemaField('gcs_url', 'STRING', mode='NULLABLE', description='URL of the Google Cloud Storage (GCS) object containing the current version of this instance' ),
@@ -72,16 +70,14 @@
bigquery.SchemaField('aws_bucket', 'STRING', mode='NULLABLE', description='Name to the Amazon Web Services (AWS) bucket containing the current version of this instance'),
bigquery.SchemaField('instance_size', 'INTEGER', mode='NULLABLE', description='Size in bytes of this version of this instance'),
bigquery.SchemaField('instance_hash', 'STRING', mode='NULLABLE', description='md5 hash of the data in the this version of this instance'),
- # bigquery.SchemaField('instance_source', 'STRING', mode='NULLABLE', description='Source of the instance, either tcia or idc'),
bigquery.SchemaField('instance_init_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which this instance first appeared'),
bigquery.SchemaField('instance_revised_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which this instance first appeared'),
bigquery.SchemaField('license_url', 'STRING', mode='NULLABLE', description='URL of license of this instance'),
bigquery.SchemaField('license_long_name', 'STRING', mode='NULLABLE', description='Long name of license of this instance'),
bigquery.SchemaField('license_short_name', 'STRING', mode='NULLABLE', description='Short name of license of this instance'),
- bigquery.SchemaField('instance_final_idc_version', 'INTEGER', mode='NULLABLE', description='DEPRECATED: The IDC version in which this instance last appeared. If 0, thise is the current version.'),
+ bigquery.SchemaField('study_final_idc_version', 'INTEGER', mode='NULLABLE', description='DEPRECATED: The IDC version in which this version of the study containing this instance last appeared. If 0, thise is the current version.'),
+ bigquery.SchemaField('series_final_idc_version', 'INTEGER', mode='NULLABLE', description='DEPRECATED: The IDC version in which this version of the series containing this instance last appeared. If 0, thise is the current version.'),
+ bigquery.SchemaField('instance_final_idc_version', 'INTEGER', mode='NULLABLE', description='DEPRECATED: The IDC version in which this instance last appeared. If 0, this is the current version.'),
bigquery.SchemaField('submitter_case_id', 'STRING', mode='NULLABLE', description='DEPRECATED: Identical to column PatientID'),
bigquery.SchemaField('access', 'STRING', mode='NULLABLE', description='DEPRECATED (all data is Public): Collection access status: Public or Limited'),
- # bigquery.SchemaField('tcia_api_collection_id', 'STRING', mode='NULLABLE', description='DEPRECATED: Collection name as used externally by IDC webapp'),
- # bigquery.SchemaField('idc_webapp_collection_id', 'STRING', mode='NULLABLE', description='DEPRECATED: Collection ID as used internally by IDC webapp and accepted by the IDC API'),
-
]
diff --git a/bq/generate_tables_and_views/collection_program_map.py b/bq/generate_tables_and_views/collection_program_map.py
index b93c6b9..e5b4aa5 100644
--- a/bq/generate_tables_and_views/collection_program_map.py
+++ b/bq/generate_tables_and_views/collection_program_map.py
@@ -23,7 +23,7 @@
import settings
import argparse
-from utilities.tcia_helpers import get_all_tcia_metadata
+from utilities.tcia_helpers import get_tcia_collection_manager_data
from google.cloud import bigquery
SCHEMA = [
@@ -44,7 +44,7 @@ def gen_table(args):
# Get a list of the program of each TCIA sourced collection
tcia_programs = [(row['collection_short_title'].lower().replace('-', '_').replace(' ', '_'),
row['program'][0]) if type(row['program']) == list else "" for
- row in get_all_tcia_metadata(type="collections", query_param="&_fields=collection_short_title,program") \
+ row in get_tcia_collection_manager_data(type="collections", query_param="&_fields=collection_short_title,program") \
if row['collection_short_title'] != 'TEST-PAGE']
all_programs = idc_programs
diff --git a/bq/generate_tables_and_views/derived_tables/BQ_Table_Building/derived_data_views/schema/dicom_all.json b/bq/generate_tables_and_views/derived_tables/BQ_Table_Building/derived_data_views/schema/dicom_all.json
index 5a66fe9..8d71562 100644
--- a/bq/generate_tables_and_views/derived_tables/BQ_Table_Building/derived_data_views/schema/dicom_all.json
+++ b/bq/generate_tables_and_views/derived_tables/BQ_Table_Building/derived_data_views/schema/dicom_all.json
@@ -65,7 +65,7 @@
"type": "STRING"
},
{
- "description": "Collection access status: Public or Limited",
+ "description": "DEPRECATED: Collection access status: Public or Limited",
"mode": "NULLABLE",
"name": "access",
"type": "STRING"
@@ -94,6 +94,12 @@
"name": "patient_revised_idc_version",
"type": "STRING"
},
+ {
+ "description": "UUID of this version of study containing this instance",
+ "mode": "NULLABLE",
+ "name": "crdc_study_uuid",
+ "type": "STRING"
+ },
{
"description": "md5 hash of the data in the this version of the study containing this instance",
"mode": "NULLABLE",
@@ -115,7 +121,7 @@
{
"description": "UUID of this version of series containing this instance",
"mode": "NULLABLE",
- "name": "series_uuid",
+ "name": "crdc_series_uuid",
"type": "STRING"
},
{
@@ -151,7 +157,7 @@
{
"description": "UUID of this version of this instance",
"mode": "NULLABLE",
- "name": "instance_uuid",
+ "name": "crdc_instance_uuid",
"type": "STRING"
},
{
@@ -244,24 +250,6 @@
"name": "license_short_name",
"type": "STRING"
},
- {
- "description": "DEPRECATED: UUID of this version of study containing this instance",
- "mode": "NULLABLE",
- "name": "crdc_study_uuid",
- "type": "STRING"
- },
- {
- "description": "DEPRECATED: Duplicate of series_uuid",
- "mode": "NULLABLE",
- "name": "crdc_series_uuid",
- "type": "STRING"
- },
- {
- "description": "DEPRECATED: Duplicate of instance_uuid",
- "mode": "NULLABLE",
- "name": "crdc_instance_uuid",
- "type": "STRING"
- },
{
"description": "TBD",
"mode": "NULLABLE",
diff --git a/bq/generate_tables_and_views/doi_to_access.py b/bq/generate_tables_and_views/doi_to_access.py
index 22e8d53..c2aa1c6 100644
--- a/bq/generate_tables_and_views/doi_to_access.py
+++ b/bq/generate_tables_and_views/doi_to_access.py
@@ -18,7 +18,7 @@
#
import settings
import argparse
-from bq.utilities import json_file_to_bq
+from bq.bq_utilities import json_file_to_bq
if __name__ == '__main__':
parser = argparse.ArgumentParser()
diff --git a/bq/generate_tables_and_views/gen_licenses_table.py b/bq/generate_tables_and_views/gen_licenses_table.py
index ba23635..c85da5b 100644
--- a/bq/generate_tables_and_views/gen_licenses_table.py
+++ b/bq/generate_tables_and_views/gen_licenses_table.py
@@ -25,7 +25,7 @@
import settings
from google.cloud import bigquery
from utilities.bq_helpers import load_BQ_from_json, delete_BQ_Table
-from utilities.tcia_helpers import get_all_tcia_metadata
+from utilities.tcia_helpers import get_tcia_collection_manager_data
from utilities.logging_config import progresslogger, errlogger
import pandas as pd
@@ -61,10 +61,10 @@
def get_tcia_original_collection_licenses(client, args, tcia_downloads_metadata):
# Get all the collection manager collections data:
try:
- tcia_collection_metadata = {row['collection_short_title']:row for row in get_all_tcia_metadata('collections')}
+ tcia_collection_metadata = {row['collection_short_title']:row for row in get_tcia_collection_manager_data('collections')}
except Exception as exc:
pass
- tcia_license_metadata = {row['license_label']:row for row in get_all_tcia_metadata('licenses')}
+ tcia_license_metadata = {row['license_label']:row for row in get_tcia_collection_manager_data('licenses')}
tcia_licenses = []
for collection_name, collection_metadata in tcia_collection_metadata.items():
@@ -123,9 +123,9 @@ def get_tcia_dois(client, args):
# These are licenses of analysis results sourced from TCIA and therefore TCIA sets the licenses
def get_tcia_analysis_results_licenses(client, args, tcia_downloads_metadata):
# Get TCIA Collection Manager analysis-results metadata of all TCIA analysis results
- all_tcia_analysis_results_metadata = {row['result_short_title']: row for row in get_all_tcia_metadata('analysis-results')}
+ all_tcia_analysis_results_metadata = {row['result_short_title']: row for row in get_tcia_collection_manager_data('analysis-results')}
# Get all the download and license info from the collection manager.
- tcia_license_metadata = {row['license_label']:row for row in get_all_tcia_metadata('licenses')}
+ tcia_license_metadata = {row['license_label']:row for row in get_tcia_collection_manager_data('licenses')}
tcia_licenses = []
# Get the license for each AR that IDC has.
@@ -199,7 +199,7 @@ def construct_licenses_table(args):
idc_collection_licenses = get_idc_collection_licences(args)
idc_analysis_results_licenses = get_idc_analysis_results_licences(args)
- tcia_downloads_metadata = {row['id']:row for row in get_all_tcia_metadata('downloads')}
+ tcia_downloads_metadata = {row['id']:row for row in get_tcia_collection_manager_data('downloads')}
tcia_analysis_results_licenses = get_tcia_analysis_results_licenses(client, args, tcia_downloads_metadata)
tcia_collection_licenses = get_tcia_original_collection_licenses(client, args, tcia_downloads_metadata)
all_licenses = pd.concat([idc_collection_licenses, idc_analysis_results_licenses, tcia_collection_licenses, tcia_analysis_results_licenses])
diff --git a/bq/generate_tables_and_views/metadata_sunset.py b/bq/generate_tables_and_views/metadata_sunset.py
index ee9da6c..b863b33 100644
--- a/bq/generate_tables_and_views/metadata_sunset.py
+++ b/bq/generate_tables_and_views/metadata_sunset.py
@@ -18,7 +18,7 @@
# spreadsheet in Google Drive
import settings
import argparse
-from bq.utilities import json_file_to_bq
+from bq.bq_utilities import json_file_to_bq
if __name__ == '__main__':
diff --git a/bq/generate_tables_and_views/obsolete/licenses.py b/bq/generate_tables_and_views/obsolete/licenses.py
index 834b367..50050ee 100644
--- a/bq/generate_tables_and_views/obsolete/licenses.py
+++ b/bq/generate_tables_and_views/obsolete/licenses.py
@@ -25,7 +25,7 @@
import settings
from google.cloud import bigquery
from utilities.bq_helpers import load_BQ_from_json, delete_BQ_Table
-from utilities.tcia_helpers import get_all_tcia_metadata
+from utilities.tcia_helpers import get_tcia_collection_manager_data
from utilities.logging_config import progresslogger, errlogger
LICENSE_NAME_MAP = {
@@ -128,11 +128,11 @@ def get_idc_sourced_collection_licenses(client):
def get_tcia_original_collection_licenses(client, args, tcia_sourced_subcollections):
# Get all the collection manager collections data:
try:
- tcia_collection_metadata = {row['collection_short_title']:row for row in get_all_tcia_metadata('collections')}
+ tcia_collection_metadata = {row['collection_short_title']:row for row in get_tcia_collection_manager_data('collections')}
except Exception as exc:
pass
- tcia_downloads_metadata = {row['id']:row for row in get_all_tcia_metadata('downloads')}
- tcia_licese_metadata = {row['license_label']:row for row in get_all_tcia_metadata('licenses')}
+ tcia_downloads_metadata = {row['id']:row for row in get_tcia_collection_manager_data('downloads')}
+ tcia_licese_metadata = {row['license_label']:row for row in get_tcia_collection_manager_data('licenses')}
tcia_licenses = []
for collection_id, values in tcia_sourced_subcollections.items():
@@ -233,18 +233,18 @@ def get_tcia_analysis_results_licenses(client, args):
idc_ar_dois = get_tcia_dois(client, args)
# Get all the collection manager collections data
- tcia_collection_metadata = {row['collection_short_title']: row for row in get_all_tcia_metadata('collections')}
+ tcia_collection_metadata = {row['collection_short_title']: row for row in get_tcia_collection_manager_data('collections')}
# Get TCIA Collection Manager analysis-results metadata of all TCIA analysis results
- all_tcia_analysis_results_metadata = get_all_tcia_metadata('analysis-results')
+ all_tcia_analysis_results_metadata = get_tcia_collection_manager_data('analysis-results')
# Keep only the metadata of analysis results which IDC has
# These are the only ARs for which we need licenses
tcia_ar_metadata = {row['result_short_title']:row for row in all_tcia_analysis_results_metadata \
if row['result_doi'].lower() in idc_ar_dois}
# Get all the download and license info from the collection manager.
- tcia_downloads_metadata = {row['id']:row for row in get_all_tcia_metadata('downloads')}
- tcia_license_metadata = {row['license_label']:row for row in get_all_tcia_metadata('licenses')}
+ tcia_downloads_metadata = {row['id']:row for row in get_tcia_collection_manager_data('downloads')}
+ tcia_license_metadata = {row['license_label']:row for row in get_tcia_collection_manager_data('licenses')}
tcia_licenses = []
# Get the license for each AR that IDC has.
diff --git a/bq/generate_tables_and_views/original_collections_end_user_descriptions.py b/bq/generate_tables_and_views/original_collections_end_user_descriptions.py
index 373ad96..aaf7e07 100644
--- a/bq/generate_tables_and_views/original_collections_end_user_descriptions.py
+++ b/bq/generate_tables_and_views/original_collections_end_user_descriptions.py
@@ -22,7 +22,7 @@
import pandas as pd
from google.cloud import bigquery
import markdownify
-from bq.utilities import read_json_to_dataframe, dataframe_to_bq
+from bq.bq_utilities import read_json_to_dataframe, dataframe_to_bq
# Get the descriptions of collections that are only sourced from IDC
diff --git a/bq/generate_tables_and_views/original_collections_metadata/gen_original_collection_metadata.py b/bq/generate_tables_and_views/original_collections_metadata/gen_original_collection_metadata.py
index 297a6d1..e7e7a93 100644
--- a/bq/generate_tables_and_views/original_collections_metadata/gen_original_collection_metadata.py
+++ b/bq/generate_tables_and_views/original_collections_metadata/gen_original_collection_metadata.py
@@ -22,10 +22,10 @@
from google.cloud import bigquery
from utilities.bq_helpers import load_BQ_from_json, delete_BQ_Table
from bq.generate_tables_and_views.original_collections_metadata.schema import data_collections_metadata_schema
-from utilities.tcia_helpers import get_all_tcia_metadata
+from utilities.tcia_helpers import get_tcia_collection_manager_data
from utilities.logging_config import progresslogger, errlogger
from python_settings import settings
-from bq.utilities import read_json_to_dataframe
+from bq.bq_utilities import read_json_to_dataframe
import requests
import pandas as pd
@@ -36,10 +36,10 @@ def add_programs(client, args, collection_metadata):
programs = {row['collection_id'].lower(): row['program'] for row in client.query(query).result()}
for collection_name, metadata in collection_metadata.items():
try:
- metadata["Program"] = programs[metadata['collection_id']]
+ metadata["program"] = programs[metadata['collection_id']]
except Exception as exc:
errlogger.error(f'No program for {collection_name}')
- metadata["Program"] = ""
+ metadata["program"] = ""
return collection_metadata
@@ -80,33 +80,34 @@ def add_case_counts(client, args, collection_metadata):
case_counts = {c['collection_id']: c['cases'] for c in client.query(query).result()}
for collection in collection_metadata:
try:
- collection_metadata[collection]['Subjects'] = case_counts[collection]
+ collection_metadata[collection]['subjects'] = case_counts[collection]
except Exception as exc:
errlogger.error(f'No case counts for {collection}')
- collection_metadata[collection]['Subjects'] = ""
+ collection_metadata[collection]['subjects'] = ""
return collection_metadata
-# Generate a per-collection list of the modalities across all instances in each collection
-def add_image_modalities(client, args, collection_metadata):
- query = f"""
- SELECT DISTINCT
- REPLACE(REPLACE(LOWER(collection_id),'-','_'),' ','_') AS idc_webapp_collection_id,
- STRING_AGG(DISTINCT modality, ", " ORDER BY modality) ImageTypes
- FROM
- `idc-dev-etl.{settings.BQ_DEV_INT_DATASET}.all_joined_public_and_current`
- JOIN
- `idc-dev-etl.idc_v{settings.CURRENT_VERSION}_pub.dicom_metadata`
- ON
- sop_instance_uid = SOPInstanceUID
- GROUP BY
- idc_webapp_collection_id
- ORDER BY
- idc_webapp_collection_id """
-
- imageTypes = {c['idc_webapp_collection_id'].lower().replace(' ','_').replace('-','_'): c['ImageTypes'] for c in client.query(query).result()}
- for collection in collection_metadata:
- collection_metadata[collection]['ImageTypes'] = imageTypes[collection]
+# # Generate a per-collection list of the modalities across all instances in each collection
+# def add_image_modalities(client, args, collection_metadata):
+# query = f"""
+# SELECT DISTINCT
+# REPLACE(REPLACE(LOWER(collection_id),'-','_'),' ','_') AS idc_webapp_collection_id,
+# STRING_AGG(DISTINCT modality, ", " ORDER BY modality) ImageTypes
+# FROM
+# `idc-dev-etl.{settings.BQ_DEV_INT_DATASET}.all_joined_public_and_current`
+# JOIN
+# `idc-dev-etl.idc_v{settings.CURRENT_VERSION}_pub.dicom_metadata`
+# ON
+# sop_instance_uid = SOPInstanceUID
+# GROUP BY
+# idc_webapp_collection_id
+# ORDER BY
+# idc_webapp_collection_id """
+#
+# imageTypes = {c['idc_webapp_collection_id'].lower().replace(' ','_').replace('-','_'): c['ImageTypes'] for c in client.query(query).result()}
+# for collection in collection_metadata:
+# collection_metadata[collection]['modalities'] = imageTypes[collection]
+# collection_metadata[collection]['ImageTypes'] = imageTypes[collection]
return collection_metadata
@@ -125,16 +126,20 @@ def get_original_collections_metadata_idc_source(client, args):
idc_only_metadata[row['collection_name']] = dict(
collection_name=row['collection_name'],
collection_id=row['collection_id'],
- Title=row['title'],
- CancerTypes=row['CancerTypes'],
- TumorLocations=row['TumorLocations'],
+ collection_title=row['title'],
+ cancer_types=row['CancerTypes'],
+ tumor_locations=row['TumorLocations'],
# Subjects = 0,
- Species=row['Species'],
- Sources=[],
+ species=row['Species'],
+ sources=[],
+ supporting_data=row['SupportingData'],
+ status=row['Status'],
+ updated = None,
+ # Deprecations
+ Title = row['title'],
+ CancerTypes = row['CancerTypes'],
+ TumorLocations = row['TumorLocations'],
SupportingData=row['SupportingData'],
- Status=row['Status'],
- Updated = None
-# Updated=row['Updated'] if row['Updated'] != 'NA' else None,
)
return idc_only_metadata
@@ -156,7 +161,7 @@ def get_citation(source_url):
def get_original_collections_metadata_tcia_source(client, args, idc_collections):
- tcia_collection_metadata = get_all_tcia_metadata('collections')
+ tcia_collection_metadata = get_tcia_collection_manager_data('collections')
metadata = {}
for collection_name, values in idc_collections.items():
# Find the collection manager entry corresponding to a collection that IDC has
@@ -165,32 +170,34 @@ def get_original_collections_metadata_tcia_source(client, args, idc_collections)
if collection_name == collection['collection_short_title'])
except Exception as exc:
errlogger.error(f'No collection manager data for {collection_name}')
- id_map = {
- 'ACRIN-NSCLC-FDG-PET': 'ACRIN 6668',
- 'CT COLONOGRAPHY': 'ACRIN 6664',
- 'Prostate-Anatomical-Edge-Cases': 'Prostate Anatomical Edge Cases',
- 'QIN-BREAST': 'QIN-Breast'
- }
+ exit(1)
try:
metadata[collection_name] = dict(
collection_name=collection_name,
collection_id=values['collection_id'],
+ collection_title=collection_metadata['collection_title'],
+ cancer_types=", ".join(collection_metadata['cancer_types']) \
+ if isinstance(collection_metadata['cancer_types'], list) else '',
+ tumor_locations=", ".join(collection_metadata['cancer_locations']) \
+ if isinstance(collection_metadata['cancer_locations'], list) else '',
+ subjects=0,
+ species=", ".join(collection_metadata['species']) \
+ if isinstance(collection_metadata['species'], list) else '',
+ sources = [],
+ supporting_data=", ".join(collection_metadata['supporting_data']) \
+ if isinstance(collection_metadata['supporting_data'], list) else '',
+ status=collection_metadata['collection_status'],
+ updated=None,
+ # Deprecations
Title=collection_metadata['collection_title'],
CancerTypes=", ".join(collection_metadata['cancer_types']) \
if isinstance(collection_metadata['cancer_types'], list) else '',
TumorLocations=", ".join(collection_metadata['cancer_locations']) \
if isinstance(collection_metadata['cancer_locations'], list) else '',
- Subjects=0,
- Species=", ".join(collection_metadata['species']) \
- if isinstance(collection_metadata['species'], list) else '',
- Sources = [],
SupportingData=", ".join(collection_metadata['supporting_data']) \
if isinstance(collection_metadata['supporting_data'], list) else '',
- Status=collection_metadata['collection_status'],
- Updated=None
- # Updated=collection_metadata['date_updated'].split('T')[0]
- )
+ )
except Exception as exc:
print(exc)
@@ -210,7 +217,7 @@ def get_idc_sourced_analysis_results_metadata():
return idc_sourced_original_collections_metadata
-# Get all the collections in this version.
+# Get metadata of all the collections in this version.
# For each collection, determine whether collection level metadata (as opposed to per-source metadata) is sourced
# from tcia or idc. We get this collection level metadata from tcia if we get radiology or pathology or both from
# tcia. Otherwise, we get collection level metadata from idc maintained table/file.
@@ -238,8 +245,6 @@ def get_collection_metadata(client, args):
# Merge the TCIA collection metadata.
collection_metadata = tcia_sourced_collections
- # for collection_id, metadata in idc_and_tcia_collection_metadata.items():
- # collection_metadata[collection_id]['Sources'].extend(metadata['Sources'])
collection_metadata |= idc_only_collections
return collection_metadata
@@ -258,7 +263,7 @@ def add_descriptions(client, args, collection_metadata):
)
for collection, metadata in collection_metadata.items():
try:
- metadata['Description'] = descriptions[collection.lower().replace('-','_').replace(' ','_')]['description']
+ metadata['description'] = descriptions[collection.lower().replace('-','_').replace(' ','_')]['description']
except Exception as exc:
errlogger.error(f'No description for {collection}: {exc}')
# collection_metadata[collection]['Description'] = ""
@@ -276,18 +281,18 @@ def add_licenses(client, doi, collection_metadata):
licenses = {row['source_doi']: row['license'] for row in client.query(query)}
for collection, metadata in collection_metadata.items():
- for source in metadata['Sources']:
+ for source in metadata['sources']:
try:
- source["License"] = licenses[source['source_doi']]
+ source["license"] = licenses[source['source_doi']]
except Exception as exc:
errlogger.error(f'No license for {collection}, {source["source_doi"]}: {exc}')
- source["License"] = {"license_doi": "", "license_long_name": "", "license_short_name": ""}
+ source["license"] = {"license_doi": "", "license_long_name": "", "license_short_name": ""}
progresslogger.info('Added licenses')
return collection_metadata
def add_citations(collection_metadata):
for collection, data in collection_metadata.items():
- for source in data['Sources']:
+ for source in data['sources']:
if source['source_doi']:
try:
citation = get_citation(source['source_url'])
@@ -296,7 +301,7 @@ def add_citations(collection_metadata):
citation = source['source_url']
else:
citation = source['source_url']
- source['Citation'] = citation
+ source['citation'] = citation
progresslogger.info('Added citations')
return collection_metadata
@@ -321,8 +326,9 @@ def add_modalities(client, collection_metadata):
modalities[collection_name] = {row['source_url']: row['modalities']}
for collection_name, metadata in collection_metadata.items():
- for source in metadata['Sources']:
+ for source in metadata['sources']:
try:
+ source['modalities'] = modalities[collection_name][source['source_url'].lower()]
source['ImageTypes'] = modalities[collection_name][source['source_url'].lower()]
except:
errlogger.error(f'No modality for {collection_name}, {source["source_url"].lower()}')
@@ -341,10 +347,10 @@ def add_updates(client,collection_metadata):
timestamps = {c['collection_name']:c['version_timestamp'] for c in client.query(query).result()}
for collection_name in collection_metadata:
try:
- collection_metadata[collection_name]['Updated'] = timestamps[collection_name]
+ collection_metadata[collection_name]['updated'] = timestamps[collection_name]
except Exception as exc:
errlogger.error(f'No timestamp for {collection_name}')
- collection_metadata[collection_name]['Updated'] = ""
+ collection_metadata[collection_name]['updated'] = ""
progresslogger.info('Added updates')
return collection_metadata
@@ -355,49 +361,53 @@ def add_sources(client, collection_metadata):
FROM `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_joined_public_and_current`
"""
for row in client.query(query).result():
- collection_metadata[row['collection_name']]['Sources'].append(
+ collection_metadata[row['collection_name']]['sources'].append(
{
- "ID": "",
- "Type": "",
- "Access": "Public",
+ "source_id": "",
+ "source_type": "",
"source_doi": row["source_doi"],
"source_url": f'https://doi.org/{row["source_doi"]}',
- "ImageTypes": "",
- "License": {
+ "modalities": "",
+ "license": {
"license_url": "",
"license_long_name": "",
"license_short_name": ""
},
- "Citation": ""}
+ "citation": "",
+ "access": "Public",
+ "ImageTypes": "",
+ }
)
return collection_metadata
def add_ids(client, collection_metadata):
+
+ # Build a dictionary of source_id and type across all collections and analysis results
tcia_collection_metadata = { data['collection_doi'].lower(): {
- "ID": data['collection_short_title'],
- "Type": "original data"
- } for data in get_all_tcia_metadata('collections')}
+ "source_id": data['collection_short_title'].lower().replace('-', '_'),
+ "source_type": "original data"
+ } for data in get_tcia_collection_manager_data('collections')}
tcia_analysis_results_metadata = {data['result_doi'].lower(): {
- "ID": data['result_short_title'],
- "Type": 'analysis result'
- } for data in get_all_tcia_metadata('analysis-results')}
+ "source_id": data['result_short_title'].lower().replace('-', '_'),
+ "source_type": 'analysis result'
+ } for data in get_tcia_collection_manager_data('analysis-results')}
idc_collection_metadata = {data['source_doi'].lower(): {
- "ID": data['collection_name'],
- "Type": "original data"
+ "source_id": data['collection_id'],
+ "source_type": "original data"
} for index, data in read_json_to_dataframe(f'{settings.PROJECT_PATH}/bq/generate_tables_and_views/table_generation_jsons/idc_original_collections_metadata.json5').iterrows()}
idc_analysis_results_metadata = {data['source_doi'].lower(): {
- "ID": data['ID'],
- "Type": "analysis result"
- } for index, data in read_json_to_dataframe(f'{settings.PROJECT_PATH}/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json').iterrows()}
+ "source_id": data['ID'].lower().replace('-', '_').replace(' ', '_',),
+ "source_type": "analysis result"
+ } for index, data in read_json_to_dataframe(f'{settings.PROJECT_PATH}/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json5').iterrows()}
source_data = tcia_collection_metadata | tcia_analysis_results_metadata | idc_collection_metadata | idc_analysis_results_metadata
for collection, metadata in collection_metadata.items():
- for source in metadata['Sources']:
+ for source in metadata['sources']:
try:
- source['ID'] = source_data[source['source_doi']]['ID']
- source['Type'] = source_data[source['source_doi']]['Type']
+ source['source_id'] = source_data[source['source_doi']]['source_id']
+ source['source_type'] = source_data[source['source_doi']]['source_type']
except:
- errlogger.error(f'No ID for {collection}:{source["source_doi"]}')
+ errlogger.error(f'No source_id for {collection}:{source["source_doi"]}')
return collection_metadata
def build_metadata(client, args):
@@ -425,14 +435,14 @@ def gen_collections_table(args):
if args.use_cached_metadata:
with open(args.cached_metadata_file) as f:
- metadata = json.load(f)
+ all_metadata = json.load(f)
else:
- metadata = build_metadata(BQ_client, args)
+ all_metadata = build_metadata(BQ_client, args)
with open(args.cached_metadata_file, 'w') as f:
- json.dump(metadata, f)
+ json.dump(all_metadata, f)
# Drop any collections that do not have any sources. This is probably only needed during development
- metadata = [row for row in metadata if len(row['Sources']) > 0]
+ metadata = [row for row in all_metadata if len(row['sources']) > 0]
pass
metadata_json = '\n'.join([json.dumps(row) for row in
sorted(metadata, key=lambda d: d['collection_name'])])
@@ -443,10 +453,10 @@ def gen_collections_table(args):
settings.DEV_PROJECT,
settings.BQ_DEV_EXT_DATASET if args.access=='Public' else settings.BQ_DEV_INT_DATASET , args.bqtable_name, metadata_json,
data_collections_metadata_schema, write_disposition='WRITE_TRUNCATE')
- pass
+ return
except Exception as exc:
errlogger.error(f'Table creation failed: {exc}')
- exit
+ exit
if __name__ == '__main__':
diff --git a/bq/generate_tables_and_views/original_collections_metadata/schema.py b/bq/generate_tables_and_views/original_collections_metadata/schema.py
index a886eb8..899f10f 100644
--- a/bq/generate_tables_and_views/original_collections_metadata/schema.py
+++ b/bq/generate_tables_and_views/original_collections_metadata/schema.py
@@ -20,25 +20,26 @@
data_collections_metadata_schema = [
bigquery.SchemaField('collection_name', 'STRING', mode='REQUIRED', description='Collection name as used externally by IDC webapp'),
bigquery.SchemaField('collection_id', 'STRING', mode='REQUIRED', description='Collection ID as used internally by IDC webapp'),
- bigquery.SchemaField('Title', 'STRING', mode='REQUIRED', description='Collection ID as used internally by IDC webapp'),
- bigquery.SchemaField('CancerTypes','STRING', mode='REQUIRED', description='Cancer type of this collection '),
- bigquery.SchemaField('TumorLocations','STRING', mode='REQUIRED', description='Body location that was studied'),
- bigquery.SchemaField('Subjects', 'INTEGER', mode='REQUIRED', description='Number of subjects in collection'),
- bigquery.SchemaField('Species', 'STRING', mode='REQUIRED', description="Species of collection subjects"),
+ bigquery.SchemaField('collection_title', 'STRING', mode='REQUIRED',
+ description='Descriptive title of this collection'),
+ bigquery.SchemaField('cancer_types', 'STRING', mode='REQUIRED', description='Cancer types in this collection '),
+ bigquery.SchemaField('tumor_locations', 'STRING', mode='REQUIRED',
+ description='Tumor locations in this collection'),
+ bigquery.SchemaField('subjects', 'INTEGER', mode='REQUIRED', description='Number of subjects in this collection'),
+ bigquery.SchemaField('species', 'STRING', mode='REQUIRED', description="Species of collection subjects"),
bigquery.SchemaField(
- "Sources",
+ "sources",
"RECORD",
mode="REPEATED",
fields=[
- bigquery.SchemaField('ID', 'STRING', mode='NULLABLE', description='Original collection or Analysis result ID'),
- bigquery.SchemaField('Type', 'STRING', mode='NULLABLE', description='Original collection or Analysis result'),
- bigquery.SchemaField('Access', 'STRING', mode='NULLABLE', description='Limited or Public'),
+ bigquery.SchemaField('source_id', 'STRING', mode='NULLABLE', description='collection_id or analysis_result_id of this source'),
+ bigquery.SchemaField('source_type', 'STRING', mode='NULLABLE', description='"original collection" or "analysis result"'),
bigquery.SchemaField('source_doi', 'STRING', mode='NULLABLE',
description='DOI that can be resolved at doi.org to a information page of this source'),
bigquery.SchemaField('source_url', 'STRING', mode='REQUIRED',
- description='URL of source information page'),
- bigquery.SchemaField('ImageTypes', 'STRING', mode='NULLABLE',
- description='Enumeration of types/modalities of instances from this source'),
+ description='URL of the information page of this sourc'),
+ bigquery.SchemaField('modalities', 'STRING', mode='NULLABLE',
+ description='URL of the information page of this source'),
bigquery.SchemaField(
"license",
"RECORD",
@@ -51,20 +52,25 @@
description='Short name of license of this (sub)collection')
]
),
- bigquery.SchemaField('Citation', 'STRING', mode='NULLABLE',
+ bigquery.SchemaField('citation', 'STRING', mode='NULLABLE',
description='Citation to be used for this source'),
+ bigquery.SchemaField('access', 'STRING', mode='NULLABLE', description='DEPRECATED: All IDC data is public'),
+ bigquery.SchemaField('ImageTypes', 'STRING', mode='NULLABLE',
+ description='DEPRECATED: Duplicate of modalities'),
],
description='Array of metadata for each source of instance data in this collection'
),
- bigquery.SchemaField('SupportingData', 'STRING', mode='NULLABLE', description='Type(s) of addional available data'),
- bigquery.SchemaField('Program', 'STRING', mode='REQUIRED', description='Program to which this collection belongs'),
- bigquery.SchemaField('Status', 'STRING', mode='NULLABLE', description='Collection status: Ongoing or Complete'),
- bigquery.SchemaField('Updated', 'DATE', mode='NULLABLE', description='Date of most recent update'),
- bigquery.SchemaField('Description', 'STRING', mode='REQUIRED', description='Description of collection (HTML format)'),
- # bigquery.SchemaField('DOI', 'STRING', mode='NULLABLE',
- # description='DEPRECATED: Duplicate of source_doi'),
- # bigquery.SchemaField('URL', 'STRING', mode='NULLABLE', description='DEPRECATED: Duplicate of source_url'),
- # bigquery.SchemaField('CancerType', 'STRING', mode='NULLABLE', description='DEPRECATED: Duplicate of CancerTypes '),
- # bigquery.SchemaField('Location', 'STRING', mode='NULLABLE',
- # description='DEPRECATED: Duplicate of TumorLocations'),
+ bigquery.SchemaField('supporting_data', 'STRING', mode='NULLABLE', description='Type(s) of addional available data'),
+ bigquery.SchemaField('program', 'STRING', mode='REQUIRED', description='Program to which this collection belongs'),
+ bigquery.SchemaField('status', 'STRING', mode='NULLABLE', description='Collection status: Ongoing or Complete'),
+ bigquery.SchemaField('updated', 'DATE', mode='NULLABLE', description='Date of most recent update'),
+ bigquery.SchemaField('description', 'STRING', mode='REQUIRED', description='Description of collection (HTML format)'),
+ # Deprecations
+ bigquery.SchemaField('Title', 'STRING', mode='REQUIRED',
+ description='Deprecated: Duplicate of collection_title'),
+ bigquery.SchemaField('CancerTypes', 'STRING', mode='REQUIRED', description='DEPRECATED: Duplicate of cancer_types'),
+ bigquery.SchemaField('TumorLocations', 'STRING', mode='REQUIRED',
+ description='DEPRECATED: Duplicate of tumor_locations'),
+ bigquery.SchemaField('SupportingData', 'STRING', mode='NULLABLE',
+ description='DEPRECATED: Duplicate of supporting_data'),
]
\ No newline at end of file
diff --git a/bq/generate_tables_and_views/original_collections_metadata_idc_source.py b/bq/generate_tables_and_views/original_collections_metadata_idc_source.py
index 75db769..83ca4fc 100644
--- a/bq/generate_tables_and_views/original_collections_metadata_idc_source.py
+++ b/bq/generate_tables_and_views/original_collections_metadata_idc_source.py
@@ -18,7 +18,7 @@
# spreadsheet in Google Drive
import settings
import argparse
-from bq.utilities import json_file_to_bq
+from bq.bq_utilities import json_file_to_bq
if __name__ == '__main__':
parser = argparse.ArgumentParser()
diff --git a/bq/generate_tables_and_views/original_collections_tooltip_descriptions.py b/bq/generate_tables_and_views/original_collections_tooltip_descriptions.py
index 1a2e58d..da430da 100644
--- a/bq/generate_tables_and_views/original_collections_tooltip_descriptions.py
+++ b/bq/generate_tables_and_views/original_collections_tooltip_descriptions.py
@@ -22,7 +22,7 @@
import pandas as pd
from google.cloud import bigquery
import markdownify
-from bq.utilities import read_json_to_dataframe, dataframe_to_bq
+from bq.bq_utilities import read_json_to_dataframe, dataframe_to_bq
import re
diff --git a/bq/generate_tables_and_views/program_metadata.py b/bq/generate_tables_and_views/program_metadata.py
index ca48ba8..731e71d 100644
--- a/bq/generate_tables_and_views/program_metadata.py
+++ b/bq/generate_tables_and_views/program_metadata.py
@@ -14,3 +14,54 @@
# limitations under the License.
#
+
+# This script generates the BQ program_metadata table.
+import argparse
+import sys
+import json
+from google.cloud import bigquery
+import hashlib
+
+import settings
+from utilities.bq_helpers import load_BQ_from_json
+from utilities.logging_config import successlogger, errlogger
+from bq.bq_utilities import get_data_from_comet
+
+version_metadata_schema = [
+ bigquery.SchemaField('program_name', 'STRING', mode='REQUIRED', description='Short program name'),
+ bigquery.SchemaField('program_id', 'STRING', mode='REQUIRED', description="Lower cased short program name"),
+ bigquery.SchemaField('program_title', 'STRING', mode='REQUIRED', description='Descriptive program title'),
+ bigquery.SchemaField('program_url', 'STRING', mode='REQUIRED', description='URL of program information page'),
+ bigquery.SchemaField('program_description', 'STRING', mode='REQUIRED', description='Brief program description'),
+ ]
+
+
+
+def gen_version_metadata_table(args):
+ client = bigquery.Client(project=args.src_project)
+ rows = get_data_from_comet(args.path, branch=args.comet_branch)
+ for row in rows:
+ row["program_url"] = "None" if row["program_url"] is None else row["program_url"]
+ metadata_json = '\n'.join([json.dumps(row) for row in
+ sorted(rows, key=lambda d: d['program_name'])])
+ try:
+ job = load_BQ_from_json(client, args.dst_project, args.bqdataset_name, args.bqtable_name, metadata_json,
+ version_metadata_schema, write_disposition='WRITE_TRUNCATE')
+ successlogger.info('program_metadata table generation completed')
+ return
+ except Exception as exc:
+ errlogger.info(f'Error creating BQ table; {exc}')
+ exit(1)
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--version', default=settings.CURRENT_VERSION, help='Max IDC version for which to build the table')
+ parser.add_argument('--src_project', default='idc-dev-etl')
+ parser.add_argument('--dst_project', default='idc-dev-etl')
+ parser.add_argument('--bqdataset_name', default=f'idc_v{settings.CURRENT_VERSION}_pub', help='BQ dataset name')
+ parser.add_argument('--bqtable_name', default=f'program_metadata', help='BQ table name')
+ parser.add_argument('--comet_branch', default='release/v24', help="idc_comet github branch")
+ parser.add_argument("--path", default="vocabularies/programs.yaml", help="Path from branch to file")
+
+ args = parser.parse_args()
+ print("{}".format(args), file=sys.stdout)
+ gen_version_metadata_table(args)
\ No newline at end of file
diff --git a/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json5 b/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json5
index f46c50e..004331e 100644
--- a/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json5
+++ b/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json5
@@ -1,142 +1,142 @@
[
/*
{
- "ID": "template",
- "Title": "",
+ "analysis_result_name": "template",
+ "analysis_result_title": "",
"source_doi": "",
- "current versioned_source_doi": "",
- "Updated": "",
+ "current_versioned_source_doi": "",
+ "updated": "",
"license_url": "",
"license_long_name": "",
"license_short_name": ""
},
*/
{
- "ID": "BAMF-AIMI-Annotations",
- "Title": "Image segmentations produced by the AIMI Annotations initiative",
+ "analysis_result_name": "BAMF-AIMI-Annotations",
+ "analysis_result_title": "Image segmentations produced by the AIMI Annotations initiative",
"source_doi": "10.5281/zenodo.8345959",
- "current versioned_source_doi": "",
- "Updated": "2023-11-07",
+ "current_versioned_source_doi": "",
+ "updated": "2023-11-07",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license_long_name": "Creative Commons Attribution 4.0 International License",
"license_short_name": "CC BY 4.0"
},
{
- "ID": "Lung-PET-CT-Dx-Annotations",
- "Title": "Expert annotation of lung tumors for the Lung-PET-CT-Dx collection",
+ "analysis_result_name": "Lung-PET-CT-Dx-Annotations",
+ "analysis_result_title": "Expert annotation of lung tumors for the Lung-PET-CT-Dx collection",
"source_doi": "10.5281/zenodo.16989819",
- "current versioned_source_doi": "10.5281/zenodo.16989820",
- "Updated": "2024-02-24",
+ "current_versioned_source_doi": "10.5281/zenodo.16989820",
+ "updated": "2024-02-24",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license_long_name": "Creative Commons Attribution 4.0 International License",
"license_short_name": "CC BY 4.0"
},
{
- "ID": "NLST-Sybil",
- "Title": "Expert annotations of tumor regions in the NLST CT images",
+ "analysis_result_name": "NLST-Sybil",
+ "analysis_result_title": "Expert annotations of tumor regions in the NLST CT images",
"source_doi": "10.5281/zenodo.15643334",
- "current versioned_source_doi": "10.5281/zenodo.15643335",
- "Updated": "2024-02-24",
+ "current_versioned_source_doi": "10.5281/zenodo.15643335",
+ "updated": "2024-02-24",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license_long_name": "Creative Commons Attribution 4.0 International License",
"license_short_name": "CC BY 4.0"
},
{
- "ID": "NLSTSeg",
- "Title": "Expert tumor segmentations and radiomics features for NLST CT images",
+ "analysis_result_name": "NLSTSeg",
+ "analysis_result_title": "Expert tumor segmentations and radiomics features for NLST CT images",
"source_doi": "10.5281/zenodo.17362624",
- "current versioned_source_doi": "10.5281/zenodo.17362625",
- "Updated": "2024-02-24",
+ "current_versioned_source_doi": "10.5281/zenodo.17362625",
+ "updated": "2024-02-24",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license_long_name": "Creative Commons Attribution 4.0 International License",
"license_short_name": "CC BY 4.0"
},
{
- "ID": "nnU-Net-BPR-annotations",
- "Title": "AI-derived annotations for the NLST and NSCLC-Radiomics computed tomography imaging collections",
+ "analysis_result_name": "nnU-Net-BPR-annotations",
+ "analysis_result_title": "AI-derived annotations for the NLST and NSCLC-Radiomics computed tomography imaging collections",
"source_doi": "10.5281/zenodo.7473970",
- "current versioned_source_doi": "",
- "Updated": "2023-05-26",
+ "current_versioned_source_doi": "",
+ "updated": "2023-05-26",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license_long_name": "Creative Commons Attribution 4.0 International License",
"license_short_name": "CC BY 4.0"
},
{
- "ID": "Pan-Cancer-Nuclei-Seg-DICOM",
- "Title": "DICOM converted Dataset of Segmented Nuclei in Hematoxylin and Eosin Stained Histopathology Images (Pan-Cancer-Nuclei-Seg)",
+ "analysis_result_name": "Pan-Cancer-Nuclei-Seg-DICOM",
+ "analysis_result_title": "DICOM converted Dataset of Segmented Nuclei in Hematoxylin and Eosin Stained Histopathology Images (Pan-Cancer-Nuclei-Seg)",
"source_doi": "10.5281/zenodo.11099004",
- "current versioned_source_doi": "",
- "Updated": "2024-08-05",
+ "current_versioned_source_doi": "",
+ "updated": "2024-08-05",
"license_url": "https://creativecommons.org/licenses/by/3.0/",
"license_long_name": "Creative Commons Attribution 3.0 International",
"license_short_name": "CC BY 3.0"
},
{
- "ID": "Pancreas-CT-SEG",
- "Title": "DICOM converted annotations for the Pancreas-CT collection",
+ "analysis_result_name": "Pancreas-CT-SEG",
+ "analysis_result_title": "DICOM converted annotations for the Pancreas-CT collection",
"source_doi": "10.5281/zenodo.12130275",
- "current versioned_source_doi": "",
- "Updated": "2024-07-16",
+ "current_versioned_source_doi": "",
+ "updated": "2024-07-16",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license_long_name": "Creative Commons Attribution 4.0 International License",
"license_short_name": "CC BY 4.0"
},
{
- "ID": "Prostate-MRI-US-Biopsy-DICOM-Annotations",
- "Title": "DICOM converted annotations for the Prostate-MRI-US-Biopsy collection",
+ "analysis_result_name": "Prostate-MRI-US-Biopsy-DICOM-Annotations",
+ "analysis_result_title": "DICOM converted annotations for the Prostate-MRI-US-Biopsy collection",
"source_doi": "10.5281/zenodo.10069910",
- "current versioned_source_doi": "",
- "Updated": "2023-11-03",
+ "current_versioned_source_doi": "",
+ "updated": "2023-11-03",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license_long_name": "Creative Commons Attribution 4.0 International License",
"license_short_name": "CC BY 4.0"
},
{
- "ID": "PROSTATEx-Targets",
- "Title": "Point annotations of lesion targets for the PROSTATEx collection",
+ "analysis_result_name": "PROSTATEx-Targets",
+ "analysis_result_title": "Point annotations of lesion targets for the PROSTATEx collection",
"source_doi": "10.5281/zenodo.15643312",
- "current versioned_source_doi": "10.5281/zenodo.15643313",
- "Updated": "2024-02-24",
+ "current_versioned_source_doi": "10.5281/zenodo.15643313",
+ "updated": "2024-02-24",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license_long_name": "Creative Commons Attribution 4.0 International License",
"license_short_name": "CC BY 4.0"
},
{
- "ID": "RMS-Mutation-Prediction-Expert-Annotations",
- "Title": "Expert annotations of the tissue types for the RMS-Mutation-Prediction microscopy images",
+ "analysis_result_name": "RMS-Mutation-Prediction-Expert-Annotations",
+ "analysis_result_title": "Expert annotations of the tissue types for the RMS-Mutation-Prediction microscopy images",
"source_doi": "10.5281/zenodo.10462857",
- "current versioned_source_doi": "",
- "Updated": "2024-02-24",
+ "current_versioned_source_doi": "",
+ "updated": "2024-02-24",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license_long_name": "Creative Commons Attribution 4.0 International License",
"license_short_name": "CC BY 4.0"
},
{
- "ID": "TCGA-GBM360",
- "Title": "GBM360 aggressiveness maps for a subset of TCGA pathology slides",
+ "analysis_result_name": "TCGA-GBM360",
+ "analysis_result_title": "GBM360 aggressiveness maps for a subset of TCGA pathology slides",
"source_doi": "10.5281/zenodo.17470190",
- "current versioned_source_doi": "10.5281/zenodo.17470191",
- "Updated": "2024-02-24",
+ "current_versioned_source_doi": "10.5281/zenodo.17470191",
+ "updated": "2024-02-24",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license_long_name": "Creative Commons Attribution 4.0 International License",
"license_short_name": "CC BY 4.0"
},
{
- "ID": "TCGA-SBU-TIL-Maps",
- "Title": "AI-derived Tumor Infiltrating Lymphocyte maps for the TCGA collections",
+ "analysis_result_name": "TCGA-SBU-TIL-Maps",
+ "analysis_result_title": "AI-derived Tumor Infiltrating Lymphocyte maps for the TCGA collections",
"source_doi": "10.5281/zenodo.16966285",
- "current versioned_source_doi": "10.5281/zenodo.16966285",
- "Updated": "2024-02-24",
+ "current_versioned_source_doi": "10.5281/zenodo.16966285",
+ "updated": "2024-02-24",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license_long_name": "Creative Commons Attribution 4.0 International License",
"license_short_name": "CC BY 4.0"
},
{
- "ID": "TotalSegmentator-CT-Segmentations",
- "Title": "AI-driven enrichment of NCI Imaging Data Commons CT images with volumetric segmentations and radiomics features",
+ "analysis_result_name": "TotalSegmentator-CT-Segmentations",
+ "analysis_result_title": "AI-driven enrichment of NCI Imaging Data Commons CT images with volumetric segmentations and radiomics features",
"source_doi": "10.5281/zenodo.8347011",
- "current versioned_source_doi": "",
- "Updated": "2024-02-24",
+ "current_versioned_source_doi": "",
+ "updated": "2024-02-24",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
"license_long_name": "Creative Commons Attribution 4.0 International License",
"license_short_name": "CC BY 4.0"
diff --git a/bq/generate_tables_and_views/tcia_pathology_metadata.py b/bq/generate_tables_and_views/tcia_pathology_metadata.py
index e799127..80d817b 100644
--- a/bq/generate_tables_and_views/tcia_pathology_metadata.py
+++ b/bq/generate_tables_and_views/tcia_pathology_metadata.py
@@ -20,7 +20,7 @@
import os
import sys
import json
-from utilities.tcia_helpers import get_all_tcia_metadata
+from utilities.tcia_helpers import get_tcia_collection_manager_data
from utilities.logging_config import successlogger, progresslogger, errlogger
from google.cloud import bigquery
from utilities.bq_helpers import load_BQ_from_json
@@ -91,9 +91,9 @@ def get_aspera_hash(data):
def gen_table(args):
- collections = get_all_tcia_metadata("collections")
+ collections = get_tcia_collection_manager_data("collections")
# collections = [ c for c in get_all_tcia_metadata("collections") if c['collection_page_accessibility'] == "Public"]
- downloads = get_all_tcia_metadata("downloads")
+ downloads = get_tcia_collection_manager_data("downloads")
pathology_downloads = {download['id']:download for download in downloads if download['download_type']=='Pathology Images'}
# Add the id and slug of the parent collection to each pathology_download
diff --git a/bq/generate_tables_and_views/tcia_pathology_metadata_comparison.py b/bq/generate_tables_and_views/tcia_pathology_metadata_comparison.py
index 9defd71..b370eca 100644
--- a/bq/generate_tables_and_views/tcia_pathology_metadata_comparison.py
+++ b/bq/generate_tables_and_views/tcia_pathology_metadata_comparison.py
@@ -19,7 +19,7 @@
import argparse
import sys
import json
-from utilities.tcia_helpers import get_all_tcia_metadata, get_url
+from utilities.tcia_helpers import get_tcia_collection_manager_data, get_url
from google.cloud import bigquery
from utilities.bq_helpers import load_BQ_from_json
from bq.generate_tables_and_views.original_collections_metadata.schema import data_collections_metadata_schema
diff --git a/bq/generate_tables_and_views/utils/json_to_bq_table.py b/bq/generate_tables_and_views/utils/json_to_bq_table.py
index 04e84d7..04a2b4b 100644
--- a/bq/generate_tables_and_views/utils/json_to_bq_table.py
+++ b/bq/generate_tables_and_views/utils/json_to_bq_table.py
@@ -19,7 +19,7 @@
import pandas as pd
from google.cloud import bigquery
-from bq.utilities import read_json_to_dataframe
+from bq.bq_utilities import read_json_to_dataframe
from datetime import datetime, timedelta
import pytz
diff --git a/clinical/compare_tcia_clinical_downloads.py b/clinical/compare_tcia_clinical_downloads.py
index 18d209a..1582078 100644
--- a/clinical/compare_tcia_clinical_downloads.py
+++ b/clinical/compare_tcia_clinical_downloads.py
@@ -19,7 +19,7 @@
import argparse
import sys
import json
-from utilities.tcia_helpers import get_all_tcia_metadata, get_url
+from utilities.tcia_helpers import get_tcia_collection_manager_data, get_url
from google.cloud import bigquery
from utilities.bq_helpers import load_BQ_from_json
from bq.generate_tables_and_views.original_collections_metadata.schema import data_collections_metadata_schema
diff --git a/clinical/tcia_clinical_metadata.py b/clinical/tcia_clinical_metadata.py
index 01bcf45..dc45fa4 100644
--- a/clinical/tcia_clinical_metadata.py
+++ b/clinical/tcia_clinical_metadata.py
@@ -19,7 +19,7 @@
import argparse
import sys
import json
-from utilities.tcia_helpers import get_all_tcia_metadata, get_url
+from utilities.tcia_helpers import get_tcia_collection_manager_data, get_url
from google.cloud import bigquery
from utilities.bq_helpers import load_BQ_from_json
from bq.generate_tables_and_views.original_collections_metadata.schema import data_collections_metadata_schema
@@ -79,20 +79,20 @@ def get_raw_data():
all_idc_collections = client.list_rows(client.get_table(f'{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_sources')).to_dataframe()
all_idc_source_dois = all_idc_collections[['source_doi', 'Access']].copy()
# Get all TCIA collections which we also have
- all_tcia_collection_metadata = get_all_tcia_metadata("collections")
+ all_tcia_collection_metadata = get_tcia_collection_manager_data("collections")
public_tcia_collections = [ c for c in all_tcia_collection_metadata if \
c['collection_doi'].lower() in list(all_idc_source_dois['source_doi']) and \
all_idc_source_dois[all_idc_source_dois["source_doi"] == (c['collection_doi'].lower())].iloc[0]['Access'] == "Public"
]
# Get all TCIA analysis results which we also have
- all_tcia_ar_metadata = get_all_tcia_metadata('analysis-results')
+ all_tcia_ar_metadata = get_tcia_collection_manager_data('analysis-results')
public_analysis_results = [c for c in all_tcia_ar_metadata if \
c['result_doi'].lower() in list(all_idc_source_dois['source_doi']) and \
all_idc_source_dois[all_idc_source_dois["source_doi"] == (c['result_doi'].lower())].iloc[0]['Access'] == "Public"
]
# Get TCIA clinical downloads
- downloads = {d['id']: d for d in get_all_tcia_metadata("downloads")}
+ downloads = {d['id']: d for d in get_tcia_collection_manager_data("downloads")}
clinical_downloads = {id: data for id, data in downloads.items() if likely_clinical(data)}
# Associate 0 or 1 collection with each clinical download
diff --git a/ingestion/utilities/get_collection_dois_urls_licenses.py b/ingestion/utilities/get_collection_dois_urls_licenses.py
index 896f4ae..72c6151 100644
--- a/ingestion/utilities/get_collection_dois_urls_licenses.py
+++ b/ingestion/utilities/get_collection_dois_urls_licenses.py
@@ -25,7 +25,7 @@
from utilities.logging_config import errlogger
logger = logging.getLogger(__name__)
-from utilities.tcia_helpers import get_internal_series_ids, series_drill_down, get_all_tcia_metadata
+from utilities.tcia_helpers import get_internal_series_ids, series_drill_down, get_tcia_collection_manager_data
from utilities.tcia_helpers_v4 import get_TCIA_series_metadata_per_patient
from idc.models import IDC_Collection, IDC_Patient, IDC_Study, IDC_Series
from python_settings import settings
@@ -202,7 +202,7 @@ def get_patient_urls_idc(sess, collection, patient):
# data sourced from TCIA.
def get_licenses_tcia(collection, patient, third_party="no", server=""):
# license_types = get_license_info()
- license_types = {l['license_url']: l['license_label'] for l in get_all_tcia_metadata(type="licenses")}
+ license_types = {l['license_url']: l['license_label'] for l in get_tcia_collection_manager_data(type="licenses")}
series_licenses = {}
series_metadata = get_TCIA_series_metadata_per_patient(collection, patient)
for series in series_metadata:
diff --git a/preingestion/detect_tcia_collection_name_changes.py b/preingestion/detect_tcia_collection_name_changes.py
index 8381240..7a8e3e1 100644
--- a/preingestion/detect_tcia_collection_name_changes.py
+++ b/preingestion/detect_tcia_collection_name_changes.py
@@ -22,7 +22,7 @@
from idc.models import Patient, Study, Series, Collection, All_Collections
-from utilities.tcia_helpers import get_all_tcia_metadata
+from utilities.tcia_helpers import get_tcia_collection_manager_data
from sqlalchemy import and_, or_
from utilities.sqlalchemy_helpers import sa_session
from utilities.logging_config import successlogger, progresslogger, errlogger
@@ -40,8 +40,8 @@ def compare_dois():
filter(and_(or_(Series.sources == [True, False], Series.sources == [True, True]), All_Collections.access=="Public")).all()
idc_dois = {row.source_doi.lower(): row.collection_id for row in rows if row.source_doi }
- tcia_original_dois = {row['collection_doi'].lower(): row['collection_short_title'] for row in get_all_tcia_metadata(type="collections", query_param="&_fields=collection_short_title,collection_doi")}
- tcia_analysis_dois = {row['result_doi'].lower(): row['result_short_title'] for row in get_all_tcia_metadata(type="analysis-results", query_param="&_fields=result_short_title,result_doi")}
+ tcia_original_dois = {row['collection_doi'].lower(): row['collection_short_title'] for row in get_tcia_collection_manager_data(type="collections", query_param="&_fields=collection_short_title,collection_doi")}
+ tcia_analysis_dois = {row['result_doi'].lower(): row['result_short_title'] for row in get_tcia_collection_manager_data(type="analysis-results", query_param="&_fields=result_short_title,result_doi")}
for doi in idc_dois:
if idc_dois[doi] in args.ignored:
diff --git a/settings.py b/settings.py
index 01698f9..94d3b5d 100644
--- a/settings.py
+++ b/settings.py
@@ -160,3 +160,7 @@
AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', '')
AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', '')
+
+GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN')
+
+pass
diff --git a/utilities/tcia_helpers.py b/utilities/tcia_helpers.py
index 1e88753..0813469 100644
--- a/utilities/tcia_helpers.py
+++ b/utilities/tcia_helpers.py
@@ -456,7 +456,7 @@ def get_collection_license_info():
return licenses
-def get_all_tcia_metadata(type, query_param=''):
+def get_tcia_collection_manager_data(type, query_param=''):
if query_param:
url = f"https://cancerimagingarchive.net/api/v1/{type}/?per_page=100&{query_param}"
else:
@@ -480,7 +480,7 @@ def get_all_tcia_metadata(type, query_param=''):
print('Error accessing the API:', response.status_code)
exit
-def get_all_tcia_metadata_v2(type, query_param=''):
+def et_tcia_collection_manager_data_v2(type, query_param=''):
page = 1
collections = []
while True:
@@ -505,7 +505,7 @@ def get_all_tcia_metadata_v2(type, query_param=''):
if __name__ == "__main__":
- c = get_all_tcia_metadata_v2("collections", query_param='')
+ c = get_tcia_collection_manager_data_v2("collections", query_param='')
pass
# access_token = get_access_token(auth_server=NLST_AUTH_URL)[0]