diff --git a/bq/utilities.py b/bq/bq_utilities.py similarity index 63% rename from bq/utilities.py rename to bq/bq_utilities.py index 19f1866..090f250 100644 --- a/bq/utilities.py +++ b/bq/bq_utilities.py @@ -22,8 +22,52 @@ import pytz import argparse import json5 - - +from datetime import datetime, timedelta, timezone +import requests +import yaml + + +def get_data_from_comet(path, branch="current"): + file_url = f"https://raw.githubusercontent.com/ImagingDataCommons/idc-comet/{branch}/{path}" + headers = { + "Authorization": f"token {settings.GITHUB_TOKEN}", + "Accept": "application/vnd.github.v3+json" + } + response = requests.get(file_url, headers=headers) + if response.status_code == 200: + # Specify the local path where you want to save the file + metadata = yaml.load(StringIO(response.text), Loader=yaml.Loader) + return metadata["programs"] + else: + print(f"Failed to retrieve file. Status code: {response.status_code}") + exit(1) + +# Create a table from a data frame. The table will be deleted after the time limit expires +def create_temp_table_from_df(client, table_id, schema, df, expire_in_minutes=10): + table = bigquery.Table(table_id) + + # Set expiration to 2 minutes from now + expiration_duration = timedelta(minutes=expire_in_minutes) + table.expires = datetime.now(timezone.utc) + expiration_duration + try: + client.create_table(table, exists_ok=True) + # print(f"Table {table_id} created/updated with expiration at {table.expires}") + except Exception as e: + print(f"Error setting table metadata: {e}") + exit(1) + + job_config = bigquery.LoadJobConfig( + schema=schema, + write_disposition="WRITE_TRUNCATE" + ) + # 5. Load data + job = client.load_table_from_dataframe( + df, table_id, job_config=job_config + ) + job.result() # Wait for job to complete + + +# Read the file at the file path into a dataframe. The file is assumed to be JSON formatted def read_json_to_dataframe(file_path): with open(file_path) as f: definitions = json5.load(f) @@ -74,33 +118,6 @@ def json_file_to_bq(args, file_path, lifetime=None): return - # # Initialize the BigQuery client - # client = bigquery.Client() - # - # # Define the BigQuery table reference - # table_ref = f'{args.project}.{args.bq_dataset_id}.{args.table_id}' - # - # # Create the BigQuery table if it doesn't exist - # try: - # client.get_table(table_ref) - # except: - # table = bigquery.Table(table_ref) - # client.create_table(table) - # - # # Write the DataFrame data to BigQuery - # job_config = bigquery.LoadJobConfig(write_disposition='WRITE_TRUNCATE') - # job = client.load_table_from_dataframe(df, table_ref, job_config=job_config) - # job.result() - # - # if lifetime: - # table = client.get_table(table_ref) # Get the table object - # expiration_time = datetime.now(pytz.utc) + timedelta(minutes=lifetime) - # table.expires = expiration_time - # client.update_table(table, ["expires"]) - # - # print('Data imported successfully!') - - if __name__ == '__main__': parser = argparse.ArgumentParser() diff --git a/bq/generate_tables_and_views/all_joined.py b/bq/generate_tables_and_views/all_joined.py index 0d9b1b6..53b4017 100644 --- a/bq/generate_tables_and_views/all_joined.py +++ b/bq/generate_tables_and_views/all_joined.py @@ -23,6 +23,9 @@ import settings from google.cloud import bigquery from utilities.bq_helpers import create_BQ_dataset +from utilities.tcia_helpers import get_tcia_collection_manager_data +import pandas as pd +from bq.bq_utilities import create_temp_table_from_df # Flatten the version/collection/... hierarchy # Note that we no longer include license here as the license can change over time. @@ -119,7 +122,16 @@ def create_all_flattened(client): def create_all_sources(client): - table_id = f"{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_sources" + # Create a temporary table of names of all TCIA analysis results + tcia_analysis_result_dois= [row['result_doi'].lower() for row in get_tcia_collection_manager_data('analysis-results')] + df = pd.DataFrame(tcia_analysis_result_dois, columns=['source_doi']) + table_id = f"{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.tcia_analysis_result_dois" + schema = [ + bigquery.SchemaField("source_doi", "STRING") + ] + create_temp_table_from_df(client, table_id, schema, df, 60) + + query = f""" with basics as ( SELECT distinct @@ -136,17 +148,28 @@ def create_all_sources(client): ON af.source_doi = dtc.source_doi LEFT JOIN `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.metadata_sunset` ms ON af.source_doi = ms.source_doi +), +analysis_result_dois as ( + SELECT DISTINCT source_doi + FROM `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.tcia_analysis_result_dois` +UNION ALL + SELECT DISTINCT source_doi + FROM `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.analysis_results_metadata_idc_source` ) SELECT - *, + basics.*, + if(analysis_result_dois.source_doi IS NULL, False, True) analysis_result, if(Type='Open', 'idc-arch-open', if(Type='Cr', 'idc-arch-cr', if(Type='Defaced', 'idc-arch-defaced', if(Type='Redacted','idc-arch-redacted','idc-arch-excluded')))) dev_bucket, if(Type='Open', 'idc-open-data', if(Type='Cr', 'idc-open-cr', if(Type='Defaced', 'idc-open-idc1', NULL))) pub_gcs_bucket, if(Type='Open', 'idc-open-data', if(Type='Cr', 'idc-open-data-cr', if(Type='Defaced', 'idc-open-data-two', NULL))) pub_aws_bucket, FROM basics -- ORDER by collection_id, source_doi, dev_bucket, pub_gcs_bucket, pub_aws_bucket -ORDER by collection_id, source_doi, pub_gcs_bucket, pub_aws_bucket +LEFT JOIN analysis_result_dois +ON basics.source_doi = analysis_result_dois.source_doi +ORDER by collection_id, basics.source_doi, pub_gcs_bucket, pub_aws_bucket """ # Make an API request to create the view. + table_id = f"{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_sources" client.delete_table(table_id, not_found_ok=True) job_config = bigquery.QueryJobConfig(destination=table_id) query_job = client.query(query,job_config=job_config) @@ -159,7 +182,7 @@ def create_all_joined(client): view = bigquery.Table(view_id) view.view_query = f""" -- SELECT af.*, ac.source, ac.Class, ac.Access, ac.metadata_sunset, ac.dev_bucket, ac.pub_gcs_bucket, ac.pub_aws_bucket -SELECT af.*, ac.source, ac.Type, ac.Access, ac.metadata_sunset, ac.dev_bucket, ac.pub_gcs_bucket, ac.pub_aws_bucket +SELECT af.*, ac.source, ac.Type, ac.Access, ac.metadata_sunset, ac.analysis_result, ac.dev_bucket, ac.pub_gcs_bucket, ac.pub_aws_bucket FROM `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_flattened` af JOIN `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_sources` ac ON af.source_doi = ac.source_doi diff --git a/bq/generate_tables_and_views/analysis_results_end_user_descriptions.py b/bq/generate_tables_and_views/analysis_results_end_user_descriptions.py index b99729e..4bf5d9d 100644 --- a/bq/generate_tables_and_views/analysis_results_end_user_descriptions.py +++ b/bq/generate_tables_and_views/analysis_results_end_user_descriptions.py @@ -22,7 +22,7 @@ import pandas as pd from google.cloud import bigquery import markdownify -from bq.utilities import read_json_to_dataframe, dataframe_to_bq +from bq.bq_utilities import read_json_to_dataframe, dataframe_to_bq # Get the descriptions of collections that are only sourced from IDC diff --git a/bq/generate_tables_and_views/analysis_results_metadata.py b/bq/generate_tables_and_views/analysis_results_metadata.py index e294f7f..74d9741 100644 --- a/bq/generate_tables_and_views/analysis_results_metadata.py +++ b/bq/generate_tables_and_views/analysis_results_metadata.py @@ -21,35 +21,42 @@ import os import json import time -from re import split as re_split +from bq.bq_utilities import create_temp_table_from_df +import pandas as pd from google.cloud import bigquery from utilities.bq_helpers import load_BQ_from_json -# from bq.generate_tables_and_views.analysis_results_metadata.schema import analysis_results_metadata_schema -from utilities.tcia_helpers import get_all_tcia_metadata +from utilities.tcia_helpers import get_tcia_collection_manager_data from utilities.logging_config import successlogger, progresslogger, errlogger -# from python_settings import settings + import settings import requests analysis_results_metadata_schema = [ - bigquery.SchemaField('ID', 'STRING', mode='REQUIRED', description='Results ID'), - bigquery.SchemaField('Title', 'STRING', mode='REQUIRED', description='Descriptive title'), - bigquery.SchemaField('source_doi','STRING', mode='NULLABLE', description='DOI that can be resolved at doi.org to a wiki page'), + bigquery.SchemaField('analysis_result_name', 'STRING', mode='REQUIRED', description='Analysis result name as used externally by IDC webapp'), + bigquery.SchemaField('analysis_result_id', 'STRING', mode='REQUIRED', description='Analysis result ID as used internally by IDC webapp'), + bigquery.SchemaField('analysis_result_title', 'STRING', mode='REQUIRED', description='Descriptive title of this analysis result'), + bigquery.SchemaField('source_doi','STRING', mode='NULLABLE', description='DOI that can be resolved at doi.org to an information page'), bigquery.SchemaField('source_url','STRING', mode='REQUIRED', description='URL of a wiki page'), - bigquery.SchemaField('CancerTypes','STRING', mode='REQUIRED', description='Type(s) of cancer analyzed'), - bigquery.SchemaField('TumorLocations', 'STRING', mode='REQUIRED', description='Body location that was analyzed'), - bigquery.SchemaField('Subjects', 'INTEGER', mode='REQUIRED', description='Number of subjects whose data was analyzed'), - bigquery.SchemaField('Collections', 'STRING', mode='REQUIRED', description='collection_names of original data collections analyzed'), - bigquery.SchemaField('Modalities', 'STRING', mode='REQUIRED', description='Modalities of this analysis result'), - bigquery.SchemaField('Updated', 'DATE', mode='REQUIRED', description='Most recent update reported by TCIA'), + bigquery.SchemaField('cancer_types','STRING', mode='REQUIRED', description='Type(s) of cancer analyzed'), + bigquery.SchemaField('tumor_locations', 'STRING', mode='REQUIRED', description='Body location that was analyzed'), + bigquery.SchemaField('subjects', 'INTEGER', mode='REQUIRED', description='Number of subjects whose data was analyzed'), + bigquery.SchemaField('collections', 'STRING', mode='REQUIRED', description='collection_names of original data collections analyzed'), + bigquery.SchemaField('modalities', 'STRING', mode='REQUIRED', description='Modalities of this analysis result'), + bigquery.SchemaField('updated', 'DATE', mode='REQUIRED', description='Most recent update reported by TCIA'), bigquery.SchemaField('license_url', 'STRING', mode='REQUIRED', description='URL of license of this analysis result'), bigquery.SchemaField('license_long_name', 'STRING', mode='REQUIRED', description='Long name of license of this analysis result'), bigquery.SchemaField('license_short_name', 'STRING', mode='REQUIRED', description='Short name of license of this analysis result'), - bigquery.SchemaField('Description', 'STRING', mode='REQUIRED', - description='Analysis result description'), - bigquery.SchemaField('Citation', 'STRING', mode='NULLABLE', - description='Citation to be used for this source'), - bigquery.SchemaField('Access', 'STRING', mode='REQUIRED', description='Deprecated: Access is always Public'), + bigquery.SchemaField('description', 'STRING', mode='REQUIRED', + description='Description of this analysis result'), + bigquery.SchemaField('citation', 'STRING', mode='NULLABLE', + description='Citation to be used for this analysis result'), + # Deprecations + bigquery.SchemaField('ID', 'STRING', mode='REQUIRED', + description='DEPRECATED: Duplicate of analysis_result_name'), + bigquery.SchemaField('Title', 'STRING', mode='REQUIRED', description='DEPRECATED: Duplicate of analysis_result_title'), + bigquery.SchemaField('CancerTypes', 'STRING', mode='REQUIRED', description='DEPRECATED: Duplicate of cancer_types'), + bigquery.SchemaField('TumorLocations', 'STRING', mode='REQUIRED', description='DEPRECATED: Duplicate of tumor_locations'), + bigquery.SchemaField('Access', 'STRING', mode='REQUIRED', description='DEPRECATED: Access is always Public'), ] @@ -62,41 +69,24 @@ def get_descriptions(client,args): return descriptions -def get_idc_sourced_analysis_metadata(client): +def get_idc_sourced_analysis_result_metadata(client): query = f""" -- SELECT DISTINCT ID, Title, Access, source_doi, Updated - SELECT DISTINCT ID, Title, source_doi, Updated + SELECT DISTINCT analysis_result_name, analysis_result_title, source_doi, updated FROM `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.analysis_results_metadata_idc_source` """ - results = [dict(row) for row in client.query(query).result()] - metadata = {row["source_doi"]:row for row in results} + metadata = [dict(row) for row in client.query(query).result()] + # metadata = {row["source_doi"]:row for row in results} return metadata # Get a list of subjects per source_doi def get_citation(source_url): - temp_citations = { - -# "https://doi.org/10.5281/zenodo.15643312": "Krishnaswamy, D., Bridge, C., Clunie, D., & Fedorov, A. (2025). PROSTATEx-Targets: Point annotations of lesion targets for the PROSTATEx collection[Dataset]. Zenodo. https://doi.org/10.5281/zenodo.15643312", -# "https://doi.org/10.5281/zenodo.15643334": "Krishnaswamy, D., Bridge, C., Clunie, D., & Fedorov, A. (2025). NLST-Sybil: Expert annotations of tumor regions in the NLST CT images[Dataset]. Zenodo. https://doi.org/10.5281/zenodo.15643334", -# "https://doi.org/10.5281/zenodo.16989819": "Krishnaswamy, D., Bridge, C., Clunie, D., & Fedorov, A. (2025). Lung-PET-CT-Dx-Annotations: Expert annotation of lung tumors for the Lung-PET-CT-Dx collection[Dataset]. Zenodo. https://doi.org/10.5281/zenodo.16989819", -# -# "https://doi.org/10.5281/zenodo.17362624": "Krishnaswamy, D., Clunie, D., & Fedorov, A. (2025). NLSTSeg: Expert lesion segmentations and radiomics features for NLST CT images[Dataset]. Zenodo. https://doi.org/10.5281/zenodo.17362624", -# "https://doi.org/10.5281/zenodo.17470190": "Bridge, C., Zheng, Y., Gevaert, O., Clunie, D., & Fedorov, A. (2025). TCGA-GBM360: GBM360 aggressiveness maps for a subset of TCGA pathology slides[Dataset]. Zenodo. https://doi.org/10.5281/zenodo.17470190", -# "https://doi.org/10.5281/zenodo.16966285": "Bridge, C., Abousamra, S., Saltz, J., Gupta, R., Kurc, T., Zhang, Y., Zhao, T., Batiste, R., Samaras, D., Bremer, E., Shroyer, K. R., Nguyen, V., Singh, P., Hou, L., Le, H., Van Arnam, J., Shmulevich, I., \ -# Rao, A. U. K., Lazar, A. J., Sharma, A., Thorsson, V., Shankar, A., Chen, C., Clunie, D., & Fedorov, A. (2025). TCGA-SBU-TIL-Maps: AI-derived Tumor Infiltrating Lymphocyte maps for the TCGA collections[Dataset]. Zenodo. https://doi.org/10.5281/zenodo.16966285" - - } - - # header = {"Accept": "text/x-bibliography; style=apa"} header = {"Accept": "text/x-bibliography; style=elsevier-vancouver-no-et-al"} citation = requests.get(source_url, headers=header).text if citation.startswith(""): - try: - citation = temp_citations[source_url] - except: - breakpoint() - + errlogger.error(f'No citation for {source_url}') + exit(1) return citation @@ -108,36 +98,21 @@ def get_citation(source_url): # 3. Get cumulative metadata for each remaining AR def get_analysis_results_metadata(client, analysis_metadata): schema = [ - bigquery.SchemaField("ID", "STRING", mode="REQUIRED"), - bigquery.SchemaField("Title", "STRING", mode="REQUIRED"), - # bigquery.SchemaField("Access", "STRING", mode="REQUIRED"), + bigquery.SchemaField("analysis_result_name", "STRING", mode="REQUIRED"), + bigquery.SchemaField("analysis_result_title", "STRING", mode="REQUIRED"), bigquery.SchemaField("source_doi", "STRING", mode="REQUIRED"), - bigquery.SchemaField("Updated", "STRING", mode="REQUIRED"), + bigquery.SchemaField("updated", "STRING", mode="REQUIRED"), ] - table_id = 'gen_analysis_results_metadata' - table_ref = client.dataset('whc_dev').table(table_id) - table = bigquery.Table(table_ref, schema=schema) - client.delete_table(table, not_found_ok=True) - table = client.create_table(table,["schema"]) - client.update_table(table,["schema"]) - try: - errors = client.insert_rows_json(table_ref, [v for k,v in analysis_metadata.items()]) + table_name = 'gen_analysis_results_metadata' + table_id = f"{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.{table_name}" + df = pd.DataFrame(analysis_metadata) + create_temp_table_from_df(client, table_id, schema, df, expire_in_minutes=30) - except: - try: - # Known bug. Need to try to insert twice. - errors = client.insert_rows_json(table_ref, [v for k, v in analysis_metadata.items()]) - except Exception as exc: - print("Encountered errors while inserting rows:", exc) - exit(1) - if errors != []: - print("Encountered errors while inserting rows:") - exit(1) query = f""" WITH ocm AS ( -- Flatten SELECT * except(Updated, sources, CancerTypes, TumorLocations, Subjects) - FROM `idc-dev-etl.{settings.BQ_DEV_EXT_DATASET}.original_collections_metadata` ocm, + FROM `{settings.DEV_PROJECT}.{settings.BQ_DEV_EXT_DATASET}.original_collections_metadata` ocm, UNNEST(sources) AS srcs, UNNEST(SPLIT(CancerTypes, ',')) as CTypes, UNNEST(SPLIT(TumorLocations, ',')) TLocations, UNNEST(SPLIT(srcs.ImageTypes, ',')) ITypes @@ -146,23 +121,32 @@ def get_analysis_results_metadata(client, analysis_metadata): s1 AS ( SELECT DISTINCT garm.*, ocm.collection_id, ocm.CTypes, ocm.TLocations, ocm.ITypes FROM ocm - JOIN `idc-dev-etl.whc_dev.{table_id}` garm + JOIN `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.{table_name}` garm ON ocm.source_doi = garm.source_doi ) -SELECT DISTINCT ID, Title, s1.source_doi source_doi, CONCAT("https://doi.org/",s1.source_doi) source_url, +SELECT DISTINCT + etl_functions.name_to_id(analysis_result_name) analysis_result_id, + analysis_result_name, + analysis_result_title, + s1.source_doi source_doi, + CONCAT("https://doi.org/", s1.source_doi) source_url, + STRING_AGG(DISTINCT TRIM(s1.CTypes, ' '), ", " ORDER BY TRIM(CTypes, ' ')) cancer_types, + STRING_AGG(DISTINCT TRIM(TLocations, ' '), ", " ORDER BY TRIM(TLocations, ' ')) tumor_locations, + COUNT(DISTINCT ajpac.submitter_case_id) subjects, + STRING_AGG( DISTINCT TRIM(s1.collection_id, ' '), ", " ORDER BY TRIM(s1.collection_id, ' ')) collections, + STRING_AGG( DISTINCT TRIM(Modality, ' '), ", " ORDER BY TRIM(Modality, ' ')) modalities, + updated, + license_url, license_long_name, license_short_name, + "Public" Access, + analysis_result_name ID, + analysis_result_title Title, STRING_AGG(DISTINCT TRIM(s1.CTypes, ' '), ", " ORDER BY TRIM(CTypes, ' ')) CancerTypes, STRING_AGG(DISTINCT TRIM(TLocations, ' '), ", " ORDER BY TRIM(TLocations, ' ')) TumorLocations, - COUNT(DISTINCT ajpac.submitter_case_id) Subjects, - STRING_AGG( DISTINCT TRIM(s1.collection_id, ' '), ", " ORDER BY TRIM(s1.collection_id, ' ')) Collections, - STRING_AGG( DISTINCT TRIM(Modality, ' '), ", " ORDER BY TRIM(Modality, ' ')) Modalities, - Updated, - license_url, license_long_name, license_short_name, - "Public" Access FROM s1 - JOIN `idc-dev-etl.{settings.BQ_DEV_INT_DATASET}.all_joined_public_and_current` ajpac + JOIN `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_joined_public_and_current` ajpac ON s1.source_doi = ajpac.source_doi - JOIN `idc-dev-etl.{settings.BQ_DEV_EXT_DATASET}.dicom_metadata` dm + JOIN `{settings.DEV_PROJECT}.{settings.BQ_DEV_EXT_DATASET}.dicom_metadata` dm ON ajpac.sop_instance_uid = dm.SOPInstanceUID GROUP BY ID, Title, source_doi, source_url, Updated, license_url, license_long_name, license_short_name ORDER BY ID @@ -171,23 +155,26 @@ def get_analysis_results_metadata(client, analysis_metadata): results = {row['source_doi']:dict(row) for row in client.query(query)} return results -def get_tcia_sourced_analysis_metadata(BQ_client): - tcia_ars = get_all_tcia_metadata('analysis-results') - ar_metadata = {} +def get_tcia_sourced_analysis_result_metadata(BQ_client): + tcia_ars = get_tcia_collection_manager_data('analysis-results') + ar_metadata = [] for ar in tcia_ars: - ar_metadata[ar['result_doi'].lower()] = dict( - ID = ar['result_short_title'], - Title = ar['result_title'], + ar_metadata.append( + dict( + analysis_result_name = ar['result_short_title'], + analysis_result_title = ar['result_title'], # Access = ar['result_page_accessibility'], source_doi = ar['result_doi'].lower(), - Updated = ar['date_updated'], + updated = ar['date_updated'], + ) ) return ar_metadata def build_metadata(args, BQ_client): - tcia_analysis_metadata = get_tcia_sourced_analysis_metadata(BQ_client) - idc_analysis_metadata = get_idc_sourced_analysis_metadata(BQ_client) - all_analysis_results = idc_analysis_metadata | tcia_analysis_metadata + # Get some basic metadata for each tcia-sourced and idc-sourced analysis result + tcia_analysis_metadata = get_tcia_sourced_analysis_result_metadata(BQ_client) + idc_analysis_metadata = get_idc_sourced_analysis_result_metadata(BQ_client) + all_analysis_results = idc_analysis_metadata + tcia_analysis_metadata analysis_results_metadata = get_analysis_results_metadata(BQ_client, all_analysis_results) # Get analysis results descriptions @@ -195,31 +182,31 @@ def build_metadata(args, BQ_client): rows = [] for source_doi, analysis_data in analysis_results_metadata.items(): - # analysis_data['source_url'] = f'https://doi.org/{analysis_data["source_doi"]}' try: - analysis_data['Description'] = descriptions[analysis_data['ID']] + analysis_data['description'] = descriptions[analysis_data['ID']] except Exception as exc: errlogger.error(f'No description found for {analysis_data["ID"]}') analysis_data['Citation'] = get_citation(analysis_data['source_url']) rows.append(json.dumps(analysis_data)) - metadata = '\n'.join(rows) - return metadata + return rows def gen_table(args): BQ_client = bigquery.Client(project=settings.DEV_PROJECT) - metadata = build_metadata(args, BQ_client) + rows = build_metadata(args, BQ_client) + metadata = '\n'.join(rows) try: job = load_BQ_from_json(BQ_client, settings.DEV_PROJECT, settings.BQ_DEV_EXT_DATASET, args.bqtable_name, metadata, analysis_results_metadata_schema, write_disposition='WRITE_TRUNCATE', table_description='Metadata of Analysis Results. These are the results of analysis performed against Original Collections hosted by IDC.') + while not job.state == 'DONE': + progresslogger.info('Status: {}'.format(job.state)) + time.sleep(args.period * 60) + successlogger.info(f"{time.asctime()}: Completed {args.bqtable_name}") + return except Exception as exc: print(f'Error {exc}') - exit - while not job.state == 'DONE': - progresslogger.info('Status: {}'.format(job.state)) - time.sleep(args.period * 60) - successlogger.info(f"{time.asctime()}: Completed {args.bqtable_name}") + exit(1) if __name__ == '__main__': parser =argparse.ArgumentParser() diff --git a/bq/generate_tables_and_views/analysis_results_metadata_idc_source.py b/bq/generate_tables_and_views/analysis_results_metadata_idc_source.py index cd958c9..6390cfc 100644 --- a/bq/generate_tables_and_views/analysis_results_metadata_idc_source.py +++ b/bq/generate_tables_and_views/analysis_results_metadata_idc_source.py @@ -18,7 +18,7 @@ # spreadsheet in Google Drive import settings import argparse -from bq.utilities import json_file_to_bq +from bq.bq_utilities import json_file_to_bq if __name__ == '__main__': parser = argparse.ArgumentParser() @@ -29,5 +29,5 @@ args = parser.parse_args() print('args: {}'.format(args)) - json_file_path = f"{settings.PROJECT_PATH}/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json" + json_file_path = f"{settings.PROJECT_PATH}/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json5" json_file_to_bq(args, json_file_path) \ No newline at end of file diff --git a/bq/generate_tables_and_views/analysis_results_tooltip_descriptions.py b/bq/generate_tables_and_views/analysis_results_tooltip_descriptions.py index ac77681..531cb4c 100644 --- a/bq/generate_tables_and_views/analysis_results_tooltip_descriptions.py +++ b/bq/generate_tables_and_views/analysis_results_tooltip_descriptions.py @@ -22,7 +22,7 @@ import pandas as pd from google.cloud import bigquery import markdownify -from bq.utilities import read_json_to_dataframe, dataframe_to_bq +from bq.bq_utilities import read_json_to_dataframe, dataframe_to_bq import re diff --git a/bq/generate_tables_and_views/auxiliary_metadata_table/schema.py b/bq/generate_tables_and_views/auxiliary_metadata_table/schema.py index a263c3d..68ec0eb 100644 --- a/bq/generate_tables_and_views/auxiliary_metadata_table/schema.py +++ b/bq/generate_tables_and_views/auxiliary_metadata_table/schema.py @@ -51,19 +51,17 @@ bigquery.SchemaField('study_hash', 'STRING', mode='NULLABLE', description='md5 hash of the data in the this version of the study containing this instance'), bigquery.SchemaField('study_init_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which the study containing this instance first appeared'), bigquery.SchemaField('study_revised_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which this version of the study containing this instance first appeared'), - bigquery.SchemaField('study_final_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which this version of the study containing this instance last appeared. If 0, thise is the current version.'), bigquery.SchemaField('SeriesInstanceUID', 'STRING', mode='NULLABLE', description='DICOM series containing this instance'), bigquery.SchemaField('series_uuid', 'STRING', mode='NULLABLE', description='UUID of this version of the series containing this instance'), bigquery.SchemaField('series_gcs_url', 'STRING', mode='NULLABLE', description='URL of the Google Cloud Storage (GCS) folder of the series containing this instance'), bigquery.SchemaField('series_aws_url', 'STRING', mode='NULLABLE', description='URL of the Amazon Web Services (AWS) folder of the series containing this instance'), - bigquery.SchemaField('Source_DOI', 'STRING', mode='NULLABLE', description='The DOI of a wiki page that describes the original collection or analysis result that includes this instance'), - bigquery.SchemaField('Source_URL', 'STRING', mode='NULLABLE', description='The URL of a wiki page that describes the original collection or analysis result that includes this instance'), + bigquery.SchemaField('source_DOI', 'STRING', mode='NULLABLE', description='The DOI of a wiki page that describes the original collection or analysis result that includes this instance'), + bigquery.SchemaField('source_URL', 'STRING', mode='NULLABLE', description='The URL of a wiki page that describes the original collection or analysis result that includes this instance'), bigquery.SchemaField('versioned_Source_DOI', 'STRING', mode='NULLABLE', description='If present, the DOI of a wiki page that describes the original collection or analysis result that includes this version of this instance'), bigquery.SchemaField('series_instances', 'INTEGER', mode='NULLABLE', description='Number of instances in the version of the study containing this instance'), bigquery.SchemaField('series_hash', 'STRING', mode='NULLABLE', description='md5 hash of the data in the this version of the series containing this instance'), bigquery.SchemaField('series_init_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which the series containing this instance first appeared'), bigquery.SchemaField('series_revised_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which this version of the series containing this instance first appeared'), - bigquery.SchemaField('series_final_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which this version of the series containing this instance last appeared. If 0, thise is the current version.'), bigquery.SchemaField('SOPInstanceUID', 'STRING', mode='NULLABLE', description='DICOM instance containing this instance version'), bigquery.SchemaField('instance_uuid', 'STRING', mode='NULLABLE', description='UUID of this version of this instance'), bigquery.SchemaField('gcs_url', 'STRING', mode='NULLABLE', description='URL of the Google Cloud Storage (GCS) object containing the current version of this instance' ), @@ -72,16 +70,14 @@ bigquery.SchemaField('aws_bucket', 'STRING', mode='NULLABLE', description='Name to the Amazon Web Services (AWS) bucket containing the current version of this instance'), bigquery.SchemaField('instance_size', 'INTEGER', mode='NULLABLE', description='Size in bytes of this version of this instance'), bigquery.SchemaField('instance_hash', 'STRING', mode='NULLABLE', description='md5 hash of the data in the this version of this instance'), - # bigquery.SchemaField('instance_source', 'STRING', mode='NULLABLE', description='Source of the instance, either tcia or idc'), bigquery.SchemaField('instance_init_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which this instance first appeared'), bigquery.SchemaField('instance_revised_idc_version', 'INTEGER', mode='NULLABLE', description='The IDC version in which this instance first appeared'), bigquery.SchemaField('license_url', 'STRING', mode='NULLABLE', description='URL of license of this instance'), bigquery.SchemaField('license_long_name', 'STRING', mode='NULLABLE', description='Long name of license of this instance'), bigquery.SchemaField('license_short_name', 'STRING', mode='NULLABLE', description='Short name of license of this instance'), - bigquery.SchemaField('instance_final_idc_version', 'INTEGER', mode='NULLABLE', description='DEPRECATED: The IDC version in which this instance last appeared. If 0, thise is the current version.'), + bigquery.SchemaField('study_final_idc_version', 'INTEGER', mode='NULLABLE', description='DEPRECATED: The IDC version in which this version of the study containing this instance last appeared. If 0, thise is the current version.'), + bigquery.SchemaField('series_final_idc_version', 'INTEGER', mode='NULLABLE', description='DEPRECATED: The IDC version in which this version of the series containing this instance last appeared. If 0, thise is the current version.'), + bigquery.SchemaField('instance_final_idc_version', 'INTEGER', mode='NULLABLE', description='DEPRECATED: The IDC version in which this instance last appeared. If 0, this is the current version.'), bigquery.SchemaField('submitter_case_id', 'STRING', mode='NULLABLE', description='DEPRECATED: Identical to column PatientID'), bigquery.SchemaField('access', 'STRING', mode='NULLABLE', description='DEPRECATED (all data is Public): Collection access status: Public or Limited'), - # bigquery.SchemaField('tcia_api_collection_id', 'STRING', mode='NULLABLE', description='DEPRECATED: Collection name as used externally by IDC webapp'), - # bigquery.SchemaField('idc_webapp_collection_id', 'STRING', mode='NULLABLE', description='DEPRECATED: Collection ID as used internally by IDC webapp and accepted by the IDC API'), - ] diff --git a/bq/generate_tables_and_views/collection_program_map.py b/bq/generate_tables_and_views/collection_program_map.py index b93c6b9..e5b4aa5 100644 --- a/bq/generate_tables_and_views/collection_program_map.py +++ b/bq/generate_tables_and_views/collection_program_map.py @@ -23,7 +23,7 @@ import settings import argparse -from utilities.tcia_helpers import get_all_tcia_metadata +from utilities.tcia_helpers import get_tcia_collection_manager_data from google.cloud import bigquery SCHEMA = [ @@ -44,7 +44,7 @@ def gen_table(args): # Get a list of the program of each TCIA sourced collection tcia_programs = [(row['collection_short_title'].lower().replace('-', '_').replace(' ', '_'), row['program'][0]) if type(row['program']) == list else "" for - row in get_all_tcia_metadata(type="collections", query_param="&_fields=collection_short_title,program") \ + row in get_tcia_collection_manager_data(type="collections", query_param="&_fields=collection_short_title,program") \ if row['collection_short_title'] != 'TEST-PAGE'] all_programs = idc_programs diff --git a/bq/generate_tables_and_views/derived_tables/BQ_Table_Building/derived_data_views/schema/dicom_all.json b/bq/generate_tables_and_views/derived_tables/BQ_Table_Building/derived_data_views/schema/dicom_all.json index 5a66fe9..8d71562 100644 --- a/bq/generate_tables_and_views/derived_tables/BQ_Table_Building/derived_data_views/schema/dicom_all.json +++ b/bq/generate_tables_and_views/derived_tables/BQ_Table_Building/derived_data_views/schema/dicom_all.json @@ -65,7 +65,7 @@ "type": "STRING" }, { - "description": "Collection access status: Public or Limited", + "description": "DEPRECATED: Collection access status: Public or Limited", "mode": "NULLABLE", "name": "access", "type": "STRING" @@ -94,6 +94,12 @@ "name": "patient_revised_idc_version", "type": "STRING" }, + { + "description": "UUID of this version of study containing this instance", + "mode": "NULLABLE", + "name": "crdc_study_uuid", + "type": "STRING" + }, { "description": "md5 hash of the data in the this version of the study containing this instance", "mode": "NULLABLE", @@ -115,7 +121,7 @@ { "description": "UUID of this version of series containing this instance", "mode": "NULLABLE", - "name": "series_uuid", + "name": "crdc_series_uuid", "type": "STRING" }, { @@ -151,7 +157,7 @@ { "description": "UUID of this version of this instance", "mode": "NULLABLE", - "name": "instance_uuid", + "name": "crdc_instance_uuid", "type": "STRING" }, { @@ -244,24 +250,6 @@ "name": "license_short_name", "type": "STRING" }, - { - "description": "DEPRECATED: UUID of this version of study containing this instance", - "mode": "NULLABLE", - "name": "crdc_study_uuid", - "type": "STRING" - }, - { - "description": "DEPRECATED: Duplicate of series_uuid", - "mode": "NULLABLE", - "name": "crdc_series_uuid", - "type": "STRING" - }, - { - "description": "DEPRECATED: Duplicate of instance_uuid", - "mode": "NULLABLE", - "name": "crdc_instance_uuid", - "type": "STRING" - }, { "description": "TBD", "mode": "NULLABLE", diff --git a/bq/generate_tables_and_views/doi_to_access.py b/bq/generate_tables_and_views/doi_to_access.py index 22e8d53..c2aa1c6 100644 --- a/bq/generate_tables_and_views/doi_to_access.py +++ b/bq/generate_tables_and_views/doi_to_access.py @@ -18,7 +18,7 @@ # import settings import argparse -from bq.utilities import json_file_to_bq +from bq.bq_utilities import json_file_to_bq if __name__ == '__main__': parser = argparse.ArgumentParser() diff --git a/bq/generate_tables_and_views/gen_licenses_table.py b/bq/generate_tables_and_views/gen_licenses_table.py index ba23635..c85da5b 100644 --- a/bq/generate_tables_and_views/gen_licenses_table.py +++ b/bq/generate_tables_and_views/gen_licenses_table.py @@ -25,7 +25,7 @@ import settings from google.cloud import bigquery from utilities.bq_helpers import load_BQ_from_json, delete_BQ_Table -from utilities.tcia_helpers import get_all_tcia_metadata +from utilities.tcia_helpers import get_tcia_collection_manager_data from utilities.logging_config import progresslogger, errlogger import pandas as pd @@ -61,10 +61,10 @@ def get_tcia_original_collection_licenses(client, args, tcia_downloads_metadata): # Get all the collection manager collections data: try: - tcia_collection_metadata = {row['collection_short_title']:row for row in get_all_tcia_metadata('collections')} + tcia_collection_metadata = {row['collection_short_title']:row for row in get_tcia_collection_manager_data('collections')} except Exception as exc: pass - tcia_license_metadata = {row['license_label']:row for row in get_all_tcia_metadata('licenses')} + tcia_license_metadata = {row['license_label']:row for row in get_tcia_collection_manager_data('licenses')} tcia_licenses = [] for collection_name, collection_metadata in tcia_collection_metadata.items(): @@ -123,9 +123,9 @@ def get_tcia_dois(client, args): # These are licenses of analysis results sourced from TCIA and therefore TCIA sets the licenses def get_tcia_analysis_results_licenses(client, args, tcia_downloads_metadata): # Get TCIA Collection Manager analysis-results metadata of all TCIA analysis results - all_tcia_analysis_results_metadata = {row['result_short_title']: row for row in get_all_tcia_metadata('analysis-results')} + all_tcia_analysis_results_metadata = {row['result_short_title']: row for row in get_tcia_collection_manager_data('analysis-results')} # Get all the download and license info from the collection manager. - tcia_license_metadata = {row['license_label']:row for row in get_all_tcia_metadata('licenses')} + tcia_license_metadata = {row['license_label']:row for row in get_tcia_collection_manager_data('licenses')} tcia_licenses = [] # Get the license for each AR that IDC has. @@ -199,7 +199,7 @@ def construct_licenses_table(args): idc_collection_licenses = get_idc_collection_licences(args) idc_analysis_results_licenses = get_idc_analysis_results_licences(args) - tcia_downloads_metadata = {row['id']:row for row in get_all_tcia_metadata('downloads')} + tcia_downloads_metadata = {row['id']:row for row in get_tcia_collection_manager_data('downloads')} tcia_analysis_results_licenses = get_tcia_analysis_results_licenses(client, args, tcia_downloads_metadata) tcia_collection_licenses = get_tcia_original_collection_licenses(client, args, tcia_downloads_metadata) all_licenses = pd.concat([idc_collection_licenses, idc_analysis_results_licenses, tcia_collection_licenses, tcia_analysis_results_licenses]) diff --git a/bq/generate_tables_and_views/metadata_sunset.py b/bq/generate_tables_and_views/metadata_sunset.py index ee9da6c..b863b33 100644 --- a/bq/generate_tables_and_views/metadata_sunset.py +++ b/bq/generate_tables_and_views/metadata_sunset.py @@ -18,7 +18,7 @@ # spreadsheet in Google Drive import settings import argparse -from bq.utilities import json_file_to_bq +from bq.bq_utilities import json_file_to_bq if __name__ == '__main__': diff --git a/bq/generate_tables_and_views/obsolete/licenses.py b/bq/generate_tables_and_views/obsolete/licenses.py index 834b367..50050ee 100644 --- a/bq/generate_tables_and_views/obsolete/licenses.py +++ b/bq/generate_tables_and_views/obsolete/licenses.py @@ -25,7 +25,7 @@ import settings from google.cloud import bigquery from utilities.bq_helpers import load_BQ_from_json, delete_BQ_Table -from utilities.tcia_helpers import get_all_tcia_metadata +from utilities.tcia_helpers import get_tcia_collection_manager_data from utilities.logging_config import progresslogger, errlogger LICENSE_NAME_MAP = { @@ -128,11 +128,11 @@ def get_idc_sourced_collection_licenses(client): def get_tcia_original_collection_licenses(client, args, tcia_sourced_subcollections): # Get all the collection manager collections data: try: - tcia_collection_metadata = {row['collection_short_title']:row for row in get_all_tcia_metadata('collections')} + tcia_collection_metadata = {row['collection_short_title']:row for row in get_tcia_collection_manager_data('collections')} except Exception as exc: pass - tcia_downloads_metadata = {row['id']:row for row in get_all_tcia_metadata('downloads')} - tcia_licese_metadata = {row['license_label']:row for row in get_all_tcia_metadata('licenses')} + tcia_downloads_metadata = {row['id']:row for row in get_tcia_collection_manager_data('downloads')} + tcia_licese_metadata = {row['license_label']:row for row in get_tcia_collection_manager_data('licenses')} tcia_licenses = [] for collection_id, values in tcia_sourced_subcollections.items(): @@ -233,18 +233,18 @@ def get_tcia_analysis_results_licenses(client, args): idc_ar_dois = get_tcia_dois(client, args) # Get all the collection manager collections data - tcia_collection_metadata = {row['collection_short_title']: row for row in get_all_tcia_metadata('collections')} + tcia_collection_metadata = {row['collection_short_title']: row for row in get_tcia_collection_manager_data('collections')} # Get TCIA Collection Manager analysis-results metadata of all TCIA analysis results - all_tcia_analysis_results_metadata = get_all_tcia_metadata('analysis-results') + all_tcia_analysis_results_metadata = get_tcia_collection_manager_data('analysis-results') # Keep only the metadata of analysis results which IDC has # These are the only ARs for which we need licenses tcia_ar_metadata = {row['result_short_title']:row for row in all_tcia_analysis_results_metadata \ if row['result_doi'].lower() in idc_ar_dois} # Get all the download and license info from the collection manager. - tcia_downloads_metadata = {row['id']:row for row in get_all_tcia_metadata('downloads')} - tcia_license_metadata = {row['license_label']:row for row in get_all_tcia_metadata('licenses')} + tcia_downloads_metadata = {row['id']:row for row in get_tcia_collection_manager_data('downloads')} + tcia_license_metadata = {row['license_label']:row for row in get_tcia_collection_manager_data('licenses')} tcia_licenses = [] # Get the license for each AR that IDC has. diff --git a/bq/generate_tables_and_views/original_collections_end_user_descriptions.py b/bq/generate_tables_and_views/original_collections_end_user_descriptions.py index 373ad96..aaf7e07 100644 --- a/bq/generate_tables_and_views/original_collections_end_user_descriptions.py +++ b/bq/generate_tables_and_views/original_collections_end_user_descriptions.py @@ -22,7 +22,7 @@ import pandas as pd from google.cloud import bigquery import markdownify -from bq.utilities import read_json_to_dataframe, dataframe_to_bq +from bq.bq_utilities import read_json_to_dataframe, dataframe_to_bq # Get the descriptions of collections that are only sourced from IDC diff --git a/bq/generate_tables_and_views/original_collections_metadata/gen_original_collection_metadata.py b/bq/generate_tables_and_views/original_collections_metadata/gen_original_collection_metadata.py index 297a6d1..e7e7a93 100644 --- a/bq/generate_tables_and_views/original_collections_metadata/gen_original_collection_metadata.py +++ b/bq/generate_tables_and_views/original_collections_metadata/gen_original_collection_metadata.py @@ -22,10 +22,10 @@ from google.cloud import bigquery from utilities.bq_helpers import load_BQ_from_json, delete_BQ_Table from bq.generate_tables_and_views.original_collections_metadata.schema import data_collections_metadata_schema -from utilities.tcia_helpers import get_all_tcia_metadata +from utilities.tcia_helpers import get_tcia_collection_manager_data from utilities.logging_config import progresslogger, errlogger from python_settings import settings -from bq.utilities import read_json_to_dataframe +from bq.bq_utilities import read_json_to_dataframe import requests import pandas as pd @@ -36,10 +36,10 @@ def add_programs(client, args, collection_metadata): programs = {row['collection_id'].lower(): row['program'] for row in client.query(query).result()} for collection_name, metadata in collection_metadata.items(): try: - metadata["Program"] = programs[metadata['collection_id']] + metadata["program"] = programs[metadata['collection_id']] except Exception as exc: errlogger.error(f'No program for {collection_name}') - metadata["Program"] = "" + metadata["program"] = "" return collection_metadata @@ -80,33 +80,34 @@ def add_case_counts(client, args, collection_metadata): case_counts = {c['collection_id']: c['cases'] for c in client.query(query).result()} for collection in collection_metadata: try: - collection_metadata[collection]['Subjects'] = case_counts[collection] + collection_metadata[collection]['subjects'] = case_counts[collection] except Exception as exc: errlogger.error(f'No case counts for {collection}') - collection_metadata[collection]['Subjects'] = "" + collection_metadata[collection]['subjects'] = "" return collection_metadata -# Generate a per-collection list of the modalities across all instances in each collection -def add_image_modalities(client, args, collection_metadata): - query = f""" - SELECT DISTINCT - REPLACE(REPLACE(LOWER(collection_id),'-','_'),' ','_') AS idc_webapp_collection_id, - STRING_AGG(DISTINCT modality, ", " ORDER BY modality) ImageTypes - FROM - `idc-dev-etl.{settings.BQ_DEV_INT_DATASET}.all_joined_public_and_current` - JOIN - `idc-dev-etl.idc_v{settings.CURRENT_VERSION}_pub.dicom_metadata` - ON - sop_instance_uid = SOPInstanceUID - GROUP BY - idc_webapp_collection_id - ORDER BY - idc_webapp_collection_id """ - - imageTypes = {c['idc_webapp_collection_id'].lower().replace(' ','_').replace('-','_'): c['ImageTypes'] for c in client.query(query).result()} - for collection in collection_metadata: - collection_metadata[collection]['ImageTypes'] = imageTypes[collection] +# # Generate a per-collection list of the modalities across all instances in each collection +# def add_image_modalities(client, args, collection_metadata): +# query = f""" +# SELECT DISTINCT +# REPLACE(REPLACE(LOWER(collection_id),'-','_'),' ','_') AS idc_webapp_collection_id, +# STRING_AGG(DISTINCT modality, ", " ORDER BY modality) ImageTypes +# FROM +# `idc-dev-etl.{settings.BQ_DEV_INT_DATASET}.all_joined_public_and_current` +# JOIN +# `idc-dev-etl.idc_v{settings.CURRENT_VERSION}_pub.dicom_metadata` +# ON +# sop_instance_uid = SOPInstanceUID +# GROUP BY +# idc_webapp_collection_id +# ORDER BY +# idc_webapp_collection_id """ +# +# imageTypes = {c['idc_webapp_collection_id'].lower().replace(' ','_').replace('-','_'): c['ImageTypes'] for c in client.query(query).result()} +# for collection in collection_metadata: +# collection_metadata[collection]['modalities'] = imageTypes[collection] +# collection_metadata[collection]['ImageTypes'] = imageTypes[collection] return collection_metadata @@ -125,16 +126,20 @@ def get_original_collections_metadata_idc_source(client, args): idc_only_metadata[row['collection_name']] = dict( collection_name=row['collection_name'], collection_id=row['collection_id'], - Title=row['title'], - CancerTypes=row['CancerTypes'], - TumorLocations=row['TumorLocations'], + collection_title=row['title'], + cancer_types=row['CancerTypes'], + tumor_locations=row['TumorLocations'], # Subjects = 0, - Species=row['Species'], - Sources=[], + species=row['Species'], + sources=[], + supporting_data=row['SupportingData'], + status=row['Status'], + updated = None, + # Deprecations + Title = row['title'], + CancerTypes = row['CancerTypes'], + TumorLocations = row['TumorLocations'], SupportingData=row['SupportingData'], - Status=row['Status'], - Updated = None -# Updated=row['Updated'] if row['Updated'] != 'NA' else None, ) return idc_only_metadata @@ -156,7 +161,7 @@ def get_citation(source_url): def get_original_collections_metadata_tcia_source(client, args, idc_collections): - tcia_collection_metadata = get_all_tcia_metadata('collections') + tcia_collection_metadata = get_tcia_collection_manager_data('collections') metadata = {} for collection_name, values in idc_collections.items(): # Find the collection manager entry corresponding to a collection that IDC has @@ -165,32 +170,34 @@ def get_original_collections_metadata_tcia_source(client, args, idc_collections) if collection_name == collection['collection_short_title']) except Exception as exc: errlogger.error(f'No collection manager data for {collection_name}') - id_map = { - 'ACRIN-NSCLC-FDG-PET': 'ACRIN 6668', - 'CT COLONOGRAPHY': 'ACRIN 6664', - 'Prostate-Anatomical-Edge-Cases': 'Prostate Anatomical Edge Cases', - 'QIN-BREAST': 'QIN-Breast' - } + exit(1) try: metadata[collection_name] = dict( collection_name=collection_name, collection_id=values['collection_id'], + collection_title=collection_metadata['collection_title'], + cancer_types=", ".join(collection_metadata['cancer_types']) \ + if isinstance(collection_metadata['cancer_types'], list) else '', + tumor_locations=", ".join(collection_metadata['cancer_locations']) \ + if isinstance(collection_metadata['cancer_locations'], list) else '', + subjects=0, + species=", ".join(collection_metadata['species']) \ + if isinstance(collection_metadata['species'], list) else '', + sources = [], + supporting_data=", ".join(collection_metadata['supporting_data']) \ + if isinstance(collection_metadata['supporting_data'], list) else '', + status=collection_metadata['collection_status'], + updated=None, + # Deprecations Title=collection_metadata['collection_title'], CancerTypes=", ".join(collection_metadata['cancer_types']) \ if isinstance(collection_metadata['cancer_types'], list) else '', TumorLocations=", ".join(collection_metadata['cancer_locations']) \ if isinstance(collection_metadata['cancer_locations'], list) else '', - Subjects=0, - Species=", ".join(collection_metadata['species']) \ - if isinstance(collection_metadata['species'], list) else '', - Sources = [], SupportingData=", ".join(collection_metadata['supporting_data']) \ if isinstance(collection_metadata['supporting_data'], list) else '', - Status=collection_metadata['collection_status'], - Updated=None - # Updated=collection_metadata['date_updated'].split('T')[0] - ) + ) except Exception as exc: print(exc) @@ -210,7 +217,7 @@ def get_idc_sourced_analysis_results_metadata(): return idc_sourced_original_collections_metadata -# Get all the collections in this version. +# Get metadata of all the collections in this version. # For each collection, determine whether collection level metadata (as opposed to per-source metadata) is sourced # from tcia or idc. We get this collection level metadata from tcia if we get radiology or pathology or both from # tcia. Otherwise, we get collection level metadata from idc maintained table/file. @@ -238,8 +245,6 @@ def get_collection_metadata(client, args): # Merge the TCIA collection metadata. collection_metadata = tcia_sourced_collections - # for collection_id, metadata in idc_and_tcia_collection_metadata.items(): - # collection_metadata[collection_id]['Sources'].extend(metadata['Sources']) collection_metadata |= idc_only_collections return collection_metadata @@ -258,7 +263,7 @@ def add_descriptions(client, args, collection_metadata): ) for collection, metadata in collection_metadata.items(): try: - metadata['Description'] = descriptions[collection.lower().replace('-','_').replace(' ','_')]['description'] + metadata['description'] = descriptions[collection.lower().replace('-','_').replace(' ','_')]['description'] except Exception as exc: errlogger.error(f'No description for {collection}: {exc}') # collection_metadata[collection]['Description'] = "" @@ -276,18 +281,18 @@ def add_licenses(client, doi, collection_metadata): licenses = {row['source_doi']: row['license'] for row in client.query(query)} for collection, metadata in collection_metadata.items(): - for source in metadata['Sources']: + for source in metadata['sources']: try: - source["License"] = licenses[source['source_doi']] + source["license"] = licenses[source['source_doi']] except Exception as exc: errlogger.error(f'No license for {collection}, {source["source_doi"]}: {exc}') - source["License"] = {"license_doi": "", "license_long_name": "", "license_short_name": ""} + source["license"] = {"license_doi": "", "license_long_name": "", "license_short_name": ""} progresslogger.info('Added licenses') return collection_metadata def add_citations(collection_metadata): for collection, data in collection_metadata.items(): - for source in data['Sources']: + for source in data['sources']: if source['source_doi']: try: citation = get_citation(source['source_url']) @@ -296,7 +301,7 @@ def add_citations(collection_metadata): citation = source['source_url'] else: citation = source['source_url'] - source['Citation'] = citation + source['citation'] = citation progresslogger.info('Added citations') return collection_metadata @@ -321,8 +326,9 @@ def add_modalities(client, collection_metadata): modalities[collection_name] = {row['source_url']: row['modalities']} for collection_name, metadata in collection_metadata.items(): - for source in metadata['Sources']: + for source in metadata['sources']: try: + source['modalities'] = modalities[collection_name][source['source_url'].lower()] source['ImageTypes'] = modalities[collection_name][source['source_url'].lower()] except: errlogger.error(f'No modality for {collection_name}, {source["source_url"].lower()}') @@ -341,10 +347,10 @@ def add_updates(client,collection_metadata): timestamps = {c['collection_name']:c['version_timestamp'] for c in client.query(query).result()} for collection_name in collection_metadata: try: - collection_metadata[collection_name]['Updated'] = timestamps[collection_name] + collection_metadata[collection_name]['updated'] = timestamps[collection_name] except Exception as exc: errlogger.error(f'No timestamp for {collection_name}') - collection_metadata[collection_name]['Updated'] = "" + collection_metadata[collection_name]['updated'] = "" progresslogger.info('Added updates') return collection_metadata @@ -355,49 +361,53 @@ def add_sources(client, collection_metadata): FROM `{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_joined_public_and_current` """ for row in client.query(query).result(): - collection_metadata[row['collection_name']]['Sources'].append( + collection_metadata[row['collection_name']]['sources'].append( { - "ID": "", - "Type": "", - "Access": "Public", + "source_id": "", + "source_type": "", "source_doi": row["source_doi"], "source_url": f'https://doi.org/{row["source_doi"]}', - "ImageTypes": "", - "License": { + "modalities": "", + "license": { "license_url": "", "license_long_name": "", "license_short_name": "" }, - "Citation": ""} + "citation": "", + "access": "Public", + "ImageTypes": "", + } ) return collection_metadata def add_ids(client, collection_metadata): + + # Build a dictionary of source_id and type across all collections and analysis results tcia_collection_metadata = { data['collection_doi'].lower(): { - "ID": data['collection_short_title'], - "Type": "original data" - } for data in get_all_tcia_metadata('collections')} + "source_id": data['collection_short_title'].lower().replace('-', '_'), + "source_type": "original data" + } for data in get_tcia_collection_manager_data('collections')} tcia_analysis_results_metadata = {data['result_doi'].lower(): { - "ID": data['result_short_title'], - "Type": 'analysis result' - } for data in get_all_tcia_metadata('analysis-results')} + "source_id": data['result_short_title'].lower().replace('-', '_'), + "source_type": 'analysis result' + } for data in get_tcia_collection_manager_data('analysis-results')} idc_collection_metadata = {data['source_doi'].lower(): { - "ID": data['collection_name'], - "Type": "original data" + "source_id": data['collection_id'], + "source_type": "original data" } for index, data in read_json_to_dataframe(f'{settings.PROJECT_PATH}/bq/generate_tables_and_views/table_generation_jsons/idc_original_collections_metadata.json5').iterrows()} idc_analysis_results_metadata = {data['source_doi'].lower(): { - "ID": data['ID'], - "Type": "analysis result" - } for index, data in read_json_to_dataframe(f'{settings.PROJECT_PATH}/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json').iterrows()} + "source_id": data['ID'].lower().replace('-', '_').replace(' ', '_',), + "source_type": "analysis result" + } for index, data in read_json_to_dataframe(f'{settings.PROJECT_PATH}/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json5').iterrows()} source_data = tcia_collection_metadata | tcia_analysis_results_metadata | idc_collection_metadata | idc_analysis_results_metadata for collection, metadata in collection_metadata.items(): - for source in metadata['Sources']: + for source in metadata['sources']: try: - source['ID'] = source_data[source['source_doi']]['ID'] - source['Type'] = source_data[source['source_doi']]['Type'] + source['source_id'] = source_data[source['source_doi']]['source_id'] + source['source_type'] = source_data[source['source_doi']]['source_type'] except: - errlogger.error(f'No ID for {collection}:{source["source_doi"]}') + errlogger.error(f'No source_id for {collection}:{source["source_doi"]}') return collection_metadata def build_metadata(client, args): @@ -425,14 +435,14 @@ def gen_collections_table(args): if args.use_cached_metadata: with open(args.cached_metadata_file) as f: - metadata = json.load(f) + all_metadata = json.load(f) else: - metadata = build_metadata(BQ_client, args) + all_metadata = build_metadata(BQ_client, args) with open(args.cached_metadata_file, 'w') as f: - json.dump(metadata, f) + json.dump(all_metadata, f) # Drop any collections that do not have any sources. This is probably only needed during development - metadata = [row for row in metadata if len(row['Sources']) > 0] + metadata = [row for row in all_metadata if len(row['sources']) > 0] pass metadata_json = '\n'.join([json.dumps(row) for row in sorted(metadata, key=lambda d: d['collection_name'])]) @@ -443,10 +453,10 @@ def gen_collections_table(args): settings.DEV_PROJECT, settings.BQ_DEV_EXT_DATASET if args.access=='Public' else settings.BQ_DEV_INT_DATASET , args.bqtable_name, metadata_json, data_collections_metadata_schema, write_disposition='WRITE_TRUNCATE') - pass + return except Exception as exc: errlogger.error(f'Table creation failed: {exc}') - exit + exit if __name__ == '__main__': diff --git a/bq/generate_tables_and_views/original_collections_metadata/schema.py b/bq/generate_tables_and_views/original_collections_metadata/schema.py index a886eb8..899f10f 100644 --- a/bq/generate_tables_and_views/original_collections_metadata/schema.py +++ b/bq/generate_tables_and_views/original_collections_metadata/schema.py @@ -20,25 +20,26 @@ data_collections_metadata_schema = [ bigquery.SchemaField('collection_name', 'STRING', mode='REQUIRED', description='Collection name as used externally by IDC webapp'), bigquery.SchemaField('collection_id', 'STRING', mode='REQUIRED', description='Collection ID as used internally by IDC webapp'), - bigquery.SchemaField('Title', 'STRING', mode='REQUIRED', description='Collection ID as used internally by IDC webapp'), - bigquery.SchemaField('CancerTypes','STRING', mode='REQUIRED', description='Cancer type of this collection '), - bigquery.SchemaField('TumorLocations','STRING', mode='REQUIRED', description='Body location that was studied'), - bigquery.SchemaField('Subjects', 'INTEGER', mode='REQUIRED', description='Number of subjects in collection'), - bigquery.SchemaField('Species', 'STRING', mode='REQUIRED', description="Species of collection subjects"), + bigquery.SchemaField('collection_title', 'STRING', mode='REQUIRED', + description='Descriptive title of this collection'), + bigquery.SchemaField('cancer_types', 'STRING', mode='REQUIRED', description='Cancer types in this collection '), + bigquery.SchemaField('tumor_locations', 'STRING', mode='REQUIRED', + description='Tumor locations in this collection'), + bigquery.SchemaField('subjects', 'INTEGER', mode='REQUIRED', description='Number of subjects in this collection'), + bigquery.SchemaField('species', 'STRING', mode='REQUIRED', description="Species of collection subjects"), bigquery.SchemaField( - "Sources", + "sources", "RECORD", mode="REPEATED", fields=[ - bigquery.SchemaField('ID', 'STRING', mode='NULLABLE', description='Original collection or Analysis result ID'), - bigquery.SchemaField('Type', 'STRING', mode='NULLABLE', description='Original collection or Analysis result'), - bigquery.SchemaField('Access', 'STRING', mode='NULLABLE', description='Limited or Public'), + bigquery.SchemaField('source_id', 'STRING', mode='NULLABLE', description='collection_id or analysis_result_id of this source'), + bigquery.SchemaField('source_type', 'STRING', mode='NULLABLE', description='"original collection" or "analysis result"'), bigquery.SchemaField('source_doi', 'STRING', mode='NULLABLE', description='DOI that can be resolved at doi.org to a information page of this source'), bigquery.SchemaField('source_url', 'STRING', mode='REQUIRED', - description='URL of source information page'), - bigquery.SchemaField('ImageTypes', 'STRING', mode='NULLABLE', - description='Enumeration of types/modalities of instances from this source'), + description='URL of the information page of this sourc'), + bigquery.SchemaField('modalities', 'STRING', mode='NULLABLE', + description='URL of the information page of this source'), bigquery.SchemaField( "license", "RECORD", @@ -51,20 +52,25 @@ description='Short name of license of this (sub)collection') ] ), - bigquery.SchemaField('Citation', 'STRING', mode='NULLABLE', + bigquery.SchemaField('citation', 'STRING', mode='NULLABLE', description='Citation to be used for this source'), + bigquery.SchemaField('access', 'STRING', mode='NULLABLE', description='DEPRECATED: All IDC data is public'), + bigquery.SchemaField('ImageTypes', 'STRING', mode='NULLABLE', + description='DEPRECATED: Duplicate of modalities'), ], description='Array of metadata for each source of instance data in this collection' ), - bigquery.SchemaField('SupportingData', 'STRING', mode='NULLABLE', description='Type(s) of addional available data'), - bigquery.SchemaField('Program', 'STRING', mode='REQUIRED', description='Program to which this collection belongs'), - bigquery.SchemaField('Status', 'STRING', mode='NULLABLE', description='Collection status: Ongoing or Complete'), - bigquery.SchemaField('Updated', 'DATE', mode='NULLABLE', description='Date of most recent update'), - bigquery.SchemaField('Description', 'STRING', mode='REQUIRED', description='Description of collection (HTML format)'), - # bigquery.SchemaField('DOI', 'STRING', mode='NULLABLE', - # description='DEPRECATED: Duplicate of source_doi'), - # bigquery.SchemaField('URL', 'STRING', mode='NULLABLE', description='DEPRECATED: Duplicate of source_url'), - # bigquery.SchemaField('CancerType', 'STRING', mode='NULLABLE', description='DEPRECATED: Duplicate of CancerTypes '), - # bigquery.SchemaField('Location', 'STRING', mode='NULLABLE', - # description='DEPRECATED: Duplicate of TumorLocations'), + bigquery.SchemaField('supporting_data', 'STRING', mode='NULLABLE', description='Type(s) of addional available data'), + bigquery.SchemaField('program', 'STRING', mode='REQUIRED', description='Program to which this collection belongs'), + bigquery.SchemaField('status', 'STRING', mode='NULLABLE', description='Collection status: Ongoing or Complete'), + bigquery.SchemaField('updated', 'DATE', mode='NULLABLE', description='Date of most recent update'), + bigquery.SchemaField('description', 'STRING', mode='REQUIRED', description='Description of collection (HTML format)'), + # Deprecations + bigquery.SchemaField('Title', 'STRING', mode='REQUIRED', + description='Deprecated: Duplicate of collection_title'), + bigquery.SchemaField('CancerTypes', 'STRING', mode='REQUIRED', description='DEPRECATED: Duplicate of cancer_types'), + bigquery.SchemaField('TumorLocations', 'STRING', mode='REQUIRED', + description='DEPRECATED: Duplicate of tumor_locations'), + bigquery.SchemaField('SupportingData', 'STRING', mode='NULLABLE', + description='DEPRECATED: Duplicate of supporting_data'), ] \ No newline at end of file diff --git a/bq/generate_tables_and_views/original_collections_metadata_idc_source.py b/bq/generate_tables_and_views/original_collections_metadata_idc_source.py index 75db769..83ca4fc 100644 --- a/bq/generate_tables_and_views/original_collections_metadata_idc_source.py +++ b/bq/generate_tables_and_views/original_collections_metadata_idc_source.py @@ -18,7 +18,7 @@ # spreadsheet in Google Drive import settings import argparse -from bq.utilities import json_file_to_bq +from bq.bq_utilities import json_file_to_bq if __name__ == '__main__': parser = argparse.ArgumentParser() diff --git a/bq/generate_tables_and_views/original_collections_tooltip_descriptions.py b/bq/generate_tables_and_views/original_collections_tooltip_descriptions.py index 1a2e58d..da430da 100644 --- a/bq/generate_tables_and_views/original_collections_tooltip_descriptions.py +++ b/bq/generate_tables_and_views/original_collections_tooltip_descriptions.py @@ -22,7 +22,7 @@ import pandas as pd from google.cloud import bigquery import markdownify -from bq.utilities import read_json_to_dataframe, dataframe_to_bq +from bq.bq_utilities import read_json_to_dataframe, dataframe_to_bq import re diff --git a/bq/generate_tables_and_views/program_metadata.py b/bq/generate_tables_and_views/program_metadata.py index ca48ba8..731e71d 100644 --- a/bq/generate_tables_and_views/program_metadata.py +++ b/bq/generate_tables_and_views/program_metadata.py @@ -14,3 +14,54 @@ # limitations under the License. # + +# This script generates the BQ program_metadata table. +import argparse +import sys +import json +from google.cloud import bigquery +import hashlib + +import settings +from utilities.bq_helpers import load_BQ_from_json +from utilities.logging_config import successlogger, errlogger +from bq.bq_utilities import get_data_from_comet + +version_metadata_schema = [ + bigquery.SchemaField('program_name', 'STRING', mode='REQUIRED', description='Short program name'), + bigquery.SchemaField('program_id', 'STRING', mode='REQUIRED', description="Lower cased short program name"), + bigquery.SchemaField('program_title', 'STRING', mode='REQUIRED', description='Descriptive program title'), + bigquery.SchemaField('program_url', 'STRING', mode='REQUIRED', description='URL of program information page'), + bigquery.SchemaField('program_description', 'STRING', mode='REQUIRED', description='Brief program description'), + ] + + + +def gen_version_metadata_table(args): + client = bigquery.Client(project=args.src_project) + rows = get_data_from_comet(args.path, branch=args.comet_branch) + for row in rows: + row["program_url"] = "None" if row["program_url"] is None else row["program_url"] + metadata_json = '\n'.join([json.dumps(row) for row in + sorted(rows, key=lambda d: d['program_name'])]) + try: + job = load_BQ_from_json(client, args.dst_project, args.bqdataset_name, args.bqtable_name, metadata_json, + version_metadata_schema, write_disposition='WRITE_TRUNCATE') + successlogger.info('program_metadata table generation completed') + return + except Exception as exc: + errlogger.info(f'Error creating BQ table; {exc}') + exit(1) +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--version', default=settings.CURRENT_VERSION, help='Max IDC version for which to build the table') + parser.add_argument('--src_project', default='idc-dev-etl') + parser.add_argument('--dst_project', default='idc-dev-etl') + parser.add_argument('--bqdataset_name', default=f'idc_v{settings.CURRENT_VERSION}_pub', help='BQ dataset name') + parser.add_argument('--bqtable_name', default=f'program_metadata', help='BQ table name') + parser.add_argument('--comet_branch', default='release/v24', help="idc_comet github branch") + parser.add_argument("--path", default="vocabularies/programs.yaml", help="Path from branch to file") + + args = parser.parse_args() + print("{}".format(args), file=sys.stdout) + gen_version_metadata_table(args) \ No newline at end of file diff --git a/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json5 b/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json5 index f46c50e..004331e 100644 --- a/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json5 +++ b/bq/generate_tables_and_views/table_generation_jsons/idc_analysis_results_metadata.json5 @@ -1,142 +1,142 @@ [ /* { - "ID": "template", - "Title": "", + "analysis_result_name": "template", + "analysis_result_title": "", "source_doi": "", - "current versioned_source_doi": "", - "Updated": "", + "current_versioned_source_doi": "", + "updated": "", "license_url": "", "license_long_name": "", "license_short_name": "" }, */ { - "ID": "BAMF-AIMI-Annotations", - "Title": "Image segmentations produced by the AIMI Annotations initiative", + "analysis_result_name": "BAMF-AIMI-Annotations", + "analysis_result_title": "Image segmentations produced by the AIMI Annotations initiative", "source_doi": "10.5281/zenodo.8345959", - "current versioned_source_doi": "", - "Updated": "2023-11-07", + "current_versioned_source_doi": "", + "updated": "2023-11-07", "license_url": "https://creativecommons.org/licenses/by/4.0/", "license_long_name": "Creative Commons Attribution 4.0 International License", "license_short_name": "CC BY 4.0" }, { - "ID": "Lung-PET-CT-Dx-Annotations", - "Title": "Expert annotation of lung tumors for the Lung-PET-CT-Dx collection", + "analysis_result_name": "Lung-PET-CT-Dx-Annotations", + "analysis_result_title": "Expert annotation of lung tumors for the Lung-PET-CT-Dx collection", "source_doi": "10.5281/zenodo.16989819", - "current versioned_source_doi": "10.5281/zenodo.16989820", - "Updated": "2024-02-24", + "current_versioned_source_doi": "10.5281/zenodo.16989820", + "updated": "2024-02-24", "license_url": "https://creativecommons.org/licenses/by/4.0/", "license_long_name": "Creative Commons Attribution 4.0 International License", "license_short_name": "CC BY 4.0" }, { - "ID": "NLST-Sybil", - "Title": "Expert annotations of tumor regions in the NLST CT images", + "analysis_result_name": "NLST-Sybil", + "analysis_result_title": "Expert annotations of tumor regions in the NLST CT images", "source_doi": "10.5281/zenodo.15643334", - "current versioned_source_doi": "10.5281/zenodo.15643335", - "Updated": "2024-02-24", + "current_versioned_source_doi": "10.5281/zenodo.15643335", + "updated": "2024-02-24", "license_url": "https://creativecommons.org/licenses/by/4.0/", "license_long_name": "Creative Commons Attribution 4.0 International License", "license_short_name": "CC BY 4.0" }, { - "ID": "NLSTSeg", - "Title": "Expert tumor segmentations and radiomics features for NLST CT images", + "analysis_result_name": "NLSTSeg", + "analysis_result_title": "Expert tumor segmentations and radiomics features for NLST CT images", "source_doi": "10.5281/zenodo.17362624", - "current versioned_source_doi": "10.5281/zenodo.17362625", - "Updated": "2024-02-24", + "current_versioned_source_doi": "10.5281/zenodo.17362625", + "updated": "2024-02-24", "license_url": "https://creativecommons.org/licenses/by/4.0/", "license_long_name": "Creative Commons Attribution 4.0 International License", "license_short_name": "CC BY 4.0" }, { - "ID": "nnU-Net-BPR-annotations", - "Title": "AI-derived annotations for the NLST and NSCLC-Radiomics computed tomography imaging collections", + "analysis_result_name": "nnU-Net-BPR-annotations", + "analysis_result_title": "AI-derived annotations for the NLST and NSCLC-Radiomics computed tomography imaging collections", "source_doi": "10.5281/zenodo.7473970", - "current versioned_source_doi": "", - "Updated": "2023-05-26", + "current_versioned_source_doi": "", + "updated": "2023-05-26", "license_url": "https://creativecommons.org/licenses/by/4.0/", "license_long_name": "Creative Commons Attribution 4.0 International License", "license_short_name": "CC BY 4.0" }, { - "ID": "Pan-Cancer-Nuclei-Seg-DICOM", - "Title": "DICOM converted Dataset of Segmented Nuclei in Hematoxylin and Eosin Stained Histopathology Images (Pan-Cancer-Nuclei-Seg)", + "analysis_result_name": "Pan-Cancer-Nuclei-Seg-DICOM", + "analysis_result_title": "DICOM converted Dataset of Segmented Nuclei in Hematoxylin and Eosin Stained Histopathology Images (Pan-Cancer-Nuclei-Seg)", "source_doi": "10.5281/zenodo.11099004", - "current versioned_source_doi": "", - "Updated": "2024-08-05", + "current_versioned_source_doi": "", + "updated": "2024-08-05", "license_url": "https://creativecommons.org/licenses/by/3.0/", "license_long_name": "Creative Commons Attribution 3.0 International", "license_short_name": "CC BY 3.0" }, { - "ID": "Pancreas-CT-SEG", - "Title": "DICOM converted annotations for the Pancreas-CT collection", + "analysis_result_name": "Pancreas-CT-SEG", + "analysis_result_title": "DICOM converted annotations for the Pancreas-CT collection", "source_doi": "10.5281/zenodo.12130275", - "current versioned_source_doi": "", - "Updated": "2024-07-16", + "current_versioned_source_doi": "", + "updated": "2024-07-16", "license_url": "https://creativecommons.org/licenses/by/4.0/", "license_long_name": "Creative Commons Attribution 4.0 International License", "license_short_name": "CC BY 4.0" }, { - "ID": "Prostate-MRI-US-Biopsy-DICOM-Annotations", - "Title": "DICOM converted annotations for the Prostate-MRI-US-Biopsy collection", + "analysis_result_name": "Prostate-MRI-US-Biopsy-DICOM-Annotations", + "analysis_result_title": "DICOM converted annotations for the Prostate-MRI-US-Biopsy collection", "source_doi": "10.5281/zenodo.10069910", - "current versioned_source_doi": "", - "Updated": "2023-11-03", + "current_versioned_source_doi": "", + "updated": "2023-11-03", "license_url": "https://creativecommons.org/licenses/by/4.0/", "license_long_name": "Creative Commons Attribution 4.0 International License", "license_short_name": "CC BY 4.0" }, { - "ID": "PROSTATEx-Targets", - "Title": "Point annotations of lesion targets for the PROSTATEx collection", + "analysis_result_name": "PROSTATEx-Targets", + "analysis_result_title": "Point annotations of lesion targets for the PROSTATEx collection", "source_doi": "10.5281/zenodo.15643312", - "current versioned_source_doi": "10.5281/zenodo.15643313", - "Updated": "2024-02-24", + "current_versioned_source_doi": "10.5281/zenodo.15643313", + "updated": "2024-02-24", "license_url": "https://creativecommons.org/licenses/by/4.0/", "license_long_name": "Creative Commons Attribution 4.0 International License", "license_short_name": "CC BY 4.0" }, { - "ID": "RMS-Mutation-Prediction-Expert-Annotations", - "Title": "Expert annotations of the tissue types for the RMS-Mutation-Prediction microscopy images", + "analysis_result_name": "RMS-Mutation-Prediction-Expert-Annotations", + "analysis_result_title": "Expert annotations of the tissue types for the RMS-Mutation-Prediction microscopy images", "source_doi": "10.5281/zenodo.10462857", - "current versioned_source_doi": "", - "Updated": "2024-02-24", + "current_versioned_source_doi": "", + "updated": "2024-02-24", "license_url": "https://creativecommons.org/licenses/by/4.0/", "license_long_name": "Creative Commons Attribution 4.0 International License", "license_short_name": "CC BY 4.0" }, { - "ID": "TCGA-GBM360", - "Title": "GBM360 aggressiveness maps for a subset of TCGA pathology slides", + "analysis_result_name": "TCGA-GBM360", + "analysis_result_title": "GBM360 aggressiveness maps for a subset of TCGA pathology slides", "source_doi": "10.5281/zenodo.17470190", - "current versioned_source_doi": "10.5281/zenodo.17470191", - "Updated": "2024-02-24", + "current_versioned_source_doi": "10.5281/zenodo.17470191", + "updated": "2024-02-24", "license_url": "https://creativecommons.org/licenses/by/4.0/", "license_long_name": "Creative Commons Attribution 4.0 International License", "license_short_name": "CC BY 4.0" }, { - "ID": "TCGA-SBU-TIL-Maps", - "Title": "AI-derived Tumor Infiltrating Lymphocyte maps for the TCGA collections", + "analysis_result_name": "TCGA-SBU-TIL-Maps", + "analysis_result_title": "AI-derived Tumor Infiltrating Lymphocyte maps for the TCGA collections", "source_doi": "10.5281/zenodo.16966285", - "current versioned_source_doi": "10.5281/zenodo.16966285", - "Updated": "2024-02-24", + "current_versioned_source_doi": "10.5281/zenodo.16966285", + "updated": "2024-02-24", "license_url": "https://creativecommons.org/licenses/by/4.0/", "license_long_name": "Creative Commons Attribution 4.0 International License", "license_short_name": "CC BY 4.0" }, { - "ID": "TotalSegmentator-CT-Segmentations", - "Title": "AI-driven enrichment of NCI Imaging Data Commons CT images with volumetric segmentations and radiomics features", + "analysis_result_name": "TotalSegmentator-CT-Segmentations", + "analysis_result_title": "AI-driven enrichment of NCI Imaging Data Commons CT images with volumetric segmentations and radiomics features", "source_doi": "10.5281/zenodo.8347011", - "current versioned_source_doi": "", - "Updated": "2024-02-24", + "current_versioned_source_doi": "", + "updated": "2024-02-24", "license_url": "https://creativecommons.org/licenses/by/4.0/", "license_long_name": "Creative Commons Attribution 4.0 International License", "license_short_name": "CC BY 4.0" diff --git a/bq/generate_tables_and_views/tcia_pathology_metadata.py b/bq/generate_tables_and_views/tcia_pathology_metadata.py index e799127..80d817b 100644 --- a/bq/generate_tables_and_views/tcia_pathology_metadata.py +++ b/bq/generate_tables_and_views/tcia_pathology_metadata.py @@ -20,7 +20,7 @@ import os import sys import json -from utilities.tcia_helpers import get_all_tcia_metadata +from utilities.tcia_helpers import get_tcia_collection_manager_data from utilities.logging_config import successlogger, progresslogger, errlogger from google.cloud import bigquery from utilities.bq_helpers import load_BQ_from_json @@ -91,9 +91,9 @@ def get_aspera_hash(data): def gen_table(args): - collections = get_all_tcia_metadata("collections") + collections = get_tcia_collection_manager_data("collections") # collections = [ c for c in get_all_tcia_metadata("collections") if c['collection_page_accessibility'] == "Public"] - downloads = get_all_tcia_metadata("downloads") + downloads = get_tcia_collection_manager_data("downloads") pathology_downloads = {download['id']:download for download in downloads if download['download_type']=='Pathology Images'} # Add the id and slug of the parent collection to each pathology_download diff --git a/bq/generate_tables_and_views/tcia_pathology_metadata_comparison.py b/bq/generate_tables_and_views/tcia_pathology_metadata_comparison.py index 9defd71..b370eca 100644 --- a/bq/generate_tables_and_views/tcia_pathology_metadata_comparison.py +++ b/bq/generate_tables_and_views/tcia_pathology_metadata_comparison.py @@ -19,7 +19,7 @@ import argparse import sys import json -from utilities.tcia_helpers import get_all_tcia_metadata, get_url +from utilities.tcia_helpers import get_tcia_collection_manager_data, get_url from google.cloud import bigquery from utilities.bq_helpers import load_BQ_from_json from bq.generate_tables_and_views.original_collections_metadata.schema import data_collections_metadata_schema diff --git a/bq/generate_tables_and_views/utils/json_to_bq_table.py b/bq/generate_tables_and_views/utils/json_to_bq_table.py index 04e84d7..04a2b4b 100644 --- a/bq/generate_tables_and_views/utils/json_to_bq_table.py +++ b/bq/generate_tables_and_views/utils/json_to_bq_table.py @@ -19,7 +19,7 @@ import pandas as pd from google.cloud import bigquery -from bq.utilities import read_json_to_dataframe +from bq.bq_utilities import read_json_to_dataframe from datetime import datetime, timedelta import pytz diff --git a/clinical/compare_tcia_clinical_downloads.py b/clinical/compare_tcia_clinical_downloads.py index 18d209a..1582078 100644 --- a/clinical/compare_tcia_clinical_downloads.py +++ b/clinical/compare_tcia_clinical_downloads.py @@ -19,7 +19,7 @@ import argparse import sys import json -from utilities.tcia_helpers import get_all_tcia_metadata, get_url +from utilities.tcia_helpers import get_tcia_collection_manager_data, get_url from google.cloud import bigquery from utilities.bq_helpers import load_BQ_from_json from bq.generate_tables_and_views.original_collections_metadata.schema import data_collections_metadata_schema diff --git a/clinical/tcia_clinical_metadata.py b/clinical/tcia_clinical_metadata.py index 01bcf45..dc45fa4 100644 --- a/clinical/tcia_clinical_metadata.py +++ b/clinical/tcia_clinical_metadata.py @@ -19,7 +19,7 @@ import argparse import sys import json -from utilities.tcia_helpers import get_all_tcia_metadata, get_url +from utilities.tcia_helpers import get_tcia_collection_manager_data, get_url from google.cloud import bigquery from utilities.bq_helpers import load_BQ_from_json from bq.generate_tables_and_views.original_collections_metadata.schema import data_collections_metadata_schema @@ -79,20 +79,20 @@ def get_raw_data(): all_idc_collections = client.list_rows(client.get_table(f'{settings.DEV_PROJECT}.{settings.BQ_DEV_INT_DATASET}.all_sources')).to_dataframe() all_idc_source_dois = all_idc_collections[['source_doi', 'Access']].copy() # Get all TCIA collections which we also have - all_tcia_collection_metadata = get_all_tcia_metadata("collections") + all_tcia_collection_metadata = get_tcia_collection_manager_data("collections") public_tcia_collections = [ c for c in all_tcia_collection_metadata if \ c['collection_doi'].lower() in list(all_idc_source_dois['source_doi']) and \ all_idc_source_dois[all_idc_source_dois["source_doi"] == (c['collection_doi'].lower())].iloc[0]['Access'] == "Public" ] # Get all TCIA analysis results which we also have - all_tcia_ar_metadata = get_all_tcia_metadata('analysis-results') + all_tcia_ar_metadata = get_tcia_collection_manager_data('analysis-results') public_analysis_results = [c for c in all_tcia_ar_metadata if \ c['result_doi'].lower() in list(all_idc_source_dois['source_doi']) and \ all_idc_source_dois[all_idc_source_dois["source_doi"] == (c['result_doi'].lower())].iloc[0]['Access'] == "Public" ] # Get TCIA clinical downloads - downloads = {d['id']: d for d in get_all_tcia_metadata("downloads")} + downloads = {d['id']: d for d in get_tcia_collection_manager_data("downloads")} clinical_downloads = {id: data for id, data in downloads.items() if likely_clinical(data)} # Associate 0 or 1 collection with each clinical download diff --git a/ingestion/utilities/get_collection_dois_urls_licenses.py b/ingestion/utilities/get_collection_dois_urls_licenses.py index 896f4ae..72c6151 100644 --- a/ingestion/utilities/get_collection_dois_urls_licenses.py +++ b/ingestion/utilities/get_collection_dois_urls_licenses.py @@ -25,7 +25,7 @@ from utilities.logging_config import errlogger logger = logging.getLogger(__name__) -from utilities.tcia_helpers import get_internal_series_ids, series_drill_down, get_all_tcia_metadata +from utilities.tcia_helpers import get_internal_series_ids, series_drill_down, get_tcia_collection_manager_data from utilities.tcia_helpers_v4 import get_TCIA_series_metadata_per_patient from idc.models import IDC_Collection, IDC_Patient, IDC_Study, IDC_Series from python_settings import settings @@ -202,7 +202,7 @@ def get_patient_urls_idc(sess, collection, patient): # data sourced from TCIA. def get_licenses_tcia(collection, patient, third_party="no", server=""): # license_types = get_license_info() - license_types = {l['license_url']: l['license_label'] for l in get_all_tcia_metadata(type="licenses")} + license_types = {l['license_url']: l['license_label'] for l in get_tcia_collection_manager_data(type="licenses")} series_licenses = {} series_metadata = get_TCIA_series_metadata_per_patient(collection, patient) for series in series_metadata: diff --git a/preingestion/detect_tcia_collection_name_changes.py b/preingestion/detect_tcia_collection_name_changes.py index 8381240..7a8e3e1 100644 --- a/preingestion/detect_tcia_collection_name_changes.py +++ b/preingestion/detect_tcia_collection_name_changes.py @@ -22,7 +22,7 @@ from idc.models import Patient, Study, Series, Collection, All_Collections -from utilities.tcia_helpers import get_all_tcia_metadata +from utilities.tcia_helpers import get_tcia_collection_manager_data from sqlalchemy import and_, or_ from utilities.sqlalchemy_helpers import sa_session from utilities.logging_config import successlogger, progresslogger, errlogger @@ -40,8 +40,8 @@ def compare_dois(): filter(and_(or_(Series.sources == [True, False], Series.sources == [True, True]), All_Collections.access=="Public")).all() idc_dois = {row.source_doi.lower(): row.collection_id for row in rows if row.source_doi } - tcia_original_dois = {row['collection_doi'].lower(): row['collection_short_title'] for row in get_all_tcia_metadata(type="collections", query_param="&_fields=collection_short_title,collection_doi")} - tcia_analysis_dois = {row['result_doi'].lower(): row['result_short_title'] for row in get_all_tcia_metadata(type="analysis-results", query_param="&_fields=result_short_title,result_doi")} + tcia_original_dois = {row['collection_doi'].lower(): row['collection_short_title'] for row in get_tcia_collection_manager_data(type="collections", query_param="&_fields=collection_short_title,collection_doi")} + tcia_analysis_dois = {row['result_doi'].lower(): row['result_short_title'] for row in get_tcia_collection_manager_data(type="analysis-results", query_param="&_fields=result_short_title,result_doi")} for doi in idc_dois: if idc_dois[doi] in args.ignored: diff --git a/settings.py b/settings.py index 01698f9..94d3b5d 100644 --- a/settings.py +++ b/settings.py @@ -160,3 +160,7 @@ AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', '') AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', '') + +GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') + +pass diff --git a/utilities/tcia_helpers.py b/utilities/tcia_helpers.py index 1e88753..0813469 100644 --- a/utilities/tcia_helpers.py +++ b/utilities/tcia_helpers.py @@ -456,7 +456,7 @@ def get_collection_license_info(): return licenses -def get_all_tcia_metadata(type, query_param=''): +def get_tcia_collection_manager_data(type, query_param=''): if query_param: url = f"https://cancerimagingarchive.net/api/v1/{type}/?per_page=100&{query_param}" else: @@ -480,7 +480,7 @@ def get_all_tcia_metadata(type, query_param=''): print('Error accessing the API:', response.status_code) exit -def get_all_tcia_metadata_v2(type, query_param=''): +def et_tcia_collection_manager_data_v2(type, query_param=''): page = 1 collections = [] while True: @@ -505,7 +505,7 @@ def get_all_tcia_metadata_v2(type, query_param=''): if __name__ == "__main__": - c = get_all_tcia_metadata_v2("collections", query_param='') + c = get_tcia_collection_manager_data_v2("collections", query_param='') pass # access_token = get_access_token(auth_server=NLST_AUTH_URL)[0]