PNNL-CompBio
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 24 additions & 0 deletions b/‎.github/workflows/build.yml‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/main.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎build/README.md‎
Lines changed: 6 additions & 8 deletions b/‎build/README.md‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎build/beatAML/GetBeatAML.py‎
Lines changed: 1 addition & 1 deletion b/‎build/beatAML/GetBeatAML.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build/bladderpdo/00_createBladderPDOSampleFile.py‎
Lines changed: 50 additions & 0 deletions b/‎build/bladderpdo/00_createBladderPDOSampleFile.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎build/bladderpdo/01_createBladderPDOOmicsFiles.py‎
Lines changed: 132 additions & 0 deletions b/‎build/bladderpdo/01_createBladderPDOOmicsFiles.py‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎build/bladderpdo/02_createBladderPDODrugsFile.py‎
Lines changed: 54 additions & 0 deletions b/‎build/bladderpdo/02_createBladderPDODrugsFile.py‎
Lines changed: 54 additions & 0 deletions
@@ -177,6 +177,30 @@ jobs:
           push: true
           platforms: linux/amd64
 
+  build-pancpdo:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      - name: Build and push pancpdo
+        uses: docker/build-push-action@v3
+        with:
+          file: ./build/docker/Dockerfile.pancpdo
+          tags: |
+            sgosline/pancpdo:latest
+            sgosline/pancpdo:${{ github.ref_name }}
+          push: true
+          platforms: linux/amd64
+
   build-upload:
     runs-on: ubuntu-latest
     steps:
 
@@ -4,6 +4,7 @@ on:
   push:
     tags:
           - '*'  # Triggers the workflow only on version tags
+  workflow_dispatch:  # Allows manual triggering of the workflow
 
 # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 permissions:
@@ -44,4 +45,4 @@ jobs:
     steps:
       - name: Deploy to GitHub Pages
         id: deployment
-        uses: actions/deploy-pages@v4
+        uses: actions/deploy-pages@v4
@@ -18,3 +18,5 @@ tests/__pycache__
 dist
 build/lib
 build/local
+coderdata/_version.py
+local/
@@ -1,5 +1,7 @@
 ## Cancer Omics Drug Experiment Response Dataset 
 
+
+
 There is a recent explosion of deep learning algorithms that to tackle the computational problem of predicting drug treatment outcome from baseline molecular measurements. To support this,we have built a benchmark dataset that harmonizes diverse datasets to better assess algorithm performance.
 
 This package collects diverse sets of paired molecular datasets with corresponding drug sensitivity data. All data here is reprocessed and standardized so it can be easily used as a benchmark dataset for the 
 
@@ -10,11 +10,10 @@ are added.
 
 ## build_all.py script
 
-This script initializes all docker containers, builds all datasets, validates them, and uploads them to figshare and pypi.
+This script initializes all docker containers, builds all datasets, validates them, and uploads them to figshare.
 
 It requires the following authorization tokens to be set in the local environment depending on the use case:   
 `SYNAPSE_AUTH_TOKEN`: Required for beataml and mpnst datasets. Join the [CoderData team](https://www.synapse.org/#!Team:3503472) on Synapse and generate an access token.  
-`PYPI_TOKEN`: This token is required to upload to PyPI.  
 `FIGSHARE_TOKEN`: This token is required to upload to Figshare.  
 `GITHUB_TOKEN`: This token is required to upload to GitHub.  
 
@@ -25,21 +24,20 @@ It requires the following authorization tokens to be set in the local environmen
 - `--omics`: Processes and builds the omics data files.
 - `--drugs`: Processes and builds the drug data files.
 - `--exp`: Processes and builds the experiment data files.
-- `--all`: Executes all available processes above (docker, samples, omics, drugs, exp). This does not run the validate, figshare, or pypi commands.
+- `--all`: Executes all available processes above (docker, samples, omics, drugs, exp). This does not run the validate or figshare commands.
 - `--validate`: Validates the generated datasets using the schema check scripts. This is automatically included if data upload occurs.
 - `--figshare`: Uploads the datasets to Figshare. FIGSHARE_TOKEN must be set in local environment.
-- `--pypi`: Uploads the package to PyPI. PYPI_TOKEN must be set in local environment.
 - `--high_mem`: Utilizes high memory mode for concurrent data processing. This has been successfully tested using 32 or more vCPUs. 
 - `--dataset`: Specifies the datasets to process (default='broad_sanger,hcmi,beataml,mpnst,cptac').
-- `--version`: Specifies the version number for the PyPI package and Figshare upload title (e.g., "0.1.29"). This is required for figshare and PyPI upload steps. This must be a higher version than previously published versions.
+- `--version`: Specifies the version number for the Figshare upload title (e.g., "0.1.29"). This must be a higher version than previously published versions.
 - `--github-username`: GitHub username matching the GITHUB_TOKEN. Required to push the new Tag to the GitHub Repository.
 - `--github-email`: GitHub email matching the GITHUB_TOKEN. Required to push the new Tag to the GitHub Repository.
 
 **Example usage**:  
-- Build all datasets and upload to Figshare and PyPI and GitHub.  
-Required tokens for the following command: `SYNAPSE_AUTH_TOKEN`, `PYPI_TOKEN`, `FIGSHARE_TOKEN`, `GITHUB_TOKEN`.  
+- Build all datasets and upload to Figshare and GitHub.  
+Required tokens for the following command: `SYNAPSE_AUTH_TOKEN`, `FIGSHARE_TOKEN`, `GITHUB_TOKEN`.  
 ```bash
-python build/build_all.py --all --high_mem --validate --pypi --figshare --version 0.1.41 --github-username jjacobson95 --github-email jeremy.jacobson3402@gmail.com
+python build/build_all.py --all --high_mem --validate --figshare --version 0.1.41 --github-username jjacobson95 --github-email jeremy.jacobson3402@gmail.com
 ```
 
 - Build only the experiment files.  
 
@@ -653,7 +653,7 @@ def generate_drug_list(drug_map_path,drug_path):
             # New Transcriptomics Data
             print("Starting Transcriptomics Data")
             ##first run conversion tool
-            os.system("python tpmFromCounts.py --counts "+transcriptomics_file)
+            os.system("python tpmFromCounts.py --counts {} --out_file {}".format(transcriptomics_file,'tpm_'+transcriptomics_file))
 
 
             t_df = pd.read_csv('tpm_'+transcriptomics_file, sep = '\t')
 
@@ -0,0 +1,50 @@
+import synapseclient
+import pandas as pd
+import numpy as np
+import argparse
+import os
+
+
+def get_bladder_pdo_samples(synLoginObject, maxval):
+    
+    # download from Synapse..
+    samples_syn = synLoginObject.get('syn64765486')
+    # and read the file
+    samples_df = pd.read_csv(samples_syn.path, sep="\t")
+
+    samples = samples_df[['Sample ID', 'Patient ID', 'Cancer Type Detailed', 'Sample Class']]
+    samples = samples.rename({"Sample ID" : 'other_id', 'Patient ID' : 'common_name', 'Cancer Type Detailed': 'cancer_type', 'Sample Class' : 'model_type'}, axis=1)
+
+    samples.loc[:,['species']] = 'Homo sapiens(Human)'
+    samples.loc[:,['other_id_source']] = 'Synapse'
+    samples.loc[:,['other_names'] ]= ''
+    samples.loc[:,['cancer_type']]=samples['cancer_type'].str.lower()
+    samples.loc[:, ['model_type']] = samples['model_type'].str.lower()
+
+    samples['improve_sample_id'] = range(maxval+1, maxval+1+samples.shape[0])
+
+    return samples
+
+
+if __name__ == "__main__":
+    
+    parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Sarcoma PDO project into a single samplesheet")
+    
+    parser.add_argument('-t', '--token', type=str, help='Synapse Token')
+
+    parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const  = "", help = "Use this to provide previous sample file, will run sample file generation")
+
+    args = parser.parse_args()
+   
+    print("Logging into Synapse")
+    PAT = args.token
+    synObject = synapseclient.login(authToken=PAT)
+
+    if (args.prevSamples):
+        prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id)
+    else: 
+        prev_max_improve_id = 0
+
+    bladder_pdo_samples = get_bladder_pdo_samples(synObject, prev_max_improve_id)
+
+    bladder_pdo_samples.to_csv("/tmp/bladderpdo_samples.csv", index=False)
@@ -0,0 +1,132 @@
+import synapseclient
+import pandas as pd
+import numpy as np
+import argparse
+import os
+import wget
+import gzip
+import subprocess
+import math
+
+def get_copy_call(a):
+    """
+    Heler Function - Determine copy call for a value.
+    """
+
+    if a is None:
+        return float('nan')
+
+    if math.isnan(a):
+        return float('nan')
+
+    a_val = math.log2(float(a)+0.000001)
+    if a_val < 0.5210507:
+        return 'deep del'
+    elif a_val < 0.7311832:
+        return 'het loss'
+    elif a_val < 1.214125:
+        return 'diploid'
+    elif a_val < 1.422233:
+        return 'gain'
+    else:
+        return 'amp'
+
+    return pd.Series([get_copy_call(a) for a in arr])
+
+def get_bladder_pdo_transcriptomics(GEO_id_link_table, samples, genes):
+
+    bladderpdo_url ='https://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103990/suppl/GSE103990_Normalized_counts.txt.gz'
+    transcriptomic_txt = wget.download(bladderpdo_url)
+    transcriptomics = pd.read_csv(transcriptomic_txt, compression='gzip', sep="\t")
+    subprocess.call (["/usr/bin/Rscript", "--vanilla", "obtainGSMidLink.R"])
+
+    GEO_ids_link = pd.read_csv("./gsmlinkDf.csv")
+    fpkm_totals = transcriptomics.iloc[:, 1:43].sum()
+    transcriptomics.iloc[:, 1:43] = transcriptomics.iloc[:, 1:43].div(fpkm_totals).mul(1e6)
+    transcriptomics['ensembl'] = transcriptomics['Unnamed: 0'].str.split("_", expand=True)[0]
+    mapped_df = transcriptomics.merge(genes[['entrez_id', 'other_id']].drop_duplicates(), left_on='ensembl', right_on='other_id', how='left')
+    # transform data to long format
+
+    mapped_df.drop('other_id', axis=1)
+    value_variables = transcriptomics.columns[transcriptomics.columns.str.contains("M")]
+    melted_txomics = mapped_df.melt(id_vars = "entrez_id", value_vars = value_variables, var_name='sample_name')
+    # use info from GEO to get Sample IDS
+    txomics_with_GEOid = melted_txomics.merge(GEO_ids_link, how = 'left', left_on = "sample_name", right_on='RNAid')
+    # use samplesheet to link sample_ids to improve ids
+    txomics_with_GEOid['sampleid'] = txomics_with_GEOid['sampleid'].str.replace("org", "Organoid_")
+    txomics_with_GEOid['sampleid'] = txomics_with_GEOid['sampleid'].str.replace("tumor", "Tumor")
+    txomics_with_improveid = txomics_with_GEOid.merge(samples, left_on="sampleid", right_on="other_id", how="left")
+    final_transcriptomics = txomics_with_improveid[['entrez_id', 'value', 'improve_sample_id']]
+    final_transcriptomics['source'] = "Gene Expression Omnibus"
+    final_transcriptomics['study'] = "Lee etal 2018 Bladder PDOs"
+    final_transcriptomics.rename({'value' : 'transcriptomics' })
+    # remove duplicates
+    toreturn = final_transcriptomics.drop_duplicates()
+
+    return toreturn
+
+def get_bladder_pdo_mutations(synObject, samples, genes):
+    print(samples.head)
+    mutations = synObject.get("syn64765525")
+    mutations_df = pd.read_csv(mutations.path, sep='\t')
+    mutations_df['mutation'] = mutations_df['HGVSc'].str.split(":", expand=True)[1]
+    #samplesheet = pd.read_csv(samples)
+    selectioncols_mutations = mutations_df[['Entrez_Gene_Id',"Variant_Classification", "Tumor_Sample_Barcode", "mutation"]]
+    merged_mutations = selectioncols_mutations.merge(samples, left_on="Tumor_Sample_Barcode", right_on="other_id", how="left")
+    merged_mutations_renamed = merged_mutations.rename({"Entrez_Gene_Id" : 'entrez_id', 'Variant_Classification' : "variant_classification"}, axis=1)
+    print(merged_mutations_renamed.head)
+    final_mutations = merged_mutations_renamed[['entrez_id', "mutation", "variant_classification", "improve_sample_id"]]
+    final_mutations['study'] = "Lee etal 2018 Bladder PDOs"
+    print(final_mutations.head)
+    return final_mutations
+
+def get_bladder_pdo_copynumber(synObject, samples, genes):
+    segfile = synObject.get("syn64765499")
+    segfile_df = pd.read_csv(segfile.path, sep='\t')
+
+    segfile_df.to_csv("bladder_segfile.csv")
+    subprocess.call (["/usr/bin/Rscript", "--vanilla", "CNV-segfile-annotation.R", "bladder_segfile.csv", "bladder_annotated_segfile.csv"])
+    copynumber = pd.read_csv("bladder_annotated_segfile.csv")
+    copynumber['copy_number'] = np.exp2(copynumber['score'].div(2))*2
+    copynumber['copy_call'] = [get_copy_call(a) for a in copynumber['copy_number']]
+    copynumber_with_improveids = copynumber.merge(samples, left_on='ID', right_on = 'other_id', how='left')
+    copynumber_with_correct_colnames = copynumber_with_improveids.rename({"ENTREZID":'entrez_id'}, axis=1)
+    final_copynumber = copynumber_with_correct_colnames[['entrez_id', 'improve_sample_id', 'copy_number', 'copy_call']]
+    final_copynumber['source'] = "Synapse"
+    final_copynumber['study'] = "Lee etal 2018 Bladder PDOs"
+
+    return final_copynumber
+
+
+
+
+if __name__ == "__main__":
+    print('in main')
+    parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of omics data files for the Bladder PDO project")
+    parser.add_argument('-s', '--samples', help='Path to sample file',default=None)
+    parser.add_argument('-g', '--genes', help='Path to genes file', default = None)
+    parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
+    parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
+    parser.add_argument('-e', '--expression', help='Flag to capture transcriptomic data', action='store_true', default=False)
+    parser.add_argument('-i', '--geolink', help=".csv file that is the output of 'CNV-segfile-anotation.R")
+    parser.add_argument('-t', '--token', help='Synapse token')
+
+    args = parser.parse_args()
+    print("Logging into Synapse")
+    PAT = args.token
+    synObject = synapseclient.login(authToken=PAT)
+    print('gene file is:')
+    print(args.genes)
+    print('sample file is :')
+    print(args.samples)
+    genes = pd.read_csv(args.genes)
+    samples = pd.read_csv(args.samples)
+
+    if args.expression:
+        get_bladder_pdo_transcriptomics(args.geolink, samples, genes).to_csv("/tmp/bladderpdo_transcriptomics.csv", index=False)
+
+    if args.mutation:
+        get_bladder_pdo_mutations(synObject, samples, genes).to_csv('/tmp/bladderpdo_mutations.csv', index=False)
+    
+    if args.copy:
+        get_bladder_pdo_copynumber(synObject, samples, genes).to_csv("/tmp/bladderpdo_copynumber.csv", index=False)
@@ -0,0 +1,54 @@
+import synapseclient
+import pandas as pd
+import numpy as np
+import argparse
+import os
+# for testing locally
+#from utils.pubchem_retrieval import update_dataframe_and_write_tsv
+# for building in docker
+from pubchem_retrieval import update_dataframe_and_write_tsv
+
+
+def create_bladder_pdo_drugs_file(synObject, prevDrugFilepath, outputPath):
+    bladder_dir = synObject.get('syn64765430')
+    filenames = list(synObject.getChildren(parent='syn64765430', includeTypes=['file']))
+    bladder_drugs = pd.DataFrame({'drugNames' : [str]})
+    # '-4' - there are 4 nondrug files in this directory. 
+    for i in range(len(filenames)-4):
+        bladder_drugs.loc[i,'drugNames'] = filenames[i]['name'].split(")")[1].split("(")[0].split(".")[0].strip()
+
+    # get unique drugs 
+    newdrugnames = bladder_drugs['drugNames'].unique()
+    # use helper functions in pubchem_retrieval.py 
+    alldrugs = []
+    if prevDrugFilepath is not None and prevDrugFilepath is not "":
+        prevdrugs = [pd.read_csv(t,sep='\t') for t in prevDrugFilepath.split(',')]
+        alldrugs = pd.concat(prevdrugs).drop_duplicates()
+
+        imps = alldrugs[alldrugs.chem_name.isin(newdrugnames)]
+        newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)]
+        
+        ##write drugs
+        newdrugs.to_csv(outputPath, sep='\t', index=False)
+
+    if len(alldrugs)==0 or len(newdrugnames)>len(set(newdrugs.improve_drug_id)): #we have more names we didn't match
+        print('Missing drugs in existing file, querying pubchem')
+        update_dataframe_and_write_tsv(newdrugnames,outputPath)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of drug data files for the Lee Bladder PDO project")
+    parser.add_argument('-d', '--prevDrugFilePath', help='Path to a previous drug file for sarcpdo', nargs="?", default = None)
+    parser.add_argument('-o', '--outputPath', help='Output path for updated sarcpdo drug file', default = "/tmp/sarcpdo_drugs.tsv") 
+    parser.add_argument('-t', '--token', help='Synapse token')
+
+    args = parser.parse_args()
+    print("Logging into Synapse")
+    PAT = args.token
+    synObject = synapseclient.login(authToken=PAT)
+    if args.prevDrugFilePath:
+        previousDrugs = args.prevDrugFilePath
+    else:
+        previousDrugs = None
+    create_bladder_pdo_drugs_file(synObject, previousDrugs, args.outputPath)