ncats
diff --git a/‎requirements.txt
Lines changed: 3 additions & 1 deletion b/‎requirements.txt
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/constants.py
Lines changed: 116 additions & 1 deletion b/‎src/constants.py
Lines changed: 116 additions & 1 deletion
diff --git a/‎src/core/config.py
Lines changed: 4 additions & 2 deletions b/‎src/core/config.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/core/etl.py
Lines changed: 2 additions & 1 deletion b/‎src/core/etl.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/id_resolvers/cell_line_resolver.py
Lines changed: 27 additions & 0 deletions b/‎src/id_resolvers/cell_line_resolver.py
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/input_adapters/ccle/experiment_and_project.py
Lines changed: 53 additions & 0 deletions b/‎src/input_adapters/ccle/experiment_and_project.py
Lines changed: 53 additions & 0 deletions
@@ -18,4 +18,6 @@ GEOparse
 
 graphene
 Flask
-pyyaml
+pyyaml
+
+networkx
@@ -14,6 +14,8 @@ class DataSourceName(SimpleEnum):
     TargetGraphNCBI = "Pharos 2.0 CSV (NCBI)"
     ChEMBL = "ChEMBL"
     CCLE = "Cancer Cell Line Encyclopedia (CCLE)"
+    Cellosaurus = "Cellosaurus"
+    CLO = "Cell Line Ontology (CLO)"
     GO = "Gene Ontology (GO)"
     DrugCentral = "DrugCentral"
     JensenLab = "JensenLab"
@@ -110,4 +112,117 @@ class Prefix(SimpleEnum):
     SLP = "SLP"
     TranscriptSymbol = "Transcript"
     Vega = "Vega"
-    Wikidata = "Wikidata"
+    Wikidata = "Wikidata"
+
+    # cell line data sources
+    # 4DN = "4DN"
+    Cellosaurus = "Cellosaurus"
+    CCLE_ID = "CCLE_ID"
+    Abcam = "Abcam"
+    ABCD = "ABCD"
+    Abeomics = "Abeomics"
+    ABM = "ABM"
+    AddexBio = "AddexBio"
+    ArrayExpress = "ArrayExpress"
+    ATCC = "ATCC"
+    BCGO = "BCGO"
+    BCRC = "BCRC"
+    BCRJ = "BCRJ"
+    BEI_Resources = "BEI_Resources"
+    BioGRID_ORCS_Cell_line = "BioGRID_ORCS_Cell_line"
+    BTO = "BTO"
+    BioSample = "BioSample"
+    BioSamples = "BioSamples"
+    cancercelllines = "cancercelllines"
+    CancerTools = "CancerTools"
+    CBA = "CBA"
+    CCLV = "CCLV"
+    CCRID = "CCRID"
+    CCTCC = "CCTCC"
+    Cell_Biolabs = "Cell_Biolabs"
+    Cell_Model_Passport = "Cell_Model_Passport"
+    CGH_DB = "CGH-DB"
+    ChEMBL_Cells = "ChEMBL-Cells"
+    ChEMBL_Targets = "ChEMBL-Targets"
+    CLDB = "CLDB"
+    CLO = "CLO"
+    CLS = "CLS"
+    ColonAtlas = "ColonAtlas"
+    Coriell = "Coriell"
+    Cosmic = "Cosmic"
+    Cosmic_CLP = "Cosmic-CLP"
+    dbGAP = "dbGAP"
+    dbMHC = "dbMHC"
+    DepMap = "DepMap"
+    DGRC = "DGRC"
+    DiscoverX = "DiscoverX"
+    DSHB = "DSHB"
+    DSMZ = "DSMZ"
+    DSMZCellDive = "DSMZCellDive"
+    EBiSC = "EBiSC"
+    ECACC = "ECACC"
+    EGA = "EGA"
+    ENCODE = "ENCODE"
+    ESTDAB = "ESTDAB"
+    FCDI = "FCDI"
+    FCS_free = "FCS-free"
+    FlyBase_Cell_line = "FlyBase_Cell_line"
+    GDSC = "GDSC"
+    GeneCopoeia = "GeneCopoeia"
+    Genomeditech = "Genomeditech"
+    GEO = "GEO"
+    HipSci = "HipSci"
+    Horizon_Discovery = "Horizon_Discovery"
+    hPSCreg = "hPSCreg"
+    Hysigen = "Hysigen"
+    IARC_TP53 = "IARC_TP53"
+    IBRC = "IBRC"
+    ICLC = "ICLC"
+    ICLDB = "ICLDB"
+    IGRhCellID = "IGRhCellID"
+    IGSR = "IGSR"
+    IHW = "IHW"
+    Imanis = "Imanis"
+    Innoprot = "Innoprot"
+    IPD_IMGT_HLA = "IPD-IMGT/HLA"
+    IZSLER = "IZSLER"
+    JCRB = "JCRB"
+    KCB = "KCB"
+    KCLB = "KCLB"
+    Kerafast = "Kerafast"
+    KYinno = "KYinno"
+    LiGeA = "LiGeA"
+    LIMORE = "LIMORE"
+    LINCS_HMS = "LINCS_HMS"
+    LINCS_LDP = "LINCS_LDP"
+    Lonza = "Lonza"
+    MCCL = "MCCL"
+    MeSH = "MeSH"
+    MetaboLights = "MetaboLights"
+    Millipore = "Millipore"
+    MMRRC = "MMRRC"
+    NCBI_Iran = "NCBI_Iran"
+    NCI_DTP = "NCI-DTP"
+    NHCDR = "NHCDR"
+    NIHhESC = "NIHhESC"
+    NISES = "NISES"
+    NRFC = "NRFC"
+    PerkinElmer = "PerkinElmer"
+    PharmacoDB = "PharmacoDB"
+    PRIDE = "PRIDE"
+    Progenetix = "Progenetix"
+    PubChem_Cell_line = "PubChem_Cell_line"
+    RCB = "RCB"
+    RIKEN_BRC_EPD = "RIKEN_BRC_EPD"
+    Rockland = "Rockland"
+    RSCB = "RSCB"
+    Sigma_Aldrich = "Sigma-Aldrich"
+    SKIP = "SKIP"
+    SKY_M_FISH_CGH = "SKY/M-FISH/CGH"
+    SLKBase = "SLKBase"
+    TKG = "TKG"
+    TNGB = "TNGB"
+    TOKU_E = "TOKU-E"
+    Ubigene = "Ubigene"
+    WiCell = "WiCell"
+    Ximbio = "Ximbio"
@@ -108,9 +108,9 @@ def create_output_adapters(self) -> List[OutputAdapter]:
         return output_adapters
 
     def create_node_adapters(self) -> List[NodeInputAdapter]:
-        if 'input_adapters' not in self.config_dict:
-            raise Exception('Configuration yaml files must contain at least one input adapter')
         node_adapters = []
+        if 'input_adapters' not in self.config_dict:
+            return node_adapters
         if 'nodes' not in self.config_dict['input_adapters']:
             return node_adapters
         config = self.config_dict['input_adapters']['nodes']
@@ -124,6 +124,8 @@ def create_node_adapters(self) -> List[NodeInputAdapter]:
 
     def create_edge_adapters(self) -> List[RelationshipInputAdapter]:
         edge_adapters = []
+        if 'input_adapters' not in self.config_dict:
+            return edge_adapters
         if 'edges' not in self.config_dict['input_adapters']:
             return edge_adapters
         config = self.config_dict['input_adapters']['edges']
 
@@ -32,9 +32,10 @@ def do_etl(self, testing = False):
             if testing:
                 resolved_list = resolved_list[0:20000]
 
-            self.labeler.assign_all_labels(resolved_list)
 
             for output_adapter in self.output_adapters:
+                resolved_list = output_adapter.preprocess_objects(resolved_list)
+                self.labeler.assign_all_labels(resolved_list)
                 output_adapter.store(resolved_list)
 
         for output_adapter in self.output_adapters:
 
@@ -0,0 +1,27 @@
+import xml.etree.ElementTree as ET
+from typing import List
+from src.constants import Prefix
+from src.id_resolvers.sqlite_cache_resolver import SqliteCacheResolver, MatchingPair
+from src.models.node import EquivalentId
+
+
+class CellosaurusCellLineResolver(SqliteCacheResolver):
+    file_path: str
+
+    def __init__(self, file_path: str, **kwargs):
+        self.file_path = file_path
+        SqliteCacheResolver.__init__(self, **kwargs)
+
+    def matching_ids(self) -> List[MatchingPair]:
+
+        tree = ET.parse(self.file_path)
+        root = tree.getroot()
+        for node in root.findall('./cell-line-list/cell-line'):
+            accession = node.find('./accession-list/accession[@type="primary"]').text
+            id = EquivalentId(id=accession, type=Prefix.Cellosaurus).id_str()
+            yield MatchingPair(id=id, match=id, type='exact')
+            for xref in node.findall('./xref-list/xref[@category="Cell line databases/resources"]'):
+                prefix = Prefix.parse(xref.get('database'))
+                accession = xref.get('accession')
+                match_id = EquivalentId(id=accession, type=prefix).id_str()
+                yield MatchingPair(id=id, match=match_id, type=prefix.value)
@@ -0,0 +1,53 @@
+import os
+from abc import ABC
+from datetime import date, datetime
+from typing import List, Union
+
+from src.constants import DataSourceName
+from src.interfaces.input_adapter import NodeInputAdapter, RelationshipInputAdapter
+from src.models.datasource_version_info import DatasourceVersionInfo
+from src.models.node import Node, Relationship
+from src.models.pounce.experiment import Experiment
+
+
+class CCLEInputAdapter(NodeInputAdapter, RelationshipInputAdapter, ABC):
+    def get_datasource_name(self) -> DataSourceName:
+        return DataSourceName.CCLE
+
+    def get_version(self) -> DatasourceVersionInfo:
+        return DatasourceVersionInfo(
+            version="CCLE 2019"
+        )
+
+    def get_experiment_name(self):
+        return f"{self.get_datasource_name().value} - {self.get_version().version}"
+
+    def __init__(self):
+        NodeInputAdapter.__init__(self)
+        RelationshipInputAdapter.__init__(self)
+
+class CCLEFileInputAdapter(CCLEInputAdapter, ABC):
+    file_path: str
+    download_date: date
+
+    def get_version(self) -> DatasourceVersionInfo:
+        return DatasourceVersionInfo(
+            version="CCLE 2019",
+            download_date=self.download_date
+        )
+
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.download_date = datetime.fromtimestamp(os.path.getmtime(file_path)).date()
+        CCLEInputAdapter.__init__(self)
+
+
+class ExperimentAdapter(CCLEInputAdapter):
+    def get_all(self) -> List[Union[Node, Relationship]]:
+        return [Experiment(
+            id=self.get_experiment_name(),
+            name=self.get_datasource_name().value,
+            type='RNA-seq',
+            description='The Cancer Cell Line Encyclopedia (CCLE) project started in 2008 as a collaboration between the Broad Institute, and the Novartis Institutes for Biomedical Research and its Genomics Institute of the Novartis Research Foundation. The goal is to conduct a detailed genetic and pharmacologic characterization of a large panel of human cancer models, to develop integrated computational analyses that link distinct pharmacologic vulnerabilities to genomic patterns and to translate cell line integrative genomics into cancer patient stratification. Later the MD Anderson and Harvard Medical school joined the project. As of summer of 2018 CCLE continues its efforts as part of the Broad Cancer Dependency Map Project.',
+            category="in vitro"
+        )]