Skip to content

Commit 0d24385

Browse files
Merge pull request #11 from KeithKelleher/main
update pounce loading code
2 parents b29bf46 + f7da460 commit 0d24385

20 files changed

+697
-77
lines changed

requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,6 @@ GEOparse
1818

1919
graphene
2020
Flask
21-
pyyaml
21+
pyyaml
22+
23+
networkx

src/constants.py

Lines changed: 116 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ class DataSourceName(SimpleEnum):
1414
TargetGraphNCBI = "Pharos 2.0 CSV (NCBI)"
1515
ChEMBL = "ChEMBL"
1616
CCLE = "Cancer Cell Line Encyclopedia (CCLE)"
17+
Cellosaurus = "Cellosaurus"
18+
CLO = "Cell Line Ontology (CLO)"
1719
GO = "Gene Ontology (GO)"
1820
DrugCentral = "DrugCentral"
1921
JensenLab = "JensenLab"
@@ -110,4 +112,117 @@ class Prefix(SimpleEnum):
110112
SLP = "SLP"
111113
TranscriptSymbol = "Transcript"
112114
Vega = "Vega"
113-
Wikidata = "Wikidata"
115+
Wikidata = "Wikidata"
116+
117+
# cell line data sources
118+
# 4DN = "4DN"
119+
Cellosaurus = "Cellosaurus"
120+
CCLE_ID = "CCLE_ID"
121+
Abcam = "Abcam"
122+
ABCD = "ABCD"
123+
Abeomics = "Abeomics"
124+
ABM = "ABM"
125+
AddexBio = "AddexBio"
126+
ArrayExpress = "ArrayExpress"
127+
ATCC = "ATCC"
128+
BCGO = "BCGO"
129+
BCRC = "BCRC"
130+
BCRJ = "BCRJ"
131+
BEI_Resources = "BEI_Resources"
132+
BioGRID_ORCS_Cell_line = "BioGRID_ORCS_Cell_line"
133+
BTO = "BTO"
134+
BioSample = "BioSample"
135+
BioSamples = "BioSamples"
136+
cancercelllines = "cancercelllines"
137+
CancerTools = "CancerTools"
138+
CBA = "CBA"
139+
CCLV = "CCLV"
140+
CCRID = "CCRID"
141+
CCTCC = "CCTCC"
142+
Cell_Biolabs = "Cell_Biolabs"
143+
Cell_Model_Passport = "Cell_Model_Passport"
144+
CGH_DB = "CGH-DB"
145+
ChEMBL_Cells = "ChEMBL-Cells"
146+
ChEMBL_Targets = "ChEMBL-Targets"
147+
CLDB = "CLDB"
148+
CLO = "CLO"
149+
CLS = "CLS"
150+
ColonAtlas = "ColonAtlas"
151+
Coriell = "Coriell"
152+
Cosmic = "Cosmic"
153+
Cosmic_CLP = "Cosmic-CLP"
154+
dbGAP = "dbGAP"
155+
dbMHC = "dbMHC"
156+
DepMap = "DepMap"
157+
DGRC = "DGRC"
158+
DiscoverX = "DiscoverX"
159+
DSHB = "DSHB"
160+
DSMZ = "DSMZ"
161+
DSMZCellDive = "DSMZCellDive"
162+
EBiSC = "EBiSC"
163+
ECACC = "ECACC"
164+
EGA = "EGA"
165+
ENCODE = "ENCODE"
166+
ESTDAB = "ESTDAB"
167+
FCDI = "FCDI"
168+
FCS_free = "FCS-free"
169+
FlyBase_Cell_line = "FlyBase_Cell_line"
170+
GDSC = "GDSC"
171+
GeneCopoeia = "GeneCopoeia"
172+
Genomeditech = "Genomeditech"
173+
GEO = "GEO"
174+
HipSci = "HipSci"
175+
Horizon_Discovery = "Horizon_Discovery"
176+
hPSCreg = "hPSCreg"
177+
Hysigen = "Hysigen"
178+
IARC_TP53 = "IARC_TP53"
179+
IBRC = "IBRC"
180+
ICLC = "ICLC"
181+
ICLDB = "ICLDB"
182+
IGRhCellID = "IGRhCellID"
183+
IGSR = "IGSR"
184+
IHW = "IHW"
185+
Imanis = "Imanis"
186+
Innoprot = "Innoprot"
187+
IPD_IMGT_HLA = "IPD-IMGT/HLA"
188+
IZSLER = "IZSLER"
189+
JCRB = "JCRB"
190+
KCB = "KCB"
191+
KCLB = "KCLB"
192+
Kerafast = "Kerafast"
193+
KYinno = "KYinno"
194+
LiGeA = "LiGeA"
195+
LIMORE = "LIMORE"
196+
LINCS_HMS = "LINCS_HMS"
197+
LINCS_LDP = "LINCS_LDP"
198+
Lonza = "Lonza"
199+
MCCL = "MCCL"
200+
MeSH = "MeSH"
201+
MetaboLights = "MetaboLights"
202+
Millipore = "Millipore"
203+
MMRRC = "MMRRC"
204+
NCBI_Iran = "NCBI_Iran"
205+
NCI_DTP = "NCI-DTP"
206+
NHCDR = "NHCDR"
207+
NIHhESC = "NIHhESC"
208+
NISES = "NISES"
209+
NRFC = "NRFC"
210+
PerkinElmer = "PerkinElmer"
211+
PharmacoDB = "PharmacoDB"
212+
PRIDE = "PRIDE"
213+
Progenetix = "Progenetix"
214+
PubChem_Cell_line = "PubChem_Cell_line"
215+
RCB = "RCB"
216+
RIKEN_BRC_EPD = "RIKEN_BRC_EPD"
217+
Rockland = "Rockland"
218+
RSCB = "RSCB"
219+
Sigma_Aldrich = "Sigma-Aldrich"
220+
SKIP = "SKIP"
221+
SKY_M_FISH_CGH = "SKY/M-FISH/CGH"
222+
SLKBase = "SLKBase"
223+
TKG = "TKG"
224+
TNGB = "TNGB"
225+
TOKU_E = "TOKU-E"
226+
Ubigene = "Ubigene"
227+
WiCell = "WiCell"
228+
Ximbio = "Ximbio"

src/core/config.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,9 @@ def create_output_adapters(self) -> List[OutputAdapter]:
108108
return output_adapters
109109

110110
def create_node_adapters(self) -> List[NodeInputAdapter]:
111-
if 'input_adapters' not in self.config_dict:
112-
raise Exception('Configuration yaml files must contain at least one input adapter')
113111
node_adapters = []
112+
if 'input_adapters' not in self.config_dict:
113+
return node_adapters
114114
if 'nodes' not in self.config_dict['input_adapters']:
115115
return node_adapters
116116
config = self.config_dict['input_adapters']['nodes']
@@ -124,6 +124,8 @@ def create_node_adapters(self) -> List[NodeInputAdapter]:
124124

125125
def create_edge_adapters(self) -> List[RelationshipInputAdapter]:
126126
edge_adapters = []
127+
if 'input_adapters' not in self.config_dict:
128+
return edge_adapters
127129
if 'edges' not in self.config_dict['input_adapters']:
128130
return edge_adapters
129131
config = self.config_dict['input_adapters']['edges']

src/core/etl.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ def do_etl(self, testing = False):
3232
if testing:
3333
resolved_list = resolved_list[0:20000]
3434

35-
self.labeler.assign_all_labels(resolved_list)
3635

3736
for output_adapter in self.output_adapters:
37+
resolved_list = output_adapter.preprocess_objects(resolved_list)
38+
self.labeler.assign_all_labels(resolved_list)
3839
output_adapter.store(resolved_list)
3940

4041
for output_adapter in self.output_adapters:
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import xml.etree.ElementTree as ET
2+
from typing import List
3+
from src.constants import Prefix
4+
from src.id_resolvers.sqlite_cache_resolver import SqliteCacheResolver, MatchingPair
5+
from src.models.node import EquivalentId
6+
7+
8+
class CellosaurusCellLineResolver(SqliteCacheResolver):
9+
file_path: str
10+
11+
def __init__(self, file_path: str, **kwargs):
12+
self.file_path = file_path
13+
SqliteCacheResolver.__init__(self, **kwargs)
14+
15+
def matching_ids(self) -> List[MatchingPair]:
16+
17+
tree = ET.parse(self.file_path)
18+
root = tree.getroot()
19+
for node in root.findall('./cell-line-list/cell-line'):
20+
accession = node.find('./accession-list/accession[@type="primary"]').text
21+
id = EquivalentId(id=accession, type=Prefix.Cellosaurus).id_str()
22+
yield MatchingPair(id=id, match=id, type='exact')
23+
for xref in node.findall('./xref-list/xref[@category="Cell line databases/resources"]'):
24+
prefix = Prefix.parse(xref.get('database'))
25+
accession = xref.get('accession')
26+
match_id = EquivalentId(id=accession, type=prefix).id_str()
27+
yield MatchingPair(id=id, match=match_id, type=prefix.value)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import os
2+
from abc import ABC
3+
from datetime import date, datetime
4+
from typing import List, Union
5+
6+
from src.constants import DataSourceName
7+
from src.interfaces.input_adapter import NodeInputAdapter, RelationshipInputAdapter
8+
from src.models.datasource_version_info import DatasourceVersionInfo
9+
from src.models.node import Node, Relationship
10+
from src.models.pounce.experiment import Experiment
11+
12+
13+
class CCLEInputAdapter(NodeInputAdapter, RelationshipInputAdapter, ABC):
14+
def get_datasource_name(self) -> DataSourceName:
15+
return DataSourceName.CCLE
16+
17+
def get_version(self) -> DatasourceVersionInfo:
18+
return DatasourceVersionInfo(
19+
version="CCLE 2019"
20+
)
21+
22+
def get_experiment_name(self):
23+
return f"{self.get_datasource_name().value} - {self.get_version().version}"
24+
25+
def __init__(self):
26+
NodeInputAdapter.__init__(self)
27+
RelationshipInputAdapter.__init__(self)
28+
29+
class CCLEFileInputAdapter(CCLEInputAdapter, ABC):
30+
file_path: str
31+
download_date: date
32+
33+
def get_version(self) -> DatasourceVersionInfo:
34+
return DatasourceVersionInfo(
35+
version="CCLE 2019",
36+
download_date=self.download_date
37+
)
38+
39+
def __init__(self, file_path: str):
40+
self.file_path = file_path
41+
self.download_date = datetime.fromtimestamp(os.path.getmtime(file_path)).date()
42+
CCLEInputAdapter.__init__(self)
43+
44+
45+
class ExperimentAdapter(CCLEInputAdapter):
46+
def get_all(self) -> List[Union[Node, Relationship]]:
47+
return [Experiment(
48+
id=self.get_experiment_name(),
49+
name=self.get_datasource_name().value,
50+
type='RNA-seq',
51+
description='The Cancer Cell Line Encyclopedia (CCLE) project started in 2008 as a collaboration between the Broad Institute, and the Novartis Institutes for Biomedical Research and its Genomics Institute of the Novartis Research Foundation. The goal is to conduct a detailed genetic and pharmacologic characterization of a large panel of human cancer models, to develop integrated computational analyses that link distinct pharmacologic vulnerabilities to genomic patterns and to translate cell line integrative genomics into cancer patient stratification. Later the MD Anderson and Harvard Medical school joined the project. As of summer of 2018 CCLE continues its efforts as part of the Broad Cancer Dependency Map Project.',
52+
category="in vitro"
53+
)]

0 commit comments

Comments
 (0)