From 26099d9efa855342cb0a5f7b711aa7e2b3cba4ed Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Mon, 18 Mar 2024 09:54:35 +0100 Subject: [PATCH 01/32] added parsers from other repo --- nedrexdb/db/parsers/cosmic.py | 226 +++++++++++++++++++++++++++++++++ nedrexdb/db/parsers/intogen.py | 106 ++++++++++++++++ nedrexdb/db/parsers/ncg.py | 95 ++++++++++++++ 3 files changed, 427 insertions(+) create mode 100644 nedrexdb/db/parsers/cosmic.py create mode 100644 nedrexdb/db/parsers/intogen.py create mode 100644 nedrexdb/db/parsers/ncg.py diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py new file mode 100644 index 0000000..7570c08 --- /dev/null +++ b/nedrexdb/db/parsers/cosmic.py @@ -0,0 +1,226 @@ +import gzip as _gzip +from csv import DictReader as _DictReader +from pathlib import Path as _Path + +from more_itertools import chunked as _chunked +from tqdm import tqdm as _tqdm + +from nedrexdb.db import MongoInstance +from nedrexdb.db.models.edges.variant_affects_gene import VariantAffectsGene +from nedrexdb.db.models.edges.variant_associated_with_disorder import VariantAssociatedWithDisorder +from nedrexdb.db.models.nodes.gene import Gene +from nedrexdb.db.models.nodes.genomic_variant import GenomicVariant +from nedrexdb.db.parsers import _get_file_location_factory +from nedrexdb.logger import logger + +get_file_location = _get_file_location_factory("cosmic") +get_clinvar_file_location = _get_file_location_factory("clinvar") + + +# g_dot_re = re.compile("(..?):g\.(\d*)_?(\d*)(.*)") + + +def get_gdot2clinvar(fname: str) -> dict[str, str]: + from nedrexdb.db.parsers.clinvar import ClinVarVCFParser + vcf_parser = ClinVarVCFParser(fname) + gdot2clinvar = {} + for row in vcf_parser.iter_rows(): + full_gdot = row['INFO'].get('CLNHGVS') + if full_gdot: + _, gdot = row['INFO']['CLNHGVS'].split(':', maxsplit=1) + gdot2clinvar[f"{row['CHROM']}:{gdot}"] = f"clinvar.{row['ID']}" + return gdot2clinvar + + +def get_cancer2mondo(mapping_fname: _Path) -> dict[tuple: str]: + mapping_columns = ['SITE_PRIMARY_COSMIC', 'SITE_SUBTYPE1_COSMIC', + 'SITE_SUBTYPE2_COSMIC', 'SITE_SUBTYPE3_COSMIC', 'HISTOLOGY_COSMIC', + 'HIST_SUBTYPE1_COSMIC', 'HIST_SUBTYPE2_COSMIC', 'HIST_SUBTYPE3_COSMIC'] + cancer2mondo = {} + with open(mapping_fname, newline='') as mapping_file: + reader = _DictReader(mapping_file, delimiter="\t") + cancer2mondo = {tuple( + row[column] for column in mapping_columns): row['mapped_curie'] for row in reader} + return cancer2mondo + + +class COSMICRow: + def __init__(self, row): + self._row = row + + def get_HGVSG(self): + return self._row["HGVSG"] + + def get_COSMIC(self): + return f"cosmic.{self._row['GENOMIC_MUTATION_ID']}" + + def get_symbol(self): + return self._row["Gene name"] + + def get_cancer_tuple(self) -> tuple: + return tuple( + self._row[column] for column in ['Primary site', 'Site subtype 1', 'Site subtype 2', 'Site subtype 3', + 'Primary histology', 'Histology subtype 1', 'Histology subtype 2', + 'Histology subtype 3']) + + def get_mutation_status(self): + return self._row['Mutation somatic status'] + + # def get_variant(self, gdot2clinvar) -> GenomicVariant: + # match = g_dot_re.search(self._row["HGVSG"]) + + # if variant_id: + # return GenomicVariant(primaryDomainId=variant_id, domainIds=[cosmic_id], dataSources=['COSMIC']) + # if match: + # chrom, pos_start, pos_end, mut = match.group(1, 2, 3, 4) + # pos_start = int(pos_start) + # if pos_end: pos_end = int(pos_end) + # if mut == 'del': + # pos_start -= 1 + # if not pos_end: + # pos_end = pos_start + 1 + # genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Deletion')] + # variant = [variant for variant in genomic_variants if + # len(variant["referenceSequence"]) == pos_end + 1 - pos_start] + # elif mut == "dup": + # genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Duplication')] + # variant = [variant for variant in genomic_variants if + # len(variant["alternativeSequence"]) == pos_end + 2 - pos_start] + # if variant: + # breakpoint() + # elif mut.startswith('ins'): + # insertion = mut.strip('ins') + # genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Insertion')] + # variant = [variant for variant in genomic_variants if + # variant["referenceSequence"] + insertion == variant["alternativeSequence"]] + # elif mut.startswith('delins'): + # insertion = mut.strip('delins') + # if not pos_end: + # pos_end = pos_start + 1 + # genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Indel')] + # variant = [variant for variant in genomic_variants if + # insertion == variant["alternativeSequence"] and + # len(variant["referenceSequence"]) == pos_end + 1 - pos_start] + # else: + # if '>' not in mut: + # breakpoint() + # mut_from, mut_to = mut.split('>', 1) + # genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Single Nucleotide Variant')] + # variant = [variant for variant in genomic_variants if + # variant['referenceSequence'] == mut_from and + # variant['alternativeSequence'] == mut_to] + # assert len(variant) <= 1, f"More than one matching variant found in Nedrex for {self._row['HGVSG']}" + # if variant: + # variant = variant[0] + # variant["domainIds"].append(cosmic_id) + # return GenomicVariant(**variant) + # return None + + def parse(self, gdot2clinvar: dict[str, str], symbol2entrez: dict[str, str], cancer2mondo: dict[tuple, str]) -> \ + tuple[GenomicVariant, VariantAffectsGene, VariantAssociatedWithDisorder]: + variant_id = gdot2clinvar.get(self.get_HGVSG()) + genomic_variant = None + variant_gene = None + variant_disorder = None + if variant_id: + cosmic_id = self.get_COSMIC() + asserted_by = ["cosmic"] + genomic_variant = GenomicVariant(primaryDomainId=variant_id, domainIds=[ + cosmic_id], dataSources=asserted_by) + # data_update = genomic_variant.generate_dataSource_update() + # MongoInstance.DB[GenomicVariant.collection_name].bulk_write([data_update]) + gene_id = symbol2entrez[self.get_symbol()] + variant_gene = VariantAffectsGene(sourceDomainId=variant_id, targetDomainId=gene_id, + dataSources=asserted_by) + mondo_id = cancer2mondo.get(self.get_cancer_tuple()) + # if mondo_id: + variant_disorder = VariantAssociatedWithDisorder(accession=cosmic_id, dataSources=asserted_by, + sourceDomainId=variant_id, + targetDomainId=mondo_id, + reviewStatus=self.get_mutation_status()) + # else: + # variant_disorder = None + + return genomic_variant, variant_gene, variant_disorder + + +class COSMICParser: + COLUMN_NAMES = ['HGVSG', 'Gene name', 'GENOMIC_MUTATION_ID', 'Mutation somatic status', 'Primary site', + 'Site subtype 1', 'Site subtype 2', + 'Site subtype 3', + 'Primary histology', 'Histology subtype 1', 'Histology subtype 2', + 'Histology subtype 3'] + + def __init__(self, f: _Path): + self.f = f + + if self.f.name.endswith(".gz") or self.f.name.endswith(".gzip"): + self.gzipped = True + else: + self.gzipped = False + + def parse(self, mapping_fname: _Path): + if self.gzipped: + f = _gzip.open(self.f, "rt") + else: + f = self.f.open() + + reader = _DictReader(f, delimiter="\t") + f_dict = [{column: row[column] + for column in self.COLUMN_NAMES} for row in reader] + f.close() + + all_symbols = {row['Gene name'] for row in f_dict} + symbol2entrez = {gene["approvedSymbol"]: gene["primaryDomainId"] for gene in + Gene.find(MongoInstance.DB, {"approvedSymbol": {"$in": list(all_symbols)}})} + non_approved_symbols = all_symbols - symbol2entrez.keys() + for symbol in non_approved_symbols: + genes = [gene["primaryDomainId"] + for gene in Gene.find(MongoInstance.DB, {"symbols": symbol})] + assert len(genes) == 1, f"Multiple genes found for the symbol { + symbol}" + symbol2entrez.update({symbol: genes[0]}) + assert not (non_approved_symbols - symbol2entrez.keys()), \ + f"Not all symbols could be mapped: { + non_approved_symbols - symbol2entrez.keys()}" + + # id2genomic_variant = {genomic_variant['primaryDomainId']: genomic_variant for genomic_variant in + # GenomicVariant.find(MongoInstance.DB)} + # chr_pos_type2id = defaultdict(list) + # this_id = + # genomic_variant['primaryDomainId'], genomic_variant + # chr_pos_type2id[ + # (genomic_variant['chromosome'], genomic_variant['position'], genomic_variant['variantType'])].append( + # genomic_variant) + gdot2clinvar = get_gdot2clinvar( + get_clinvar_file_location("human_data")) + cancer2mondo = get_cancer2mondo(mapping_fname) + + updates = (COSMICRow(row).parse( + gdot2clinvar, symbol2entrez, cancer2mondo) for row in f_dict) + for chunk in _tqdm(_chunked(updates, 10_000), leave=False, desc="Parsing COSMIC"): + if not chunk: + continue + genomic_variant_updates, variant_gene_updates, variant_disorder_updates = [], [], [] + for genomic_variant, variant_gene, variant_disorder in chunk: + if genomic_variant: + genomic_variant_updates.append( + genomic_variant.generate_update()) + variant_gene_updates.append(variant_gene.generate_update()) + if variant_disorder: + variant_disorder_updates.append( + variant_disorder.generate_update()) + + for this_collection_name, these_updates in zip([GenomicVariant.collection_name, VariantAffectsGene.collection_name, VariantAssociatedWithDisorder.collection_name], + [genomic_variant_updates, variant_gene_updates, variant_disorder_updates]): + bulk_write_results = MongoInstance.DB[this_collection_name].bulk_write( + these_updates) + if bulk_write_results.bulk_api_result['writeErrors'] or bulk_write_results.bulk_api_result['writeConcernErrors']: + print(bulk_write_results.bulk_api_result) + + +def parse_gene_disease_associations(): + logger.info("Parsing COSMIC") + fname = get_file_location("census") + mapping_fname = get_file_location("mapping") + COSMICParser(fname).parse(mapping_fname) diff --git a/nedrexdb/db/parsers/intogen.py b/nedrexdb/db/parsers/intogen.py new file mode 100644 index 0000000..d2cdbbf --- /dev/null +++ b/nedrexdb/db/parsers/intogen.py @@ -0,0 +1,106 @@ +import gzip as _gzip +from csv import DictReader as _DictReader +from itertools import chain as _chain +from pathlib import Path as _Path + +import requests +from more_itertools import chunked as _chunked +from tqdm import tqdm as _tqdm + +from nedrexdb.db import MongoInstance +from nedrexdb.db.models.edges.gene_associated_with_disorder import GeneAssociatedWithDisorder +from nedrexdb.db.models.nodes.gene import Gene +from nedrexdb.db.parsers import _get_file_location_factory + +get_file_location = _get_file_location_factory("intogen") + + +def biomart_symbol_transcript_to_entrez(symbol_list: list[str], filter_by: str = "hgnc_symbol", batch_size: int = 100): + import xml.etree.ElementTree as ET + query = ET.Element("Query", virtualSchemaName="default", formatter="CSV", header="0", uniqueRows="0", count="", + datasetConfigVersion="0.6") + dataset = ET.SubElement( + query, "Dataset", name="hsapiens_gene_ensembl", interface="default") + ET.SubElement(dataset, "Filter", name=filter_by, value="{tr_ids}") + ET.SubElement(dataset, "Attribute", name=filter_by) + ET.SubElement(dataset, "Attribute", name="entrezgene_id") + tree = ET.ElementTree(query) + xml_string = f'{ + ET.tostring(tree.getroot(), encoding="unicode")}' + + symbol2entrez: dict[str, str] = dict() + for i in range(0, len(symbol_list), batch_size): + response = requests.get( + f'http://www.ensembl.org/biomart/martservice?query={format(xml_string.format(tr_ids=",".join(symbol_list[i:i + batch_size])))}') + response.raise_for_status() + symbol2entrez.update(dict(row.split(',', 1) + for row in response.content.decode('utf-8').splitlines())) + return symbol2entrez + + +class IntOGenRow: + def __init__(self, row): + self._row = row + + def parse(self, intogen2mondo: dict[str, list[str]], symbol2entrez: dict[str, str]) -> list[ + GeneAssociatedWithDisorder]: + sourceDomainId = symbol2entrez[self._row["SYMBOL"]] + asserted_by = ["intogen"] + disorders = intogen2mondo[self._row["CANCER_TYPE"]] + + gawds = [ + GeneAssociatedWithDisorder( + sourceDomainId=sourceDomainId, targetDomainId=disorder.replace( + "MONDO:", "mondo."), + dataSources=asserted_by + ) + for disorder in disorders + ] + + return gawds + + +class IntOGenParser: + def __init__(self, f: _Path, mapping: _Path): + self.f = f + + if self.f.name.endswith(".gz") or self.f.name.endswith(".gzip"): + self.gzipped = True + else: + self.gzipped = False + + import json + self.intogen2mondo = json.load(open(mapping))["mondo_id"] + + def parse(self): + if self.gzipped: + f = _gzip.open(self.f, "rt") + else: + f = self.f.open() + + reader = _DictReader(f, delimiter="\t") + f_dict = [{"SYMBOL": row['SYMBOL'], "CANCER_TYPE": row['CANCER_TYPE']} + for row in reader] + + symbol2entrez = {gene["approvedSymbol"]: gene["primaryDomainId"] + for gene in Gene.find(MongoInstance.DB)} + + updates = (IntOGenRow(row).parse(self.intogen2mondo, symbol2entrez) + for row in f_dict) + for chunk in _tqdm(_chunked(updates, 1_000), leave=False, desc="Parsing IntOGen"): + chunk = list(_chain(*chunk)) + chunk = [gawd.generate_update() for gawd in chunk] + + if not chunk: + continue + + MongoInstance.DB[GeneAssociatedWithDisorder.collection_name].bulk_write( + chunk) + + f.close() + + +def parse_gene_disease_associations(): + fname = get_file_location("drivers") + mapping_fname = get_file_location("mapping") + IntOGenParser(fname, mapping_fname).parse() diff --git a/nedrexdb/db/parsers/ncg.py b/nedrexdb/db/parsers/ncg.py new file mode 100644 index 0000000..23c9f1e --- /dev/null +++ b/nedrexdb/db/parsers/ncg.py @@ -0,0 +1,95 @@ +import gzip as _gzip +from collections import defaultdict as _defaultdict +from csv import DictReader as _DictReader +from itertools import chain as _chain +from pathlib import Path as _Path + +from more_itertools import chunked as _chunked +from tqdm import tqdm as _tqdm + +from nedrexdb.db import MongoInstance +from nedrexdb.db.models.edges.gene_associated_with_disorder import GeneAssociatedWithDisorder +from nedrexdb.db.models.nodes.disorder import Disorder +from nedrexdb.db.models.nodes.gene import Gene +from nedrexdb.db.parsers import _get_file_location_factory + +get_file_location = _get_file_location_factory("ncg") + + +def _umls_to_nedrex_map() -> dict[str, list[str]]: + d = _defaultdict(list) + + for dis in Disorder.find(MongoInstance.DB): + umls_ids = [acc for acc in dis["domainIds"] if acc.startswith("umls.")] + for umls_id in umls_ids: + d[umls_id].append(dis["primaryDomainId"]) + + return d + + +class NCGRow: + def __init__(self, row): + self._row = row + + def get_gene_id(self): + return f"entrez.{self._row['entrez'].strip()}" + + def parse(self, ncg2mondo: dict[str, list[str]]) -> list[GeneAssociatedWithDisorder]: + sourceDomainId = self.get_gene_id() + asserted_by = ["ncg"] + disorders = ncg2mondo[self._row["cancer_type"]] + + gawds = [ + GeneAssociatedWithDisorder( + sourceDomainId=sourceDomainId, targetDomainId=disorder.replace( + "MONDO:", "mondo."), + dataSources=asserted_by + ) + for disorder in disorders + ] + + return gawds + + +class NCGParser: + def __init__(self, f: _Path, mapping: _Path): + self.f = f + + if self.f.name.endswith(".gz") or self.f.name.endswith(".gzip"): + self.gzipped = True + else: + self.gzipped = False + + import json + self.ncg2mondo = json.load(open(mapping))["mondo_id"] + + def parse(self): + if self.gzipped: + f = _gzip.open(self.f, "rt") + else: + f = self.f.open() + + reader = _DictReader(f, delimiter="\t") + + genes = {gene["primaryDomainId"] + for gene in Gene.find(MongoInstance.DB)} + + updates = (NCGRow(row).parse(self.ncg2mondo) for row in reader) + for chunk in _tqdm(_chunked(updates, 1_000), leave=False, desc="Parsing NCG"): + chunk = list(_chain(*chunk)) + chunk = [gawd.generate_update() + for gawd in chunk if gawd.sourceDomainId in genes] + + if not chunk: + continue + + MongoInstance.DB[GeneAssociatedWithDisorder.collection_name].bulk_write( + chunk) + + f.close() + + +def parse_gene_disease_associations(): + fname = get_file_location("annotation") + mapping_fname = get_file_location("mapping") + NCGParser(fname, mapping_fname).parse() From 4b345319c5cc0f47f1b7b2ebbecdd041c24ac309 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Mon, 18 Mar 2024 11:24:59 +0100 Subject: [PATCH 02/32] add new parsers to build.py --- build.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/build.py b/build.py index dd379b3..da862ef 100755 --- a/build.py +++ b/build.py @@ -32,6 +32,9 @@ sider, uberon, repotrial, + cosmic, + ncg, + intogen ) from nedrexdb.post_integration import trim_uberon, drop_empty_collections @@ -99,6 +102,10 @@ def update(conf, download): iid.parse_ppis() intact.parse() + cosmic.parse_gene_disease_associations() + intogen.parse_gene_disease_associations() + ncg.parse_gene_disease_associations() + if version == "licensed": omim.parse_gene_disease_associations() From 259afccc09523495f7d77fcd85ae92a9f2ad4ff5 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Mon, 18 Mar 2024 14:34:08 +0100 Subject: [PATCH 03/32] different formatting --- nedrexdb/db/parsers/cosmic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py index 7570c08..5448112 100644 --- a/nedrexdb/db/parsers/cosmic.py +++ b/nedrexdb/db/parsers/cosmic.py @@ -177,8 +177,7 @@ def parse(self, mapping_fname: _Path): for symbol in non_approved_symbols: genes = [gene["primaryDomainId"] for gene in Gene.find(MongoInstance.DB, {"symbols": symbol})] - assert len(genes) == 1, f"Multiple genes found for the symbol { - symbol}" + assert len(genes) == 1, f"Multiple genes found for the symbol {symbol}" symbol2entrez.update({symbol: genes[0]}) assert not (non_approved_symbols - symbol2entrez.keys()), \ f"Not all symbols could be mapped: { From ba3710e23fd5128f9fcb745a3a8ffafeb394b96b Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Mon, 18 Mar 2024 14:35:02 +0100 Subject: [PATCH 04/32] different formatting --- nedrexdb/db/parsers/cosmic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py index 5448112..8993e34 100644 --- a/nedrexdb/db/parsers/cosmic.py +++ b/nedrexdb/db/parsers/cosmic.py @@ -180,8 +180,7 @@ def parse(self, mapping_fname: _Path): assert len(genes) == 1, f"Multiple genes found for the symbol {symbol}" symbol2entrez.update({symbol: genes[0]}) assert not (non_approved_symbols - symbol2entrez.keys()), \ - f"Not all symbols could be mapped: { - non_approved_symbols - symbol2entrez.keys()}" + f"Not all symbols could be mapped: {non_approved_symbols - symbol2entrez.keys()}" # id2genomic_variant = {genomic_variant['primaryDomainId']: genomic_variant for genomic_variant in # GenomicVariant.find(MongoInstance.DB)} From 79a7ac942c2ed72d5d99de1c78a372e7b761e957 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Mon, 18 Mar 2024 14:36:09 +0100 Subject: [PATCH 05/32] different formatting --- nedrexdb/db/parsers/intogen.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nedrexdb/db/parsers/intogen.py b/nedrexdb/db/parsers/intogen.py index d2cdbbf..9aba397 100644 --- a/nedrexdb/db/parsers/intogen.py +++ b/nedrexdb/db/parsers/intogen.py @@ -25,8 +25,7 @@ def biomart_symbol_transcript_to_entrez(symbol_list: list[str], filter_by: str = ET.SubElement(dataset, "Attribute", name=filter_by) ET.SubElement(dataset, "Attribute", name="entrezgene_id") tree = ET.ElementTree(query) - xml_string = f'{ - ET.tostring(tree.getroot(), encoding="unicode")}' + xml_string = f'{ET.tostring(tree.getroot(), encoding="unicode")}' symbol2entrez: dict[str, str] = dict() for i in range(0, len(symbol_list), batch_size): From 27f4c73896c8d30a51d9be8e9d139e724a5c8e9e Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Mon, 18 Mar 2024 14:53:39 +0100 Subject: [PATCH 06/32] updated uniprot parser --- nedrexdb/db/parsers/uniprot.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/nedrexdb/db/parsers/uniprot.py b/nedrexdb/db/parsers/uniprot.py index 1c7f517..90e09ff 100644 --- a/nedrexdb/db/parsers/uniprot.py +++ b/nedrexdb/db/parsers/uniprot.py @@ -79,12 +79,22 @@ def get_gene_name(self) -> str: if not gene_name: pass else: + if isinstance(gene_name, list): + name = None + for i in gene_name: + for k,v in i.items(): + if not name: + name = v if isinstance(v, str) else v[0] + if k == "Name": + name = v + gene_name = name if gene_name.startswith("Name="): gene_name = gene_name.replace("Name=", "").split(";", 1)[0] gene_name = self._CURLY_REGEX.split(gene_name)[0].strip() return gene_name + def get_comments(self) -> str: return self._record.annotations.get("comment", "") From aba817320167aa9bb08a820723be3f4df985521a Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Mon, 18 Mar 2024 16:44:22 +0100 Subject: [PATCH 07/32] just quick call to download a specific file --- nedrexdb/downloaders/biogrid.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nedrexdb/downloaders/biogrid.py b/nedrexdb/downloaders/biogrid.py index 44ac966..0db14eb 100644 --- a/nedrexdb/downloaders/biogrid.py +++ b/nedrexdb/downloaders/biogrid.py @@ -71,3 +71,7 @@ def download_biogrid(): if counter != 1: raise _AssumptionError("more than one BioGRID file containing 'Homo_sapiens' was found") + + +if __name__ == "__main__": + download_biogrid() From f45ba0a9fa4f36473e9d327b853437a42170df4e Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Mon, 18 Mar 2024 16:48:24 +0100 Subject: [PATCH 08/32] temp: correct path --- nedrexdb/downloaders/biogrid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nedrexdb/downloaders/biogrid.py b/nedrexdb/downloaders/biogrid.py index 0db14eb..3e8a719 100644 --- a/nedrexdb/downloaders/biogrid.py +++ b/nedrexdb/downloaders/biogrid.py @@ -42,7 +42,7 @@ def download_biogrid(): zip_fname = url.rsplit("/", 1)[1] target_fname = _config.get("sources.biogrid.human_data.filename") - biogrid_dir = _Path(_config.get("db.root_directory")) / _config.get("sources.directory") / "biogrid" + biogrid_dir = "/nfs/data3/nedrex_data/downloads/biogrid" biogrid_dir.mkdir(exist_ok=True, parents=True) From 0cb50045a44f4ee4d76663f7477969b685991190 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Mon, 18 Mar 2024 16:51:26 +0100 Subject: [PATCH 09/32] adjust path --- nedrexdb/downloaders/biogrid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nedrexdb/downloaders/biogrid.py b/nedrexdb/downloaders/biogrid.py index 3e8a719..0fda5f5 100644 --- a/nedrexdb/downloaders/biogrid.py +++ b/nedrexdb/downloaders/biogrid.py @@ -42,7 +42,7 @@ def download_biogrid(): zip_fname = url.rsplit("/", 1)[1] target_fname = _config.get("sources.biogrid.human_data.filename") - biogrid_dir = "/nfs/data3/nedrex_data/downloads/biogrid" + biogrid_dir = _Path("/nfs/data3/nedrex_data/downloads/biogrid") biogrid_dir.mkdir(exist_ok=True, parents=True) From e553804e964b45f45453f5dcac6ff362e35a9e54 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Mon, 18 Mar 2024 16:53:42 +0100 Subject: [PATCH 10/32] add other path --- nedrexdb/downloaders/biogrid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nedrexdb/downloaders/biogrid.py b/nedrexdb/downloaders/biogrid.py index 0fda5f5..9ddf829 100644 --- a/nedrexdb/downloaders/biogrid.py +++ b/nedrexdb/downloaders/biogrid.py @@ -41,7 +41,7 @@ def download_biogrid(): ) zip_fname = url.rsplit("/", 1)[1] - target_fname = _config.get("sources.biogrid.human_data.filename") + target_fname = _Path("homo_sapiens.tab3") biogrid_dir = _Path("/nfs/data3/nedrex_data/downloads/biogrid") biogrid_dir.mkdir(exist_ok=True, parents=True) From e236ddc0850a1646b146b6bc7c997b92554c7e4b Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Mon, 18 Mar 2024 16:55:29 +0100 Subject: [PATCH 11/32] redo adjustments for local download --- nedrexdb/downloaders/biogrid.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/nedrexdb/downloaders/biogrid.py b/nedrexdb/downloaders/biogrid.py index 9ddf829..9f8ee81 100644 --- a/nedrexdb/downloaders/biogrid.py +++ b/nedrexdb/downloaders/biogrid.py @@ -41,8 +41,8 @@ def download_biogrid(): ) zip_fname = url.rsplit("/", 1)[1] - target_fname = _Path("homo_sapiens.tab3") - biogrid_dir = _Path("/nfs/data3/nedrex_data/downloads/biogrid") + target_fname = _config.get("sources.biogrid.human_data.filename") + biogrid_dir = _Path(_config.get("db.root_directory")) / _config.get("sources.directory") / "biogrid" biogrid_dir.mkdir(exist_ok=True, parents=True) @@ -72,6 +72,3 @@ def download_biogrid(): if counter != 1: raise _AssumptionError("more than one BioGRID file containing 'Homo_sapiens' was found") - -if __name__ == "__main__": - download_biogrid() From fdaf77f1dcd3eecdd283c136967ec49336dc3f5a Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 19 Mar 2024 12:35:57 +0100 Subject: [PATCH 12/32] different order of parsers --- build.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/build.py b/build.py index da862ef..57eeca1 100755 --- a/build.py +++ b/build.py @@ -102,15 +102,15 @@ def update(conf, download): iid.parse_ppis() intact.parse() - cosmic.parse_gene_disease_associations() - intogen.parse_gene_disease_associations() - ncg.parse_gene_disease_associations() - if version == "licensed": omim.parse_gene_disease_associations() sider.parse() uniprot.parse_idmap() + + cosmic.parse_gene_disease_associations() + intogen.parse_gene_disease_associations() + ncg.parse_gene_disease_associations() from nedrexdb.analyses import molecule_similarity From 1caf32bcee062e283832768bf4e8c765e02b9c1f Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 19 Mar 2024 14:00:09 +0100 Subject: [PATCH 13/32] temp for debugging --- nedrexdb/db/parsers/ncg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nedrexdb/db/parsers/ncg.py b/nedrexdb/db/parsers/ncg.py index 23c9f1e..5eb801c 100644 --- a/nedrexdb/db/parsers/ncg.py +++ b/nedrexdb/db/parsers/ncg.py @@ -37,6 +37,7 @@ def get_gene_id(self): def parse(self, ncg2mondo: dict[str, list[str]]) -> list[GeneAssociatedWithDisorder]: sourceDomainId = self.get_gene_id() asserted_by = ["ncg"] + print(ncg2mondo) disorders = ncg2mondo[self._row["cancer_type"]] gawds = [ @@ -90,6 +91,8 @@ def parse(self): def parse_gene_disease_associations(): - fname = get_file_location("annotation") - mapping_fname = get_file_location("mapping") + fname = "/nfs/data3/nedrex_data/downloads/ncg/NCG_cancerdrivers_systemslevelproperties.tsv" #get_file_location("annotation") + mapping_fname = "/nfs/data3/nedrex_data/downloads/ncg/ncg2mondo.json" #get_file_location("mapping") NCGParser(fname, mapping_fname).parse() + +parse_gene_disease_associations() \ No newline at end of file From 443974bd2dc50c15f4c850dd9636d78347a358f8 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 19 Mar 2024 14:02:08 +0100 Subject: [PATCH 14/32] as path --- nedrexdb/db/parsers/ncg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nedrexdb/db/parsers/ncg.py b/nedrexdb/db/parsers/ncg.py index 5eb801c..0202ddf 100644 --- a/nedrexdb/db/parsers/ncg.py +++ b/nedrexdb/db/parsers/ncg.py @@ -91,8 +91,8 @@ def parse(self): def parse_gene_disease_associations(): - fname = "/nfs/data3/nedrex_data/downloads/ncg/NCG_cancerdrivers_systemslevelproperties.tsv" #get_file_location("annotation") - mapping_fname = "/nfs/data3/nedrex_data/downloads/ncg/ncg2mondo.json" #get_file_location("mapping") + fname = _Path("/nfs/data3/nedrex_data/downloads/ncg/NCG_cancerdrivers_systemslevelproperties.tsv") #get_file_location("annotation") + mapping_fname = _Path("/nfs/data3/nedrex_data/downloads/ncg/ncg2mondo.json") #get_file_location("mapping") NCGParser(fname, mapping_fname).parse() parse_gene_disease_associations() \ No newline at end of file From 46d9d55d72352f6013e2be9b7316ca32095add27 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 19 Mar 2024 14:03:43 +0100 Subject: [PATCH 15/32] redo paths, add logs --- nedrexdb/db/parsers/ncg.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nedrexdb/db/parsers/ncg.py b/nedrexdb/db/parsers/ncg.py index 0202ddf..9d12996 100644 --- a/nedrexdb/db/parsers/ncg.py +++ b/nedrexdb/db/parsers/ncg.py @@ -39,6 +39,7 @@ def parse(self, ncg2mondo: dict[str, list[str]]) -> list[GeneAssociatedWithDisor asserted_by = ["ncg"] print(ncg2mondo) disorders = ncg2mondo[self._row["cancer_type"]] + print(disorders) gawds = [ GeneAssociatedWithDisorder( @@ -91,8 +92,8 @@ def parse(self): def parse_gene_disease_associations(): - fname = _Path("/nfs/data3/nedrex_data/downloads/ncg/NCG_cancerdrivers_systemslevelproperties.tsv") #get_file_location("annotation") - mapping_fname = _Path("/nfs/data3/nedrex_data/downloads/ncg/ncg2mondo.json") #get_file_location("mapping") + fname = get_file_location("annotation") + mapping_fname = get_file_location("mapping") + print(fname) + print(mapping_fname) NCGParser(fname, mapping_fname).parse() - -parse_gene_disease_associations() \ No newline at end of file From a78222240051abb20a8f4793f027ced6c04c9672 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 19 Mar 2024 14:04:24 +0100 Subject: [PATCH 16/32] different order --- build.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/build.py b/build.py index 57eeca1..15df211 100755 --- a/build.py +++ b/build.py @@ -73,6 +73,8 @@ def update(conf, download): ncbi.parse_gene_info() uberon.parse() uniprot.parse_proteins() + + ncg.parse_gene_disease_associations() # Sources that add node type but require existing nodes, too clinvar.parse() @@ -110,7 +112,7 @@ def update(conf, download): cosmic.parse_gene_disease_associations() intogen.parse_gene_disease_associations() - ncg.parse_gene_disease_associations() + from nedrexdb.analyses import molecule_similarity From 4a583cae8baff67896c7e3de3912601ce5f88109 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 19 Mar 2024 14:14:54 +0100 Subject: [PATCH 17/32] added log --- nedrexdb/db/parsers/ncg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nedrexdb/db/parsers/ncg.py b/nedrexdb/db/parsers/ncg.py index 9d12996..b04df0d 100644 --- a/nedrexdb/db/parsers/ncg.py +++ b/nedrexdb/db/parsers/ncg.py @@ -40,6 +40,7 @@ def parse(self, ncg2mondo: dict[str, list[str]]) -> list[GeneAssociatedWithDisor print(ncg2mondo) disorders = ncg2mondo[self._row["cancer_type"]] print(disorders) + print(self._row["cancer_type"]) gawds = [ GeneAssociatedWithDisorder( From 313ce7d2acc3c5e6ecef657da311b8528a9c9fdd Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 19 Mar 2024 14:40:01 +0100 Subject: [PATCH 18/32] delete unnecessary prints --- nedrexdb/db/parsers/ncg.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/nedrexdb/db/parsers/ncg.py b/nedrexdb/db/parsers/ncg.py index b04df0d..3bc146b 100644 --- a/nedrexdb/db/parsers/ncg.py +++ b/nedrexdb/db/parsers/ncg.py @@ -37,10 +37,10 @@ def get_gene_id(self): def parse(self, ncg2mondo: dict[str, list[str]]) -> list[GeneAssociatedWithDisorder]: sourceDomainId = self.get_gene_id() asserted_by = ["ncg"] - print(ncg2mondo) + if self._row["cancer_type"] == None: + self._row["cancer_type"] = "MONDO:0021040" + disorders = ncg2mondo[self._row["cancer_type"]] - print(disorders) - print(self._row["cancer_type"]) gawds = [ GeneAssociatedWithDisorder( @@ -95,6 +95,4 @@ def parse(self): def parse_gene_disease_associations(): fname = get_file_location("annotation") mapping_fname = get_file_location("mapping") - print(fname) - print(mapping_fname) NCGParser(fname, mapping_fname).parse() From 98bf7e51acc1c40f6771e7da80be5bfe1b6f7b6d Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 19 Mar 2024 17:37:15 +0100 Subject: [PATCH 19/32] added test script for db --- nedrexdb/db/parsers/db_test.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 nedrexdb/db/parsers/db_test.py diff --git a/nedrexdb/db/parsers/db_test.py b/nedrexdb/db/parsers/db_test.py new file mode 100644 index 0000000..434c468 --- /dev/null +++ b/nedrexdb/db/parsers/db_test.py @@ -0,0 +1,18 @@ +import pymongo +from configparser import ConfigParser + + + +# Holen Sie sich die MongoDB-Konfiguration für die Entwicklungsdatenbank +mongo_port = 27019 +mongo_db_name = "licensed_nedrex_dev" + +# Verbinde dich mit der MongoDB-Datenbank +client = pymongo.MongoClient('localhost', mongo_port) +db = client[mongo_db_name] + +# Beispielabfrage +collection = db['GeneAssociatedWithDisorder'] +num_documents = collection.count_documents({}) + +print("Anzahl der Dokumente:", num_documents) From 6a82afa4c04270f435f6e76b4a8105aa2bb2ad17 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 19 Mar 2024 17:41:23 +0100 Subject: [PATCH 20/32] use live version --- nedrexdb/db/parsers/db_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nedrexdb/db/parsers/db_test.py b/nedrexdb/db/parsers/db_test.py index 434c468..82aab79 100644 --- a/nedrexdb/db/parsers/db_test.py +++ b/nedrexdb/db/parsers/db_test.py @@ -4,8 +4,8 @@ # Holen Sie sich die MongoDB-Konfiguration für die Entwicklungsdatenbank -mongo_port = 27019 -mongo_db_name = "licensed_nedrex_dev" +mongo_port = 27020 +mongo_db_name = "licensed_nedrex_live" # Verbinde dich mit der MongoDB-Datenbank client = pymongo.MongoClient('localhost', mongo_port) From a9393145bc9e3c91c5c7c4aab6f9c8bec64c4959 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 19 Mar 2024 17:45:41 +0100 Subject: [PATCH 21/32] adjust test for db --- nedrexdb/db/parsers/db_test.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nedrexdb/db/parsers/db_test.py b/nedrexdb/db/parsers/db_test.py index 82aab79..5282dd9 100644 --- a/nedrexdb/db/parsers/db_test.py +++ b/nedrexdb/db/parsers/db_test.py @@ -12,7 +12,8 @@ db = client[mongo_db_name] # Beispielabfrage -collection = db['GeneAssociatedWithDisorder'] -num_documents = collection.count_documents({}) +collections = db.list_collection_names() -print("Anzahl der Dokumente:", num_documents) +# Ausgabe der Collections +for collection in collections: + print(collection) From 8b22a76c2aa437d41c633c4f4c0f29cf6c8af4c9 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 19 Mar 2024 17:55:08 +0100 Subject: [PATCH 22/32] different test --- nedrexdb/db/parsers/db_test.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/nedrexdb/db/parsers/db_test.py b/nedrexdb/db/parsers/db_test.py index 5282dd9..c013ae5 100644 --- a/nedrexdb/db/parsers/db_test.py +++ b/nedrexdb/db/parsers/db_test.py @@ -1,19 +1,12 @@ import pymongo -from configparser import ConfigParser +# Verbindung zur MongoDB-Datenbank herstellen +client = pymongo.MongoClient('localhost', 27020) +db = client['licensed_nedrex_live'] # Datenbank auswählen +# Ein zufälliges Element aus der Sammlung abrufen +random_document = db['gene_associated_with_disorder'].find_one() -# Holen Sie sich die MongoDB-Konfiguration für die Entwicklungsdatenbank -mongo_port = 27020 -mongo_db_name = "licensed_nedrex_live" - -# Verbinde dich mit der MongoDB-Datenbank -client = pymongo.MongoClient('localhost', mongo_port) -db = client[mongo_db_name] - -# Beispielabfrage -collections = db.list_collection_names() - -# Ausgabe der Collections -for collection in collections: - print(collection) +# Ergebnis ausgeben +print(random_document) +print(random_document) From f86ad417250ce34e1619c30b8961266f4f05bb93 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Thu, 4 Apr 2024 13:18:03 +0200 Subject: [PATCH 23/32] check used files --- build.py | 2 +- nedrexdb/db/parsers/cosmic.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/build.py b/build.py index 15df211..709f947 100755 --- a/build.py +++ b/build.py @@ -74,6 +74,7 @@ def update(conf, download): uberon.parse() uniprot.parse_proteins() + cosmic.parse_gene_disease_associations() ncg.parse_gene_disease_associations() # Sources that add node type but require existing nodes, too @@ -110,7 +111,6 @@ def update(conf, download): sider.parse() uniprot.parse_idmap() - cosmic.parse_gene_disease_associations() intogen.parse_gene_disease_associations() diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py index 8993e34..82c790e 100644 --- a/nedrexdb/db/parsers/cosmic.py +++ b/nedrexdb/db/parsers/cosmic.py @@ -221,4 +221,6 @@ def parse_gene_disease_associations(): logger.info("Parsing COSMIC") fname = get_file_location("census") mapping_fname = get_file_location("mapping") + print(fname, mapping_fname) + print(fname[0]) COSMICParser(fname).parse(mapping_fname) From 9e23ace38da0e4ea501e794d8337ce3220a7fa95 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Thu, 4 Apr 2024 13:45:13 +0200 Subject: [PATCH 24/32] analyse data structure --- nedrexdb/db/parsers/cosmic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py index 82c790e..2f6a01b 100644 --- a/nedrexdb/db/parsers/cosmic.py +++ b/nedrexdb/db/parsers/cosmic.py @@ -215,12 +215,13 @@ def parse(self, mapping_fname: _Path): these_updates) if bulk_write_results.bulk_api_result['writeErrors'] or bulk_write_results.bulk_api_result['writeConcernErrors']: print(bulk_write_results.bulk_api_result) + + print(variant_gene_updates) + def parse_gene_disease_associations(): logger.info("Parsing COSMIC") fname = get_file_location("census") mapping_fname = get_file_location("mapping") - print(fname, mapping_fname) - print(fname[0]) COSMICParser(fname).parse(mapping_fname) From 72d561cf2ef100f333fd493258960bbfcc5e372a Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Thu, 4 Apr 2024 13:58:50 +0200 Subject: [PATCH 25/32] different log --- nedrexdb/db/parsers/cosmic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py index 2f6a01b..44efa0e 100644 --- a/nedrexdb/db/parsers/cosmic.py +++ b/nedrexdb/db/parsers/cosmic.py @@ -204,8 +204,10 @@ def parse(self, mapping_fname: _Path): if genomic_variant: genomic_variant_updates.append( genomic_variant.generate_update()) + print("Variant affects gene: ", variant_gene, "\n") variant_gene_updates.append(variant_gene.generate_update()) if variant_disorder: + print("Variant associated with disorder: ", variant_disorder, "\n") variant_disorder_updates.append( variant_disorder.generate_update()) @@ -215,9 +217,7 @@ def parse(self, mapping_fname: _Path): these_updates) if bulk_write_results.bulk_api_result['writeErrors'] or bulk_write_results.bulk_api_result['writeConcernErrors']: print(bulk_write_results.bulk_api_result) - - print(variant_gene_updates) - + def parse_gene_disease_associations(): From 3e41244ee2286734fae911623ec4c842843c884c Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Thu, 4 Apr 2024 14:03:50 +0200 Subject: [PATCH 26/32] add log --- nedrexdb/db/parsers/cosmic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py index 44efa0e..7ad8ba5 100644 --- a/nedrexdb/db/parsers/cosmic.py +++ b/nedrexdb/db/parsers/cosmic.py @@ -199,6 +199,7 @@ def parse(self, mapping_fname: _Path): for chunk in _tqdm(_chunked(updates, 10_000), leave=False, desc="Parsing COSMIC"): if not chunk: continue + print(chunk) genomic_variant_updates, variant_gene_updates, variant_disorder_updates = [], [], [] for genomic_variant, variant_gene, variant_disorder in chunk: if genomic_variant: From e6229d2322451e832e4c71d41a45bdcb6f2b140a Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Thu, 4 Apr 2024 14:21:08 +0200 Subject: [PATCH 27/32] create gene_disorder relation, not yet into db --- nedrexdb/db/parsers/cosmic.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py index 7ad8ba5..0087dd1 100644 --- a/nedrexdb/db/parsers/cosmic.py +++ b/nedrexdb/db/parsers/cosmic.py @@ -8,6 +8,7 @@ from nedrexdb.db import MongoInstance from nedrexdb.db.models.edges.variant_affects_gene import VariantAffectsGene from nedrexdb.db.models.edges.variant_associated_with_disorder import VariantAssociatedWithDisorder +from nedrexdb.db.models.edges.gene_associated_with_disorder import GeneAssociatedWithDisorder from nedrexdb.db.models.nodes.gene import Gene from nedrexdb.db.models.nodes.genomic_variant import GenomicVariant from nedrexdb.db.parsers import _get_file_location_factory @@ -199,8 +200,7 @@ def parse(self, mapping_fname: _Path): for chunk in _tqdm(_chunked(updates, 10_000), leave=False, desc="Parsing COSMIC"): if not chunk: continue - print(chunk) - genomic_variant_updates, variant_gene_updates, variant_disorder_updates = [], [], [] + genomic_variant_updates, variant_gene_updates, variant_disorder_updates, gene_disorder_updates = [], [], [], [] for genomic_variant, variant_gene, variant_disorder in chunk: if genomic_variant: genomic_variant_updates.append( @@ -211,6 +211,9 @@ def parse(self, mapping_fname: _Path): print("Variant associated with disorder: ", variant_disorder, "\n") variant_disorder_updates.append( variant_disorder.generate_update()) + gene_disorder = variant_disorder + gene_disorder.sourceDomainId = variant_gene.targetDomainId + print("Gene associated with disorder: ", gene_disorder, "\n") for this_collection_name, these_updates in zip([GenomicVariant.collection_name, VariantAffectsGene.collection_name, VariantAssociatedWithDisorder.collection_name], [genomic_variant_updates, variant_gene_updates, variant_disorder_updates]): From 4669f231a75daae87340db3984813690e6165c1f Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Thu, 4 Apr 2024 14:35:31 +0200 Subject: [PATCH 28/32] correct attributes, write in DB --- nedrexdb/db/parsers/cosmic.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py index 0087dd1..a1f1104 100644 --- a/nedrexdb/db/parsers/cosmic.py +++ b/nedrexdb/db/parsers/cosmic.py @@ -211,12 +211,14 @@ def parse(self, mapping_fname: _Path): print("Variant associated with disorder: ", variant_disorder, "\n") variant_disorder_updates.append( variant_disorder.generate_update()) - gene_disorder = variant_disorder + gene_disorder = variant_gene.copy() gene_disorder.sourceDomainId = variant_gene.targetDomainId + gene_disorder.targetDomainId = variant_disorder.targetDomainId print("Gene associated with disorder: ", gene_disorder, "\n") + gene_disorder_updates.append(gene_disorder.generate_update()) - for this_collection_name, these_updates in zip([GenomicVariant.collection_name, VariantAffectsGene.collection_name, VariantAssociatedWithDisorder.collection_name], - [genomic_variant_updates, variant_gene_updates, variant_disorder_updates]): + for this_collection_name, these_updates in zip([GenomicVariant.collection_name, VariantAffectsGene.collection_name, VariantAssociatedWithDisorder.collection_name, GeneAssociatedWithDisorder.collection_name], + [genomic_variant_updates, variant_gene_updates, variant_disorder_updates, gene_disorder_updates]): bulk_write_results = MongoInstance.DB[this_collection_name].bulk_write( these_updates) if bulk_write_results.bulk_api_result['writeErrors'] or bulk_write_results.bulk_api_result['writeConcernErrors']: From e6b4cd59527b4797322a0bac8a8dbd67a2d1b639 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Thu, 4 Apr 2024 14:45:08 +0200 Subject: [PATCH 29/32] deleted logs --- nedrexdb/db/parsers/cosmic.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py index a1f1104..c5725d6 100644 --- a/nedrexdb/db/parsers/cosmic.py +++ b/nedrexdb/db/parsers/cosmic.py @@ -205,16 +205,13 @@ def parse(self, mapping_fname: _Path): if genomic_variant: genomic_variant_updates.append( genomic_variant.generate_update()) - print("Variant affects gene: ", variant_gene, "\n") variant_gene_updates.append(variant_gene.generate_update()) if variant_disorder: - print("Variant associated with disorder: ", variant_disorder, "\n") variant_disorder_updates.append( variant_disorder.generate_update()) gene_disorder = variant_gene.copy() gene_disorder.sourceDomainId = variant_gene.targetDomainId gene_disorder.targetDomainId = variant_disorder.targetDomainId - print("Gene associated with disorder: ", gene_disorder, "\n") gene_disorder_updates.append(gene_disorder.generate_update()) for this_collection_name, these_updates in zip([GenomicVariant.collection_name, VariantAffectsGene.collection_name, VariantAssociatedWithDisorder.collection_name, GeneAssociatedWithDisorder.collection_name], From ac425495297f2f618f6c403578612d354d204218 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 21 May 2024 10:04:22 +0200 Subject: [PATCH 30/32] adjusted edgetype --- nedrexdb/db/parsers/cosmic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py index c5725d6..3db8436 100644 --- a/nedrexdb/db/parsers/cosmic.py +++ b/nedrexdb/db/parsers/cosmic.py @@ -212,7 +212,7 @@ def parse(self, mapping_fname: _Path): gene_disorder = variant_gene.copy() gene_disorder.sourceDomainId = variant_gene.targetDomainId gene_disorder.targetDomainId = variant_disorder.targetDomainId - gene_disorder_updates.append(gene_disorder.generate_update()) + gene_disorder_updates.append(GeneAssociatedWithDisorder(gene_disorder).generate_update()) for this_collection_name, these_updates in zip([GenomicVariant.collection_name, VariantAffectsGene.collection_name, VariantAssociatedWithDisorder.collection_name, GeneAssociatedWithDisorder.collection_name], [genomic_variant_updates, variant_gene_updates, variant_disorder_updates, gene_disorder_updates]): From 0664fb04e97d55e689eadf6581182e76a6d69581 Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 21 May 2024 10:50:49 +0200 Subject: [PATCH 31/32] adjusted type parsing --- nedrexdb/db/parsers/cosmic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py index 3db8436..0892697 100644 --- a/nedrexdb/db/parsers/cosmic.py +++ b/nedrexdb/db/parsers/cosmic.py @@ -209,10 +209,11 @@ def parse(self, mapping_fname: _Path): if variant_disorder: variant_disorder_updates.append( variant_disorder.generate_update()) - gene_disorder = variant_gene.copy() - gene_disorder.sourceDomainId = variant_gene.targetDomainId - gene_disorder.targetDomainId = variant_disorder.targetDomainId - gene_disorder_updates.append(GeneAssociatedWithDisorder(gene_disorder).generate_update()) + gene_disorder = GeneAssociatedWithDisorder(accession=variant_gene.accession, dataSources=variant_gene.dataSources, + sourceDomainId=variant_gene.targetDomainId, + targetDomainId=variant_disorder.targetDomainId, + reviewStatus=variant_gene.reviewStatus) + gene_disorder_updates.append(gene_disorder.generate_update()) for this_collection_name, these_updates in zip([GenomicVariant.collection_name, VariantAffectsGene.collection_name, VariantAssociatedWithDisorder.collection_name, GeneAssociatedWithDisorder.collection_name], [genomic_variant_updates, variant_gene_updates, variant_disorder_updates, gene_disorder_updates]): From 619d64e17b31c46061dddb2343a144f0fb5d919d Mon Sep 17 00:00:00 2001 From: lspindler2509 Date: Tue, 21 May 2024 11:16:16 +0200 Subject: [PATCH 32/32] adjusted parsing --- nedrexdb/db/parsers/cosmic.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py index 0892697..fd1d6a8 100644 --- a/nedrexdb/db/parsers/cosmic.py +++ b/nedrexdb/db/parsers/cosmic.py @@ -209,10 +209,9 @@ def parse(self, mapping_fname: _Path): if variant_disorder: variant_disorder_updates.append( variant_disorder.generate_update()) - gene_disorder = GeneAssociatedWithDisorder(accession=variant_gene.accession, dataSources=variant_gene.dataSources, + gene_disorder = GeneAssociatedWithDisorder(dataSources=["cosmic"], sourceDomainId=variant_gene.targetDomainId, - targetDomainId=variant_disorder.targetDomainId, - reviewStatus=variant_gene.reviewStatus) + targetDomainId=variant_disorder.targetDomainId) gene_disorder_updates.append(gene_disorder.generate_update()) for this_collection_name, these_updates in zip([GenomicVariant.collection_name, VariantAffectsGene.collection_name, VariantAssociatedWithDisorder.collection_name, GeneAssociatedWithDisorder.collection_name],