Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data_conversion/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ export DEBUG_INDENT ?= 0
touch $@

.PRECIOUS: %/annotations.csv
%/annotations.csv: %/export_annotations.tsv
%/annotations.csv: %/export_annotations.touch
sed -E 's/UNKNOWN:[0-9]+//g' $*/export_annotations.tsv > $@

.PRECIOUS: %/functionome_release.gaf
Expand Down
21 changes: 8 additions & 13 deletions data_conversion/gene_info_from_gafs.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,35 +45,29 @@ def extract_from_annotation_gaf(self, annotation_gafs):
continue
self.add_genes_from_row(r)

@staticmethod
def handle_gene_name(incoming_gene_name: str) -> str:
if incoming_gene_name == "Uncharacterized protein":
return "Unnamed gene"
return incoming_gene_name

def add_genes_from_row(self, csv_row: List):
gene_id = "{}:{}".format(csv_row[0], csv_row[1])
gene_symbol = csv_row[2]
gene_name = self.handle_gene_name(csv_row[9])
gene_name = csv_row[9]
gene_taxon = csv_row[12].replace("taxon:", "")
self.gene_info_dict[gene_id] = {
"gene": gene_id,
"gene_symbol": gene_symbol,
"gene_name": gene_name,
"unnamed_gene": gene_name == "Unnamed gene",
"has_gene_symbol": gene_symbol != gene_id.split(":", maxsplit=1)[1],
"taxon_id": gene_taxon,
}
with_from_raw = csv_row[7]
if "|" in with_from_raw:
with_gene_id = with_from_raw.split("|", maxsplit=1)[1]
with_gene_symbol = csv_row[18]
with_gene_name = self.handle_gene_name(csv_row[19])
with_gene_name = csv_row[19]
with_gene_taxon_id = csv_row[20].replace("taxon:", "")
self.gene_info_dict[with_gene_id] = {
"gene": with_gene_id,
"gene_symbol": with_gene_symbol,
"gene_name": with_gene_name,
"unnamed_gene": with_gene_name == "Unnamed gene",
"has_gene_symbol": with_gene_symbol != with_gene_id.split(":", maxsplit=1)[1],
"taxon_id": with_gene_taxon_id,
}

Expand All @@ -99,14 +93,15 @@ def fill_in_gene_symbol_name(self, gene_dat_file: str):
"taxon_id": "9606",
}
gene_symbol = r[2]
gene_name = self.handle_gene_name(r[1])
gene_name = r[1]
if self.gene_info_dict[gene_id]["gene_symbol"] == "":
self.gene_info_dict[gene_id]["gene_symbol"] = gene_symbol
self.gene_info_dict[gene_id]["has_gene_symbol"] = True
if self.gene_info_dict[gene_id]["gene_symbol"] == "":
# Duplicating symbol filling logic from createGAF.pl
self.gene_info_dict[gene_id]["gene_symbol"] = gene_id.split(":", maxsplit=1)[1]
self.gene_info_dict[gene_id]["has_gene_symbol"] = False
self.gene_info_dict[gene_id]["gene_name"] = gene_name
self.gene_info_dict[gene_id]["unnamed_gene"] = gene_name == "Unnamed gene"
self.gene_info_dict[gene_id]["long_id"] = long_id
# self.gene_info_dict[gene_id]["taxon_id"] = species_oscode_to_taxon_id[oscode]

Expand Down Expand Up @@ -146,7 +141,7 @@ def gene_info_list(self):
"gene": gene,
"gene_symbol": gene_info["gene_symbol"],
"gene_name": gene_info["gene_name"],
"unnamed_gene": gene_info["unnamed_gene"],
"has_gene_symbol": gene_info["has_gene_symbol"],
"taxon_id": gene_info["taxon_id"],
"panther_family": gene_info.get("panther_family"), # can be None
"long_id": gene_info.get("long_id"), # can be None
Expand Down