From c4060de768e9a71fcc8303b3f0cae78c3d99d697 Mon Sep 17 00:00:00 2001 From: dustine32 Date: Mon, 5 Jan 2026 17:44:52 -0800 Subject: [PATCH] Swap unnamed_gene field for has_gene_symbol for #89 --- data_conversion/Makefile | 2 +- data_conversion/gene_info_from_gafs.py | 21 ++++++++------------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/data_conversion/Makefile b/data_conversion/Makefile index 4f0a8de..1eb8bf0 100644 --- a/data_conversion/Makefile +++ b/data_conversion/Makefile @@ -126,7 +126,7 @@ export DEBUG_INDENT ?= 0 touch $@ .PRECIOUS: %/annotations.csv -%/annotations.csv: %/export_annotations.tsv +%/annotations.csv: %/export_annotations.touch sed -E 's/UNKNOWN:[0-9]+//g' $*/export_annotations.tsv > $@ .PRECIOUS: %/functionome_release.gaf diff --git a/data_conversion/gene_info_from_gafs.py b/data_conversion/gene_info_from_gafs.py index 4a07821..f904204 100644 --- a/data_conversion/gene_info_from_gafs.py +++ b/data_conversion/gene_info_from_gafs.py @@ -45,35 +45,29 @@ def extract_from_annotation_gaf(self, annotation_gafs): continue self.add_genes_from_row(r) - @staticmethod - def handle_gene_name(incoming_gene_name: str) -> str: - if incoming_gene_name == "Uncharacterized protein": - return "Unnamed gene" - return incoming_gene_name - def add_genes_from_row(self, csv_row: List): gene_id = "{}:{}".format(csv_row[0], csv_row[1]) gene_symbol = csv_row[2] - gene_name = self.handle_gene_name(csv_row[9]) + gene_name = csv_row[9] gene_taxon = csv_row[12].replace("taxon:", "") self.gene_info_dict[gene_id] = { "gene": gene_id, "gene_symbol": gene_symbol, "gene_name": gene_name, - "unnamed_gene": gene_name == "Unnamed gene", + "has_gene_symbol": gene_symbol != gene_id.split(":", maxsplit=1)[1], "taxon_id": gene_taxon, } with_from_raw = csv_row[7] if "|" in with_from_raw: with_gene_id = with_from_raw.split("|", maxsplit=1)[1] with_gene_symbol = csv_row[18] - with_gene_name = self.handle_gene_name(csv_row[19]) + with_gene_name = csv_row[19] with_gene_taxon_id = csv_row[20].replace("taxon:", "") self.gene_info_dict[with_gene_id] = { "gene": with_gene_id, "gene_symbol": with_gene_symbol, "gene_name": with_gene_name, - "unnamed_gene": with_gene_name == "Unnamed gene", + "has_gene_symbol": with_gene_symbol != with_gene_id.split(":", maxsplit=1)[1], "taxon_id": with_gene_taxon_id, } @@ -99,14 +93,15 @@ def fill_in_gene_symbol_name(self, gene_dat_file: str): "taxon_id": "9606", } gene_symbol = r[2] - gene_name = self.handle_gene_name(r[1]) + gene_name = r[1] if self.gene_info_dict[gene_id]["gene_symbol"] == "": self.gene_info_dict[gene_id]["gene_symbol"] = gene_symbol + self.gene_info_dict[gene_id]["has_gene_symbol"] = True if self.gene_info_dict[gene_id]["gene_symbol"] == "": # Duplicating symbol filling logic from createGAF.pl self.gene_info_dict[gene_id]["gene_symbol"] = gene_id.split(":", maxsplit=1)[1] + self.gene_info_dict[gene_id]["has_gene_symbol"] = False self.gene_info_dict[gene_id]["gene_name"] = gene_name - self.gene_info_dict[gene_id]["unnamed_gene"] = gene_name == "Unnamed gene" self.gene_info_dict[gene_id]["long_id"] = long_id # self.gene_info_dict[gene_id]["taxon_id"] = species_oscode_to_taxon_id[oscode] @@ -146,7 +141,7 @@ def gene_info_list(self): "gene": gene, "gene_symbol": gene_info["gene_symbol"], "gene_name": gene_info["gene_name"], - "unnamed_gene": gene_info["unnamed_gene"], + "has_gene_symbol": gene_info["has_gene_symbol"], "taxon_id": gene_info["taxon_id"], "panther_family": gene_info.get("panther_family"), # can be None "long_id": gene_info.get("long_id"), # can be None