From 8f876072e5feda1156087abf6a0a7c9f1b3bdd88 Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Thu, 6 Mar 2025 11:29:20 -0800 Subject: [PATCH 1/8] support coreml and tflite model testing moves model creation into factories which produce the appropriate model based on the model_path string. geo_inferrer and vision_inferrer are abstract base classes. adds type annotations for the vision & geo inferrer bits, just for the hell of it. --- generate_thresholds.py | 2 +- lib/__init__.py | 0 lib/geo_inferrer.py | 48 ++ lib/geo_inferrer_coreml.py | 21 + lib/geo_inferrer_factory.py | 20 + ...tf_gp_elev_model.py => geo_inferrer_tf.py} | 56 +-- lib/geo_inferrer_tflite.py | 33 ++ lib/inat_inferrer.py | 425 ++++++++++++------ lib/model_test_data_export_manager.py | 2 +- lib/vision_inferrer.py | 65 +-- lib/vision_inferrer_coreml.py | 42 ++ lib/vision_inferrer_factory.py | 20 + lib/vision_inferrer_tf.py | 37 ++ lib/vision_inferrer_tflite.py | 41 ++ lib/vision_testing.py | 4 +- taxon_range_evaluation.py | 2 +- tests/conftest.py | 31 +- tests/test_inat_inferrer.py | 54 +-- tests/test_tf_gp_elev_model.py | 13 +- tests/test_vision_inferrer.py | 14 +- 20 files changed, 651 insertions(+), 279 deletions(-) create mode 100644 lib/__init__.py create mode 100644 lib/geo_inferrer.py create mode 100644 lib/geo_inferrer_coreml.py create mode 100644 lib/geo_inferrer_factory.py rename lib/{tf_gp_elev_model.py => geo_inferrer_tf.py} (53%) create mode 100644 lib/geo_inferrer_tflite.py create mode 100644 lib/vision_inferrer_coreml.py create mode 100644 lib/vision_inferrer_factory.py create mode 100644 lib/vision_inferrer_tf.py create mode 100644 lib/vision_inferrer_tflite.py diff --git a/generate_thresholds.py b/generate_thresholds.py index 47ff87a..f69e9f5 100644 --- a/generate_thresholds.py +++ b/generate_thresholds.py @@ -13,7 +13,7 @@ from sklearn.metrics import precision_recall_curve import warnings from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe -from lib.tf_gp_elev_model import TFGeoPriorModelElev +from lib.geo_inferrer_tf import TFGeoPriorModelElev def ignore_shapely_deprecation_warning(message, category, filename, lineno, file=None, line=None): diff --git a/lib/__init__.py b/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/geo_inferrer.py b/lib/geo_inferrer.py new file mode 100644 index 0000000..4caca4c --- /dev/null +++ b/lib/geo_inferrer.py @@ -0,0 +1,48 @@ +from abc import ABC, abstractmethod +import math + +import numpy as np +import tensorflow as tf + + +class GeoInferrer(ABC): + @abstractmethod + def __init__(self, model_path: str): + """Subclasses must implement this constructor.""" + pass + + @abstractmethod + def predict( + self, latitude: float, longitude: float, elevation: float + ) -> np.ndarray: + """ + given a location, calculate geo results + + Subclasses must implement this method. + """ + pass + + @staticmethod + def encode_loc(latitude, longitude, elevation): + latitude = np.array(latitude) + longitude = np.array(longitude) + elevation = np.array(elevation) + elevation = elevation.astype("float32") + grid_lon = longitude.astype("float32") / 180.0 + grid_lat = latitude.astype("float32") / 90.0 + + elevation[elevation > 0] = elevation[elevation > 0] / 6574.0 + elevation[elevation < 0] = elevation[elevation < 0] / 32768.0 + norm_elev = elevation + + norm_loc = tf.stack([grid_lon, grid_lat], axis=1) + + encoded_loc = tf.concat( + [ + tf.sin(norm_loc * math.pi), + tf.cos(norm_loc * math.pi), + tf.expand_dims(norm_elev, axis=1), + ], + axis=1, + ) + return encoded_loc diff --git a/lib/geo_inferrer_coreml.py b/lib/geo_inferrer_coreml.py new file mode 100644 index 0000000..4b8dcd5 --- /dev/null +++ b/lib/geo_inferrer_coreml.py @@ -0,0 +1,21 @@ +import coremltools as ct +import numpy as np + +from lib.geo_inferrer import GeoInferrer + + +class CoremlGeoPriorModelElev(GeoInferrer): + + def __init__(self, model_path: str): + self.model_path = model_path + self.gpmodel = ct.models.MLModel(self.model_path) + + def predict( + self, latitude: float, longitude: float, elevation: float + ) -> np.ndarray: + encoded_loc = GeoInferrer.encode_loc( + [latitude], [longitude], [elevation] + ).numpy() + out_dict = self.gpmodel.predict({"input_1": encoded_loc}) + preds = out_dict["Identity"][0] + return preds diff --git a/lib/geo_inferrer_factory.py b/lib/geo_inferrer_factory.py new file mode 100644 index 0000000..052a6fd --- /dev/null +++ b/lib/geo_inferrer_factory.py @@ -0,0 +1,20 @@ +from sys import platform + +from lib.geo_inferrer import GeoInferrer +from lib.geo_inferrer_coreml import CoremlGeoPriorModelElev +from lib.geo_inferrer_tflite import TFLiteGeoPriorModelElev +from lib.geo_inferrer_tf import TFGeoPriorModelElev + + +class GeoInferrerFactory: + @staticmethod + def create(model_path: str) -> GeoInferrer: + if "mlmodel" in model_path: + assert platform == "darwin", "CoreML models can only be used on macOS" + return CoremlGeoPriorModelElev(model_path) + elif "tflite" in model_path: + return TFLiteGeoPriorModelElev(model_path) + elif "h5" in model_path: + return TFGeoPriorModelElev(model_path) + else: + raise ValueError(f"Unsupported model format in path: {model_path}") diff --git a/lib/tf_gp_elev_model.py b/lib/geo_inferrer_tf.py similarity index 53% rename from lib/tf_gp_elev_model.py rename to lib/geo_inferrer_tf.py index e086d59..30a84e0 100644 --- a/lib/tf_gp_elev_model.py +++ b/lib/geo_inferrer_tf.py @@ -1,31 +1,34 @@ +import os + import tensorflow as tf import numpy as np -import math -import os + from lib.res_layer import ResLayer +from lib.geo_inferrer import GeoInferrer os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" -class TFGeoPriorModelElev: +class TFGeoPriorModelElev(GeoInferrer): - def __init__(self, model_path): + def __init__(self, model_path: str): # initialize the geo model for inference tf.config.set_visible_devices([], "GPU") visible_devices = tf.config.get_visible_devices() for device in visible_devices: assert device.device_type != "GPU" self.gpmodel = tf.keras.models.load_model( - model_path, - custom_objects={"ResLayer": ResLayer}, - compile=False + model_path, custom_objects={"ResLayer": ResLayer}, compile=False ) - def predict(self, latitude, longitude, elevation): - encoded_loc = TFGeoPriorModelElev.encode_loc([latitude], [longitude], [elevation]) - return self.gpmodel(tf.convert_to_tensor( - tf.expand_dims(encoded_loc[0], axis=0) - ), training=False)[0] + def predict( + self, latitude: float, longitude: float, elevation: float + ) -> np.ndarray: + encoded_loc = GeoInferrer.encode_loc([latitude], [longitude], [elevation]) + output = self.gpmodel( + tf.convert_to_tensor(tf.expand_dims(encoded_loc[0], axis=0)), training=False + )[0] + return output def features_for_one_class_elevation(self, latitude, longitude, elevation): """Evalutes the model for a single class and multiple locations @@ -55,31 +58,10 @@ def eval_one_class_elevation_from_features(self, features, class_of_interest): # process just the one class return tf.math.sigmoid( tf.matmul( - tf.expand_dims(self.gpmodel.layers[5].weights[0][:, class_of_interest], axis=0), + tf.expand_dims( + self.gpmodel.layers[5].weights[0][:, class_of_interest], axis=0 + ), features, - transpose_b=True + transpose_b=True, ) ).numpy() - - @staticmethod - def encode_loc(latitude, longitude, elevation): - latitude = np.array(latitude) - longitude = np.array(longitude) - elevation = np.array(elevation) - elevation = elevation.astype("float32") - grid_lon = longitude.astype("float32") / 180.0 - grid_lat = latitude.astype("float32") / 90.0 - - elevation[elevation > 0] = elevation[elevation > 0] / 6574.0 - elevation[elevation < 0] = elevation[elevation < 0] / 32768.0 - norm_elev = elevation - - norm_loc = tf.stack([grid_lon, grid_lat], axis=1) - - encoded_loc = tf.concat([ - tf.sin(norm_loc * math.pi), - tf.cos(norm_loc * math.pi), - tf.expand_dims(norm_elev, axis=1), - - ], axis=1) - return encoded_loc diff --git a/lib/geo_inferrer_tflite.py b/lib/geo_inferrer_tflite.py new file mode 100644 index 0000000..842a995 --- /dev/null +++ b/lib/geo_inferrer_tflite.py @@ -0,0 +1,33 @@ +import numpy as np +import tensorflow as tf + +from lib.geo_inferrer import GeoInferrer + + +class TFLiteGeoPriorModelElev(GeoInferrer): + + def __init__(self, model_path: str): + self.model_path = model_path + self.interpreter = tf.lite.Interpreter(model_path=self.model_path) + self.interpreter.allocate_tensors() + + def predict( + self, latitude: float, longitude: float, elevation: float + ) -> np.ndarray: + encoded_loc = GeoInferrer.encode_loc( + [latitude], [longitude], [elevation] + ).numpy() + + input_details = self.interpreter.get_input_details() + output_details = self.interpreter.get_output_details() + + input_dtype = input_details[0]["dtype"] + encoded_loc = encoded_loc.astype(input_dtype) + + self.interpreter.set_tensor( + input_details[0]["index"], + encoded_loc, + ) + self.interpreter.invoke() + output_data = self.interpreter.get_tensor(output_details[0]["index"]) + return output_data[0] diff --git a/lib/inat_inferrer.py b/lib/inat_inferrer.py index 73107b8..ef54844 100644 --- a/lib/inat_inferrer.py +++ b/lib/inat_inferrer.py @@ -16,8 +16,8 @@ import asyncio from PIL import Image -from lib.tf_gp_elev_model import TFGeoPriorModelElev -from lib.vision_inferrer import VisionInferrer +from lib.geo_inferrer_factory import GeoInferrerFactory +from lib.vision_inferrer_factory import VisionInferrerFactory from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe pd.options.mode.copy_on_write = True @@ -42,13 +42,18 @@ def __init__(self, config): def setup_taxonomy(self): self.taxonomy = ModelTaxonomyDataframe( self.config["taxonomy_path"], - self.config["tf_elev_thresholds"] if "tf_elev_thresholds" in self.config else None + ( + self.config["tf_elev_thresholds"] + if "tf_elev_thresholds" in self.config + else None + ), ) def check_for_modified_synonyms(self): # only run the refresh check again if `SYNONYMS_CHECK_FREQUENCY` seconds have passed if not hasattr(self, "synonym_refresh_check_time") or ( - time.time() - self.synonym_refresh_check_time > InatInferrer.SYNONYMS_CHECK_FREQUENCY + time.time() - self.synonym_refresh_check_time + > InatInferrer.SYNONYMS_CHECK_FREQUENCY ): self.refresh_synonyms_if_modified() @@ -56,8 +61,9 @@ def refresh_synonyms_if_modified(self): self.synonym_refresh_check_time = time.time() # only process the synonyms file if it has changed since last being processed if os.path.exists(self.config["synonyms_path"]) and ( - not hasattr(self, "synonyms_path_updated_at") or # noqa: W504 - os.path.getmtime(self.config["synonyms_path"]) != self.synonyms_path_updated_at + not hasattr(self, "synonyms_path_updated_at") # noqa: W504 + or os.path.getmtime(self.config["synonyms_path"]) + != self.synonyms_path_updated_at ): self.setup_synonyms() @@ -79,8 +85,8 @@ def setup_synonyms(self): "rank_level": float, "name": pd.StringDtype(), "iconic_taxon_id": "Int64", - "rank": pd.StringDtype() - } + "rank": pd.StringDtype(), + }, ) # create a dict indexed by model_taxon_id for efficient synonym mappings at inference time @@ -103,7 +109,11 @@ def setup_synonym_taxonomy(self): synonym_taxonomy = ModelTaxonomyDataframe( self.config["synonyms_taxonomy_path"], - self.config["tf_elev_thresholds"] if "tf_elev_thresholds" in self.config else None + ( + self.config["tf_elev_thresholds"] + if "tf_elev_thresholds" in self.config + else None + ), ) # ensure the leaf_class_ids from the synonym taxonomy are identical # to the taxonomy generated at data export time @@ -112,11 +122,15 @@ def setup_synonym_taxonomy(self): print(error) return - synonym_taxon_ids = np.unique(pd.array(self.synonyms["taxon_id"].dropna().values)) + synonym_taxon_ids = np.unique( + pd.array(self.synonyms["taxon_id"].dropna().values) + ) synonym_taxonomy_taxon_ids = np.unique( - pd.array(synonym_taxonomy.df[ - synonym_taxonomy.df.taxon_id.isin(synonym_taxon_ids) - ]["taxon_id"].values) + pd.array( + synonym_taxonomy.df[ + synonym_taxonomy.df.taxon_id.isin(synonym_taxon_ids) + ]["taxon_id"].values + ) ) synonym_taxon_ids_not_present_in_taxonomy = np.setdiff1d( synonym_taxon_ids, synonym_taxonomy_taxon_ids @@ -124,23 +138,25 @@ def setup_synonym_taxonomy(self): # ensure all taxa referenced in the synonym mappings file are present in the # updated taxonomy that should include all original taxa plus all synonyms if synonym_taxon_ids_not_present_in_taxonomy.size > 0: - error = "There are taxa in the synonyms file not present in the synonyms " + \ - f"taxonomy: {synonym_taxon_ids_not_present_in_taxonomy}" + error = ( + "There are taxa in the synonyms file not present in the synonyms " + + f"taxonomy: {synonym_taxon_ids_not_present_in_taxonomy}" + ) print(error) return synonym_taxonomy.leaf_df["has_synonyms"] = False # mark taxa that should be replaced or removed as having synonyms - for index, taxon in self.taxonomy.leaf_df[self.taxonomy.leaf_df["taxon_id"].isin( - self.synonyms["model_taxon_id"] - )].iterrows(): + for index, taxon in self.taxonomy.leaf_df[ + self.taxonomy.leaf_df["taxon_id"].isin(self.synonyms["model_taxon_id"]) + ].iterrows(): synonym_taxonomy.leaf_df.loc[taxon["leaf_class_id"], "has_synonyms"] = True # replace the originally exported taxonomy with the updated taxonomy that includes synonyms self.taxonomy = synonym_taxonomy def setup_vision_model(self): - self.vision_inferrer = VisionInferrer( + self.vision_inferrer = VisionInferrerFactory.create( self.config["vision_model_path"] ) @@ -150,8 +166,12 @@ def setup_elevation_dataframe(self): return # load elevation data stored at H3 resolution 4 - self.geo_elevation_cells = pd.read_csv(self.config["elevation_h3_r4"]). \ - sort_values("h3_04").set_index("h3_04").sort_index() + self.geo_elevation_cells = ( + pd.read_csv(self.config["elevation_h3_r4"]) + .sort_values("h3_04") + .set_index("h3_04") + .sort_index() + ) self.geo_elevation_cells = InatInferrer.add_lat_lng_to_h3_geo_dataframe( self.geo_elevation_cells ) @@ -173,7 +193,11 @@ def setup_elevation_dataframe_from_worldclim(self, resolution): im_df = im_df.melt(id_vars=["index"]) im_df.columns = ["lat", "lng", "elevation"] elev_dfh3 = im_df.h3.geo_to_h3(resolution) - elev_dfh3 = elev_dfh3.drop(columns=["lng", "lat"]).groupby(f"h3_0{resolution}").mean() + elev_dfh3 = ( + elev_dfh3.drop(columns=["lng", "lat"]) + .groupby(f"h3_0{resolution}") + .mean() + ) def setup_geo_model(self): self.geo_elevation_model = None @@ -184,19 +208,25 @@ def setup_geo_model(self): if self.geo_elevation_cells is None: return - self.geo_elevation_model = TFGeoPriorModelElev(self.config["tf_geo_elevation_model_path"]) - self.geo_model_features = self.geo_elevation_model.features_for_one_class_elevation( - latitude=list(self.geo_elevation_cells.lat), - longitude=list(self.geo_elevation_cells.lng), - elevation=list(self.geo_elevation_cells.elevation) + self.geo_elevation_model = GeoInferrerFactory.create( + self.config["tf_geo_elevation_model_path"] ) + if hasattr(self.geo_elevation_model, "features_for_one_class_elevation"): + self.geo_model_features = ( + self.geo_elevation_model.features_for_one_class_elevation( + latitude=list(self.geo_elevation_cells.lat), + longitude=list(self.geo_elevation_cells.lng), + elevation=list(self.geo_elevation_cells.elevation), + ) + ) + def vision_predict(self, image, debug=False): if debug: start_time = time.time() results = self.vision_inferrer.process_image(image) if debug: - print("Vision Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + print("Vision Time: %0.2fms" % ((time.time() - start_time) * 1000.0)) return results def geo_model_predict(self, lat, lng, debug=False): @@ -214,9 +244,10 @@ def geo_model_predict(self, lat, lng, debug=False): # get the average elevation of the above H3 cell elevation = self.geo_elevation_cells.loc[h3_cell].elevation geo_scores = self.geo_elevation_model.predict( - h3_cell_centroid[0], h3_cell_centroid[1], float(elevation)) + h3_cell_centroid[0], h3_cell_centroid[1], float(elevation) + ) if debug: - print("Geo Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + print("Geo Time: %0.2fms" % ((time.time() - start_time) * 1000.0)) return geo_scores def lookup_taxon(self, taxon_id): @@ -232,6 +263,11 @@ def lookup_taxon(self, taxon_id): def predictions_for_image(self, file_path, lat, lng, filter_taxon, debug=False): if debug: start_time = time.time() + + # if isinstance(self.vision_inferrer, VisionInferrerCoreML): + # image = self.vision_inferrer.load_image(file_path) + # vision_model_results = self.vision_predict(image, debug) + # else: image = InatInferrer.prepare_image_for_inference(file_path) vision_model_results = self.vision_predict(image, debug) raw_vision_scores = vision_model_results["predictions"] @@ -244,16 +280,18 @@ def predictions_for_image(self, file_path, lat, lng, filter_taxon, debug=False): # possible value, and thus all its taxa will not be considered "expected nearby" combined_scores["geo_threshold"] = combined_scores["geo_threshold"].fillna(1) if debug: - print("Prediction Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + print("Prediction Time: %0.2fms" % ((time.time() - start_time) * 1000.0)) return { "combined_scores": combined_scores, - "features": vision_model_results["features"] + "features": vision_model_results["features"], } - def combine_results(self, raw_vision_scores, raw_geo_scores, filter_taxon, debug=False): + def combine_results( + self, raw_vision_scores, raw_geo_scores, filter_taxon, debug=False + ): if debug: start_time = time.time() - no_geo_scores = (raw_geo_scores is None) + no_geo_scores = raw_geo_scores is None # make a copy of the model taxonomy leaf nodes to be used for storing results. Skip any # filtering at this stage as the taxonomy dataframe needs to have the same number of @@ -264,8 +302,11 @@ def combine_results(self, raw_vision_scores, raw_geo_scores, filter_taxon, debug # add a column for geo scores leaf_scores["geo_score"] = 0 if no_geo_scores else raw_geo_scores # set a lower limit for geo scores if there are any - leaf_scores["normalized_geo_score"] = 0 if no_geo_scores \ + leaf_scores["normalized_geo_score"] = ( + 0 + if no_geo_scores else leaf_scores["geo_score"].clip(InatInferrer.MINIMUM_GEO_SCORE, None) + ) # if filtering by a taxon, restrict results to that taxon and its descendants if filter_taxon is not None: @@ -275,8 +316,9 @@ def combine_results(self, raw_vision_scores, raw_geo_scores, filter_taxon, debug ) # normalize the vision scores so they add up to 1 after filtering sum_of_vision_scores = leaf_scores["vision_score"].sum() - leaf_scores["normalized_vision_score"] = \ + leaf_scores["normalized_vision_score"] = ( leaf_scores["vision_score"] / sum_of_vision_scores + ) else: # when not filtering by a taxon, the normalized vision score is the same as the original leaf_scores["normalized_vision_score"] = leaf_scores["vision_score"] @@ -288,18 +330,26 @@ def combine_results(self, raw_vision_scores, raw_geo_scores, filter_taxon, debug else: # the combined score is simply the normalized vision score # multipliedby the normalized geo score - leaf_scores["combined_score"] = leaf_scores["normalized_vision_score"] * \ - leaf_scores["normalized_geo_score"] + leaf_scores["combined_score"] = ( + leaf_scores["normalized_vision_score"] + * leaf_scores["normalized_geo_score"] + ) - sum_of_root_node_aggregated_combined_scores = leaf_scores["combined_score"].sum() + sum_of_root_node_aggregated_combined_scores = leaf_scores[ + "combined_score" + ].sum() if sum_of_root_node_aggregated_combined_scores > 0: - leaf_scores["normalized_combined_score"] = leaf_scores[ - "combined_score"] / sum_of_root_node_aggregated_combined_scores + leaf_scores["normalized_combined_score"] = ( + leaf_scores["combined_score"] + / sum_of_root_node_aggregated_combined_scores + ) else: leaf_scores["normalized_combined_score"] = 0 if debug: - print("Score Combining Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + print( + "Score Combining Time: %0.2fms" % ((time.time() - start_time) * 1000.0) + ) leaf_scores.reset_index(drop=True, inplace=True) return leaf_scores @@ -339,18 +389,24 @@ def map_result_synonyms(self, leaf_scores, debug=False): leaf_scores = leaf_scores.query("has_synonyms == False") if replacements: # inject the synonym replacements into leaf_scores - leaf_scores = pd.concat([ - leaf_scores, - pd.DataFrame.from_dict(replacements, orient="index") - ], axis=0) + leaf_scores = pd.concat( + [leaf_scores, pd.DataFrame.from_dict(replacements, orient="index")], + axis=0, + ) if debug: - print("Synonym Mapping Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + print( + "Synonym Mapping Time: %0.2fms" % ((time.time() - start_time) * 1000.0) + ) return leaf_scores - def aggregate_results(self, leaf_scores, debug=False, - score_ratio_cutoff=0.001, - max_leaf_scores_to_consider=None, - column_for_cutoff="combined_score"): + def aggregate_results( + self, + leaf_scores, + debug=False, + score_ratio_cutoff=0.001, + max_leaf_scores_to_consider=None, + column_for_cutoff="combined_score", + ): if debug: start_time = time.time() @@ -359,26 +415,38 @@ def aggregate_results(self, leaf_scores, debug=False, # copy columns from the already calculated leaf scores including scores # and class_id columns which will not be populated for synonyms in the taxonomy - all_node_scores = pd.merge(all_node_scores, leaf_scores[[ - "taxon_id", "vision_score", "normalized_vision_score", "geo_score", "combined_score", - "normalized_geo_score", "leaf_class_id", "iconic_class_id", "spatial_class_id"]], + all_node_scores = pd.merge( + all_node_scores, + leaf_scores[ + [ + "taxon_id", + "vision_score", + "normalized_vision_score", + "geo_score", + "combined_score", + "normalized_geo_score", + "leaf_class_id", + "iconic_class_id", + "spatial_class_id", + ] + ], on="taxon_id", how="left", - suffixes=["_x", None] + suffixes=["_x", None], ).set_index("taxon_id", drop=False) # calculate the highest combined score from leaf_scores - top_combined_score = leaf_scores.sort_values( - column_for_cutoff, ascending=False - ).head(1)[column_for_cutoff].values[0] + top_combined_score = ( + leaf_scores.sort_values(column_for_cutoff, ascending=False) + .head(1)[column_for_cutoff] + .values[0] + ) # define some cutoff based on a percentage of the top combined score. Taxa with # scores below the cutoff will be ignored when aggregating scores up the taxonomy cutoff = top_combined_score * score_ratio_cutoff # restrict score aggregation to results where the combined score is above the cutoff - scores_to_aggregate = leaf_scores.query( - f"{column_for_cutoff} > {cutoff}" - ) + scores_to_aggregate = leaf_scores.query(f"{column_for_cutoff} > {cutoff}") if max_leaf_scores_to_consider is not None: scores_to_aggregate = scores_to_aggregate.sort_values( column_for_cutoff, ascending=False @@ -391,7 +459,7 @@ def aggregate_results(self, leaf_scores, debug=False, scores_to_aggregate["normalized_vision_score"], scores_to_aggregate["geo_score"], scores_to_aggregate["combined_score"], - scores_to_aggregate["geo_threshold"] + scores_to_aggregate["geo_threshold"], ): # loop through the pre-calculated ancestors of this result's taxon for ancestor_taxon_id in self.taxonomy.taxon_ancestors[taxon_id]: @@ -399,23 +467,36 @@ def aggregate_results(self, leaf_scores, debug=False, if ancestor_taxon_id not in aggregated_scores: aggregated_scores[ancestor_taxon_id] = {} aggregated_scores[ancestor_taxon_id]["aggregated_vision_score"] = 0 - aggregated_scores[ancestor_taxon_id]["aggregated_combined_score"] = 0 - aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = 0 aggregated_scores[ancestor_taxon_id][ - "aggregated_geo_threshold" - ] = geo_threshold if (ancestor_taxon_id == taxon_id) else 1.0 + "aggregated_combined_score" + ] = 0 + aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = 0 + aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] = ( + geo_threshold if (ancestor_taxon_id == taxon_id) else 1.0 + ) # aggregated vision and combined scores are sums of descendant scores - aggregated_scores[ancestor_taxon_id]["aggregated_vision_score"] += vision_score - aggregated_scores[ancestor_taxon_id]["aggregated_combined_score"] += combined_score + aggregated_scores[ancestor_taxon_id][ + "aggregated_vision_score" + ] += vision_score + aggregated_scores[ancestor_taxon_id][ + "aggregated_combined_score" + ] += combined_score # aggregated geo score is the max of descendant geo scores - if geo_score > aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"]: - aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = geo_score + if ( + geo_score + > aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] + ): + aggregated_scores[ancestor_taxon_id][ + "aggregated_geo_score" + ] = geo_score # aggregated geo threshold is the min of descendant geo thresholds - if ancestor_taxon_id != taxon_id and geo_threshold < aggregated_scores[ - ancestor_taxon_id - ]["aggregated_geo_threshold"]: + if ( + ancestor_taxon_id != taxon_id + and geo_threshold + < aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] + ): aggregated_scores[ancestor_taxon_id][ "aggregated_geo_threshold" ] = geo_threshold @@ -429,16 +510,22 @@ def aggregate_results(self, leaf_scores, debug=False, # calculate normalized scores so all values add to 1, to be used for thresholding sum_of_root_node_aggregated_vision_scores = all_node_scores.query( - "parent_taxon_id.isnull()")["aggregated_vision_score"].sum() - all_node_scores["normalized_aggregated_vision_score"] = all_node_scores[ - "aggregated_vision_score"] / sum_of_root_node_aggregated_vision_scores + "parent_taxon_id.isnull()" + )["aggregated_vision_score"].sum() + all_node_scores["normalized_aggregated_vision_score"] = ( + all_node_scores["aggregated_vision_score"] + / sum_of_root_node_aggregated_vision_scores + ) sum_of_root_node_aggregated_combined_scores = all_node_scores.query( - "parent_taxon_id.isnull()")["aggregated_combined_score"].sum() - all_node_scores["normalized_aggregated_combined_score"] = all_node_scores[ - "aggregated_combined_score"] / sum_of_root_node_aggregated_combined_scores + "parent_taxon_id.isnull()" + )["aggregated_combined_score"].sum() + all_node_scores["normalized_aggregated_combined_score"] = ( + all_node_scores["aggregated_combined_score"] + / sum_of_root_node_aggregated_combined_scores + ) if debug: - print("Aggregation Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + print("Aggregation Time: %0.2fms" % ((time.time() - start_time) * 1000.0)) # InatInferrer.print_aggregated_scores(all_node_scores) return all_node_scores @@ -460,13 +547,19 @@ def h3_04_geo_results_for_taxon_and_cell(self, taxon_id, lat, lng): return None h3_cell = h3.geo_to_h3(lat_float, lng_float, 4) - return float(self.geo_elevation_model.eval_one_class_elevation_from_features( - [self.geo_model_features[self.geo_elevation_cell_indices[h3_cell]]], - int(taxon["leaf_class_id"]) - )[0][0]) / taxon["geo_threshold"] + return ( + float( + self.geo_elevation_model.eval_one_class_elevation_from_features( + [self.geo_model_features[self.geo_elevation_cell_indices[h3_cell]]], + int(taxon["leaf_class_id"]), + )[0][0] + ) + / taxon["geo_threshold"] + ) - def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], - thresholded=False, raw_results=False): + def h3_04_geo_results_for_taxon( + self, taxon_id, bounds=[], thresholded=False, raw_results=False + ): if (self.geo_elevation_cells is None) or (self.geo_elevation_model is None): return try: @@ -478,11 +571,14 @@ def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], return geo_scores = self.geo_elevation_model.eval_one_class_elevation_from_features( - self.geo_model_features, int(taxon["leaf_class_id"])) + self.geo_model_features, int(taxon["leaf_class_id"]) + ) geo_score_cells = self.geo_elevation_cells.copy() geo_score_cells["geo_score"] = tf.squeeze(geo_scores).numpy() if thresholded: - geo_score_cells = geo_score_cells.query(f'geo_score >= {taxon["geo_threshold"]}') + geo_score_cells = geo_score_cells.query( + f'geo_score >= {taxon["geo_threshold"]}' + ) else: # return scores more than 10% of the taxon threshold, or more than 0.0001, whichever # is smaller. This reduces data needed to be redendered client-side for the Data Layer @@ -493,36 +589,48 @@ def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], if bounds: min = geo_score_cells["geo_score"].min() max = geo_score_cells["geo_score"].max() - geo_score_cells = InatInferrer.filter_geo_dataframe_by_bounds(geo_score_cells, bounds) + geo_score_cells = InatInferrer.filter_geo_dataframe_by_bounds( + geo_score_cells, bounds + ) if min == max: # all scores are the same, so no transform is needed and all cells get the max value geo_score_cells["geo_score"] = 1 else: # perform a log transform based on the min/max score for the unbounded set - geo_score_cells["geo_score"] = \ - (np.log10(geo_score_cells["geo_score"]) - math.log10(min)) / \ - (math.log10(max) - math.log10(min)) + geo_score_cells["geo_score"] = ( + np.log10(geo_score_cells["geo_score"]) - math.log10(min) + ) / (math.log10(max) - math.log10(min)) if raw_results: return geo_score_cells - return dict(zip(geo_score_cells.index.astype(str), geo_score_cells["geo_score"])) + return dict( + zip(geo_score_cells.index.astype(str), geo_score_cells["geo_score"]) + ) def h3_04_taxon_range(self, taxon_id, bounds=[]): - taxon_range_path = os.path.join(self.config["taxon_ranges_path"], f"{taxon_id}.csv") + taxon_range_path = os.path.join( + self.config["taxon_ranges_path"], f"{taxon_id}.csv" + ) if not os.path.exists(taxon_range_path): return None - taxon_range_df = pd.read_csv(taxon_range_path, names=["h3_04"], header=None). \ - sort_values("h3_04").set_index("h3_04").sort_index() + taxon_range_df = ( + pd.read_csv(taxon_range_path, names=["h3_04"], header=None) + .sort_values("h3_04") + .set_index("h3_04") + .sort_index() + ) taxon_range_df = InatInferrer.add_lat_lng_to_h3_geo_dataframe(taxon_range_df) if bounds: - taxon_range_df = InatInferrer.filter_geo_dataframe_by_bounds(taxon_range_df, bounds) + taxon_range_df = InatInferrer.filter_geo_dataframe_by_bounds( + taxon_range_df, bounds + ) taxon_range_df["value"] = 1 return dict(zip(taxon_range_df.index.astype(str), taxon_range_df["value"])) def h3_04_taxon_range_comparison(self, taxon_id, bounds=[]): - geomodel_results = self.h3_04_geo_results_for_taxon( - taxon_id, bounds, thresholded=True - ) or {} + geomodel_results = ( + self.h3_04_geo_results_for_taxon(taxon_id, bounds, thresholded=True) or {} + ) taxon_range_results = self.h3_04_taxon_range(taxon_id, bounds) or {} combined_results = {} for cell_key in geomodel_results: @@ -537,7 +645,8 @@ def h3_04_taxon_range_comparison(self, taxon_id, bounds=[]): def h3_04_bounds(self, taxon_id): geomodel_results = self.h3_04_geo_results_for_taxon( - taxon_id, bounds=None, thresholded=True, raw_results=True) + taxon_id, bounds=None, thresholded=True, raw_results=True + ) if geomodel_results is None: return swlat = geomodel_results["lat"].min() @@ -551,16 +660,15 @@ def h3_04_bounds(self, taxon_id): if swlng == nelng: swlng -= 0.3 nelng += 0.3 - return { - "swlat": swlat, - "swlng": swlng, - "nelat": nelat, - "nelng": nelng - } + return {"swlat": swlat, "swlng": swlng, "nelat": nelat, "nelng": nelng} def common_ancestor_from_leaf_scores( - self, leaf_scores, debug=False, score_to_use="combined_score", disallow_humans=False, - common_ancestor_rank_type=None + self, + leaf_scores, + debug=False, + score_to_use="combined_score", + disallow_humans=False, + common_ancestor_rank_type=None, ): if leaf_scores.empty: return None @@ -569,22 +677,29 @@ def common_ancestor_from_leaf_scores( debug=debug, score_ratio_cutoff=InatInferrer.COMMON_ANCESTOR_CUTOFF_RATIO, max_leaf_scores_to_consider=InatInferrer.COMMON_ANCESTOR_WINDOW, - column_for_cutoff=score_to_use + column_for_cutoff=score_to_use, ) return self.common_ancestor_from_aggregated_scores( aggregated_scores, debug=debug, score_to_use=score_to_use, disallow_humans=disallow_humans, - common_ancestor_rank_type=common_ancestor_rank_type + common_ancestor_rank_type=common_ancestor_rank_type, ) def common_ancestor_from_aggregated_scores( - self, aggregated_scores, debug=False, score_to_use="combined_score", disallow_humans=False, - common_ancestor_rank_type=None + self, + aggregated_scores, + debug=False, + score_to_use="combined_score", + disallow_humans=False, + common_ancestor_rank_type=None, ): - aggregated_score_to_use = "normalized_aggregated_vision_score" if \ - score_to_use == "vision_score" else "normalized_aggregated_combined_score" + aggregated_score_to_use = ( + "normalized_aggregated_vision_score" + if score_to_use == "vision_score" + else "normalized_aggregated_combined_score" + ) common_ancestor_query = f"{aggregated_score_to_use} > 0.78 and rank_level >= 20" if common_ancestor_rank_type == "major": common_ancestor_query += " and rank_level % 10 == 0" @@ -592,20 +707,28 @@ def common_ancestor_from_aggregated_scores( common_ancestor_query += " and rank_level <= 33" # if using combined scores to aggregate, and there are taxa expected nearby, # then add a query filter to only look at nearby taxa as common ancestor candidates - if aggregated_score_to_use == "normalized_aggregated_combined_score" and not \ - aggregated_scores.query("aggregated_geo_score >= aggregated_geo_threshold").empty: - common_ancestor_query += " and aggregated_geo_score >= aggregated_geo_threshold" + if ( + aggregated_score_to_use == "normalized_aggregated_combined_score" + and not aggregated_scores.query( + "aggregated_geo_score >= aggregated_geo_threshold" + ).empty + ): + common_ancestor_query += ( + " and aggregated_geo_score >= aggregated_geo_threshold" + ) common_ancestor_candidates = aggregated_scores.query( common_ancestor_query - ).sort_values( - by=["rank_level"] - ) + ).sort_values(by=["rank_level"]) if common_ancestor_candidates.empty: return None common_ancestor = common_ancestor_candidates.iloc[0] - if disallow_humans and self.taxonomy.human_taxon is not None and \ - common_ancestor["taxon_id"] == self.taxonomy.human_taxon["parent_taxon_id"]: + if ( + disallow_humans + and self.taxonomy.human_taxon is not None + and common_ancestor["taxon_id"] + == self.taxonomy.human_taxon["parent_taxon_id"] + ): return None return common_ancestor @@ -615,10 +738,11 @@ def limit_leaf_scores_that_include_humans(self, leaf_scores): return leaf_scores top_results = leaf_scores.sort_values( - "combined_score", - ascending=False + "combined_score", ascending=False ).reset_index(drop=True) - human_results = top_results.query(f"taxon_id == {self.taxonomy.human_taxon['taxon_id']}") + human_results = top_results.query( + f"taxon_id == {self.taxonomy.human_taxon['taxon_id']}" + ) # there is only 1 result, or humans aren't in the top results if human_results.empty or top_results.index.size == 1: return leaf_scores @@ -627,8 +751,10 @@ def limit_leaf_scores_that_include_humans(self, leaf_scores): human_result_index = human_results.index[0] # if humans is first and has a substantially higher score than the next, return only humans if human_result_index == 0: - human_score_margin = top_results.iloc[0]["combined_score"] / \ - top_results.iloc[1]["combined_score"] + human_score_margin = ( + top_results.iloc[0]["combined_score"] + / top_results.iloc[1]["combined_score"] + ) if human_score_margin > 1.5: return top_results.head(1) @@ -639,8 +765,12 @@ async def embeddings_for_photos(self, photos): response = {} async with aiohttp.ClientSession() as session: queue = asyncio.Queue() - workers = [asyncio.create_task(self.embeddings_worker_task(queue, response, session)) - for _ in range(5)] + workers = [ + asyncio.create_task( + self.embeddings_worker_task(queue, response, session) + ) + for _ in range(5) + ] for photo in photos: queue.put_nowait(photo) await queue.join() @@ -675,7 +805,7 @@ def signature_for_image(self, image_path, debug=False): image = InatInferrer.prepare_image_for_inference(image_path) signature = self.vision_inferrer.process_image(image)["features"] if debug: - print("Signature Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + print("Signature Time: %0.2fms" % ((time.time() - start_time) * 1000.0)) if signature is None: return return signature.numpy().tolist() @@ -723,7 +853,7 @@ def prepare_image_for_inference(file_path): image, [new_height, new_width], method=tf.image.ResizeMethod.AREA, - preserve_aspect_ratio=True + preserve_aspect_ratio=True, ) # determine the upper-left corner that needs to be used to grab the square crop offset_height = math.floor((new_height - eventual_size) / 2) @@ -767,9 +897,11 @@ def filter_geo_dataframe_by_bounds(geo_df, bounds): # query for cells wtihin the buffered bounds, and potentially # on the other side of the antimeridian - query = f"lat >= {bounds[0] - buffer} and lat <= {bounds[2] + buffer} and " + \ - f" ((lng >= {bounds[1] - buffer} and lng <= {bounds[3] + buffer})" + \ - f" {antimedirian_condition})" + query = ( + f"lat >= {bounds[0] - buffer} and lat <= {bounds[2] + buffer} and " + + f" ((lng >= {bounds[1] - buffer} and lng <= {bounds[3] + buffer})" + + f" {antimedirian_condition})" + ) return geo_df.query(query) @staticmethod @@ -778,14 +910,17 @@ def print_aggregated_scores(aggregated_scores): "normalized_aggregated_combined_score > 0.005" ) print("\nTree of aggregated results:") - ModelTaxonomyDataframe.print(thresholded_results, display_taxon_lambda=( - lambda row: f"{row.name} [" - f"ID:{row.taxon_id}, " - f"V:{round(row.aggregated_vision_score, 4)}, " - f"NV:{round(row.normalized_aggregated_vision_score, 4)}, " - f"G:{round(row.aggregated_geo_score, 4)}, " - f"GT:{round(row.aggregated_geo_threshold, 4)}, " - f"C:{round(row.aggregated_combined_score, 4)}, " - f"NC:{round(row.normalized_aggregated_combined_score, 4)}]" - )) + ModelTaxonomyDataframe.print( + thresholded_results, + display_taxon_lambda=( + lambda row: f"{row.name} [" + f"ID:{row.taxon_id}, " + f"V:{round(row.aggregated_vision_score, 4)}, " + f"NV:{round(row.normalized_aggregated_vision_score, 4)}, " + f"G:{round(row.aggregated_geo_score, 4)}, " + f"GT:{round(row.aggregated_geo_threshold, 4)}, " + f"C:{round(row.aggregated_combined_score, 4)}, " + f"NC:{round(row.normalized_aggregated_combined_score, 4)}]" + ), + ) print("") diff --git a/lib/model_test_data_export_manager.py b/lib/model_test_data_export_manager.py index 3aa35f9..ecf5450 100644 --- a/lib/model_test_data_export_manager.py +++ b/lib/model_test_data_export_manager.py @@ -14,7 +14,7 @@ def load_train_data_photo_ids(self): return self.train_data_photo_ids = pd.concat( - map(lambda x: pd.read_csv(x, usecols=["photo_id"]), + map(lambda x: pd.read_parquet(x, columns=["photo_id"]), self.cmd_args["exclude_train_photos_path"]) ).drop_duplicates("photo_id").set_index("photo_id").sort_index().index diff --git a/lib/vision_inferrer.py b/lib/vision_inferrer.py index 04aebcb..42f937b 100644 --- a/lib/vision_inferrer.py +++ b/lib/vision_inferrer.py @@ -1,35 +1,36 @@ +from abc import ABC, abstractmethod +from typing import Optional, TypedDict + +import numpy as np import tensorflow as tf -class VisionInferrer: - - def __init__(self, model_path): - self.model_path = model_path - self.prepare_tf_model() - - # initialize the TF model given the configured path - def prepare_tf_model(self): - # disable GPU processing - tf.config.set_visible_devices([], "GPU") - visible_devices = tf.config.get_visible_devices() - for device in visible_devices: - assert device.device_type != "GPU" - - full_model = tf.keras.models.load_model(self.model_path, compile=False) - self.layered_model = tf.keras.Model( - inputs=full_model.inputs, - outputs=[ - full_model.layers[4].output, - full_model.layers[2].output - ] - ) - self.layered_model.compile() - - # given an image object (usually coming from prepare_image_for_inference), - # calculate vision results for the image - def process_image(self, image): - layer_results = self.layered_model(tf.convert_to_tensor(image), training=False) - return { - "predictions": layer_results[0][0], - "features": layer_results[1][0], - } +class VisionResults(TypedDict): + predictions: np.ndarray + features: Optional[np.ndarray] + + +class VisionInferrer(ABC): + @abstractmethod + def __init__(self, model_path: str): + """Subclasses must implement this constructor.""" + pass + + @abstractmethod + def prepare_model(self): + """ + Initialize the model. + + Subclasses must implement this method. + """ + pass + + @abstractmethod + def process_image(self, image: tf.Tensor) -> VisionResults: + """ + given an image object (usually coming from prepare_image_for_inference), + calculate vision results for the image + + Subclasses must implement this method. + """ + pass diff --git a/lib/vision_inferrer_coreml.py b/lib/vision_inferrer_coreml.py new file mode 100644 index 0000000..2718eed --- /dev/null +++ b/lib/vision_inferrer_coreml.py @@ -0,0 +1,42 @@ +import coremltools as ct +from PIL import Image +import tensorflow as tf + +from lib.vision_inferrer import VisionInferrer, VisionResults + + +class VisionInferrerCoreML(VisionInferrer): + """Vision Inferrer for the CoreML variant of iNat vision models. + Our implementation expects a single PIL image in the range [0, 255). + """ + + def __init__(self, model_path: str): + self.model_path = model_path + self.prepare_model() + + def prepare_model(self): + """initialize the CoreML model given the configured path""" + self.model = ct.models.MLModel(self.model_path) + spec = self.model.get_spec() + self.input_name = spec.description.input[0].name + + def process_image(self, image_tensor: tf.Tensor) -> VisionResults: + """given an image object (coming from prepare_image_for_inference), + calculate & return vision results for the image.""" + # coreml expects a PIL image so we have to convert from tf + # first we convert from floats [0, 1) to ints [0, 255) + image = tf.image.convert_image_dtype(image_tensor, dtype=tf.uint8) + + # Remove batch dimension if present and convert to NumPy array + image_numpy = image.numpy() + if image_numpy.ndim == 4: + image_numpy = image_numpy[0] + + # Create PIL Image from NumPy array + image_pil = Image.fromarray(image_numpy) + + out_dict = self.model.predict({self.input_name: image_pil}) + preds = out_dict["Identity"][0] + + # don't return features, not relevant for coreml at this point + return {"predictions": preds, "features": None} diff --git a/lib/vision_inferrer_factory.py b/lib/vision_inferrer_factory.py new file mode 100644 index 0000000..ffccc58 --- /dev/null +++ b/lib/vision_inferrer_factory.py @@ -0,0 +1,20 @@ +from sys import platform + +from lib.vision_inferrer import VisionInferrer +from lib.vision_inferrer_coreml import VisionInferrerCoreML +from lib.vision_inferrer_tflite import VisionInferrerTFLite +from lib.vision_inferrer_tf import VisionInferrerTF + + +class VisionInferrerFactory: + @staticmethod + def create(model_path: str) -> VisionInferrer: + if "mlmodel" in model_path: + assert platform == "darwin", "CoreML models can only be used on macOS" + return VisionInferrerCoreML(model_path) + elif "tflite" in model_path: + return VisionInferrerTFLite(model_path) + elif "h5" in model_path: + return VisionInferrerTF(model_path) + else: + raise ValueError(f"Unsupported model format in path: {model_path}") diff --git a/lib/vision_inferrer_tf.py b/lib/vision_inferrer_tf.py new file mode 100644 index 0000000..63ef88b --- /dev/null +++ b/lib/vision_inferrer_tf.py @@ -0,0 +1,37 @@ +import tensorflow as tf + +from lib.vision_inferrer import VisionInferrer, VisionResults + + +class VisionInferrerTF(VisionInferrer): + """Vision Inferrer for the TF variant of iNat vision models. + Our implementation expects inputs in the range [0, 1). + """ + + def __init__(self, model_path: str): + self.model_path = model_path + self.prepare_model() + + def prepare_model(self): + """initialize the TF model given the configured path""" + # disable GPU processing + tf.config.set_visible_devices([], "GPU") + visible_devices = tf.config.get_visible_devices() + for device in visible_devices: + assert device.device_type != "GPU" + + full_model = tf.keras.models.load_model(self.model_path, compile=False) + self.layered_model = tf.keras.Model( + inputs=full_model.inputs, + outputs=[full_model.layers[-1].output, full_model.layers[2].output], + ) + self.layered_model.compile() + + def process_image(self, image: tf.Tensor) -> VisionResults: + """given an image object (coming from prepare_image_for_inference), + calculate & return vision results for the image.""" + layer_results = self.layered_model(tf.convert_to_tensor(image), training=False) + return { + "predictions": layer_results[0][0], + "features": layer_results[1][0], + } diff --git a/lib/vision_inferrer_tflite.py b/lib/vision_inferrer_tflite.py new file mode 100644 index 0000000..a22ce34 --- /dev/null +++ b/lib/vision_inferrer_tflite.py @@ -0,0 +1,41 @@ +import tensorflow as tf + +from lib.vision_inferrer import VisionInferrer, VisionResults + + +class VisionInferrerTFLite(VisionInferrer): + """Vision Inferrer for the tflite variant of iNat vision models. + Our implementation expects inputs in the range [0, 255). + """ + + def __init__(self, model_path: str): + self.model_path = model_path + self.prepare_model() + + def prepare_model(self): + """initialize the tflite model given the configured path""" + self.interpreter = tf.lite.Interpreter(model_path=self.model_path) + self.interpreter.allocate_tensors() + + self.input_details = self.interpreter.get_input_details() + self.output_details = self.interpreter.get_output_details() + + def process_image(self, image_tensor: tf.Tensor) -> VisionResults: + """given an image object (coming from prepare_image_for_inference), + calculate & return vision results for the image.""" + # tflite expects an image in range [0, 255] not [0, 1] + image_tensor = image_tensor * 255 + + # set the input to tflite model + input_dtype = self.input_details[0]["dtype"] + input_data = image_tensor.numpy().astype(input_dtype) + self.interpreter.set_tensor(self.input_details[0]["index"], input_data) + + # execute the tflite model + self.interpreter.invoke() + + # extract the output + output_data = self.interpreter.get_tensor(self.output_details[0]["index"]) + + # don't return features, not relevant for tflite at this point + return {"predictions": output_data[0], "features": None} diff --git a/lib/vision_testing.py b/lib/vision_testing.py index 142fe8a..8ed853c 100644 --- a/lib/vision_testing.py +++ b/lib/vision_testing.py @@ -435,7 +435,9 @@ def summarize_result_subset( ) / sum_of_precision_and_recall summary["top_score"] = top_normalized_score - summary["matching_score"] = self.matching_score(observation, working_results, normalized_score_column) + summary["matching_score"] = self.matching_score( + observation, working_results, normalized_score_column + ) return summary diff --git a/taxon_range_evaluation.py b/taxon_range_evaluation.py index 1d00336..da9ddef 100644 --- a/taxon_range_evaluation.py +++ b/taxon_range_evaluation.py @@ -18,7 +18,7 @@ from sklearn.metrics import auc from sklearn.metrics import precision_recall_curve from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe -from lib.tf_gp_elev_model import TFGeoPriorModelElev +from lib.geo_inferrer_tf import TFGeoPriorModelElev def evaluate_p_r(thres, gdfb, tr_h3, world, plot): diff --git a/tests/conftest.py b/tests/conftest.py index f58935f..1e0a365 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ def taxonomy(): yield ModelTaxonomyDataframe( os.path.realpath(os.path.dirname(__file__) + "/fixtures/taxonomy.csv"), - os.path.realpath(os.path.dirname(__file__) + "/fixtures/thresholds.csv") + os.path.realpath(os.path.dirname(__file__) + "/fixtures/thresholds.csv"), ) @@ -22,18 +22,23 @@ def taxon(request, taxonomy): @pytest.fixture() def inatInferrer(request, mocker): config = { - "vision_model_path": "vision_model_path", - "tf_geo_elevation_model_path": "tf_geo_elevation_model_path", - "taxonomy_path": - os.path.realpath(os.path.dirname(__file__) + "/fixtures/taxonomy.csv"), - "elevation_h3_r4": - os.path.realpath(os.path.dirname(__file__) + "/fixtures/elevation.csv"), - "tf_elev_thresholds": - os.path.realpath(os.path.dirname(__file__) + "/fixtures/thresholds.csv"), - "taxon_ranges_path": - os.path.realpath(os.path.dirname(__file__) + "/fixtures/taxon_ranges"), - "synonyms_path": - os.path.realpath(os.path.dirname(__file__) + "/fixtures/synonyms.csv") + "vision_model_path": "vision_model_path.h5", + "tf_geo_elevation_model_path": "tf_geo_elevation_model_path.h5", + "taxonomy_path": os.path.realpath( + os.path.dirname(__file__) + "/fixtures/taxonomy.csv" + ), + "elevation_h3_r4": os.path.realpath( + os.path.dirname(__file__) + "/fixtures/elevation.csv" + ), + "tf_elev_thresholds": os.path.realpath( + os.path.dirname(__file__) + "/fixtures/thresholds.csv" + ), + "taxon_ranges_path": os.path.realpath( + os.path.dirname(__file__) + "/fixtures/taxon_ranges" + ), + "synonyms_path": os.path.realpath( + os.path.dirname(__file__) + "/fixtures/synonyms.csv" + ), } mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) mocker.patch("tensorflow.keras.Model", return_value=MagicMock()) diff --git a/tests/test_inat_inferrer.py b/tests/test_inat_inferrer.py index d5023f2..e9ca6d6 100644 --- a/tests/test_inat_inferrer.py +++ b/tests/test_inat_inferrer.py @@ -13,24 +13,20 @@ def test_initialization(self, inatInferrer): assert isinstance(inatInferrer.synonyms, pd.DataFrame) assert isinstance(inatInferrer.geo_elevation_cells, pd.DataFrame) tf.keras.models.load_model.assert_any_call( - inatInferrer.config["vision_model_path"], - compile=False + inatInferrer.config["vision_model_path"], compile=False ) tf.keras.models.load_model.assert_any_call( inatInferrer.config["tf_geo_elevation_model_path"], custom_objects={"ResLayer": ResLayer}, - compile=False + compile=False, ) def test_predictions_for_image(self, inatInferrer): - test_image_path = \ - os.path.realpath(os.path.dirname(__file__) + "/fixtures/lamprocapnos_spectabilis.jpeg") + test_image_path = os.path.realpath( + os.path.dirname(__file__) + "/fixtures/lamprocapnos_spectabilis.jpeg" + ) results = inatInferrer.predictions_for_image( - file_path=test_image_path, - lat=42, - lng=-71, - filter_taxon=None, - debug=True + file_path=test_image_path, lat=42, lng=-71, filter_taxon=None, debug=True ) combined_scores = results["combined_scores"] assert isinstance(combined_scores, pd.DataFrame) @@ -62,14 +58,11 @@ def test_lookup_taxon_with_invalid_taxon(self, inatInferrer): assert inatInferrer.lookup_taxon(999999999) is None def test_aggregate_results(self, inatInferrer): - test_image_path = \ - os.path.realpath(os.path.dirname(__file__) + "/fixtures/lamprocapnos_spectabilis.jpeg") + test_image_path = os.path.realpath( + os.path.dirname(__file__) + "/fixtures/lamprocapnos_spectabilis.jpeg" + ) predictions_for_image = inatInferrer.predictions_for_image( - file_path=test_image_path, - lat=42, - lng=-71, - filter_taxon=None, - debug=True + file_path=test_image_path, lat=42, lng=-71, filter_taxon=None, debug=True ) combined_scores = predictions_for_image["combined_scores"] combined_scores.normalized_vision_score = 0.5 @@ -77,8 +70,7 @@ def test_aggregate_results(self, inatInferrer): combined_scores.combined_score = 0.25 combined_scores.geo_threshold = 0.001 aggregated_scores = inatInferrer.aggregate_results( - leaf_scores=combined_scores, - debug=True + leaf_scores=combined_scores, debug=True ) assert "aggregated_vision_score" in aggregated_scores.columns assert "aggregated_geo_score" in aggregated_scores.columns @@ -88,17 +80,13 @@ def test_aggregate_results(self, inatInferrer): @pytest.mark.parametrize("taxon", ["Aramus guarauna"], indirect=True) def test_h3_04_taxon_range_comparison(self, mocker, inatInferrer, taxon): - inatInferrer.h3_04_geo_results_for_taxon = MagicMock(return_value={ - "aa": "0.1", - "ab": "0.1" - }) - inatInferrer.h3_04_taxon_range = MagicMock(return_value={ - "ab": "0.1", - "bb": "0.1" - }) - range_comparison_results = inatInferrer.h3_04_taxon_range_comparison(taxon["taxon_id"]) - assert range_comparison_results == { - "aa": 0, - "ab": 0.5, - "bb": 1 - } + inatInferrer.h3_04_geo_results_for_taxon = MagicMock( + return_value={"aa": "0.1", "ab": "0.1"} + ) + inatInferrer.h3_04_taxon_range = MagicMock( + return_value={"ab": "0.1", "bb": "0.1"} + ) + range_comparison_results = inatInferrer.h3_04_taxon_range_comparison( + taxon["taxon_id"] + ) + assert range_comparison_results == {"aa": 0, "ab": 0.5, "bb": 1} diff --git a/tests/test_tf_gp_elev_model.py b/tests/test_tf_gp_elev_model.py index fb56d3b..f215e8b 100644 --- a/tests/test_tf_gp_elev_model.py +++ b/tests/test_tf_gp_elev_model.py @@ -1,10 +1,11 @@ import pytest import tensorflow as tf -from lib.res_layer import ResLayer -from lib.tf_gp_elev_model import TFGeoPriorModelElev from unittest.mock import MagicMock import unittest.mock as mock +from lib.res_layer import ResLayer +from lib.geo_inferrer_tf import TFGeoPriorModelElev + class TestTfGpModel: def test_initialization_with_unknown_model_path(self): @@ -16,9 +17,7 @@ def test_initialization(self, mocker): mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) TFGeoPriorModelElev(model_path) tf.keras.models.load_model.assert_called_once_with( - model_path, - custom_objects={"ResLayer": ResLayer}, - compile=False + model_path, custom_objects={"ResLayer": ResLayer}, compile=False ) def test_predict(self, mocker): @@ -41,5 +40,7 @@ def test_eval_one_class_elevation_from_features(self, mocker): mocker.patch("tensorflow.matmul", return_value=MagicMock()) mocker.patch("tensorflow.expand_dims", return_value=MagicMock()) tf_gp_model = TFGeoPriorModelElev(model_path) - tf_gp_model.eval_one_class_elevation_from_features("features", "class_of_interest") + tf_gp_model.eval_one_class_elevation_from_features( + "features", "class_of_interest" + ) tf.math.sigmoid.assert_called_once diff --git a/tests/test_vision_inferrer.py b/tests/test_vision_inferrer.py index 5327b33..2bf788f 100644 --- a/tests/test_vision_inferrer.py +++ b/tests/test_vision_inferrer.py @@ -1,6 +1,6 @@ import tensorflow as tf from unittest.mock import MagicMock -from lib.vision_inferrer import VisionInferrer +from lib.vision_inferrer_tf import VisionInferrerTF class TestVisionInferrer: @@ -8,21 +8,17 @@ def test_initialization(self, mocker): mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) mocker.patch("tensorflow.keras.Model", return_value=MagicMock()) model_path = "model_path" - inferrer = VisionInferrer(model_path) + inferrer = VisionInferrerTF(model_path) assert inferrer.model_path == model_path - tf.keras.models.load_model.assert_called_once_with( - model_path, - compile=False - ) + tf.keras.models.load_model.assert_called_once_with(model_path, compile=False) def test_process_image(self, mocker): mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) mocker.patch("tensorflow.keras.Model", return_value=MagicMock()) model_path = "model_path" - inferrer = VisionInferrer(model_path) + inferrer = VisionInferrerTF(model_path) theimage = "theimage" inferrer.process_image(theimage) inferrer.layered_model.assert_called_once_with( - tf.convert_to_tensor(theimage), - training=False + tf.convert_to_tensor(theimage), training=False ) From ee4783589c7b6824a3efae521d569242fe8bb3fb Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Thu, 6 Mar 2025 12:58:14 -0800 Subject: [PATCH 2/8] add coremltools to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a66056b..f00f48e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ aiofiles==24.1.0 aiohttp==3.11.2;python_version>="3.11" aiohttp==3.10.11;python_version=="3.8" +coremltools==8.2 flake8==7.0.0 flake8-quotes==3.4.0 Flask[async]==3.0.2 From e9e944ace640c67d4d8d9e800399f8f0fd3bf990 Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Thu, 6 Mar 2025 13:41:08 -0800 Subject: [PATCH 3/8] whitespace & formatting --- lib/inat_inferrer.py | 414 +++++++++++++++---------------------------- 1 file changed, 142 insertions(+), 272 deletions(-) diff --git a/lib/inat_inferrer.py b/lib/inat_inferrer.py index ef54844..4aa2039 100644 --- a/lib/inat_inferrer.py +++ b/lib/inat_inferrer.py @@ -42,18 +42,13 @@ def __init__(self, config): def setup_taxonomy(self): self.taxonomy = ModelTaxonomyDataframe( self.config["taxonomy_path"], - ( - self.config["tf_elev_thresholds"] - if "tf_elev_thresholds" in self.config - else None - ), + self.config["tf_elev_thresholds"] if "tf_elev_thresholds" in self.config else None ) def check_for_modified_synonyms(self): # only run the refresh check again if `SYNONYMS_CHECK_FREQUENCY` seconds have passed if not hasattr(self, "synonym_refresh_check_time") or ( - time.time() - self.synonym_refresh_check_time - > InatInferrer.SYNONYMS_CHECK_FREQUENCY + time.time() - self.synonym_refresh_check_time > InatInferrer.SYNONYMS_CHECK_FREQUENCY ): self.refresh_synonyms_if_modified() @@ -61,9 +56,8 @@ def refresh_synonyms_if_modified(self): self.synonym_refresh_check_time = time.time() # only process the synonyms file if it has changed since last being processed if os.path.exists(self.config["synonyms_path"]) and ( - not hasattr(self, "synonyms_path_updated_at") # noqa: W504 - or os.path.getmtime(self.config["synonyms_path"]) - != self.synonyms_path_updated_at + not hasattr(self, "synonyms_path_updated_at") or # noqa: W504 + os.path.getmtime(self.config["synonyms_path"]) != self.synonyms_path_updated_at ): self.setup_synonyms() @@ -85,8 +79,8 @@ def setup_synonyms(self): "rank_level": float, "name": pd.StringDtype(), "iconic_taxon_id": "Int64", - "rank": pd.StringDtype(), - }, + "rank": pd.StringDtype() + } ) # create a dict indexed by model_taxon_id for efficient synonym mappings at inference time @@ -109,11 +103,7 @@ def setup_synonym_taxonomy(self): synonym_taxonomy = ModelTaxonomyDataframe( self.config["synonyms_taxonomy_path"], - ( - self.config["tf_elev_thresholds"] - if "tf_elev_thresholds" in self.config - else None - ), + self.config["tf_elev_thresholds"] if "tf_elev_thresholds" in self.config else None ) # ensure the leaf_class_ids from the synonym taxonomy are identical # to the taxonomy generated at data export time @@ -122,15 +112,11 @@ def setup_synonym_taxonomy(self): print(error) return - synonym_taxon_ids = np.unique( - pd.array(self.synonyms["taxon_id"].dropna().values) - ) + synonym_taxon_ids = np.unique(pd.array(self.synonyms["taxon_id"].dropna().values)) synonym_taxonomy_taxon_ids = np.unique( - pd.array( - synonym_taxonomy.df[ - synonym_taxonomy.df.taxon_id.isin(synonym_taxon_ids) - ]["taxon_id"].values - ) + pd.array(synonym_taxonomy.df[ + synonym_taxonomy.df.taxon_id.isin(synonym_taxon_ids) + ]["taxon_id"].values) ) synonym_taxon_ids_not_present_in_taxonomy = np.setdiff1d( synonym_taxon_ids, synonym_taxonomy_taxon_ids @@ -138,18 +124,16 @@ def setup_synonym_taxonomy(self): # ensure all taxa referenced in the synonym mappings file are present in the # updated taxonomy that should include all original taxa plus all synonyms if synonym_taxon_ids_not_present_in_taxonomy.size > 0: - error = ( - "There are taxa in the synonyms file not present in the synonyms " - + f"taxonomy: {synonym_taxon_ids_not_present_in_taxonomy}" - ) + error = "There are taxa in the synonyms file not present in the synonyms " + \ + f"taxonomy: {synonym_taxon_ids_not_present_in_taxonomy}" print(error) return synonym_taxonomy.leaf_df["has_synonyms"] = False # mark taxa that should be replaced or removed as having synonyms - for index, taxon in self.taxonomy.leaf_df[ - self.taxonomy.leaf_df["taxon_id"].isin(self.synonyms["model_taxon_id"]) - ].iterrows(): + for index, taxon in self.taxonomy.leaf_df[self.taxonomy.leaf_df["taxon_id"].isin( + self.synonyms["model_taxon_id"] + )].iterrows(): synonym_taxonomy.leaf_df.loc[taxon["leaf_class_id"], "has_synonyms"] = True # replace the originally exported taxonomy with the updated taxonomy that includes synonyms @@ -166,12 +150,8 @@ def setup_elevation_dataframe(self): return # load elevation data stored at H3 resolution 4 - self.geo_elevation_cells = ( - pd.read_csv(self.config["elevation_h3_r4"]) - .sort_values("h3_04") - .set_index("h3_04") - .sort_index() - ) + self.geo_elevation_cells = pd.read_csv(self.config["elevation_h3_r4"]). \ + sort_values("h3_04").set_index("h3_04").sort_index() self.geo_elevation_cells = InatInferrer.add_lat_lng_to_h3_geo_dataframe( self.geo_elevation_cells ) @@ -193,11 +173,7 @@ def setup_elevation_dataframe_from_worldclim(self, resolution): im_df = im_df.melt(id_vars=["index"]) im_df.columns = ["lat", "lng", "elevation"] elev_dfh3 = im_df.h3.geo_to_h3(resolution) - elev_dfh3 = ( - elev_dfh3.drop(columns=["lng", "lat"]) - .groupby(f"h3_0{resolution}") - .mean() - ) + elev_dfh3 = elev_dfh3.drop(columns=["lng", "lat"]).groupby(f"h3_0{resolution}").mean() def setup_geo_model(self): self.geo_elevation_model = None @@ -213,20 +189,19 @@ def setup_geo_model(self): ) if hasattr(self.geo_elevation_model, "features_for_one_class_elevation"): - self.geo_model_features = ( - self.geo_elevation_model.features_for_one_class_elevation( - latitude=list(self.geo_elevation_cells.lat), - longitude=list(self.geo_elevation_cells.lng), - elevation=list(self.geo_elevation_cells.elevation), - ) + self.geo_model_features = self.geo_elevation_model.features_for_one_class_elevation( + latitude=list(self.geo_elevation_cells.lat), + longitude=list(self.geo_elevation_cells.lng), + elevation=list(self.geo_elevation_cells.elevation) ) + def vision_predict(self, image, debug=False): if debug: start_time = time.time() results = self.vision_inferrer.process_image(image) if debug: - print("Vision Time: %0.2fms" % ((time.time() - start_time) * 1000.0)) + print("Vision Time: %0.2fms" % ((time.time() - start_time) * 1000.)) return results def geo_model_predict(self, lat, lng, debug=False): @@ -244,10 +219,9 @@ def geo_model_predict(self, lat, lng, debug=False): # get the average elevation of the above H3 cell elevation = self.geo_elevation_cells.loc[h3_cell].elevation geo_scores = self.geo_elevation_model.predict( - h3_cell_centroid[0], h3_cell_centroid[1], float(elevation) - ) + h3_cell_centroid[0], h3_cell_centroid[1], float(elevation)) if debug: - print("Geo Time: %0.2fms" % ((time.time() - start_time) * 1000.0)) + print("Geo Time: %0.2fms" % ((time.time() - start_time) * 1000.)) return geo_scores def lookup_taxon(self, taxon_id): @@ -263,11 +237,6 @@ def lookup_taxon(self, taxon_id): def predictions_for_image(self, file_path, lat, lng, filter_taxon, debug=False): if debug: start_time = time.time() - - # if isinstance(self.vision_inferrer, VisionInferrerCoreML): - # image = self.vision_inferrer.load_image(file_path) - # vision_model_results = self.vision_predict(image, debug) - # else: image = InatInferrer.prepare_image_for_inference(file_path) vision_model_results = self.vision_predict(image, debug) raw_vision_scores = vision_model_results["predictions"] @@ -280,18 +249,16 @@ def predictions_for_image(self, file_path, lat, lng, filter_taxon, debug=False): # possible value, and thus all its taxa will not be considered "expected nearby" combined_scores["geo_threshold"] = combined_scores["geo_threshold"].fillna(1) if debug: - print("Prediction Time: %0.2fms" % ((time.time() - start_time) * 1000.0)) + print("Prediction Time: %0.2fms" % ((time.time() - start_time) * 1000.)) return { "combined_scores": combined_scores, - "features": vision_model_results["features"], + "features": vision_model_results["features"] } - def combine_results( - self, raw_vision_scores, raw_geo_scores, filter_taxon, debug=False - ): + def combine_results(self, raw_vision_scores, raw_geo_scores, filter_taxon, debug=False): if debug: start_time = time.time() - no_geo_scores = raw_geo_scores is None + no_geo_scores = (raw_geo_scores is None) # make a copy of the model taxonomy leaf nodes to be used for storing results. Skip any # filtering at this stage as the taxonomy dataframe needs to have the same number of @@ -302,11 +269,8 @@ def combine_results( # add a column for geo scores leaf_scores["geo_score"] = 0 if no_geo_scores else raw_geo_scores # set a lower limit for geo scores if there are any - leaf_scores["normalized_geo_score"] = ( - 0 - if no_geo_scores + leaf_scores["normalized_geo_score"] = 0 if no_geo_scores \ else leaf_scores["geo_score"].clip(InatInferrer.MINIMUM_GEO_SCORE, None) - ) # if filtering by a taxon, restrict results to that taxon and its descendants if filter_taxon is not None: @@ -316,9 +280,8 @@ def combine_results( ) # normalize the vision scores so they add up to 1 after filtering sum_of_vision_scores = leaf_scores["vision_score"].sum() - leaf_scores["normalized_vision_score"] = ( + leaf_scores["normalized_vision_score"] = \ leaf_scores["vision_score"] / sum_of_vision_scores - ) else: # when not filtering by a taxon, the normalized vision score is the same as the original leaf_scores["normalized_vision_score"] = leaf_scores["vision_score"] @@ -330,26 +293,18 @@ def combine_results( else: # the combined score is simply the normalized vision score # multipliedby the normalized geo score - leaf_scores["combined_score"] = ( - leaf_scores["normalized_vision_score"] - * leaf_scores["normalized_geo_score"] - ) + leaf_scores["combined_score"] = leaf_scores["normalized_vision_score"] * \ + leaf_scores["normalized_geo_score"] - sum_of_root_node_aggregated_combined_scores = leaf_scores[ - "combined_score" - ].sum() + sum_of_root_node_aggregated_combined_scores = leaf_scores["combined_score"].sum() if sum_of_root_node_aggregated_combined_scores > 0: - leaf_scores["normalized_combined_score"] = ( - leaf_scores["combined_score"] - / sum_of_root_node_aggregated_combined_scores - ) + leaf_scores["normalized_combined_score"] = leaf_scores[ + "combined_score"] / sum_of_root_node_aggregated_combined_scores else: leaf_scores["normalized_combined_score"] = 0 if debug: - print( - "Score Combining Time: %0.2fms" % ((time.time() - start_time) * 1000.0) - ) + print("Score Combining Time: %0.2fms" % ((time.time() - start_time) * 1000.)) leaf_scores.reset_index(drop=True, inplace=True) return leaf_scores @@ -389,24 +344,18 @@ def map_result_synonyms(self, leaf_scores, debug=False): leaf_scores = leaf_scores.query("has_synonyms == False") if replacements: # inject the synonym replacements into leaf_scores - leaf_scores = pd.concat( - [leaf_scores, pd.DataFrame.from_dict(replacements, orient="index")], - axis=0, - ) + leaf_scores = pd.concat([ + leaf_scores, + pd.DataFrame.from_dict(replacements, orient="index") + ], axis=0) if debug: - print( - "Synonym Mapping Time: %0.2fms" % ((time.time() - start_time) * 1000.0) - ) + print("Synonym Mapping Time: %0.2fms" % ((time.time() - start_time) * 1000.)) return leaf_scores - def aggregate_results( - self, - leaf_scores, - debug=False, - score_ratio_cutoff=0.001, - max_leaf_scores_to_consider=None, - column_for_cutoff="combined_score", - ): + def aggregate_results(self, leaf_scores, debug=False, + score_ratio_cutoff=0.001, + max_leaf_scores_to_consider=None, + column_for_cutoff="combined_score"): if debug: start_time = time.time() @@ -415,38 +364,26 @@ def aggregate_results( # copy columns from the already calculated leaf scores including scores # and class_id columns which will not be populated for synonyms in the taxonomy - all_node_scores = pd.merge( - all_node_scores, - leaf_scores[ - [ - "taxon_id", - "vision_score", - "normalized_vision_score", - "geo_score", - "combined_score", - "normalized_geo_score", - "leaf_class_id", - "iconic_class_id", - "spatial_class_id", - ] - ], + all_node_scores = pd.merge(all_node_scores, leaf_scores[[ + "taxon_id", "vision_score", "normalized_vision_score", "geo_score", "combined_score", + "normalized_geo_score", "leaf_class_id", "iconic_class_id", "spatial_class_id"]], on="taxon_id", how="left", - suffixes=["_x", None], + suffixes=["_x", None] ).set_index("taxon_id", drop=False) # calculate the highest combined score from leaf_scores - top_combined_score = ( - leaf_scores.sort_values(column_for_cutoff, ascending=False) - .head(1)[column_for_cutoff] - .values[0] - ) + top_combined_score = leaf_scores.sort_values( + column_for_cutoff, ascending=False + ).head(1)[column_for_cutoff].values[0] # define some cutoff based on a percentage of the top combined score. Taxa with # scores below the cutoff will be ignored when aggregating scores up the taxonomy cutoff = top_combined_score * score_ratio_cutoff # restrict score aggregation to results where the combined score is above the cutoff - scores_to_aggregate = leaf_scores.query(f"{column_for_cutoff} > {cutoff}") + scores_to_aggregate = leaf_scores.query( + f"{column_for_cutoff} > {cutoff}" + ) if max_leaf_scores_to_consider is not None: scores_to_aggregate = scores_to_aggregate.sort_values( column_for_cutoff, ascending=False @@ -459,7 +396,7 @@ def aggregate_results( scores_to_aggregate["normalized_vision_score"], scores_to_aggregate["geo_score"], scores_to_aggregate["combined_score"], - scores_to_aggregate["geo_threshold"], + scores_to_aggregate["geo_threshold"] ): # loop through the pre-calculated ancestors of this result's taxon for ancestor_taxon_id in self.taxonomy.taxon_ancestors[taxon_id]: @@ -467,36 +404,23 @@ def aggregate_results( if ancestor_taxon_id not in aggregated_scores: aggregated_scores[ancestor_taxon_id] = {} aggregated_scores[ancestor_taxon_id]["aggregated_vision_score"] = 0 - aggregated_scores[ancestor_taxon_id][ - "aggregated_combined_score" - ] = 0 + aggregated_scores[ancestor_taxon_id]["aggregated_combined_score"] = 0 aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = 0 - aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] = ( - geo_threshold if (ancestor_taxon_id == taxon_id) else 1.0 - ) + aggregated_scores[ancestor_taxon_id][ + "aggregated_geo_threshold" + ] = geo_threshold if (ancestor_taxon_id == taxon_id) else 1.0 # aggregated vision and combined scores are sums of descendant scores - aggregated_scores[ancestor_taxon_id][ - "aggregated_vision_score" - ] += vision_score - aggregated_scores[ancestor_taxon_id][ - "aggregated_combined_score" - ] += combined_score + aggregated_scores[ancestor_taxon_id]["aggregated_vision_score"] += vision_score + aggregated_scores[ancestor_taxon_id]["aggregated_combined_score"] += combined_score # aggregated geo score is the max of descendant geo scores - if ( - geo_score - > aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] - ): - aggregated_scores[ancestor_taxon_id][ - "aggregated_geo_score" - ] = geo_score + if geo_score > aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"]: + aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = geo_score # aggregated geo threshold is the min of descendant geo thresholds - if ( - ancestor_taxon_id != taxon_id - and geo_threshold - < aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] - ): + if ancestor_taxon_id != taxon_id and geo_threshold < aggregated_scores[ + ancestor_taxon_id + ]["aggregated_geo_threshold"]: aggregated_scores[ancestor_taxon_id][ "aggregated_geo_threshold" ] = geo_threshold @@ -510,22 +434,16 @@ def aggregate_results( # calculate normalized scores so all values add to 1, to be used for thresholding sum_of_root_node_aggregated_vision_scores = all_node_scores.query( - "parent_taxon_id.isnull()" - )["aggregated_vision_score"].sum() - all_node_scores["normalized_aggregated_vision_score"] = ( - all_node_scores["aggregated_vision_score"] - / sum_of_root_node_aggregated_vision_scores - ) + "parent_taxon_id.isnull()")["aggregated_vision_score"].sum() + all_node_scores["normalized_aggregated_vision_score"] = all_node_scores[ + "aggregated_vision_score"] / sum_of_root_node_aggregated_vision_scores sum_of_root_node_aggregated_combined_scores = all_node_scores.query( - "parent_taxon_id.isnull()" - )["aggregated_combined_score"].sum() - all_node_scores["normalized_aggregated_combined_score"] = ( - all_node_scores["aggregated_combined_score"] - / sum_of_root_node_aggregated_combined_scores - ) + "parent_taxon_id.isnull()")["aggregated_combined_score"].sum() + all_node_scores["normalized_aggregated_combined_score"] = all_node_scores[ + "aggregated_combined_score"] / sum_of_root_node_aggregated_combined_scores if debug: - print("Aggregation Time: %0.2fms" % ((time.time() - start_time) * 1000.0)) + print("Aggregation Time: %0.2fms" % ((time.time() - start_time) * 1000.)) # InatInferrer.print_aggregated_scores(all_node_scores) return all_node_scores @@ -547,19 +465,13 @@ def h3_04_geo_results_for_taxon_and_cell(self, taxon_id, lat, lng): return None h3_cell = h3.geo_to_h3(lat_float, lng_float, 4) - return ( - float( - self.geo_elevation_model.eval_one_class_elevation_from_features( - [self.geo_model_features[self.geo_elevation_cell_indices[h3_cell]]], - int(taxon["leaf_class_id"]), - )[0][0] - ) - / taxon["geo_threshold"] - ) + return float(self.geo_elevation_model.eval_one_class_elevation_from_features( + [self.geo_model_features[self.geo_elevation_cell_indices[h3_cell]]], + int(taxon["leaf_class_id"]) + )[0][0]) / taxon["geo_threshold"] - def h3_04_geo_results_for_taxon( - self, taxon_id, bounds=[], thresholded=False, raw_results=False - ): + def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], + thresholded=False, raw_results=False): if (self.geo_elevation_cells is None) or (self.geo_elevation_model is None): return try: @@ -571,14 +483,11 @@ def h3_04_geo_results_for_taxon( return geo_scores = self.geo_elevation_model.eval_one_class_elevation_from_features( - self.geo_model_features, int(taxon["leaf_class_id"]) - ) + self.geo_model_features, int(taxon["leaf_class_id"])) geo_score_cells = self.geo_elevation_cells.copy() geo_score_cells["geo_score"] = tf.squeeze(geo_scores).numpy() if thresholded: - geo_score_cells = geo_score_cells.query( - f'geo_score >= {taxon["geo_threshold"]}' - ) + geo_score_cells = geo_score_cells.query(f'geo_score >= {taxon["geo_threshold"]}') else: # return scores more than 10% of the taxon threshold, or more than 0.0001, whichever # is smaller. This reduces data needed to be redendered client-side for the Data Layer @@ -589,48 +498,36 @@ def h3_04_geo_results_for_taxon( if bounds: min = geo_score_cells["geo_score"].min() max = geo_score_cells["geo_score"].max() - geo_score_cells = InatInferrer.filter_geo_dataframe_by_bounds( - geo_score_cells, bounds - ) + geo_score_cells = InatInferrer.filter_geo_dataframe_by_bounds(geo_score_cells, bounds) if min == max: # all scores are the same, so no transform is needed and all cells get the max value geo_score_cells["geo_score"] = 1 else: # perform a log transform based on the min/max score for the unbounded set - geo_score_cells["geo_score"] = ( - np.log10(geo_score_cells["geo_score"]) - math.log10(min) - ) / (math.log10(max) - math.log10(min)) + geo_score_cells["geo_score"] = \ + (np.log10(geo_score_cells["geo_score"]) - math.log10(min)) / \ + (math.log10(max) - math.log10(min)) if raw_results: return geo_score_cells - return dict( - zip(geo_score_cells.index.astype(str), geo_score_cells["geo_score"]) - ) + return dict(zip(geo_score_cells.index.astype(str), geo_score_cells["geo_score"])) def h3_04_taxon_range(self, taxon_id, bounds=[]): - taxon_range_path = os.path.join( - self.config["taxon_ranges_path"], f"{taxon_id}.csv" - ) + taxon_range_path = os.path.join(self.config["taxon_ranges_path"], f"{taxon_id}.csv") if not os.path.exists(taxon_range_path): return None - taxon_range_df = ( - pd.read_csv(taxon_range_path, names=["h3_04"], header=None) - .sort_values("h3_04") - .set_index("h3_04") - .sort_index() - ) + taxon_range_df = pd.read_csv(taxon_range_path, names=["h3_04"], header=None). \ + sort_values("h3_04").set_index("h3_04").sort_index() taxon_range_df = InatInferrer.add_lat_lng_to_h3_geo_dataframe(taxon_range_df) if bounds: - taxon_range_df = InatInferrer.filter_geo_dataframe_by_bounds( - taxon_range_df, bounds - ) + taxon_range_df = InatInferrer.filter_geo_dataframe_by_bounds(taxon_range_df, bounds) taxon_range_df["value"] = 1 return dict(zip(taxon_range_df.index.astype(str), taxon_range_df["value"])) def h3_04_taxon_range_comparison(self, taxon_id, bounds=[]): - geomodel_results = ( - self.h3_04_geo_results_for_taxon(taxon_id, bounds, thresholded=True) or {} - ) + geomodel_results = self.h3_04_geo_results_for_taxon( + taxon_id, bounds, thresholded=True + ) or {} taxon_range_results = self.h3_04_taxon_range(taxon_id, bounds) or {} combined_results = {} for cell_key in geomodel_results: @@ -645,8 +542,7 @@ def h3_04_taxon_range_comparison(self, taxon_id, bounds=[]): def h3_04_bounds(self, taxon_id): geomodel_results = self.h3_04_geo_results_for_taxon( - taxon_id, bounds=None, thresholded=True, raw_results=True - ) + taxon_id, bounds=None, thresholded=True, raw_results=True) if geomodel_results is None: return swlat = geomodel_results["lat"].min() @@ -660,15 +556,16 @@ def h3_04_bounds(self, taxon_id): if swlng == nelng: swlng -= 0.3 nelng += 0.3 - return {"swlat": swlat, "swlng": swlng, "nelat": nelat, "nelng": nelng} + return { + "swlat": swlat, + "swlng": swlng, + "nelat": nelat, + "nelng": nelng + } def common_ancestor_from_leaf_scores( - self, - leaf_scores, - debug=False, - score_to_use="combined_score", - disallow_humans=False, - common_ancestor_rank_type=None, + self, leaf_scores, debug=False, score_to_use="combined_score", disallow_humans=False, + common_ancestor_rank_type=None ): if leaf_scores.empty: return None @@ -677,29 +574,22 @@ def common_ancestor_from_leaf_scores( debug=debug, score_ratio_cutoff=InatInferrer.COMMON_ANCESTOR_CUTOFF_RATIO, max_leaf_scores_to_consider=InatInferrer.COMMON_ANCESTOR_WINDOW, - column_for_cutoff=score_to_use, + column_for_cutoff=score_to_use ) return self.common_ancestor_from_aggregated_scores( aggregated_scores, debug=debug, score_to_use=score_to_use, disallow_humans=disallow_humans, - common_ancestor_rank_type=common_ancestor_rank_type, + common_ancestor_rank_type=common_ancestor_rank_type ) def common_ancestor_from_aggregated_scores( - self, - aggregated_scores, - debug=False, - score_to_use="combined_score", - disallow_humans=False, - common_ancestor_rank_type=None, + self, aggregated_scores, debug=False, score_to_use="combined_score", disallow_humans=False, + common_ancestor_rank_type=None ): - aggregated_score_to_use = ( - "normalized_aggregated_vision_score" - if score_to_use == "vision_score" - else "normalized_aggregated_combined_score" - ) + aggregated_score_to_use = "normalized_aggregated_vision_score" if \ + score_to_use == "vision_score" else "normalized_aggregated_combined_score" common_ancestor_query = f"{aggregated_score_to_use} > 0.78 and rank_level >= 20" if common_ancestor_rank_type == "major": common_ancestor_query += " and rank_level % 10 == 0" @@ -707,28 +597,20 @@ def common_ancestor_from_aggregated_scores( common_ancestor_query += " and rank_level <= 33" # if using combined scores to aggregate, and there are taxa expected nearby, # then add a query filter to only look at nearby taxa as common ancestor candidates - if ( - aggregated_score_to_use == "normalized_aggregated_combined_score" - and not aggregated_scores.query( - "aggregated_geo_score >= aggregated_geo_threshold" - ).empty - ): - common_ancestor_query += ( - " and aggregated_geo_score >= aggregated_geo_threshold" - ) + if aggregated_score_to_use == "normalized_aggregated_combined_score" and not \ + aggregated_scores.query("aggregated_geo_score >= aggregated_geo_threshold").empty: + common_ancestor_query += " and aggregated_geo_score >= aggregated_geo_threshold" common_ancestor_candidates = aggregated_scores.query( common_ancestor_query - ).sort_values(by=["rank_level"]) + ).sort_values( + by=["rank_level"] + ) if common_ancestor_candidates.empty: return None common_ancestor = common_ancestor_candidates.iloc[0] - if ( - disallow_humans - and self.taxonomy.human_taxon is not None - and common_ancestor["taxon_id"] - == self.taxonomy.human_taxon["parent_taxon_id"] - ): + if disallow_humans and self.taxonomy.human_taxon is not None and \ + common_ancestor["taxon_id"] == self.taxonomy.human_taxon["parent_taxon_id"]: return None return common_ancestor @@ -738,11 +620,10 @@ def limit_leaf_scores_that_include_humans(self, leaf_scores): return leaf_scores top_results = leaf_scores.sort_values( - "combined_score", ascending=False + "combined_score", + ascending=False ).reset_index(drop=True) - human_results = top_results.query( - f"taxon_id == {self.taxonomy.human_taxon['taxon_id']}" - ) + human_results = top_results.query(f"taxon_id == {self.taxonomy.human_taxon['taxon_id']}") # there is only 1 result, or humans aren't in the top results if human_results.empty or top_results.index.size == 1: return leaf_scores @@ -751,10 +632,8 @@ def limit_leaf_scores_that_include_humans(self, leaf_scores): human_result_index = human_results.index[0] # if humans is first and has a substantially higher score than the next, return only humans if human_result_index == 0: - human_score_margin = ( - top_results.iloc[0]["combined_score"] - / top_results.iloc[1]["combined_score"] - ) + human_score_margin = top_results.iloc[0]["combined_score"] / \ + top_results.iloc[1]["combined_score"] if human_score_margin > 1.5: return top_results.head(1) @@ -765,12 +644,8 @@ async def embeddings_for_photos(self, photos): response = {} async with aiohttp.ClientSession() as session: queue = asyncio.Queue() - workers = [ - asyncio.create_task( - self.embeddings_worker_task(queue, response, session) - ) - for _ in range(5) - ] + workers = [asyncio.create_task(self.embeddings_worker_task(queue, response, session)) + for _ in range(5)] for photo in photos: queue.put_nowait(photo) await queue.join() @@ -805,7 +680,7 @@ def signature_for_image(self, image_path, debug=False): image = InatInferrer.prepare_image_for_inference(image_path) signature = self.vision_inferrer.process_image(image)["features"] if debug: - print("Signature Time: %0.2fms" % ((time.time() - start_time) * 1000.0)) + print("Signature Time: %0.2fms" % ((time.time() - start_time) * 1000.)) if signature is None: return return signature.numpy().tolist() @@ -853,7 +728,7 @@ def prepare_image_for_inference(file_path): image, [new_height, new_width], method=tf.image.ResizeMethod.AREA, - preserve_aspect_ratio=True, + preserve_aspect_ratio=True ) # determine the upper-left corner that needs to be used to grab the square crop offset_height = math.floor((new_height - eventual_size) / 2) @@ -897,11 +772,9 @@ def filter_geo_dataframe_by_bounds(geo_df, bounds): # query for cells wtihin the buffered bounds, and potentially # on the other side of the antimeridian - query = ( - f"lat >= {bounds[0] - buffer} and lat <= {bounds[2] + buffer} and " - + f" ((lng >= {bounds[1] - buffer} and lng <= {bounds[3] + buffer})" - + f" {antimedirian_condition})" - ) + query = f"lat >= {bounds[0] - buffer} and lat <= {bounds[2] + buffer} and " + \ + f" ((lng >= {bounds[1] - buffer} and lng <= {bounds[3] + buffer})" + \ + f" {antimedirian_condition})" return geo_df.query(query) @staticmethod @@ -910,17 +783,14 @@ def print_aggregated_scores(aggregated_scores): "normalized_aggregated_combined_score > 0.005" ) print("\nTree of aggregated results:") - ModelTaxonomyDataframe.print( - thresholded_results, - display_taxon_lambda=( - lambda row: f"{row.name} [" - f"ID:{row.taxon_id}, " - f"V:{round(row.aggregated_vision_score, 4)}, " - f"NV:{round(row.normalized_aggregated_vision_score, 4)}, " - f"G:{round(row.aggregated_geo_score, 4)}, " - f"GT:{round(row.aggregated_geo_threshold, 4)}, " - f"C:{round(row.aggregated_combined_score, 4)}, " - f"NC:{round(row.normalized_aggregated_combined_score, 4)}]" - ), - ) + ModelTaxonomyDataframe.print(thresholded_results, display_taxon_lambda=( + lambda row: f"{row.name} [" + f"ID:{row.taxon_id}, " + f"V:{round(row.aggregated_vision_score, 4)}, " + f"NV:{round(row.normalized_aggregated_vision_score, 4)}, " + f"G:{round(row.aggregated_geo_score, 4)}, " + f"GT:{round(row.aggregated_geo_threshold, 4)}, " + f"C:{round(row.aggregated_combined_score, 4)}, " + f"NC:{round(row.normalized_aggregated_combined_score, 4)}]" + )) print("") From 532eff2b68d5b741476642d327c8ce3ee0e4e8ec Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Thu, 6 Mar 2025 13:41:35 -0800 Subject: [PATCH 4/8] whitespace and formatting --- lib/geo_inferrer_tf.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/lib/geo_inferrer_tf.py b/lib/geo_inferrer_tf.py index 30a84e0..79c5a1c 100644 --- a/lib/geo_inferrer_tf.py +++ b/lib/geo_inferrer_tf.py @@ -18,16 +18,16 @@ def __init__(self, model_path: str): for device in visible_devices: assert device.device_type != "GPU" self.gpmodel = tf.keras.models.load_model( - model_path, custom_objects={"ResLayer": ResLayer}, compile=False + model_path, + custom_objects={"ResLayer": ResLayer}, + compile=False ) - def predict( - self, latitude: float, longitude: float, elevation: float - ) -> np.ndarray: + def predict(self, latitude: float, longitude: float, elevation: float) -> np.ndarray: encoded_loc = GeoInferrer.encode_loc([latitude], [longitude], [elevation]) - output = self.gpmodel( - tf.convert_to_tensor(tf.expand_dims(encoded_loc[0], axis=0)), training=False - )[0] + output = self.gpmodel(tf.convert_to_tensor( + tf.expand_dims(encoded_loc[0], axis=0) + ), training=False)[0] return output def features_for_one_class_elevation(self, latitude, longitude, elevation): @@ -58,10 +58,8 @@ def eval_one_class_elevation_from_features(self, features, class_of_interest): # process just the one class return tf.math.sigmoid( tf.matmul( - tf.expand_dims( - self.gpmodel.layers[5].weights[0][:, class_of_interest], axis=0 - ), + tf.expand_dims(self.gpmodel.layers[5].weights[0][:, class_of_interest], axis=0), features, - transpose_b=True, + transpose_b=True ) ).numpy() From 5f4645194e42234791490034b076a22f4b9d059f Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Thu, 6 Mar 2025 13:42:30 -0800 Subject: [PATCH 5/8] roll back switch to parquet here --- lib/model_test_data_export_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/model_test_data_export_manager.py b/lib/model_test_data_export_manager.py index ecf5450..3aa35f9 100644 --- a/lib/model_test_data_export_manager.py +++ b/lib/model_test_data_export_manager.py @@ -14,7 +14,7 @@ def load_train_data_photo_ids(self): return self.train_data_photo_ids = pd.concat( - map(lambda x: pd.read_parquet(x, columns=["photo_id"]), + map(lambda x: pd.read_csv(x, usecols=["photo_id"]), self.cmd_args["exclude_train_photos_path"]) ).drop_duplicates("photo_id").set_index("photo_id").sort_index().index From 52defdb00291ea62a7f868780082e5aab21e66fd Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Thu, 6 Mar 2025 13:50:29 -0800 Subject: [PATCH 6/8] whitespace & formatting cleanup --- lib/vision_testing.py | 4 +-- tests/conftest.py | 27 ++++++++----------- tests/test_inat_inferrer.py | 54 ++++++++++++++++++++++--------------- 3 files changed, 45 insertions(+), 40 deletions(-) diff --git a/lib/vision_testing.py b/lib/vision_testing.py index 8ed853c..142fe8a 100644 --- a/lib/vision_testing.py +++ b/lib/vision_testing.py @@ -435,9 +435,7 @@ def summarize_result_subset( ) / sum_of_precision_and_recall summary["top_score"] = top_normalized_score - summary["matching_score"] = self.matching_score( - observation, working_results, normalized_score_column - ) + summary["matching_score"] = self.matching_score(observation, working_results, normalized_score_column) return summary diff --git a/tests/conftest.py b/tests/conftest.py index 1e0a365..4a58158 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ def taxonomy(): yield ModelTaxonomyDataframe( os.path.realpath(os.path.dirname(__file__) + "/fixtures/taxonomy.csv"), - os.path.realpath(os.path.dirname(__file__) + "/fixtures/thresholds.csv"), + os.path.realpath(os.path.dirname(__file__) + "/fixtures/thresholds.csv") ) @@ -24,21 +24,16 @@ def inatInferrer(request, mocker): config = { "vision_model_path": "vision_model_path.h5", "tf_geo_elevation_model_path": "tf_geo_elevation_model_path.h5", - "taxonomy_path": os.path.realpath( - os.path.dirname(__file__) + "/fixtures/taxonomy.csv" - ), - "elevation_h3_r4": os.path.realpath( - os.path.dirname(__file__) + "/fixtures/elevation.csv" - ), - "tf_elev_thresholds": os.path.realpath( - os.path.dirname(__file__) + "/fixtures/thresholds.csv" - ), - "taxon_ranges_path": os.path.realpath( - os.path.dirname(__file__) + "/fixtures/taxon_ranges" - ), - "synonyms_path": os.path.realpath( - os.path.dirname(__file__) + "/fixtures/synonyms.csv" - ), + "taxonomy_path": + os.path.realpath(os.path.dirname(__file__) + "/fixtures/taxonomy.csv"), + "elevation_h3_r4": + os.path.realpath(os.path.dirname(__file__) + "/fixtures/elevation.csv"), + "tf_elev_thresholds": + os.path.realpath(os.path.dirname(__file__) + "/fixtures/thresholds.csv"), + "taxon_ranges_path": + os.path.realpath(os.path.dirname(__file__) + "/fixtures/taxon_ranges"), + "synonyms_path": + os.path.realpath(os.path.dirname(__file__) + "/fixtures/synonyms.csv") } mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) mocker.patch("tensorflow.keras.Model", return_value=MagicMock()) diff --git a/tests/test_inat_inferrer.py b/tests/test_inat_inferrer.py index e9ca6d6..d5023f2 100644 --- a/tests/test_inat_inferrer.py +++ b/tests/test_inat_inferrer.py @@ -13,20 +13,24 @@ def test_initialization(self, inatInferrer): assert isinstance(inatInferrer.synonyms, pd.DataFrame) assert isinstance(inatInferrer.geo_elevation_cells, pd.DataFrame) tf.keras.models.load_model.assert_any_call( - inatInferrer.config["vision_model_path"], compile=False + inatInferrer.config["vision_model_path"], + compile=False ) tf.keras.models.load_model.assert_any_call( inatInferrer.config["tf_geo_elevation_model_path"], custom_objects={"ResLayer": ResLayer}, - compile=False, + compile=False ) def test_predictions_for_image(self, inatInferrer): - test_image_path = os.path.realpath( - os.path.dirname(__file__) + "/fixtures/lamprocapnos_spectabilis.jpeg" - ) + test_image_path = \ + os.path.realpath(os.path.dirname(__file__) + "/fixtures/lamprocapnos_spectabilis.jpeg") results = inatInferrer.predictions_for_image( - file_path=test_image_path, lat=42, lng=-71, filter_taxon=None, debug=True + file_path=test_image_path, + lat=42, + lng=-71, + filter_taxon=None, + debug=True ) combined_scores = results["combined_scores"] assert isinstance(combined_scores, pd.DataFrame) @@ -58,11 +62,14 @@ def test_lookup_taxon_with_invalid_taxon(self, inatInferrer): assert inatInferrer.lookup_taxon(999999999) is None def test_aggregate_results(self, inatInferrer): - test_image_path = os.path.realpath( - os.path.dirname(__file__) + "/fixtures/lamprocapnos_spectabilis.jpeg" - ) + test_image_path = \ + os.path.realpath(os.path.dirname(__file__) + "/fixtures/lamprocapnos_spectabilis.jpeg") predictions_for_image = inatInferrer.predictions_for_image( - file_path=test_image_path, lat=42, lng=-71, filter_taxon=None, debug=True + file_path=test_image_path, + lat=42, + lng=-71, + filter_taxon=None, + debug=True ) combined_scores = predictions_for_image["combined_scores"] combined_scores.normalized_vision_score = 0.5 @@ -70,7 +77,8 @@ def test_aggregate_results(self, inatInferrer): combined_scores.combined_score = 0.25 combined_scores.geo_threshold = 0.001 aggregated_scores = inatInferrer.aggregate_results( - leaf_scores=combined_scores, debug=True + leaf_scores=combined_scores, + debug=True ) assert "aggregated_vision_score" in aggregated_scores.columns assert "aggregated_geo_score" in aggregated_scores.columns @@ -80,13 +88,17 @@ def test_aggregate_results(self, inatInferrer): @pytest.mark.parametrize("taxon", ["Aramus guarauna"], indirect=True) def test_h3_04_taxon_range_comparison(self, mocker, inatInferrer, taxon): - inatInferrer.h3_04_geo_results_for_taxon = MagicMock( - return_value={"aa": "0.1", "ab": "0.1"} - ) - inatInferrer.h3_04_taxon_range = MagicMock( - return_value={"ab": "0.1", "bb": "0.1"} - ) - range_comparison_results = inatInferrer.h3_04_taxon_range_comparison( - taxon["taxon_id"] - ) - assert range_comparison_results == {"aa": 0, "ab": 0.5, "bb": 1} + inatInferrer.h3_04_geo_results_for_taxon = MagicMock(return_value={ + "aa": "0.1", + "ab": "0.1" + }) + inatInferrer.h3_04_taxon_range = MagicMock(return_value={ + "ab": "0.1", + "bb": "0.1" + }) + range_comparison_results = inatInferrer.h3_04_taxon_range_comparison(taxon["taxon_id"]) + assert range_comparison_results == { + "aa": 0, + "ab": 0.5, + "bb": 1 + } From bc2db1397f782e111ada852c98262ee8a9beeb08 Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Thu, 6 Mar 2025 13:52:27 -0800 Subject: [PATCH 7/8] whitespace & formatting cleanup --- tests/test_tf_gp_elev_model.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/test_tf_gp_elev_model.py b/tests/test_tf_gp_elev_model.py index f215e8b..0821a6b 100644 --- a/tests/test_tf_gp_elev_model.py +++ b/tests/test_tf_gp_elev_model.py @@ -1,10 +1,9 @@ import pytest import tensorflow as tf -from unittest.mock import MagicMock -import unittest.mock as mock - from lib.res_layer import ResLayer from lib.geo_inferrer_tf import TFGeoPriorModelElev +from unittest.mock import MagicMock +import unittest.mock as mock class TestTfGpModel: @@ -17,7 +16,9 @@ def test_initialization(self, mocker): mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) TFGeoPriorModelElev(model_path) tf.keras.models.load_model.assert_called_once_with( - model_path, custom_objects={"ResLayer": ResLayer}, compile=False + model_path, + custom_objects={"ResLayer": ResLayer}, + compile=False ) def test_predict(self, mocker): @@ -40,7 +41,5 @@ def test_eval_one_class_elevation_from_features(self, mocker): mocker.patch("tensorflow.matmul", return_value=MagicMock()) mocker.patch("tensorflow.expand_dims", return_value=MagicMock()) tf_gp_model = TFGeoPriorModelElev(model_path) - tf_gp_model.eval_one_class_elevation_from_features( - "features", "class_of_interest" - ) + tf_gp_model.eval_one_class_elevation_from_features("features", "class_of_interest") tf.math.sigmoid.assert_called_once From 5391732f728a9d353297b126f8f01403f7e565a3 Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Thu, 6 Mar 2025 13:54:12 -0800 Subject: [PATCH 8/8] whitespace & formatting --- tests/test_vision_inferrer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_vision_inferrer.py b/tests/test_vision_inferrer.py index 2bf788f..7693870 100644 --- a/tests/test_vision_inferrer.py +++ b/tests/test_vision_inferrer.py @@ -10,7 +10,10 @@ def test_initialization(self, mocker): model_path = "model_path" inferrer = VisionInferrerTF(model_path) assert inferrer.model_path == model_path - tf.keras.models.load_model.assert_called_once_with(model_path, compile=False) + tf.keras.models.load_model.assert_called_once_with( + model_path, + compile=False + ) def test_process_image(self, mocker): mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) @@ -20,5 +23,6 @@ def test_process_image(self, mocker): theimage = "theimage" inferrer.process_image(theimage) inferrer.layered_model.assert_called_once_with( - tf.convert_to_tensor(theimage), training=False + tf.convert_to_tensor(theimage), + training=False )