From be04d88a1d7b98daafe8c16fd56a69c451a2ccce Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Fri, 15 Jul 2022 23:26:27 +0100 Subject: [PATCH 1/4] update pre-commit config --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 538b35e..43b804e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,10 +1,10 @@ repos: - repo: https://github.com/psf/black - rev: 20.8b1 + rev: 22.6.0 hooks: - id: black language_version: python3.8 - repo: https://github.com/pycqa/pylint - rev: pylint-2.6.0 + rev: v2.14.4 hooks: - id: pylint From 1b827a0192e191b2a47a1d7a3ac9738272418d00 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Fri, 15 Jul 2022 23:43:04 +0100 Subject: [PATCH 2/4] fix: improve geograph loading perf. --- geograph/geograph.py | 124 ++++++++++++++++++++++--------------------- pylintrc | 12 ----- 2 files changed, 64 insertions(+), 72 deletions(-) diff --git a/geograph/geograph.py b/geograph/geograph.py index efd3e9f..104d85d 100644 --- a/geograph/geograph.py +++ b/geograph/geograph.py @@ -412,48 +412,57 @@ def _load_from_dataframe( # Reset index to ensure consistent indices df = df.reset_index(drop=True) - # Using this list and iterating through it is slightly faster than - # iterating through df due to the dataframe overhead - geom: List[shapely.Polygon] = df["geometry"].tolist() - # this dict maps polygon row numbers in df to a list - # of neighbouring polygon row numbers - graph_dict = {} - - if tolerance > 0: - # Expand the borders of the polygons by `tolerance``` - new_polygons: List[shapely.Polygon] = ( - df["geometry"].buffer(tolerance).tolist() - ) + # pylint: disable=protected-access + if gpd._compat.USE_PYGEOS: + if tolerance > 0: + neighbour_arr = df.sindex.query_bulk( + df["geometry"].buffer(tolerance), predicate="intersects" + ).transpose() + else: + neighbour_arr = df.sindex.query_bulk( + df["geometry"], predicate="intersects" + ).transpose() + else: + # this dict maps polygon row numbers in df to a list + # of neighbouring polygon row numbers + graph_dict = {} + if tolerance > 0: + # Expand the borders of the polygons by `tolerance``` + new_polygons: gpd.GeoSeries = df["geometry"].buffer(tolerance) # Creating nodes (=vertices) and finding neighbors for index, polygon in tqdm( - enumerate(geom), + enumerate(df["geometry"]), desc="Step 1 of 2: Creating nodes and finding neighbours", - total=len(geom), + total=len(df), ): - if tolerance > 0: - # find the indexes of all polygons which intersect with this one - neighbours = df.sindex.query( - new_polygons[index], predicate="intersects" - ) - else: - neighbours = df.sindex.query(polygon, predicate="intersects") - - graph_dict[index] = neighbours - # add each polygon as a node to the graph with useful attributes - self.graph.add_node( - index, - rep_point=polygon.representative_point(), - area=polygon.area, - perimeter=polygon.length, - bounds=polygon.bounds, - ) + # pylint: disable=protected-access + if not gpd._compat.USE_PYGEOS: + if tolerance > 0: + # find the indexes of all polygons which intersect with this one + neighbours = df.sindex.query( + new_polygons[index], predicate="intersects" + ) + else: + neighbours = df.sindex.query(polygon, predicate="intersects") + + graph_dict[index] = neighbours + # TODO: factor out for use_pygeos + self.graph.add_node(index) # iterate through the dict and add edges between neighbouring polygons - for polygon_id, neighbours in tqdm( - graph_dict.items(), desc="Step 2 of 2: Adding edges" - ): - for neighbour_id in neighbours: - if polygon_id != neighbour_id: - self.graph.add_edge(polygon_id, neighbour_id) + # pylint: disable=protected-access + if gpd._compat.USE_PYGEOS: + for index, neighbour in tqdm( + neighbour_arr, desc="Step 2 of 2: Adding edges" + ): + if index != neighbour: + self.graph.add_edge(index, neighbour) + else: + for polygon_id, neighbours in tqdm( + graph_dict.items(), desc="Step 2 of 2: Adding edges" + ): + for neighbour_id in neighbours: + if polygon_id != neighbour_id: + self.graph.add_edge(polygon_id, neighbour_id) # add index name df.index.name = "node_index" @@ -677,12 +686,12 @@ def add_habitat( # barrier polygon. If this results in multiple polygons, # then the barrier polygon fully cuts the buffered polygon. # - # We find the resulting buffered polygon fragment that contains - # the original node polygon, by selecting the polygon that - # contains a representative point sampled from the original - # polygon. - # We then check if that polygon fragment intersects the - # neighbour polygon we're trying to reach. + # We find the resulting buffered polygon fragment that contains + # the original node polygon, by selecting the polygon that + # contains a representative point sampled from the original + # polygon. + # We then check if that polygon fragment intersects the + # neighbour polygon we're trying to reach. # If it does not, there is no path. # If it does, there may be a path or there may not - it # would require complex pathfinding code to discover, but @@ -792,7 +801,9 @@ def get_graph_components( """ components: List[set] = list(nx.connected_components(self.graph)) if calc_polygons: - geom = [self.df["geometry"].loc[comp].unary_union for comp in components] + geom = [ + self.df["geometry"].loc[list(comp)].unary_union for comp in components + ] gdf = gpd.GeoDataFrame( {"geometry": geom, "class_label": -1}, crs=self.df.crs ) @@ -840,7 +851,7 @@ def get_metric( result = metrics._get_metric( name=name, geo_graph=self, class_value=class_value, **metric_kwargs ) - if name in self.class_metrics.keys(): + if name in self.class_metrics: self.class_metrics[name][class_value] = result else: self.class_metrics[name] = {class_value: result} @@ -958,14 +969,7 @@ def _add_node( node_data = dict(data.items()) # Add node to graph - self.graph.add_node( - node_id, - rep_point=node_data["geometry"].representative_point(), - area=node_data["geometry"].area, - perimeter=node_data["geometry"].length, - class_label=node_data["class_label"], - bounds=node_data["geometry"].bounds, - ) + self.graph.add_node(node_id) # Add node data to dataframe missing_cols = { @@ -1111,25 +1115,25 @@ class label in `valid_classes`, as long as they are less than f"and {self.graph.number_of_edges()} edges.", ) - def _load_from_graph_path(self, load_path: pathlib.Path) -> None: + def _load_from_graph_path(self, graph_path: pathlib.Path) -> None: """ Load networkx graph and dataframe objects from a pickle file. Args: - load_path (pathlib.Path): Path to a pickle file. Can be compressed + graph_path (pathlib.Path): Path to a pickle file. Can be compressed with gzip or bz2. Returns: gpd.GeoDataFrame: The dataframe containing polygon objects. """ - if load_path.suffix == ".bz2": - with bz2.BZ2File(load_path, "rb") as bz2_file: + if graph_path.suffix == ".bz2": + with bz2.BZ2File(graph_path, "rb") as bz2_file: data = pickle.load(bz2_file) - elif load_path.suffix == ".gz": - with gzip.GzipFile(load_path, "rb") as gz_file: + elif graph_path.suffix == ".gz": + with gzip.GzipFile(graph_path, "rb") as gz_file: data = pickle.loads(gz_file.read()) else: - with open(load_path, "rb") as file: + with open(graph_path, "rb") as file: data = pickle.load(file) self.df = data["dataframe"] self.name = data["name"] diff --git a/pylintrc b/pylintrc index cbac165..c015dac 100644 --- a/pylintrc +++ b/pylintrc @@ -160,12 +160,6 @@ disable=abstract-method, # mypackage.mymodule.MyReporterClass. output-format=text -# Put messages in a separate file for each module / package specified on the -# command line instead of printing them on stdout. Reports (if any) will be -# written in a file name "pylint_global.[txt|html]". This option is deprecated -# and it will be removed in Pylint 2.0. -files-output=no - # Tells whether to display a full report or only the messages reports=no @@ -284,12 +278,6 @@ ignore-long-lines=(?x)( # else. single-line-if-stmt=yes -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check= - # Maximum number of lines in a module max-module-lines=99999 From 92e8bc03733be455805053a4d50f5e75da0b96ec Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Fri, 15 Jul 2022 23:59:16 +0100 Subject: [PATCH 3/4] fix: remove unnecessary node attributes from comps --- geograph/geograph.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/geograph/geograph.py b/geograph/geograph.py index 104d85d..d696c83 100644 --- a/geograph/geograph.py +++ b/geograph/geograph.py @@ -1257,16 +1257,6 @@ def _load_from_dataframe( self.graph = nx.complete_graph(len(df)) else: self.graph = nx.empty_graph(len(df)) - # Add node attributes - for node in tqdm( - self.graph.nodes, desc="Constructing graph", total=len(self.graph) - ): - polygon = geom[node] - self.graph.nodes[node]["rep_point"] = polygon.representative_point() - self.graph.nodes[node]["area"] = polygon.area - self.graph.nodes[node]["perimeter"] = polygon.length - self.graph.nodes[node]["bounds"] = polygon.bounds - # Add edge attributes if necessary if self.has_distance_edges: for u, v, attrs in tqdm( From 3226af14aefa8b8878efcf250bbc26115a32e091 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Mon, 3 Oct 2022 14:06:58 +0100 Subject: [PATCH 4/4] Metrics update --- geograph/metrics.py | 55 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/geograph/metrics.py b/geograph/metrics.py index eadc68c..81da538 100644 --- a/geograph/metrics.py +++ b/geograph/metrics.py @@ -2,7 +2,8 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional, Union +from itertools import combinations +from typing import TYPE_CHECKING, Any, Dict, Optional, Union import networkx as nx import numpy as np @@ -10,7 +11,9 @@ if TYPE_CHECKING: import geograph - +# TODO: refactor this file to return a tuple with the values to put inside the +# metric such that the metric is only created once by the calling GeoGraph +# since dataclass creation is slow # define a metric dataclass with < <= => > == comparisons that work as you would # expect intuitively @dataclass() @@ -52,6 +55,8 @@ def __ge__(self, o: object) -> bool: ######################################################################################## # 1. Landscape level metrics ######################################################################################## + + def _num_patches(geo_graph: geograph.GeoGraph) -> Metric: return Metric( value=len(geo_graph.df), @@ -164,7 +169,7 @@ def _simpson_diversity_index(geo_graph: geograph.GeoGraph) -> Metric: ) return Metric( - value=1 - np.sum(class_prop_of_landscape ** 2), + value=1 - np.sum(class_prop_of_landscape**2), name="simpson_diversity_index", description=description, variant="conventional", @@ -391,7 +396,7 @@ def _class_effective_mesh_size( ) return Metric( - value=np.sum(class_areas ** 2) / total_area, + value=np.sum(class_areas**2) / total_area, name=f"effective_mesh_size_class={class_value}", description=description, variant="conventional", @@ -413,7 +418,7 @@ def _class_effective_mesh_size( } ######################################################################################## -# 3. Habitat componment level metrics +# 3. Habitat component level metrics ######################################################################################## @@ -482,10 +487,49 @@ def _avg_component_isolation(geo_graph: geograph.GeoGraph) -> Metric: ) +def _habitat_iic( + geo_graph: geograph.GeoGraph, + get_total_area: bool = False, + shortest_path_cutoff: Optional[int] = None, +) -> Metric: + if get_total_area: + # Most efficient way to get area of convex hull of the GeoGraph + total_area = geo_graph.components.df.dissolve().convex_hull.values[0].area + iic = 0.0 + idx_dict = dict(zip(geo_graph.df.index.values, range(len(geo_graph.df)))) + path_lengths: Dict = dict( + nx.all_pairs_shortest_path_length(geo_graph.graph, cutoff=shortest_path_cutoff) + ) + for x in combinations(geo_graph.df.index.values, 2): + if x[1] not in path_lengths[x[0]]: + continue + iic += ( + geo_graph.graph.nodes[x[0]]["area"] * geo_graph.graph.nodes[x[1]]["area"] + ) / (1 + path_lengths[x[0]][x[1]]) + if get_total_area: + # for node in geo_graph.graph.nodes: + # iic += geo_graph.graph.nodes[node]["area"] ** 2 + return Metric( + value=iic / total_area, + name="habitat_iic", + description="The habitat IIC metric", + variant="component", + unit="dimensionless", + ) + return Metric( + value=iic, + name="habitat_iic", + description="The habitat IIC metric", + variant="component", + unit="dimensionless", + ) + + COMPONENT_METRICS_DICT = { "num_components": _num_components, "avg_component_area": _avg_component_area, "avg_component_isolation": _avg_component_isolation, + "habitat_iic": _habitat_iic, } @@ -493,6 +537,7 @@ def _avg_component_isolation(geo_graph: geograph.GeoGraph) -> Metric: # 4. Define access interface for GeoGraph ######################################################################################## + STANDARD_METRICS = ["num_components", "avg_patch_area", "total_area"]