spatial-data-lab · cr1speecr3m3 · Jul 31, 2025 · Aug 5, 2025 · Aug 6, 2025 · Aug 7, 2025
diff --git a/knime_extension/src/nodes/spatialtool.py b/knime_extension/src/nodes/spatialtool.py
@@ -1692,3 +1692,327 @@ def execute(self, exec_context: knext.ExecutionContext, input_table1, input_tabl
         # append region id column
         gdf[self._COL_ID] = range(1, (gdf.shape[0] + 1))
         return knut.to_table(gdf, exec_context)
+
+
+############################################
+# Mapclassify
+############################################
+
+
+@knext.node(
+    name="Mapclassifier",
+    node_type=knext.NodeType.MANIPULATOR,
+    icon_path=__NODE_ICON_PATH + "Mapclassifier.png",
+    category=__category,
+    after="",
+)
+@knext.input_table(
+    name="Input Table",
+    description="Input table with targeted columns for classification.",
+)
+@knext.output_table(
+    name="Output Table",
+    description="Output table with classified result.",
+)
+@knut.geo_node_description(
+    short_description="Classifies numeric columns using Mapclassify algorithms.",
+    description="""Apply different classification schemes provided by the
+    [mapclassify](https://pysal.org/mapclassify/) library to selected numeric columns.
+    Choose a classifier, configure its parameters, and either replace the original
+    values with class labels or append new columns with the classification result.
+    The iterative classifiers `JenksCaspall`, `JenksCaspallForced`, and `JenksCaspallSampled`
+    may take noticeably longer on large datasets because they refine class boundaries
+    through multiple passes. The `HeadTailBreaks` classifier automatically determines
+    the number of classes based on the data distribution.""",
+    references={
+        "mapclassify documentation": "https://pysal.org/mapclassify/",
+    },
+)
+class Mapclassifier:
+
+    # classmode_param options for mapclassify
+    class ClassModes(knext.EnumParameterOptions):
+        BOXPLOT = (  # mapclassify.BoxPlot(y[, hinge])
+            "Boxplot",
+            """PURPOSE: Creates class breaks based on the statistical properties of a box plot distribution.
+            HOW IT WORKS: Uses the quartiles (Q1, median, Q3) and interquartile range (IQR) to identify outliers and create meaningful breaks. Typically creates 6 classes: lower outlier, < Q1, Q1-median, median-Q3, > Q3, and upper outlier.
+            BEST FOR: Identifying and highlighting outliers in your data while maintaining interpretable breaks based on statistical distribution.""",
+        )
+        EQUALINTERVAL = (  # mapclassify.EqualInterval(y[, k])
+            "EqualInterval",
+            """PURPOSE: Divides the data range into equal-sized intervals.
+            HOW IT WORKS: Takes the difference between maximum and minimum values, then divides by the number of desired classes to create intervals of equal width.
+            BEST FOR: Data that is relatively evenly distributed and when you want consistent interval sizes for easy interpretation.""",
+        )
+        FISHERJ = (  # mapclassify.FisherJenks(y[, k])
+            "FisherJenks",
+            """PURPOSE: Finds optimal class breaks that minimize within-class variance while maximizing between-class variance.
+            HOW IT WORKS: Uses dynamic programming to find the optimal groupings that create the most homogeneous classes possible.
+            BEST FOR: Most types of data as it adapts to the natural clustering in your dataset. Considered one of the most statistically robust methods.""",
+        )
+        FISHERJ_SAMPLED = (  # mapclassify.FisherJenksSampled(y[, k, pct, ...])
+            "FisherJanksSampled",
+            """PURPOSE: Same optimization as FisherJenks but uses a random sample for computational efficiency.
+            HOW IT WORKS: Applies the Fisher-Jenks algorithm to a subset of the data, making it faster for large datasets.
+            BEST FOR: Large datasets where standard Fisher-Jenks would be computationally expensive but you still want optimal breaks.""",
+        )
+        HEADT_BREAKS = (  # mapclassify.HeadTailBreaks(y)
+            "HeadTailBreaks",
+            """PURPOSE: Recursively divides data around the mean, designed specifically for heavy-tailed distributions.
+            HOW IT WORKS: Splits data at the arithmetic mean, then recursively applies the same process to the "head" (above-mean values) until stopping criteria are met.
+            BEST FOR: Highly skewed data with heavy tails, such as city populations, income distributions, or social media network data.""",
+        )
+        JENKS_CAS = (  # mapclassify.JenksCaspall(y[, k])
+            "JenksCaspall",
+            """PURPOSE: An iterative optimization method that moves class boundaries to minimize within-class variance.
+            HOW IT WORKS: Starts with initial class breaks and iteratively moves boundaries to improve the goodness of variance fit.
+            BEST FOR: When you want optimized breaks similar to Fisher-Jenks but prefer an iterative approach that can be stopped at any point.""",
+        )
+        JENKS_CASFORCED = (  # mapclassify.JenksCaspallForced(y[, k])
+            "JenksCaspallForced",
+            """PURPOSE: Similar to JenksCaspall but allows forcing specific values to be class boundaries.
+            HOW IT WORKS: Performs the iterative optimization while ensuring certain predetermined values remain as class breaks.
+            BEST FOR: When you have meaningful breakpoints (like 0, poverty line, etc.) that must be preserved while optimizing the remaining breaks.""",
+        )
+        JENKS_CASSAMPLED = (  # mapclassify.JenksCaspallSampled(y[, k, pct])
+            "JenksCaspallSampled",
+            """PURPOSE: Applies JenksCaspall optimization to a random sample of the data.
+            HOW IT WORKS: Uses sampling to make the iterative process computationally feasible for large datasets.
+            BEST FOR: Large datasets where full JenksCaspall would be too slow but you want iteratively optimized breaks.""",
+        )
+        MAXIMUMBREAKS = (  # mapclassify.MaximumBreaks(y[, k, mindiff])
+            "MaximumBreaks",
+            """PURPOSE: Places class breaks at the largest gaps in the sorted data values.
+            HOW IT WORKS: Identifies the biggest jumps between consecutive values and uses these as natural breaking points.
+            BEST FOR: Data with clear natural clusters or gaps, where you want breaks at the most obvious discontinuities.""",
+        )
+        NATURALBREAKS = (  # mapclassify.NaturalBreaks(y[, k, initial, ...]) needs to be revised
+            "NaturalBreaks",
+            """PURPOSE: Identifies class breaks that minimize variance within classes while maximizing variance between classes""",
+        )
+        PERCENTILES = (  # mapclassify.Percentiles(y[, pct])
+            "Percentiles",
+            """PURPOSE: Creates class breaks at specified percentile values.
+            HOW IT WORKS: Divides data based on percentile ranks (e.g., quintiles at 20th, 40th, 60th, 80th percentiles).
+            BEST FOR: When you want equal numbers of observations in each class, or when working with data where relative position matters more than absolute values.""",
+        )
+        PRETTYBREAKS = (  # mapclassify.PrettyBreaks(y[, k])
+            "PrettyBreaks",
+            """PURPOSE: Creates "nice" round numbers as class breaks for improved readability.
+            HOW IT WORKS: Chooses aesthetically pleasing break points (round numbers) that are close to optimal statistical breaks.
+            BEST FOR: Maps intended for general audiences where readability and round numbers are more important than statistical optimization.""",
+        )
+        QUANTILES = (  # mapclassify.Quantiles(y[, k])
+            "Quantiles",
+            """PURPOSE: Divides data so each class contains an equal number of observations.
+            HOW IT WORKS: Sorts data and creates breaks at quantile boundaries to ensure equal sample sizes per class.
+            BEST FOR: Comparing relative rankings across areas, or when you want to ensure balanced representation across all classes.""",
+        )
+        STDMEAN = (  # mapclassify.StdMean(y[, multiples, anchor])
+            "StdMean",
+            """PURPOSE: Creates classes based on standard deviations from the mean.
+            HOW IT WORKS: Sets breaks at intervals of standard deviations above and below the mean (e.g., mean±1σ, mean±2σ).
+            BEST FOR: Normally distributed data where you want to highlight areas that are statistically typical vs. unusual relative to the average.""",
+        )
+
+        @classmethod
+        def get_default(cls):
+            return cls.FISHERJ
+
+    # classifier_param for selecting the classification method
+    classifier_param = knext.EnumParameter(
+        label="Classifier Selection",
+        description="Select the classifier that you want to apply to the targeted columns.",
+        default_value=ClassModes.get_default().name,
+        enum=ClassModes,
+    )
+
+    # classifier selection parameter
+    class_col = knext.MultiColumnParameter(
+        "Targeted columns",
+        """The zoom level of the grid from 0 to 15 (default value is 8). The bigger the zoom level, the smaller the 
+        hexagon. If the zoom level is too small, the hexagon might be too big to fit in the input polygon which will
+        result in an error. A very small zoom level might result in a very large output table even for smaller 
+        input polygons. 
+        For more details about the zoom levels  refer to 
+        [Tables of Cell Statistics Across Resolutions.](https://h3geo.org/docs/core-library/restable/)
+        """,
+        column_filter=knut.is_numeric,
+    )
+
+    # k_param for all classifiers
+    k_param = knext.IntParameter(
+        label="Number of classification",
+        description="""The zoom level of the grid from 0 to 15 (default value is 8). The bigger the zoom level, the smaller the 
+        hexagon. If the zoom level is too small, the hexagon might be too big to fit in the input polygon which will
+        result in an error. A very small zoom level might result in a very large output table even for smaller 
+        input polygons. 
+        For more details about the zoom levels  refer to 
+        [Tables of Cell Statistics Across Resolutions.](https://h3geo.org/docs/core-library/restable/)
+        """,
+        default_value=5,
+        min_value=2,
+    ).rule(
+        knext.OneOf(
+            classifier_param,
+            [
+                ClassModes.BOXPLOT.name,
+                ClassModes.PERCENTILES.name,
+                ClassModes.STDMEAN.name,
+                ClassModes.HEADT_BREAKS.name,
+            ],
+        ),
+        knext.Effect.HIDE,
+    )
+
+    # hinge_param for Boxplot
+    hinge_param = knext.DoubleParameter(
+        label="Hinge prompt for Boxplot",
+        description="""The hinge value is used to determine the lower and upper quartiles of the data.""",
+        default_value=1.5,
+    ).rule(knext.OneOf(classifier_param, [ClassModes.BOXPLOT.name]), knext.Effect.SHOW)
+
+    # SETTINGS FOR FISHER JENKS SAMPLED CLASSIFIER ***
+
+    # pct_param for JenksCaspallSampled
+    pct_param = knext.DoubleParameter(
+        label="Natural Breaks Percentage for FisherJenks Sampled",
+        description="""The percentage of the data to be randomly sampled to determine the natural breaks.""",
+        default_value=0.10,
+    ).rule(
+        knext.OneOf(
+            classifier_param,
+            [
+                ClassModes.FISHERJ_SAMPLED.name,
+                ClassModes.JENKS_CASSAMPLED.name,
+            ],
+        ),
+        knext.Effect.SHOW,
+    )
+
+    # determines if the output is to truncate the classification result to the number of classes specified by the k parameter
+    jc_sampledtruncate = knext.BoolParameter(
+        "Truncate for FisherJenksSampled",
+        """If checked, the node will truncate the classification result to the number of classes specified by the k parameter.""",
+        default_value=False,
+    ).rule(
+        knext.OneOf(classifier_param, [ClassModes.JENKS_CASSAMPLED.name]),
+        knext.Effect.SHOW,
+    )
+
+    # *** Formatting Settings ***
+
+    # determines if the output is to replace the original columns or append them
+    append_replace = knext.BoolParameter(
+        "Append Classification Results",
+        """If checked, the node will append the classification result to the input table.
+        If unchecked, the node will replace the input table with the classification result.""",
+        default_value=False,
+    )
+
+    def configure(self, configure_context, input_schema):
+        return None
+
+    def execute(self, exec_context: knext.ExecutionContext, input_table):
+        import mapclassify as mc
+        import numpy as np
+        import pandas as pd
+
+        df = input_table.to_pandas()
+
+        if not self.class_col:
+            LOGGER.warning("No target columns selected for Mapclassifier node.")
+            return knut.to_table(df, exec_context)
+
+        selected_mode = self.classifier_param
+
+        exec_context.set_progress(0.1, "Preparing classification.")
+
+        def std_mean_multiples(k: int) -> list:
+            if k < 2:
+                return [-1, 0, 1]
+            half = k // 2
+            multiples = list(range(-half, half + 1))
+            if k % 2 == 0 and 0 in multiples:
+                multiples.remove(0)
+            return multiples
+
+        def classify_series(series: pd.Series) -> pd.Series:
+            valid = series.dropna()
+            if valid.empty:
+                return pd.Series(pd.NA, index=series.index, dtype="Int64")
+
+            values = valid.to_numpy()
+            mode = selected_mode
+
+            if mode == self.ClassModes.EQUALINTERVAL.name:
+                classifier = mc.EqualInterval(values, k=self.k_param)
+            elif mode == self.ClassModes.FISHERJ.name:
+                classifier = mc.FisherJenks(values, k=self.k_param)
+            elif mode == self.ClassModes.FISHERJ_SAMPLED.name:
+                classifier = mc.FisherJenksSampled(
+                    values, k=self.k_param, pct=self.pct_param
+                )
+            elif mode == self.ClassModes.JENKS_CAS.name:
+                classifier = mc.JenksCaspall(values, k=self.k_param)
+            elif mode == self.ClassModes.JENKS_CASFORCED.name:
+                classifier = mc.JenksCaspallForced(values, k=self.k_param)
+            elif mode == self.ClassModes.JENKS_CASSAMPLED.name:
+                classifier = mc.JenksCaspallSampled(
+                    values,
+                    k=self.k_param,
+                    pct=self.pct_param,
+                    truncate=self.jc_sampledtruncate,
+                )
+            elif mode == self.ClassModes.PRETTYBREAKS.name:
+                classifier = mc.PrettyBreaks(values, k=self.k_param)
+            elif mode == self.ClassModes.QUANTILES.name:
+                classifier = mc.Quantiles(values, k=self.k_param)
+            elif mode == self.ClassModes.BOXPLOT.name:
+                classifier = mc.BoxPlot(values, hinge=self.hinge_param)
+            elif mode == self.ClassModes.HEADT_BREAKS.name:
+                classifier = mc.HeadTailBreaks(values)
+            elif mode == self.ClassModes.MAXIMUMBREAKS.name:
+                classifier = mc.MaximumBreaks(values, k=self.k_param)
+            elif mode == self.ClassModes.NATURALBREAKS.name:
+                classifier = mc.NaturalBreaks(values, k=self.k_param)
+            elif mode == self.ClassModes.PERCENTILES.name:
+                if self.k_param < 2:
+                    raise ValueError(
+                        "Percentiles classifier requires at least 2 classes."
+                    )
+                percentiles = list(np.linspace(0, 100, self.k_param + 1)[1:-1])
+                classifier = mc.Percentiles(values, pct=percentiles)
+            elif mode == self.ClassModes.STDMEAN.name:
+                multiples = std_mean_multiples(self.k_param)
+                classifier = mc.StdMean(values, multiples=multiples)
+            else:
+                raise ValueError(f"Unsupported classifier mode: {mode}")
+
+            classified = pd.Series(classifier.yb, index=valid.index, dtype="int64")
+            result = classified.reindex(series.index).astype("Int64")
+            return result
+
+        result_df = df.copy()
+        existing_columns = set(result_df.columns)
+
+        for idx, col in enumerate(self.class_col, start=1):
+            knut.check_canceled(exec_context)
+            exec_context.set_progress(
+                0.1 + 0.8 * idx / len(self.class_col),
+                f"Classifying column '{col}' ({idx}/{len(self.class_col)})",
+            )
+            classified_series = classify_series(df[col])
+
+            if self.append_replace:
+                new_col_name = knut.get_unique_name(
+                    f"{col}_classified", list(existing_columns)
+                )
+                existing_columns.add(new_col_name)
+                result_df[new_col_name] = classified_series
+            else:
+                result_df[col] = classified_series
+
+        exec_context.set_progress(1.0, "Classification complete.")
+
+        return knut.to_table(result_df, exec_context)