diff --git a/knime_extension/src/nodes/spatialtool.py b/knime_extension/src/nodes/spatialtool.py index fb7e8413..438132c7 100644 --- a/knime_extension/src/nodes/spatialtool.py +++ b/knime_extension/src/nodes/spatialtool.py @@ -1692,3 +1692,327 @@ def execute(self, exec_context: knext.ExecutionContext, input_table1, input_tabl # append region id column gdf[self._COL_ID] = range(1, (gdf.shape[0] + 1)) return knut.to_table(gdf, exec_context) + + +############################################ +# Mapclassify +############################################ + + +@knext.node( + name="Mapclassifier", + node_type=knext.NodeType.MANIPULATOR, + icon_path=__NODE_ICON_PATH + "Mapclassifier.png", + category=__category, + after="", +) +@knext.input_table( + name="Input Table", + description="Input table with targeted columns for classification.", +) +@knext.output_table( + name="Output Table", + description="Output table with classified result.", +) +@knut.geo_node_description( + short_description="Classifies numeric columns using Mapclassify algorithms.", + description="""Apply different classification schemes provided by the + [mapclassify](https://pysal.org/mapclassify/) library to selected numeric columns. + Choose a classifier, configure its parameters, and either replace the original + values with class labels or append new columns with the classification result. + The iterative classifiers `JenksCaspall`, `JenksCaspallForced`, and `JenksCaspallSampled` + may take noticeably longer on large datasets because they refine class boundaries + through multiple passes. The `HeadTailBreaks` classifier automatically determines + the number of classes based on the data distribution.""", + references={ + "mapclassify documentation": "https://pysal.org/mapclassify/", + }, +) +class Mapclassifier: + + # classmode_param options for mapclassify + class ClassModes(knext.EnumParameterOptions): + BOXPLOT = ( # mapclassify.BoxPlot(y[, hinge]) + "Boxplot", + """PURPOSE: Creates class breaks based on the statistical properties of a box plot distribution. + HOW IT WORKS: Uses the quartiles (Q1, median, Q3) and interquartile range (IQR) to identify outliers and create meaningful breaks. Typically creates 6 classes: lower outlier, < Q1, Q1-median, median-Q3, > Q3, and upper outlier. + BEST FOR: Identifying and highlighting outliers in your data while maintaining interpretable breaks based on statistical distribution.""", + ) + EQUALINTERVAL = ( # mapclassify.EqualInterval(y[, k]) + "EqualInterval", + """PURPOSE: Divides the data range into equal-sized intervals. + HOW IT WORKS: Takes the difference between maximum and minimum values, then divides by the number of desired classes to create intervals of equal width. + BEST FOR: Data that is relatively evenly distributed and when you want consistent interval sizes for easy interpretation.""", + ) + FISHERJ = ( # mapclassify.FisherJenks(y[, k]) + "FisherJenks", + """PURPOSE: Finds optimal class breaks that minimize within-class variance while maximizing between-class variance. + HOW IT WORKS: Uses dynamic programming to find the optimal groupings that create the most homogeneous classes possible. + BEST FOR: Most types of data as it adapts to the natural clustering in your dataset. Considered one of the most statistically robust methods.""", + ) + FISHERJ_SAMPLED = ( # mapclassify.FisherJenksSampled(y[, k, pct, ...]) + "FisherJanksSampled", + """PURPOSE: Same optimization as FisherJenks but uses a random sample for computational efficiency. + HOW IT WORKS: Applies the Fisher-Jenks algorithm to a subset of the data, making it faster for large datasets. + BEST FOR: Large datasets where standard Fisher-Jenks would be computationally expensive but you still want optimal breaks.""", + ) + HEADT_BREAKS = ( # mapclassify.HeadTailBreaks(y) + "HeadTailBreaks", + """PURPOSE: Recursively divides data around the mean, designed specifically for heavy-tailed distributions. + HOW IT WORKS: Splits data at the arithmetic mean, then recursively applies the same process to the "head" (above-mean values) until stopping criteria are met. + BEST FOR: Highly skewed data with heavy tails, such as city populations, income distributions, or social media network data.""", + ) + JENKS_CAS = ( # mapclassify.JenksCaspall(y[, k]) + "JenksCaspall", + """PURPOSE: An iterative optimization method that moves class boundaries to minimize within-class variance. + HOW IT WORKS: Starts with initial class breaks and iteratively moves boundaries to improve the goodness of variance fit. + BEST FOR: When you want optimized breaks similar to Fisher-Jenks but prefer an iterative approach that can be stopped at any point.""", + ) + JENKS_CASFORCED = ( # mapclassify.JenksCaspallForced(y[, k]) + "JenksCaspallForced", + """PURPOSE: Similar to JenksCaspall but allows forcing specific values to be class boundaries. + HOW IT WORKS: Performs the iterative optimization while ensuring certain predetermined values remain as class breaks. + BEST FOR: When you have meaningful breakpoints (like 0, poverty line, etc.) that must be preserved while optimizing the remaining breaks.""", + ) + JENKS_CASSAMPLED = ( # mapclassify.JenksCaspallSampled(y[, k, pct]) + "JenksCaspallSampled", + """PURPOSE: Applies JenksCaspall optimization to a random sample of the data. + HOW IT WORKS: Uses sampling to make the iterative process computationally feasible for large datasets. + BEST FOR: Large datasets where full JenksCaspall would be too slow but you want iteratively optimized breaks.""", + ) + MAXIMUMBREAKS = ( # mapclassify.MaximumBreaks(y[, k, mindiff]) + "MaximumBreaks", + """PURPOSE: Places class breaks at the largest gaps in the sorted data values. + HOW IT WORKS: Identifies the biggest jumps between consecutive values and uses these as natural breaking points. + BEST FOR: Data with clear natural clusters or gaps, where you want breaks at the most obvious discontinuities.""", + ) + NATURALBREAKS = ( # mapclassify.NaturalBreaks(y[, k, initial, ...]) needs to be revised + "NaturalBreaks", + """PURPOSE: Identifies class breaks that minimize variance within classes while maximizing variance between classes""", + ) + PERCENTILES = ( # mapclassify.Percentiles(y[, pct]) + "Percentiles", + """PURPOSE: Creates class breaks at specified percentile values. + HOW IT WORKS: Divides data based on percentile ranks (e.g., quintiles at 20th, 40th, 60th, 80th percentiles). + BEST FOR: When you want equal numbers of observations in each class, or when working with data where relative position matters more than absolute values.""", + ) + PRETTYBREAKS = ( # mapclassify.PrettyBreaks(y[, k]) + "PrettyBreaks", + """PURPOSE: Creates "nice" round numbers as class breaks for improved readability. + HOW IT WORKS: Chooses aesthetically pleasing break points (round numbers) that are close to optimal statistical breaks. + BEST FOR: Maps intended for general audiences where readability and round numbers are more important than statistical optimization.""", + ) + QUANTILES = ( # mapclassify.Quantiles(y[, k]) + "Quantiles", + """PURPOSE: Divides data so each class contains an equal number of observations. + HOW IT WORKS: Sorts data and creates breaks at quantile boundaries to ensure equal sample sizes per class. + BEST FOR: Comparing relative rankings across areas, or when you want to ensure balanced representation across all classes.""", + ) + STDMEAN = ( # mapclassify.StdMean(y[, multiples, anchor]) + "StdMean", + """PURPOSE: Creates classes based on standard deviations from the mean. + HOW IT WORKS: Sets breaks at intervals of standard deviations above and below the mean (e.g., mean±1σ, mean±2σ). + BEST FOR: Normally distributed data where you want to highlight areas that are statistically typical vs. unusual relative to the average.""", + ) + + @classmethod + def get_default(cls): + return cls.FISHERJ + + # classifier_param for selecting the classification method + classifier_param = knext.EnumParameter( + label="Classifier Selection", + description="Select the classifier that you want to apply to the targeted columns.", + default_value=ClassModes.get_default().name, + enum=ClassModes, + ) + + # classifier selection parameter + class_col = knext.MultiColumnParameter( + "Targeted columns", + """The zoom level of the grid from 0 to 15 (default value is 8). The bigger the zoom level, the smaller the + hexagon. If the zoom level is too small, the hexagon might be too big to fit in the input polygon which will + result in an error. A very small zoom level might result in a very large output table even for smaller + input polygons. + For more details about the zoom levels refer to + [Tables of Cell Statistics Across Resolutions.](https://h3geo.org/docs/core-library/restable/) + """, + column_filter=knut.is_numeric, + ) + + # k_param for all classifiers + k_param = knext.IntParameter( + label="Number of classification", + description="""The zoom level of the grid from 0 to 15 (default value is 8). The bigger the zoom level, the smaller the + hexagon. If the zoom level is too small, the hexagon might be too big to fit in the input polygon which will + result in an error. A very small zoom level might result in a very large output table even for smaller + input polygons. + For more details about the zoom levels refer to + [Tables of Cell Statistics Across Resolutions.](https://h3geo.org/docs/core-library/restable/) + """, + default_value=5, + min_value=2, + ).rule( + knext.OneOf( + classifier_param, + [ + ClassModes.BOXPLOT.name, + ClassModes.PERCENTILES.name, + ClassModes.STDMEAN.name, + ClassModes.HEADT_BREAKS.name, + ], + ), + knext.Effect.HIDE, + ) + + # hinge_param for Boxplot + hinge_param = knext.DoubleParameter( + label="Hinge prompt for Boxplot", + description="""The hinge value is used to determine the lower and upper quartiles of the data.""", + default_value=1.5, + ).rule(knext.OneOf(classifier_param, [ClassModes.BOXPLOT.name]), knext.Effect.SHOW) + + # SETTINGS FOR FISHER JENKS SAMPLED CLASSIFIER *** + + # pct_param for JenksCaspallSampled + pct_param = knext.DoubleParameter( + label="Natural Breaks Percentage for FisherJenks Sampled", + description="""The percentage of the data to be randomly sampled to determine the natural breaks.""", + default_value=0.10, + ).rule( + knext.OneOf( + classifier_param, + [ + ClassModes.FISHERJ_SAMPLED.name, + ClassModes.JENKS_CASSAMPLED.name, + ], + ), + knext.Effect.SHOW, + ) + + # determines if the output is to truncate the classification result to the number of classes specified by the k parameter + jc_sampledtruncate = knext.BoolParameter( + "Truncate for FisherJenksSampled", + """If checked, the node will truncate the classification result to the number of classes specified by the k parameter.""", + default_value=False, + ).rule( + knext.OneOf(classifier_param, [ClassModes.JENKS_CASSAMPLED.name]), + knext.Effect.SHOW, + ) + + # *** Formatting Settings *** + + # determines if the output is to replace the original columns or append them + append_replace = knext.BoolParameter( + "Append Classification Results", + """If checked, the node will append the classification result to the input table. + If unchecked, the node will replace the input table with the classification result.""", + default_value=False, + ) + + def configure(self, configure_context, input_schema): + return None + + def execute(self, exec_context: knext.ExecutionContext, input_table): + import mapclassify as mc + import numpy as np + import pandas as pd + + df = input_table.to_pandas() + + if not self.class_col: + LOGGER.warning("No target columns selected for Mapclassifier node.") + return knut.to_table(df, exec_context) + + selected_mode = self.classifier_param + + exec_context.set_progress(0.1, "Preparing classification.") + + def std_mean_multiples(k: int) -> list: + if k < 2: + return [-1, 0, 1] + half = k // 2 + multiples = list(range(-half, half + 1)) + if k % 2 == 0 and 0 in multiples: + multiples.remove(0) + return multiples + + def classify_series(series: pd.Series) -> pd.Series: + valid = series.dropna() + if valid.empty: + return pd.Series(pd.NA, index=series.index, dtype="Int64") + + values = valid.to_numpy() + mode = selected_mode + + if mode == self.ClassModes.EQUALINTERVAL.name: + classifier = mc.EqualInterval(values, k=self.k_param) + elif mode == self.ClassModes.FISHERJ.name: + classifier = mc.FisherJenks(values, k=self.k_param) + elif mode == self.ClassModes.FISHERJ_SAMPLED.name: + classifier = mc.FisherJenksSampled( + values, k=self.k_param, pct=self.pct_param + ) + elif mode == self.ClassModes.JENKS_CAS.name: + classifier = mc.JenksCaspall(values, k=self.k_param) + elif mode == self.ClassModes.JENKS_CASFORCED.name: + classifier = mc.JenksCaspallForced(values, k=self.k_param) + elif mode == self.ClassModes.JENKS_CASSAMPLED.name: + classifier = mc.JenksCaspallSampled( + values, + k=self.k_param, + pct=self.pct_param, + truncate=self.jc_sampledtruncate, + ) + elif mode == self.ClassModes.PRETTYBREAKS.name: + classifier = mc.PrettyBreaks(values, k=self.k_param) + elif mode == self.ClassModes.QUANTILES.name: + classifier = mc.Quantiles(values, k=self.k_param) + elif mode == self.ClassModes.BOXPLOT.name: + classifier = mc.BoxPlot(values, hinge=self.hinge_param) + elif mode == self.ClassModes.HEADT_BREAKS.name: + classifier = mc.HeadTailBreaks(values) + elif mode == self.ClassModes.MAXIMUMBREAKS.name: + classifier = mc.MaximumBreaks(values, k=self.k_param) + elif mode == self.ClassModes.NATURALBREAKS.name: + classifier = mc.NaturalBreaks(values, k=self.k_param) + elif mode == self.ClassModes.PERCENTILES.name: + if self.k_param < 2: + raise ValueError( + "Percentiles classifier requires at least 2 classes." + ) + percentiles = list(np.linspace(0, 100, self.k_param + 1)[1:-1]) + classifier = mc.Percentiles(values, pct=percentiles) + elif mode == self.ClassModes.STDMEAN.name: + multiples = std_mean_multiples(self.k_param) + classifier = mc.StdMean(values, multiples=multiples) + else: + raise ValueError(f"Unsupported classifier mode: {mode}") + + classified = pd.Series(classifier.yb, index=valid.index, dtype="int64") + result = classified.reindex(series.index).astype("Int64") + return result + + result_df = df.copy() + existing_columns = set(result_df.columns) + + for idx, col in enumerate(self.class_col, start=1): + knut.check_canceled(exec_context) + exec_context.set_progress( + 0.1 + 0.8 * idx / len(self.class_col), + f"Classifying column '{col}' ({idx}/{len(self.class_col)})", + ) + classified_series = classify_series(df[col]) + + if self.append_replace: + new_col_name = knut.get_unique_name( + f"{col}_classified", list(existing_columns) + ) + existing_columns.add(new_col_name) + result_df[new_col_name] = classified_series + else: + result_df[col] = classified_series + + exec_context.set_progress(1.0, "Classification complete.") + + return knut.to_table(result_df, exec_context)