Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
324 changes: 324 additions & 0 deletions knime_extension/src/nodes/spatialtool.py
Original file line number Diff line number Diff line change
Expand Up @@ -1692,3 +1692,327 @@ def execute(self, exec_context: knext.ExecutionContext, input_table1, input_tabl
# append region id column
gdf[self._COL_ID] = range(1, (gdf.shape[0] + 1))
return knut.to_table(gdf, exec_context)


############################################
# Mapclassify
############################################


@knext.node(
name="Mapclassifier",
node_type=knext.NodeType.MANIPULATOR,
icon_path=__NODE_ICON_PATH + "Mapclassifier.png",
category=__category,
after="",
)
@knext.input_table(
name="Input Table",
description="Input table with targeted columns for classification.",
)
@knext.output_table(
name="Output Table",
description="Output table with classified result.",
)
@knut.geo_node_description(
short_description="Classifies numeric columns using Mapclassify algorithms.",
description="""Apply different classification schemes provided by the
[mapclassify](https://pysal.org/mapclassify/) library to selected numeric columns.
Choose a classifier, configure its parameters, and either replace the original
values with class labels or append new columns with the classification result.
The iterative classifiers `JenksCaspall`, `JenksCaspallForced`, and `JenksCaspallSampled`
may take noticeably longer on large datasets because they refine class boundaries
through multiple passes. The `HeadTailBreaks` classifier automatically determines
the number of classes based on the data distribution.""",
references={
"mapclassify documentation": "https://pysal.org/mapclassify/",
},
)
class Mapclassifier:

# classmode_param options for mapclassify
class ClassModes(knext.EnumParameterOptions):
BOXPLOT = ( # mapclassify.BoxPlot(y[, hinge])
"Boxplot",
"""PURPOSE: Creates class breaks based on the statistical properties of a box plot distribution.
HOW IT WORKS: Uses the quartiles (Q1, median, Q3) and interquartile range (IQR) to identify outliers and create meaningful breaks. Typically creates 6 classes: lower outlier, < Q1, Q1-median, median-Q3, > Q3, and upper outlier.
BEST FOR: Identifying and highlighting outliers in your data while maintaining interpretable breaks based on statistical distribution.""",
)
EQUALINTERVAL = ( # mapclassify.EqualInterval(y[, k])
"EqualInterval",
"""PURPOSE: Divides the data range into equal-sized intervals.
HOW IT WORKS: Takes the difference between maximum and minimum values, then divides by the number of desired classes to create intervals of equal width.
BEST FOR: Data that is relatively evenly distributed and when you want consistent interval sizes for easy interpretation.""",
)
FISHERJ = ( # mapclassify.FisherJenks(y[, k])
"FisherJenks",
"""PURPOSE: Finds optimal class breaks that minimize within-class variance while maximizing between-class variance.
HOW IT WORKS: Uses dynamic programming to find the optimal groupings that create the most homogeneous classes possible.
BEST FOR: Most types of data as it adapts to the natural clustering in your dataset. Considered one of the most statistically robust methods.""",
)
FISHERJ_SAMPLED = ( # mapclassify.FisherJenksSampled(y[, k, pct, ...])
"FisherJanksSampled",
"""PURPOSE: Same optimization as FisherJenks but uses a random sample for computational efficiency.
HOW IT WORKS: Applies the Fisher-Jenks algorithm to a subset of the data, making it faster for large datasets.
BEST FOR: Large datasets where standard Fisher-Jenks would be computationally expensive but you still want optimal breaks.""",
)
HEADT_BREAKS = ( # mapclassify.HeadTailBreaks(y)
"HeadTailBreaks",
"""PURPOSE: Recursively divides data around the mean, designed specifically for heavy-tailed distributions.
HOW IT WORKS: Splits data at the arithmetic mean, then recursively applies the same process to the "head" (above-mean values) until stopping criteria are met.
BEST FOR: Highly skewed data with heavy tails, such as city populations, income distributions, or social media network data.""",
)
JENKS_CAS = ( # mapclassify.JenksCaspall(y[, k])
"JenksCaspall",
"""PURPOSE: An iterative optimization method that moves class boundaries to minimize within-class variance.
HOW IT WORKS: Starts with initial class breaks and iteratively moves boundaries to improve the goodness of variance fit.
BEST FOR: When you want optimized breaks similar to Fisher-Jenks but prefer an iterative approach that can be stopped at any point.""",
)
JENKS_CASFORCED = ( # mapclassify.JenksCaspallForced(y[, k])
"JenksCaspallForced",
"""PURPOSE: Similar to JenksCaspall but allows forcing specific values to be class boundaries.
HOW IT WORKS: Performs the iterative optimization while ensuring certain predetermined values remain as class breaks.
BEST FOR: When you have meaningful breakpoints (like 0, poverty line, etc.) that must be preserved while optimizing the remaining breaks.""",
)
JENKS_CASSAMPLED = ( # mapclassify.JenksCaspallSampled(y[, k, pct])
"JenksCaspallSampled",
"""PURPOSE: Applies JenksCaspall optimization to a random sample of the data.
HOW IT WORKS: Uses sampling to make the iterative process computationally feasible for large datasets.
BEST FOR: Large datasets where full JenksCaspall would be too slow but you want iteratively optimized breaks.""",
)
MAXIMUMBREAKS = ( # mapclassify.MaximumBreaks(y[, k, mindiff])
"MaximumBreaks",
"""PURPOSE: Places class breaks at the largest gaps in the sorted data values.
HOW IT WORKS: Identifies the biggest jumps between consecutive values and uses these as natural breaking points.
BEST FOR: Data with clear natural clusters or gaps, where you want breaks at the most obvious discontinuities.""",
)
NATURALBREAKS = ( # mapclassify.NaturalBreaks(y[, k, initial, ...]) needs to be revised
"NaturalBreaks",
"""PURPOSE: Identifies class breaks that minimize variance within classes while maximizing variance between classes""",
)
PERCENTILES = ( # mapclassify.Percentiles(y[, pct])
"Percentiles",
"""PURPOSE: Creates class breaks at specified percentile values.
HOW IT WORKS: Divides data based on percentile ranks (e.g., quintiles at 20th, 40th, 60th, 80th percentiles).
BEST FOR: When you want equal numbers of observations in each class, or when working with data where relative position matters more than absolute values.""",
)
PRETTYBREAKS = ( # mapclassify.PrettyBreaks(y[, k])
"PrettyBreaks",
"""PURPOSE: Creates "nice" round numbers as class breaks for improved readability.
HOW IT WORKS: Chooses aesthetically pleasing break points (round numbers) that are close to optimal statistical breaks.
BEST FOR: Maps intended for general audiences where readability and round numbers are more important than statistical optimization.""",
)
QUANTILES = ( # mapclassify.Quantiles(y[, k])
"Quantiles",
"""PURPOSE: Divides data so each class contains an equal number of observations.
HOW IT WORKS: Sorts data and creates breaks at quantile boundaries to ensure equal sample sizes per class.
BEST FOR: Comparing relative rankings across areas, or when you want to ensure balanced representation across all classes.""",
)
STDMEAN = ( # mapclassify.StdMean(y[, multiples, anchor])
"StdMean",
"""PURPOSE: Creates classes based on standard deviations from the mean.
HOW IT WORKS: Sets breaks at intervals of standard deviations above and below the mean (e.g., mean±1σ, mean±2σ).
BEST FOR: Normally distributed data where you want to highlight areas that are statistically typical vs. unusual relative to the average.""",
)

@classmethod
def get_default(cls):
return cls.FISHERJ

# classifier_param for selecting the classification method
classifier_param = knext.EnumParameter(
label="Classifier Selection",
description="Select the classifier that you want to apply to the targeted columns.",
default_value=ClassModes.get_default().name,
enum=ClassModes,
)

# classifier selection parameter
class_col = knext.MultiColumnParameter(
"Targeted columns",
"""The zoom level of the grid from 0 to 15 (default value is 8). The bigger the zoom level, the smaller the
hexagon. If the zoom level is too small, the hexagon might be too big to fit in the input polygon which will
result in an error. A very small zoom level might result in a very large output table even for smaller
input polygons.
For more details about the zoom levels refer to
[Tables of Cell Statistics Across Resolutions.](https://h3geo.org/docs/core-library/restable/)
""",
column_filter=knut.is_numeric,
)

# k_param for all classifiers
k_param = knext.IntParameter(
label="Number of classification",
description="""The zoom level of the grid from 0 to 15 (default value is 8). The bigger the zoom level, the smaller the
hexagon. If the zoom level is too small, the hexagon might be too big to fit in the input polygon which will
result in an error. A very small zoom level might result in a very large output table even for smaller
input polygons.
For more details about the zoom levels refer to
[Tables of Cell Statistics Across Resolutions.](https://h3geo.org/docs/core-library/restable/)
""",
default_value=5,
min_value=2,
).rule(
knext.OneOf(
classifier_param,
[
ClassModes.BOXPLOT.name,
ClassModes.PERCENTILES.name,
ClassModes.STDMEAN.name,
ClassModes.HEADT_BREAKS.name,
],
),
knext.Effect.HIDE,
)

# hinge_param for Boxplot
hinge_param = knext.DoubleParameter(
label="Hinge prompt for Boxplot",
description="""The hinge value is used to determine the lower and upper quartiles of the data.""",
default_value=1.5,
).rule(knext.OneOf(classifier_param, [ClassModes.BOXPLOT.name]), knext.Effect.SHOW)

# SETTINGS FOR FISHER JENKS SAMPLED CLASSIFIER ***

# pct_param for JenksCaspallSampled
pct_param = knext.DoubleParameter(
label="Natural Breaks Percentage for FisherJenks Sampled",
description="""The percentage of the data to be randomly sampled to determine the natural breaks.""",
default_value=0.10,
).rule(
knext.OneOf(
classifier_param,
[
ClassModes.FISHERJ_SAMPLED.name,
ClassModes.JENKS_CASSAMPLED.name,
],
),
knext.Effect.SHOW,
)

# determines if the output is to truncate the classification result to the number of classes specified by the k parameter
jc_sampledtruncate = knext.BoolParameter(
"Truncate for FisherJenksSampled",
"""If checked, the node will truncate the classification result to the number of classes specified by the k parameter.""",
default_value=False,
).rule(
knext.OneOf(classifier_param, [ClassModes.JENKS_CASSAMPLED.name]),
knext.Effect.SHOW,
)

# *** Formatting Settings ***

# determines if the output is to replace the original columns or append them
append_replace = knext.BoolParameter(
"Append Classification Results",
"""If checked, the node will append the classification result to the input table.
If unchecked, the node will replace the input table with the classification result.""",
default_value=False,
)

def configure(self, configure_context, input_schema):
return None

def execute(self, exec_context: knext.ExecutionContext, input_table):
import mapclassify as mc
import numpy as np
import pandas as pd

df = input_table.to_pandas()

if not self.class_col:
LOGGER.warning("No target columns selected for Mapclassifier node.")
return knut.to_table(df, exec_context)

selected_mode = self.classifier_param

exec_context.set_progress(0.1, "Preparing classification.")

def std_mean_multiples(k: int) -> list:
if k < 2:
return [-1, 0, 1]
half = k // 2
multiples = list(range(-half, half + 1))
if k % 2 == 0 and 0 in multiples:
multiples.remove(0)
return multiples

def classify_series(series: pd.Series) -> pd.Series:
valid = series.dropna()
if valid.empty:
return pd.Series(pd.NA, index=series.index, dtype="Int64")

values = valid.to_numpy()
mode = selected_mode

if mode == self.ClassModes.EQUALINTERVAL.name:
classifier = mc.EqualInterval(values, k=self.k_param)
elif mode == self.ClassModes.FISHERJ.name:
classifier = mc.FisherJenks(values, k=self.k_param)
elif mode == self.ClassModes.FISHERJ_SAMPLED.name:
classifier = mc.FisherJenksSampled(
values, k=self.k_param, pct=self.pct_param
)
elif mode == self.ClassModes.JENKS_CAS.name:
classifier = mc.JenksCaspall(values, k=self.k_param)
elif mode == self.ClassModes.JENKS_CASFORCED.name:
classifier = mc.JenksCaspallForced(values, k=self.k_param)
elif mode == self.ClassModes.JENKS_CASSAMPLED.name:
classifier = mc.JenksCaspallSampled(
values,
k=self.k_param,
pct=self.pct_param,
truncate=self.jc_sampledtruncate,
)
elif mode == self.ClassModes.PRETTYBREAKS.name:
classifier = mc.PrettyBreaks(values, k=self.k_param)
elif mode == self.ClassModes.QUANTILES.name:
classifier = mc.Quantiles(values, k=self.k_param)
elif mode == self.ClassModes.BOXPLOT.name:
classifier = mc.BoxPlot(values, hinge=self.hinge_param)
elif mode == self.ClassModes.HEADT_BREAKS.name:
classifier = mc.HeadTailBreaks(values)
elif mode == self.ClassModes.MAXIMUMBREAKS.name:
classifier = mc.MaximumBreaks(values, k=self.k_param)
elif mode == self.ClassModes.NATURALBREAKS.name:
classifier = mc.NaturalBreaks(values, k=self.k_param)
elif mode == self.ClassModes.PERCENTILES.name:
if self.k_param < 2:
raise ValueError(
"Percentiles classifier requires at least 2 classes."
)
percentiles = list(np.linspace(0, 100, self.k_param + 1)[1:-1])
classifier = mc.Percentiles(values, pct=percentiles)
elif mode == self.ClassModes.STDMEAN.name:
multiples = std_mean_multiples(self.k_param)
classifier = mc.StdMean(values, multiples=multiples)
else:
raise ValueError(f"Unsupported classifier mode: {mode}")

classified = pd.Series(classifier.yb, index=valid.index, dtype="int64")
result = classified.reindex(series.index).astype("Int64")
return result

result_df = df.copy()
existing_columns = set(result_df.columns)

for idx, col in enumerate(self.class_col, start=1):
knut.check_canceled(exec_context)
exec_context.set_progress(
0.1 + 0.8 * idx / len(self.class_col),
f"Classifying column '{col}' ({idx}/{len(self.class_col)})",
)
classified_series = classify_series(df[col])

if self.append_replace:
new_col_name = knut.get_unique_name(
f"{col}_classified", list(existing_columns)
)
existing_columns.add(new_col_name)
result_df[new_col_name] = classified_series
else:
result_df[col] = classified_series

exec_context.set_progress(1.0, "Classification complete.")

return knut.to_table(result_df, exec_context)