logicalclocks · manu-sj · Apr 28, 2025 · Apr 28, 2025 · Apr 29, 2025
diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py
@@ -45,7 +45,9 @@ def robust_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Serie
 
 @udf(int, drop=["feature"], mode="pandas")
 def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
-    unique_data = sorted([value for value in statistics.feature.unique_values])
+    unique_data = sorted(
+        [value for value in statistics.feature.unique_values if not pd.isna(value)]
+    )
     value_to_index = {value: index for index, value in enumerate(unique_data)}
     # Unknown categories not present in training dataset are encoded as -1.
     return pd.Series(

diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py
@@ -16,6 +16,7 @@
 from __future__ import annotations
 
 import datetime
+import math
 import os
 import statistics
 
@@ -610,6 +611,139 @@ def tf_fun(col_0):
             td, spark_df, expected_spark_df, transformation_functions
         )
 
+    def test_apply_builtin_label_encoder(self, mocker):
+        # Arrange
+        mocker.patch("hopsworks_common.client.get_instance")
+        mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics")
+        spark_engine = spark.Engine()
+
+        schema = StructType(
+            [
+                StructField("col_0", IntegerType(), True),
+                StructField("col_1", StringType(), True),
+                StructField("col_2", BooleanType(), True),
+            ]
+        )
+        df = pd.DataFrame(
+            data={
+                "col_0": [1, 2, 3],
+                "col_1": ["test_1", "test_2", "test_1"],
+                "col_2": [True, False, True],
+            }
+        )
+        spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema)
+
+        expected_schema = StructType(
+            [
+                StructField("col_0", IntegerType(), True),
+                StructField("col_2", BooleanType(), True),
+                StructField("label_encoder_col_1_", LongType(), True),
+            ]
+        )
+        expected_df = pd.DataFrame(
+            data={
+                "col_0": [1, 2, 3],
+                "col_2": [True, False, True],
+                "label_encoder_col_1_": [0, 1, 0],
+            }
+        )
+        expected_spark_df = spark_engine._spark_session.createDataFrame(
+            expected_df, schema=expected_schema
+        )
+
+        # Arrange
+        from hsfs.builtin_transformations import label_encoder
+
+        td = self._create_training_dataset()
+
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=label_encoder("col_1"),
+                featurestore_id=99,
+                transformation_type=TransformationType.MODEL_DEPENDENT,
+            )
+        ]
+
+        extended_statistics = {"unique_values": ["test_1", "test_2"]}
+        transformation_functions[0].transformation_statistics = [
+            FeatureDescriptiveStatistics(
+                feature_name="col_1", extended_statistics=extended_statistics
+            )
+        ]
+
+        # Assert
+        self._validate_on_python_engine(td, df, expected_df, transformation_functions)
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
+
+    def test_apply_builtin_label_encoder_null_values(self, mocker):
+        # Arrange
+        mocker.patch("hopsworks_common.client.get_instance")
+        mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics")
+        spark_engine = spark.Engine()
+
+        schema = StructType(
+            [
+                StructField("col_0", IntegerType(), True),
+                StructField("col_1", StringType(), True),
+                StructField("col_2", BooleanType(), True),
+            ]
+        )
+        df = pd.DataFrame(
+            data={
+                "col_0": [1, 2, 3, 4],
+                "col_1": ["test_1", "test_2", None, "test_1"],
+                "col_2": [True, False, True, True],
+            }
+        )
+        spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema)
+
+        expected_schema = StructType(
+            [
+                StructField("col_0", IntegerType(), True),
+                StructField("col_2", BooleanType(), True),
+                StructField("label_encoder_col_1_", LongType(), True),
+            ]
+        )
+        expected_df = pd.DataFrame(
+            data={
+                "col_0": [1, 2, 3, 4],
+                "col_2": [True, False, True, True],
+                "label_encoder_col_1_": [0, 1, math.nan, 0],
+            }
+        )
+        data = [(1, True, 0), (2, False, 1), (3, True, None), (4, True, 0)]
+        expected_spark_df = spark_engine._spark_session.createDataFrame(
+            data, schema=expected_schema
+        )
+
+        # Arrange
+        from hsfs.builtin_transformations import label_encoder
+
+        td = self._create_training_dataset()
+
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=label_encoder("col_1"),
+                featurestore_id=99,
+                transformation_type=TransformationType.MODEL_DEPENDENT,
+            )
+        ]
+
+        extended_statistics = {"unique_values": ["test_1", None, "test_2"]}
+        transformation_functions[0].transformation_statistics = [
+            FeatureDescriptiveStatistics(
+                feature_name="col_1", extended_statistics=extended_statistics
+            )
+        ]
+
+        # Assert
+        self._validate_on_python_engine(td, df, expected_df, transformation_functions)
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
+
     def test_apply_plus_one_int_python(self, mocker):
         # Arrange
         mocker.patch("hopsworks_common.client.get_instance")