linkedin · jverbus · Mar 18, 2026 · Mar 9, 2025 · Mar 10, 2025 · Mar 13, 2025
diff --git a/README.md b/README.md
diff --git a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala
@@ -5,16 +5,16 @@ import com.linkedin.relevance.isolationforest.core.SharedTrainLogic.{
   computeAndSetModelThreshold,
   createSampledPartitionedDataset,
   trainIsolationTrees,
+  validateAndResolveParams,
 }
-import com.linkedin.relevance.isolationforest.core.Utils.{DataPoint, ResolvedParams}
+import com.linkedin.relevance.isolationforest.core.Utils.{DataPoint, validateAndTransformSchema}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.Estimator
-import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
 import org.apache.spark.sql.Dataset
-import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
+import org.apache.spark.sql.types.StructType
 
 /**
  * Used to train an isolation forest model. It extends the spark.ml Estimator class.
@@ -28,7 +28,7 @@ class IsolationForest(override val uid: String)
     with DefaultParamsWritable
     with Logging {
 
-  def this() = this(Identifiable.randomUID("standard-isolation-forest"))
+  def this() = this(Identifiable.randomUID("isolation-forest"))
 
   override def copy(extra: ParamMap): IsolationForest =
 
@@ -55,7 +55,7 @@ class IsolationForest(override val uid: String)
 
     // Validate $(maxFeatures) and $(maxSamples) against input dataset and determine the values
     // actually used to train the model: numFeatures and numSamples
-    val resolvedParams = validateAndResolveParams(dataset)
+    val resolvedParams = validateAndResolveParams(dataset, $(maxFeatures), $(maxSamples))
 
     // Bag and flatten the data, then repartition it so that each partition corresponds to one
     // isolation tree.
@@ -86,6 +86,7 @@ class IsolationForest(override val uid: String)
         isolationTrees,
         resolvedParams.numSamples,
         resolvedParams.numFeatures,
+        resolvedParams.totalNumFeatures,
       )
         .setParent(this),
     )
@@ -103,103 +104,8 @@ class IsolationForest(override val uid: String)
     isolationForestModel
   }
 
-  /**
-   * Private helper to validate parameters and figure out how many features and samples we'll use.
-   *
-   * @param dataset
-   *   The input dataset.
-   * @return
-   *   A ResolvedParams instance containing the resolved values.
-   */
-  private def validateAndResolveParams(dataset: Dataset[DataPoint]): ResolvedParams = {
-
-    // Validate $(maxFeatures) and $(maxSamples) against input dataset and determine the values
-    // actually used to train the model: numFeatures and numSamples.
-    val totalNumFeatures = dataset.head().features.length
-    val numFeatures = if ($(maxFeatures) > 1.0) {
-      math.floor($(maxFeatures)).toInt
-    } else {
-      math.floor($(maxFeatures) * totalNumFeatures).toInt
-    }
-    logInfo(
-      s"User specified number of features used to train each tree over total number of" +
-        s" features: ${numFeatures} / ${totalNumFeatures}",
-    )
-    require(
-      numFeatures > 0,
-      s"parameter maxFeatures given invalid value ${$(maxFeatures)}" +
-        s" specifying the use of ${numFeatures} features, but >0 features are required.",
-    )
-    require(
-      numFeatures <= totalNumFeatures,
-      s"parameter maxFeatures given invalid value" +
-        s" ${$(maxFeatures)} specifying the use of ${numFeatures} features, but only" +
-        s" ${totalNumFeatures} features are available.",
-    )
-
-    val totalNumSamples = dataset.count()
-    val numSamples = if ($(maxSamples) > 1.0) {
-      math.floor($(maxSamples)).toInt
-    } else {
-      math.floor($(maxSamples) * totalNumSamples).toInt
-    }
-    logInfo(
-      s"User specified number of samples used to train each tree over total number of" +
-        s" samples: ${numSamples} / ${totalNumSamples}",
-    )
-    require(
-      numSamples > 0,
-      s"parameter maxSamples given invalid value ${$(maxSamples)}" +
-        s" specifying the use of ${numSamples} samples, but >0 samples are required.",
-    )
-    require(
-      numSamples <= totalNumSamples,
-      s"parameter maxSamples given invalid value" +
-        s" ${$(maxSamples)} specifying the use of ${numSamples} samples, but only" +
-        s" ${totalNumSamples} samples are in the input dataset.",
-    )
-
-    ResolvedParams(numFeatures, totalNumFeatures, numSamples, totalNumSamples)
-  }
-
-  /**
-   * Validates the input schema and transforms it into the output schema. It validates that the
-   * input DataFrame has a $(featuresCol) of the correct type and appends the output columns to the
-   * input schema. It also ensures that the input DataFrame does not already have $(predictionCol)
-   * or $(scoreCol) columns, as they will be created during the fitting process.
-   *
-   * @param schema
-   *   The schema of the DataFrame containing the data to be fit.
-   * @return
-   *   The schema of the DataFrame containing the data to be fit, with the additional
-   *   $(predictionCol) and $(scoreCol) columns added.
-   */
-  override def transformSchema(schema: StructType): StructType = {
-
-    require(
-      schema.fieldNames.contains($(featuresCol)),
-      s"Input column ${$(featuresCol)} does not exist.",
-    )
-    require(
-      schema($(featuresCol)).dataType == VectorType,
-      s"Input column ${$(featuresCol)} is not of required type ${VectorType}",
-    )
-
-    require(
-      !schema.fieldNames.contains($(predictionCol)),
-      s"Output column ${$(predictionCol)} already exists.",
-    )
-    require(
-      !schema.fieldNames.contains($(scoreCol)),
-      s"Output column ${$(scoreCol)} already exists.",
-    )
-
-    val outputFields = schema.fields :+
-      StructField($(predictionCol), DoubleType, nullable = false) :+
-      StructField($(scoreCol), DoubleType, nullable = false)
-
-    StructType(outputFields)
-  }
+  override def transformSchema(schema: StructType): StructType =
+    validateAndTransformSchema(schema, $(featuresCol), $(predictionCol), $(scoreCol))
 }
 
 /**

diff --git a/...n-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestModel.scala b/...n-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestModel.scala
@@ -1,17 +1,18 @@
 package com.linkedin.relevance.isolationforest
 
-import com.linkedin.relevance.isolationforest.core.Utils.{DataPoint, avgPathLength}
-import com.linkedin.relevance.isolationforest.core.{
-  IsolationForestModelReadWrite,
-  IsolationForestParamsBase,
+import com.linkedin.relevance.isolationforest.core.Utils.{
+  DataPoint,
+  avgPathLength,
+  validateFeatureVectorSize,
+  validateAndTransformSchema,
 }
+import com.linkedin.relevance.isolationforest.core.IsolationForestParamsBase
 import org.apache.spark.ml.Model
-import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
 import org.apache.spark.sql.functions.{col, lit, udf}
-import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, Dataset}
 
 /**
@@ -28,16 +29,35 @@ import org.apache.spark.sql.{DataFrame, Dataset}
  *   cases, a given isolation tree may not have any nodes using some of these features, e.g., a
  *   shallow tree where the number of features in the training data exceeds the number of nodes in
  *   the tree.
+ * @param totalNumFeatures
+ *   The total number of input features seen during training, or
+ *   [[IsolationForestModel.UnknownTotalNumFeatures]] for legacy loaded models that predate this
+ *   metadata.
  */
-class IsolationForestModel(
+class IsolationForestModel private[isolationforest] (
   override val uid: String,
   val isolationTrees: Array[IsolationTree],
   private val numSamples: Int,
   private val numFeatures: Int,
+  private val totalNumFeatures: Int,
 ) extends Model[IsolationForestModel]
     with IsolationForestParamsBase
     with MLWritable {
 
+  def this(
+    uid: String,
+    isolationTrees: Array[IsolationTree],
+    numSamples: Int,
+    numFeatures: Int,
+  ) =
+    this(
+      uid,
+      isolationTrees,
+      numSamples,
+      numFeatures,
+      IsolationForestModel.UnknownTotalNumFeatures,
+    )
+
   require(numSamples > 0, s"parameter numSamples must be >0, but given invalid value ${numSamples}")
   final def getNumSamples: Int = numSamples
 
@@ -47,6 +67,19 @@ class IsolationForestModel(
   )
   final def getNumFeatures: Int = numFeatures
 
+  require(
+    totalNumFeatures == IsolationForestModel.UnknownTotalNumFeatures || totalNumFeatures > 0,
+    s"parameter totalNumFeatures must be >0 or UnknownTotalNumFeatures, but given invalid value ${totalNumFeatures}",
+  )
+  require(
+    totalNumFeatures == IsolationForestModel.UnknownTotalNumFeatures || numFeatures <= totalNumFeatures,
+    s"parameter numFeatures must be <= totalNumFeatures, but given invalid values" +
+      s" numFeatures=${numFeatures}, totalNumFeatures=${totalNumFeatures}",
+  )
+  final def getTotalNumFeatures: Int = totalNumFeatures
+  final def hasKnownTotalNumFeatures: Boolean =
+    totalNumFeatures != IsolationForestModel.UnknownTotalNumFeatures
+
   // The outlierScoreThreshold needs to be a mutable variable because it is not known when an
   // IsolationForestModel instance is created.
   private var outlierScoreThreshold: Double = -1
@@ -64,8 +97,9 @@ class IsolationForestModel(
 
   override def copy(extra: ParamMap): IsolationForestModel = {
 
-    val isolationForestCopy = new IsolationForestModel(uid, isolationTrees, numSamples, numFeatures)
-      .setParent(this.parent)
+    val isolationForestCopy =
+      new IsolationForestModel(uid, isolationTrees, numSamples, numFeatures, totalNumFeatures)
+        .setParent(this.parent)
     isolationForestCopy.setOutlierScoreThreshold(outlierScoreThreshold)
     copyValues(isolationForestCopy, extra)
   }
@@ -81,15 +115,26 @@ class IsolationForestModel(
    */
   override def transform(data: Dataset[_]): DataFrame = {
 
+    require(
+      numSamples >= 2,
+      s"Cannot score with numSamples=$numSamples; expected numSamples >= 2.",
+    )
+    require(
+      isolationTrees.nonEmpty,
+      "Cannot score with an empty IsolationForestModel.",
+    )
     transformSchema(data.schema, logging = true)
 
     val avgPath = avgPathLength(numSamples)
     val broadcastIsolationTrees = data.sparkSession.sparkContext.broadcast(isolationTrees)
 
     val calculatePathLength = (features: Vector) => {
+      if (hasKnownTotalNumFeatures) {
+        validateFeatureVectorSize(features, totalNumFeatures)
+      }
       val pathLength = broadcastIsolationTrees.value
         .map(y => y.calculatePathLength(DataPoint(features.toArray.map(x => x.toFloat))))
-        .sum / $(numEstimators)
+        .sum / broadcastIsolationTrees.value.length
       Math.pow(2, -pathLength / avgPath)
     }
     val transformUDF = udf(calculatePathLength)
@@ -105,44 +150,8 @@ class IsolationForestModel(
     dataWithScoresAndPrediction
   }
 
-  /**
-   * Validates the input schema and transforms it into the output schema. It validates that the
-   * input DataFrame has a $(featuresCol) of the correct type and appends the output columns to the
-   * input schema. It also ensures that the input DataFrame does not already have $(predictionCol)
-   * or $(scoreCol) columns, as they will be created during the fitting process.
-   *
-   * @param schema
-   *   The schema of the DataFrame containing the data to be fit.
-   * @return
-   *   The schema of the DataFrame containing the data to be fit, with the additional
-   *   $(predictionCol) and $(scoreCol) columns added.
-   */
-  override def transformSchema(schema: StructType): StructType = {
-
-    require(
-      schema.fieldNames.contains($(featuresCol)),
-      s"Input column ${$(featuresCol)} does not exist.",
-    )
-    require(
-      schema($(featuresCol)).dataType == VectorType,
-      s"Input column ${$(featuresCol)} is not of required type ${VectorType}",
-    )
-
-    require(
-      !schema.fieldNames.contains($(predictionCol)),
-      s"Output column ${$(predictionCol)} already exists.",
-    )
-    require(
-      !schema.fieldNames.contains($(scoreCol)),
-      s"Output column ${$(scoreCol)} already exists.",
-    )
-
-    val outputFields = schema.fields :+
-      StructField($(predictionCol), DoubleType, nullable = false) :+
-      StructField($(scoreCol), DoubleType, nullable = false)
-
-    StructType(outputFields)
-  }
+  override def transformSchema(schema: StructType): StructType =
+    validateAndTransformSchema(schema, $(featuresCol), $(predictionCol), $(scoreCol))
 
   /**
    * Returns an IsolationForestModelWriter instance that can be used to write the isolation forest
@@ -159,6 +168,8 @@ class IsolationForestModel(
  */
 case object IsolationForestModel extends MLReadable[IsolationForestModel] {
 
+  private[isolationforest] val UnknownTotalNumFeatures: Int = -1
+
   /**
    * Returns an IsolationForestModelReader instance that can be used to read a saved isolation
    * forest from disk.