From bd9ed50e493aee23812bcc4b43a9cbf92d3d5a99 Mon Sep 17 00:00:00 2001
From: James Verbus <jverbus@linkedin.com>
Date: Tue, 17 Dec 2024 01:10:26 -0800
Subject: [PATCH] Made minor edits to match the proposed Estimator
 transformSchema method to the existing Model transformSchema method. (#61)

---
 .../isolationforest/IsolationForest.scala     | 26 +++++++++----------
 .../IsolationForestModel.scala                |  6 ++---
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala
index 1864926..42864da 100644
--- a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala
+++ b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala
@@ -187,11 +187,13 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores
 
   /**
     * Validates the input schema and transforms it into the output schema. It validates that the
-    * input DataFrame has a $(featuresCol) of the correct type. In this case, the output schema appends
-    * the output columns to the input schema.
+    * input DataFrame has a $(featuresCol) of the correct type and appends the output columns to
+    * the input schema. It also ensures that the input DataFrame does not already have
+    * $(predictionCol) or $(scoreCol) columns, as they will be created during the fitting process.
     *
     * @param schema The schema of the DataFrame containing the data to be fit.
-    * @return The schema of the DataFrame containing the data to be fit.
+    * @return The schema of the DataFrame containing the data to be fit, with the additional
+    *         $(predictionCol) and $(scoreCol) columns added.
     */
   override def transformSchema(schema: StructType): StructType = {
 
@@ -200,16 +202,14 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores
     require(schema($(featuresCol)).dataType == VectorType,
       s"Input column ${$(featuresCol)} is not of required type ${VectorType}")
 
-    val outputFields: Array[StructField] = schema.fields ++ Array(
-      StructField(
-        name = s"$predictionCol",
-        dataType = DoubleType
-      ),
-      StructField(
-        name = s"$scoreCol",
-        dataType = DoubleType
-      )
-    )
+    require(!schema.fieldNames.contains($(predictionCol)),
+      s"Output column ${$(predictionCol)} already exists.")
+    require(!schema.fieldNames.contains($(scoreCol)),
+      s"Output column ${$(scoreCol)} already exists.")
+
+    val outputFields = schema.fields :+
+      StructField($(predictionCol), DoubleType, nullable = false) :+
+      StructField($(scoreCol), DoubleType, nullable = false)
 
     StructType(outputFields)
   }
diff --git a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestModel.scala b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestModel.scala
index 30c8c29..11b7518 100644
--- a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestModel.scala
+++ b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestModel.scala
@@ -89,9 +89,9 @@ class IsolationForestModel(
 
   /**
     * Validates the input schema and transforms it into the output schema. It validates that the
-    * input DataFrame has a $(featuresCol) of the correct type.  It also ensures that the input
-    * DataFrame does not already have $(predictionCol) or $(scoreCol) columns, as they will be
-    * created during the fitting process.
+    * input DataFrame has a $(featuresCol) of the correct type and appends the output columns to
+    * the input schema. It also ensures that the input DataFrame does not already have
+    * $(predictionCol) or $(scoreCol) columns, as they will be created during the fitting process.
     *
     * @param schema The schema of the DataFrame containing the data to be fit.
     * @return The schema of the DataFrame containing the data to be fit, with the additional