dmlc · trivialfis · Sep 11, 2024 · Jul 25, 2024 · Aug 7, 2024 · Aug 9, 2024
diff --git a/doc/jvm/index.rst b/doc/jvm/index.rst
@@ -38,6 +38,7 @@ Contents
   XGBoost4J-Spark-GPU Tutorial <xgboost4j_spark_gpu_tutorial>
   Code Examples <https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example>
   API docs <api>
+  How to migrate to XGBoost Spark 3.x <xgboost_spark_migration>
 
 .. note::
 

diff --git a/doc/jvm/xgboost_spark_migration.rst b/doc/jvm/xgboost_spark_migration.rst
@@ -0,0 +1,162 @@
+####################################################
+Migration Guide: How to migrate to XGBoost Spark 3.x
+####################################################
+
+XGBoost Spark underwent significant modifications in version 3.0,
+which may cause compatibility issues with existing user code.
+
+This guide will walk you through the process of updating your code to ensure
+it's compatible with XGBoost Spark 3.0 and later versions.
+
+**********************
+XGBoost Spark Packages
+**********************
+
+XGBoost Spark 3.0 introduced a single uber package named xgboost-spark_2.12-3.0.0.jar, which bundles 
+both xgboost4j and xgboost4j-spark. This means you can now simply use `xgboost-spark`` for your application.
+
+* For CPU
+
+  .. code-block:: xml
+
+    <dependency>
+        <groupId>ml.dmlc</groupId>
+        <artifactId>xgboost-spark_${scala.binary.version}</artifactId>
+        <version>3.0.0</version>
+    </dependency>
+
+* For GPU
+
+  .. code-block:: xml
+
+    <dependency>
+        <groupId>ml.dmlc</groupId>
+        <artifactId>xgboost-spark-gpu_${scala.binary.version}</artifactId>
+        <version>3.0.0</version>
+    </dependency>
+
+
+When submitting the XGBoost application to the Spark cluster, you only need to specify the single `xgboost-spark` package.
+
+* For CPU
+
+  .. code-block:: bash
+
+    spark-submit \
+      --jars xgboost-spark_2.12-3.0.0.jar \
+      ... \
+
+
+* For GPU
+
+  .. code-block:: bash
+
+    spark-submit \
+      --jars xgboost-spark_2.12-3.0.0.jar \
+      ... \
+
+**************
+XGBoost Ranking
+**************
+
+Learning to rank using XGBoostRegressor has been replaced by a dedicated `XGBoostRanker`, which is specifically designed
+to support ranking algorithms.
+
+.. code-block:: scala
+
+  // before 3.0
+  val regressor = new XGBoostRegressor().setObjective("rank:ndcg")
+
+  // after 3.0
+  val ranker = new XGBoostRanker()
+
+******************************
+XGBoost Constructor Parameters
+******************************
+
+XGBoost Spark now categorizes parameters into two groups: XGBoost-Spark parameters and XGBoost parameters.
+When constructing an XGBoost estimator, only XGBoost-specific parameters are permitted. XGBoost-Spark specific 
+parameters must be configured using the estimator's setter methods. It's worth noting that 
+`XGBoost Parameters <https://xgboost.readthedocs.io/en/stable/parameter.html>`_
+can be set both during construction and through the estimator's setter methods.
+
+.. code-block:: scala
+
+  // before 3.0
+  val xgboost_paras = Map(
+    "eta" -> "1",
+    "max_depth" -> "6",
+    "objective" -> "binary:logistic",
+    "num_round" -> 5,
+    "num_workers" -> 1,
+    "features" -> "feature_column",
+    "label" -> "label_column",
+  )
+  val classifier = new XGBoostClassifier(xgboost_paras)
+
+
+  // after 3.0
+  val xgboost_paras = Map(
+    "eta" -> "1",
+    "max_depth" -> "6",
+    "objective" -> "binary:logistic",
+    )
+  val classifier = new XGBoostClassifier(xgboost_paras)
+    .setNumRound(5)
+    .setNumWorkers(1)
+    .setFeaturesCol("feature_column")
+    .setLabelCol("label_column")
+
+  // Or you can use setter to set all parameters
+  val classifier = new XGBoostClassifier()
+    .setNumRound(5)
+    .setNumWorkers(1)
+    .setFeaturesCol("feature_column")
+    .setLabelCol("label_column")
+    .setEta(1)
+    .setMaxDepth(6)
+    .setObjective("binary:logistic")
+
+******************
+Removed Parameters
+******************
+
+Starting from 3.0, below parameters are removed.
+
+- cacheTrainingSet
+
+  If you wish to cache the training dataset, you have the option to implement caching
+  in your code prior to fitting the data to an estimator.
+
+  .. code-block:: scala
+
+    val df = input.cache()
+    val model = new XGBoostClassifier().fit(df)
+
+- trainTestRatio
+
+  The following method can be employed to do the evaluation.
+
+  .. code-block:: scala
+
+    val Array(train, eval) = trainDf.randomSplit(Array(0.7, 0.3))
+    val classifier = new XGBoostClassifer().setEvalDataset(eval)
+    val model = classifier.fit(train)
+
+- tracker_conf
+
+  The following method can be used to configure RabitTracker.
+
+  .. code-block:: scala
+
+    val classifier = new XGBoostClassifer()
+      .setRabitTrackerTimeout(100)
+      .setRabitTrackerHostIp("192.168.0.2")
+      .setRabitTrackerPort(19203)
+
+- rabitRingReduceThreshold
+- rabitTimeout
+- rabitConnectRetry
+- singlePrecisionHistogram
+- lambdaBias
+- objectiveType
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/CudfColumn.java b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/CudfColumn.java
@@ -55,9 +55,9 @@ public static CudfColumn from(ColumnVector cv) {
     DType dType = cv.getType();
     String typeStr = "";
     if (dType == DType.FLOAT32 || dType == DType.FLOAT64 ||
-          dType == DType.TIMESTAMP_DAYS || dType == DType.TIMESTAMP_MICROSECONDS ||
-          dType == DType.TIMESTAMP_MILLISECONDS || dType == DType.TIMESTAMP_NANOSECONDS ||
-          dType == DType.TIMESTAMP_SECONDS) {
+        dType == DType.TIMESTAMP_DAYS || dType == DType.TIMESTAMP_MICROSECONDS ||
+        dType == DType.TIMESTAMP_MILLISECONDS || dType == DType.TIMESTAMP_NANOSECONDS ||
+        dType == DType.TIMESTAMP_SECONDS) {
       typeStr = "<f" + dType.getSizeInBytes();
     } else if (dType == DType.BOOL8 || dType == DType.INT8 || dType == DType.INT16 ||
         dType == DType.INT32 || dType == DType.INT64) {

diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/QuantileDMatrix.java b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/QuantileDMatrix.java
@@ -35,11 +35,39 @@ public QuantileDMatrix(
       float missing,
       int maxBin,
       int nthread) throws XGBoostError {
+    this(iter, null, missing, maxBin, nthread);
+  }
+
+  /**
+   * Create QuantileDMatrix from iterator based on the cuda array interface
+   *
+   * @param iter       the XGBoost ColumnBatch batch to provide the corresponding cuda array
+   *                   interface
+   * @param refDMatrix The training dataset that provides quantile information, needed when
+   *                   creating validation/test dataset with QuantileDMatrix. Supplying the
+   *                   training DMatrix as a reference means that the same quantisation
+   *                   applied to the training data is applied to the validation/test data
+   * @param missing    the missing value
+   * @param maxBin     the max bin
+   * @param nthread    the parallelism
+   * @throws XGBoostError
+   */
+  public QuantileDMatrix(
+      Iterator<ColumnBatch> iter,
+      QuantileDMatrix refDMatrix,
+      float missing,
+      int maxBin,
+      int nthread) throws XGBoostError {
     super(0);
     long[] out = new long[1];
     String conf = getConfig(missing, maxBin, nthread);
+    long[] ref = null;
+    if (refDMatrix != null) {
+      ref = new long[1];
+      ref[0] = refDMatrix.getHandle();
+    }
     XGBoostJNI.checkCall(XGBoostJNI.XGQuantileDMatrixCreateFromCallback(
-        iter, null, conf, out));
+        iter, ref, conf, out));
     handle = out[0];
   }
 
@@ -85,6 +113,7 @@ public void setGroup(int[] group) throws XGBoostError {
 
   private String getConfig(float missing, int maxBin, int nthread) {
     return String.format("{\"missing\":%f,\"max_bin\":%d,\"nthread\":%d}",
-        missing, maxBin, nthread);
+                         missing, maxBin, nthread);
   }
+
 }
diff --git a/...xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/nvidia/spark/GpuColumnBatch.java b/...xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/nvidia/spark/GpuColumnBatch.java
diff --git a/...gpu/src/main/resources/META-INF/services/ml.dmlc.xgboost4j.scala.spark.PreXGBoostProvider b/...gpu/src/main/resources/META-INF/services/ml.dmlc.xgboost4j.scala.spark.PreXGBoostProvider
diff --git a/...park-gpu/src/main/resources/META-INF/services/ml.dmlc.xgboost4j.scala.spark.XGBoostPlugin b/...park-gpu/src/main/resources/META-INF/services/ml.dmlc.xgboost4j.scala.spark.XGBoostPlugin
@@ -0,0 +1 @@
+ml.dmlc.xgboost4j.scala.spark.GpuXGBoostPlugin
diff --git a/...mlc/xgboost4j/scala/QuantileDMatrix.scala → ...mlc/xgboost4j/scala/QuantileDMatrix.scala b/...mlc/xgboost4j/scala/QuantileDMatrix.scala → ...mlc/xgboost4j/scala/QuantileDMatrix.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2021 by Contributors
+ Copyright (c) 2021-2024 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,17 +16,17 @@
 
 package ml.dmlc.xgboost4j.scala
 
-import _root_.scala.collection.JavaConverters._
-
 import ml.dmlc.xgboost4j.java.{Column, ColumnBatch, XGBoostError, QuantileDMatrix => JQuantileDMatrix}
 
+import scala.collection.JavaConverters._
+
 class QuantileDMatrix private[scala](
-  private[scala] override val jDMatrix: JQuantileDMatrix) extends DMatrix(jDMatrix) {
+    private[scala] override val jDMatrix: JQuantileDMatrix) extends DMatrix(jDMatrix) {
 
   /**
-   * Create QuantileDMatrix from iterator based on the cuda array interface
+   * Create QuantileDMatrix from iterator based on the array interface
    *
-   * @param iter    the XGBoost ColumnBatch batch to provide the corresponding cuda array interface
+   * @param iter    the XGBoost ColumnBatch batch to provide the corresponding array interface
    * @param missing the missing value
    * @param maxBin  the max bin
    * @param nthread the parallelism
@@ -36,6 +36,27 @@ class QuantileDMatrix private[scala](
     this(new JQuantileDMatrix(iter.asJava, missing, maxBin, nthread))
   }
 
+  /**
+   * Create QuantileDMatrix from iterator based on the array interface
+   *
+   * @param iter       the XGBoost ColumnBatch batch to provide the corresponding array interface
+   * @param refDMatrix The training dataset that provides quantile information, needed
+   *                   when creating validation/test dataset with QuantileDMatrix. Supplying the
+   *                   training DMatrix as a reference means that the same quantisation applied
+   *                   to the training data is applied to the validation/test data
+   * @param missing    the missing value
+   * @param maxBin     the max bin
+   * @param nthread    the parallelism
+   * @throws XGBoostError
+   */
+  def this(iter: Iterator[ColumnBatch],
+           ref: QuantileDMatrix,
+           missing: Float,
+           maxBin: Int,
+           nthread: Int) {
+    this(new JQuantileDMatrix(iter.asJava, ref.jDMatrix, missing, maxBin, nthread))
+  }
+
   /**
    * set label of dmatrix
    *
@@ -84,7 +105,7 @@ class QuantileDMatrix private[scala](
     throw new XGBoostError("QuantileDMatrix does not support setGroup.")
 
   /**
-   * Set label of DMatrix from cuda array interface
+   * Set label of DMatrix from array interface
    */
   @throws(classOf[XGBoostError])
   override def setLabel(column: Column): Unit =
@@ -104,4 +125,9 @@ class QuantileDMatrix private[scala](
   override def setBaseMargin(column: Column): Unit =
     throw new XGBoostError("QuantileDMatrix does not support setBaseMargin.")
 
+  @throws(classOf[XGBoostError])
+  override def setQueryId(column: Column): Unit = {
+    throw new XGBoostError("QuantileDMatrix does not support setQueryId.")
+  }
+
 }