rapidsai · rapids-bot · Jul 10, 2024 · May 19, 2024 · May 28, 2024 · May 28, 2024
@@ -94,8 +94,10 @@ struct TSNEParams {
   // verbosity level for logging messages during execution
   int verbosity = CUML_LEVEL_INFO;
 
-  // Whether to overwrite the current Y vector with random noise.
-  bool initialize_embeddings = true;
+  // Embedding initializer algorithm
+  // 0 = random layout
+  // 1 = pca layout
+  int init = 1;
 
   // When this is set to true, the distances from the knn graph will
   // always be squared before computing conditional probabilities, even if

@@ -126,12 +126,9 @@ value_t Barnes_Hut(value_t* VAL,
   RAFT_CUDA_TRY(cudaMemsetAsync(old_forces.data(), 0, sizeof(value_t) * n * 2, stream));
 
   rmm::device_uvector<value_t> YY((nnodes + 1) * 2, stream);
-  if (params.initialize_embeddings) {
-    random_vector(YY.data(), -0.0001f, 0.0001f, (nnodes + 1) * 2, stream, params.random_state);
-  } else {
-    raft::copy(YY.data(), Y, n, stream);
-    raft::copy(YY.data() + nnodes + 1, Y + n, n, stream);
-  }
+
+  raft::copy(YY.data(), Y, n, stream);
+  raft::copy(YY.data() + nnodes + 1, Y + n, n, stream);
 
   rmm::device_uvector<value_t> tmp(NNZ, stream);
   value_t* Qs      = tmp.data();

@@ -54,9 +54,6 @@ value_t Exact_TSNE(value_t* VAL,
   value_t kl_div      = 0;
   const value_idx dim = params.dim;
 
-  if (params.initialize_embeddings)
-    random_vector(Y, -0.0001f, 0.0001f, n * dim, stream, params.random_state);
-
   // Allocate space
   //---------------------------------------------------
   CUML_LOG_DEBUG("Now allocating memory for TSNE.");

@@ -340,10 +340,6 @@ value_t FFT_TSNE(value_t* VAL,
   value_t learning_rate = params.pre_learning_rate;
   value_t exaggeration  = params.early_exaggeration;
 
-  if (params.initialize_embeddings) {
-    random_vector(Y, 0.0000f, 0.0001f, n * 2, stream, params.random_state);
-  }
-
   value_t kl_div = 0;
   for (int iter = 0; iter < params.max_iter; iter++) {
     // Compute charges Q_ij

@@ -33,8 +33,16 @@
 
 #include <thrust/transform.h>
 
+#include <pca/pca.cuh>
+
 namespace ML {
 
+template <class T, template <class> class U>
+inline constexpr bool is_instance_of = std::false_type{};
+
+template <template <class> class U, class V>
+inline constexpr bool is_instance_of<U<V>, U> = std::true_type{};
+
 template <typename tsne_input, typename value_idx, typename value_t>
 class TSNE_runner {
  public:
@@ -78,6 +86,44 @@ class TSNE_runner {
       CUML_LOG_WARN(
         "# of Nearest Neighbors should be at least 3 * perplexity. Your results"
         " might be a bit strange...");
+
+    auto stream         = handle_.get_stream();
+    const value_idx dim = params.dim;
+
+    if (params.init == 0) {
+      random_vector(Y, -0.0001f, 0.0001f, n * dim, stream, params.random_state);
+    } else if (params.init == 1) {
+      rmm::device_uvector<float> components(p * dim, stream);
+      rmm::device_uvector<float> explained_var(dim, stream);
+      rmm::device_uvector<float> explained_var_ratio(dim, stream);
+      rmm::device_uvector<float> singular_vals(dim, stream);
+      rmm::device_uvector<float> mu(p, stream);
+      rmm::device_scalar<float> noise_vars(stream);
+
+      paramsPCA prms;
+      prms.n_cols       = p;
+      prms.n_rows       = n;
+      prms.n_components = dim;
+      prms.whiten       = true;
+      prms.algorithm    = solver::COV_EIG_DQ;
+
+      if constexpr (!is_instance_of<tsne_input, manifold_dense_inputs_t>) {
+        throw std::runtime_error("The TSNE input must be of type manifold_dense_inputs_t");
+      } else {
+        pcaFitTransform(handle,
+                        input.X,
+                        Y,
+                        components.data(),
+                        explained_var.data(),
+                        explained_var_ratio.data(),
+                        singular_vals.data(),
+                        mu.data(),
+                        noise_vars.data(),
+                        prms,
+                        stream);
+        handle.sync_stream(stream);
+      }
+    }
   }
 
   value_t run()

@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -76,7 +76,7 @@ cdef extern from "cuml/manifold/tsne.h" namespace "ML":
         float post_momentum,
         long long random_state,
         int verbosity,
-        bool initialize_embeddings,
+        int init,
         bool square_distances,
         DistanceType metric,
         float p,
@@ -156,8 +156,8 @@ class TSNE(Base,
         Distance metric to use. Supported distances are ['l1, 'cityblock',
         'manhattan', 'euclidean', 'l2', 'sqeuclidean', 'minkowski',
         'chebyshev', 'cosine', 'correlation']
-    init : str 'random' (default 'random')
-        Currently supports random initialization.
+    init : str 'random' or 'pca' (default 'pca')
+        Currently supports random or pca initialization.
     verbose : int or boolean, default=False
         Sets logging level. It must be one of `cuml.common.logger.level_*`.
         See :ref:`verbosity-levels` for more info.
@@ -317,11 +317,9 @@ class TSNE(Base,
         if n_iter <= 100:
             warnings.warn("n_iter = {} might cause TSNE to output wrong "
                           "results. Set it higher.".format(n_iter))
-        if init.lower() != 'random':
-            # TODO https://github.com/rapidsai/cuml/issues/3458
-            warnings.warn("TSNE does not support {} but only random "
-                          "initialization.".format(init))
-            init = 'random'
+        if init.lower() != 'random' and init.lower() != 'pca':
+            raise ValueError("TSNE does not support {} but only random and pca "
+                            "initialization.".format(init))
         if angle < 0 or angle > 1:
             raise ValueError("angle = {} should be ≥ 0 and ≤ 1".format(angle))
         if n_neighbors < 0:
@@ -599,10 +597,16 @@ class TSNE(Base,
         params.post_momentum = <float> self.post_momentum
         params.random_state = <long long> seed
         params.verbosity = <int> self.verbose
-        params.initialize_embeddings = <bool> True
         params.square_distances = <bool> self.square_distances
         params.algorithm = algo
 
+        init_parsing = {
+            "random": 0,
+            "pca": 1
+        }
+
+        params.init = <int> init_parsing[self.init.lower()]
+
         # metric
         metric_parsing = {
             "l2": DistanceType.L2SqrtExpanded,

@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -205,6 +205,7 @@ def test_tsne(test_datasets, method):
         method=method,
         min_grad_norm=1e-12,
         perplexity=DEFAULT_PERPLEXITY,
+        init="pca",
     )
 
     Y = tsne.fit_transform(X)