Skip to content

Commit

Permalink
add pca init for tsne
Browse files Browse the repository at this point in the history
  • Loading branch information
aamijar committed May 19, 2024
1 parent 4dec229 commit 21c453a
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 26 deletions.
6 changes: 4 additions & 2 deletions cpp/include/cuml/manifold/tsne.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,10 @@ struct TSNEParams {
// verbosity level for logging messages during execution
int verbosity = CUML_LEVEL_INFO;

// Whether to overwrite the current Y vector with random noise.
bool initialize_embeddings = true;
// Embedding initializer algorithm
// 0 = random layout
// 1 = pca layout
int init = 1;

// When this is set to true, the distances from the knn graph will
// always be squared before computing conditional probabilities, even if
Expand Down
9 changes: 3 additions & 6 deletions cpp/src/tsne/barnes_hut_tsne.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,9 @@ value_t Barnes_Hut(value_t* VAL,
RAFT_CUDA_TRY(cudaMemsetAsync(old_forces.data(), 0, sizeof(value_t) * n * 2, stream));

rmm::device_uvector<value_t> YY((nnodes + 1) * 2, stream);
if (params.initialize_embeddings) {
random_vector(YY.data(), -0.0001f, 0.0001f, (nnodes + 1) * 2, stream, params.random_state);
} else {
raft::copy(YY.data(), Y, n, stream);
raft::copy(YY.data() + nnodes + 1, Y + n, n, stream);
}

raft::copy(YY.data(), Y, n, stream);
raft::copy(YY.data() + nnodes + 1, Y + n, n, stream);

rmm::device_uvector<value_t> tmp(NNZ, stream);
value_t* Qs = tmp.data();
Expand Down
3 changes: 0 additions & 3 deletions cpp/src/tsne/exact_tsne.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,6 @@ value_t Exact_TSNE(value_t* VAL,
value_t kl_div = 0;
const value_idx dim = params.dim;

if (params.initialize_embeddings)
random_vector(Y, -0.0001f, 0.0001f, n * dim, stream, params.random_state);

// Allocate space
//---------------------------------------------------
CUML_LOG_DEBUG("Now allocating memory for TSNE.");
Expand Down
4 changes: 0 additions & 4 deletions cpp/src/tsne/fft_tsne.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -340,10 +340,6 @@ value_t FFT_TSNE(value_t* VAL,
value_t learning_rate = params.pre_learning_rate;
value_t exaggeration = params.early_exaggeration;

if (params.initialize_embeddings) {
random_vector(Y, 0.0000f, 0.0001f, n * 2, stream, params.random_state);
}

value_t kl_div = 0;
for (int iter = 0; iter < params.max_iter; iter++) {
// Compute charges Q_ij
Expand Down
46 changes: 46 additions & 0 deletions cpp/src/tsne/tsne_runner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,16 @@

#include <thrust/transform.h>

#include <pca/pca.cuh>

namespace ML {

template <class T, template <class> class U>
inline constexpr bool is_instance_of = std::false_type{};

template <template <class> class U, class V>
inline constexpr bool is_instance_of<U<V>, U> = std::true_type{};

template <typename tsne_input, typename value_idx, typename value_t>
class TSNE_runner {
public:
Expand Down Expand Up @@ -78,6 +86,44 @@ class TSNE_runner {
CUML_LOG_WARN(
"# of Nearest Neighbors should be at least 3 * perplexity. Your results"
" might be a bit strange...");

auto stream = handle_.get_stream();
const value_idx dim = params.dim;

if (params.init == 0) {
random_vector(Y, -0.0001f, 0.0001f, n * dim, stream, params.random_state);
} else if (params.init == 1) {
rmm::device_uvector<float> components(p * dim, stream);
rmm::device_uvector<float> explained_var(dim, stream);
rmm::device_uvector<float> explained_var_ratio(dim, stream);
rmm::device_uvector<float> singular_vals(dim, stream);
rmm::device_uvector<float> mu(p, stream);
rmm::device_scalar<float> noise_vars(stream);

paramsPCA prms;
prms.n_cols = p;
prms.n_rows = n;
prms.n_components = dim;
prms.whiten = true;
prms.algorithm = solver::COV_EIG_DQ;

if constexpr (!is_instance_of<tsne_input, manifold_dense_inputs_t>) {
throw std::runtime_error("The TSNE input must be of type manifold_dense_inputs_t");
} else {
pcaFitTransform(handle,
input.X,
Y,
components.data(),
explained_var.data(),
explained_var_ratio.data(),
singular_vals.data(),
mu.data(),
noise_vars.data(),
prms,
stream);
handle.sync_stream(stream);
}
}
}

value_t run()
Expand Down
24 changes: 14 additions & 10 deletions python/cuml/manifold/t_sne.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -76,7 +76,7 @@ cdef extern from "cuml/manifold/tsne.h" namespace "ML":
float post_momentum,
long long random_state,
int verbosity,
bool initialize_embeddings,
int init,
bool square_distances,
DistanceType metric,
float p,
Expand Down Expand Up @@ -156,8 +156,8 @@ class TSNE(Base,
Distance metric to use. Supported distances are ['l1, 'cityblock',
'manhattan', 'euclidean', 'l2', 'sqeuclidean', 'minkowski',
'chebyshev', 'cosine', 'correlation']
init : str 'random' (default 'random')
Currently supports random initialization.
init : str 'random' or 'pca' (default 'pca')
Currently supports random or pca initialization.
verbose : int or boolean, default=False
Sets logging level. It must be one of `cuml.common.logger.level_*`.
See :ref:`verbosity-levels` for more info.
Expand Down Expand Up @@ -317,11 +317,9 @@ class TSNE(Base,
if n_iter <= 100:
warnings.warn("n_iter = {} might cause TSNE to output wrong "
"results. Set it higher.".format(n_iter))
if init.lower() != 'random':
# TODO https://github.com/rapidsai/cuml/issues/3458
warnings.warn("TSNE does not support {} but only random "
"initialization.".format(init))
init = 'random'
if init.lower() != 'random' and init.lower() != 'pca':
raise ValueError("TSNE does not support {} but only random and pca "
"initialization.".format(init))
if angle < 0 or angle > 1:
raise ValueError("angle = {} should be ≥ 0 and ≤ 1".format(angle))
if n_neighbors < 0:
Expand Down Expand Up @@ -599,10 +597,16 @@ class TSNE(Base,
params.post_momentum = <float> self.post_momentum
params.random_state = <long long> seed
params.verbosity = <int> self.verbose
params.initialize_embeddings = <bool> True
params.square_distances = <bool> self.square_distances
params.algorithm = algo

init_parsing = {
"random": 0,
"pca": 1
}

params.init = <int> init_parsing[self.init.lower()]

# metric
metric_parsing = {
"l2": DistanceType.L2SqrtExpanded,
Expand Down
3 changes: 2 additions & 1 deletion python/cuml/tests/test_tsne.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -205,6 +205,7 @@ def test_tsne(test_datasets, method):
method=method,
min_grad_norm=1e-12,
perplexity=DEFAULT_PERPLEXITY,
init="pca",
)

Y = tsne.fit_transform(X)
Expand Down

0 comments on commit 21c453a

Please sign in to comment.