From 60820e1d1a673660f57e7d531a9a930e496b36f3 Mon Sep 17 00:00:00 2001 From: Alexander Guzhva Date: Thu, 5 Sep 2024 06:24:58 -0400 Subject: [PATCH] introduce options for reducing the overhead for a clustering procedure (#790) Signed-off-by: Alexandr Guzhva --- thirdparty/faiss/faiss/Clustering.cpp | 39 +++++++++++++++++++--- thirdparty/faiss/faiss/Clustering.h | 11 ++++++- thirdparty/faiss/faiss/utils/random.cpp | 43 +++++++++++++++++++++++++ thirdparty/faiss/faiss/utils/random.h | 25 ++++++++++++++ 4 files changed, 113 insertions(+), 5 deletions(-) diff --git a/thirdparty/faiss/faiss/Clustering.cpp b/thirdparty/faiss/faiss/Clustering.cpp index 43c712f7f..f306ed240 100644 --- a/thirdparty/faiss/faiss/Clustering.cpp +++ b/thirdparty/faiss/faiss/Clustering.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -75,6 +76,14 @@ void Clustering::train( namespace { +uint64_t get_actual_rng_seed(const int seed) { + return (seed >= 0) + ? seed + : static_cast(std::chrono::high_resolution_clock::now() + .time_since_epoch() + .count()); +} + idx_t subsample_training_set( const Clustering& clus, idx_t nx, @@ -88,11 +97,30 @@ idx_t subsample_training_set( clus.k * clus.max_points_per_centroid, nx); } - std::vector perm(nx); - rand_perm(perm.data(), nx, clus.seed); + + const uint64_t actual_seed = get_actual_rng_seed(clus.seed); + + std::vector perm; + if (clus.use_faster_subsampling) { + // use subsampling with splitmix64 rng + SplitMix64RandomGenerator rng(actual_seed); + + const idx_t new_nx = clus.k * clus.max_points_per_centroid; + perm.resize(new_nx); + for (idx_t i = 0; i < new_nx; i++) { + perm[i] = rng.rand_int(nx); + } + } else { + // use subsampling with a default std rng + perm.resize(nx); + rand_perm(perm.data(), nx, actual_seed); + } + nx = clus.k * clus.max_points_per_centroid; uint8_t* x_new = new uint8_t[nx * line_size]; *x_out = x_new; + + // might be worth omp-ing as well for (idx_t i = 0; i < nx; i++) { memcpy(x_new + i * line_size, x + perm[i] * line_size, line_size); } @@ -398,7 +426,7 @@ void Clustering::train_encoded( double t0 = getmillisecs(); - if (!codec) { + if (!codec && check_input_data_for_NaNs) { // Check for NaNs in input data. Normally it is the user's // responsibility, but it may spare us some hard-to-debug // reports. @@ -501,6 +529,9 @@ void Clustering::train_encoded( } t0 = getmillisecs(); + // initialize seed + const uint64_t actual_seed = get_actual_rng_seed(seed); + // temporary buffer to decode vectors during the optimization std::vector decode_buffer(codec ? d * decode_block_size : 0); @@ -510,7 +541,7 @@ void Clustering::train_encoded( } { - int64_t random_seed = seed + 1 + redo * 15486557L; + int64_t random_seed = actual_seed + 1 + redo * 15486557L; std::vector centroids_index(nx); if (ClusteringType::K_MEANS == clustering_type) { diff --git a/thirdparty/faiss/faiss/Clustering.h b/thirdparty/faiss/faiss/Clustering.h index 1c171b18c..fdc493e3f 100644 --- a/thirdparty/faiss/faiss/Clustering.h +++ b/thirdparty/faiss/faiss/Clustering.h @@ -59,11 +59,20 @@ struct ClusteringParameters { int min_points_per_centroid = 39; /// to limit size of dataset, otherwise the training set is subsampled int max_points_per_centroid = 256; - /// seed for the random number generator + /// seed for the random number generator. + /// negative values lead to seeding an internal rng with + /// std::high_resolution_clock. int seed = 1234; /// when the training set is encoded, batch size of the codec decoder size_t decode_block_size = 32768; + + /// whether to check for NaNs in an input data + bool check_input_data_for_NaNs = true; + + /// Whether to use splitmix64-based random number generator for subsampling, + /// which is faster, but may pick duplicate points. + bool use_faster_subsampling = false; }; struct ClusteringIterationStats { diff --git a/thirdparty/faiss/faiss/utils/random.cpp b/thirdparty/faiss/faiss/utils/random.cpp index 9ab8d0adb..877a7c252 100644 --- a/thirdparty/faiss/faiss/utils/random.cpp +++ b/thirdparty/faiss/faiss/utils/random.cpp @@ -54,6 +54,37 @@ double RandomGenerator::rand_double() { return mt() / double(mt.max()); } +SplitMix64RandomGenerator::SplitMix64RandomGenerator(int64_t seed) + : state{static_cast(seed)} {} + +int SplitMix64RandomGenerator::rand_int() { + return next() & 0x7fffffff; +} + +int64_t SplitMix64RandomGenerator::rand_int64() { + uint64_t value = next(); + return static_cast(value & 0x7fffffffffffffffULL); +} + +int SplitMix64RandomGenerator::rand_int(int max) { + return next() % max; +} + +float SplitMix64RandomGenerator::rand_float() { + return next() / float(std::numeric_limits::max()); +} + +double SplitMix64RandomGenerator::rand_double() { + return next() / double(std::numeric_limits::max()); +} + +uint64_t SplitMix64RandomGenerator::next() { + uint64_t z = (state += 0x9e3779b97f4a7c15ULL); + z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL; + z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL; + return z ^ (z >> 31); +} + /*********************************************************************** * Random functions in this C file only exist because Torch * counterparts are slow and not multi-threaded. Typical use is for @@ -162,6 +193,18 @@ void rand_perm(int* perm, size_t n, int64_t seed) { } } +void rand_perm_splitmix64(int* perm, size_t n, int64_t seed) { + for (size_t i = 0; i < n; i++) + perm[i] = i; + + SplitMix64RandomGenerator rng(seed); + + for (size_t i = 0; i + 1 < n; i++) { + int i2 = i + rng.rand_int(n - i); + std::swap(perm[i], perm[i2]); + } +} + void byte_rand(uint8_t* x, size_t n, int64_t seed) { // only try to parallelize on large enough arrays const size_t nblock = n < 1024 ? 1 : 1024; diff --git a/thirdparty/faiss/faiss/utils/random.h b/thirdparty/faiss/faiss/utils/random.h index 8b4286894..ac985d69b 100644 --- a/thirdparty/faiss/faiss/utils/random.h +++ b/thirdparty/faiss/faiss/utils/random.h @@ -43,6 +43,30 @@ struct RandomGenerator { explicit RandomGenerator(int64_t seed = 1234); }; +/// fast random generator that cannot be used in multithreaded contexts. +/// based on https://prng.di.unimi.it/ +struct SplitMix64RandomGenerator { + uint64_t state; + + /// random positive integer + int rand_int(); + + /// random int64_t + int64_t rand_int64(); + + /// generate random integer between 0 and max-1 + int rand_int(int max); + + /// between 0 and 1 + float rand_float(); + + double rand_double(); + + explicit SplitMix64RandomGenerator(int64_t seed = 1234); + + uint64_t next(); +}; + /* Generate an array of uniform random floats / multi-threaded implementation */ void float_rand(float* x, size_t n, int64_t seed); void float_randn(float* x, size_t n, int64_t seed); @@ -53,6 +77,7 @@ void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed); /* random permutation */ void rand_perm(int* perm, size_t n, int64_t seed); +void rand_perm_splitmix64(int* perm, size_t n, int64_t seed); /* Random set of vectors with intrinsic dimensionality 10 that is harder to * index than a subspace of dim 10 but easier than uniform data in dimension d