From 8539794216cb31aa98cd88b672caf5d761df548a Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Tue, 14 Nov 2023 23:06:41 +0200 Subject: [PATCH] allow backends to suggest minibatch size (#1877) * allow backends to suggest minibatch size * simple cuda heuristic --- appveyor.yml | 4 ++-- src/mcts/params.cc | 18 +++++++----------- src/mcts/params.h | 8 +++++--- src/mcts/search.cc | 14 +++++++------- src/mcts/search.h | 11 ++++++++++- src/neural/blas/network_blas.cc | 6 +++++- src/neural/cuda/network_cuda.cc | 7 +++++++ src/neural/network.h | 3 ++- src/neural/onednn/network_onednn.cc | 6 +++++- src/neural/onnx/network_onnx.cc | 6 +++++- 10 files changed, 55 insertions(+), 28 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index e9783ca98a..dc0445dac8 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -109,8 +109,8 @@ cache: - C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64 before_build: - cmd: git submodule update --init --recursive -- cmd: IF %BLAS%==true (echo.#define DEFAULT_MINIBATCH_SIZE 7 & echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h -- cmd: IF %ANDROID%==true (echo.#define DEFAULT_MINIBATCH_SIZE 7 & echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h +- cmd: IF %BLAS%==true (echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h +- cmd: IF %ANDROID%==true (echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h - cmd: SET BUILD_BLAS=%BLAS% - cmd: IF %OPENCL%==true SET BUILD_BLAS=true - cmd: IF %DX%==true SET BUILD_BLAS=true diff --git a/src/mcts/params.cc b/src/mcts/params.cc index 373fb91802..d13baea737 100644 --- a/src/mcts/params.cc +++ b/src/mcts/params.cc @@ -1,6 +1,6 @@ /* This file is part of Leela Chess Zero. - Copyright (C) 2018-2019 The LCZero Authors + Copyright (C) 2018-2023 The LCZero Authors Leela Chess is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -38,9 +38,6 @@ #include "params_override.h" #endif -#ifndef DEFAULT_MINIBATCH_SIZE -#define DEFAULT_MINIBATCH_SIZE 256 -#endif #ifndef DEFAULT_MAX_PREFETCH #define DEFAULT_MAX_PREFETCH 32 #endif @@ -156,7 +153,7 @@ const OptionId SearchParams::kMiniBatchSizeId{ "minibatch-size", "MinibatchSize", "How many positions the engine tries to batch together for parallel NN " "computation. Larger batches may reduce strength a bit, especially with a " - "small number of playouts."}; + "small number of playouts. Set to 0 to use a backend suggested value."}; const OptionId SearchParams::kMaxPrefetchBatchId{ "max-prefetch", "MaxPrefetch", "When the engine cannot gather a large enough batch for immediate use, try " @@ -287,7 +284,7 @@ const OptionId SearchParams::kOutOfOrderEvalId{ "in the cache or is terminal, evaluate it right away without sending the " "batch to the NN. When off, this may only happen with the very first node " "of a batch; when on, this can happen with any node."}; -const OptionId SearchParams::kMaxOutOfOrderEvalsId{ +const OptionId SearchParams::kMaxOutOfOrderEvalsFactorId{ "max-out-of-order-evals-factor", "MaxOutOfOrderEvalsFactor", "Maximum number of out of order evals during gathering of a batch is " "calculated by multiplying the maximum batch size by this number."}; @@ -459,7 +456,7 @@ const OptionId SearchParams::kSearchSpinBackoffId{ void SearchParams::Populate(OptionsParser* options) { // Here the uci optimized defaults" are set. // Many of them are overridden with training specific values in tournament.cc. - options->Add(kMiniBatchSizeId, 1, 1024) = DEFAULT_MINIBATCH_SIZE; + options->Add(kMiniBatchSizeId, 0, 1024) = 0; options->Add(kMaxPrefetchBatchId, 0, 1024) = DEFAULT_MAX_PREFETCH; options->Add(kCpuctId, 0.0f, 100.0f) = 1.745f; options->Add(kCpuctAtRootId, 0.0f, 100.0f) = 1.745f; @@ -497,7 +494,7 @@ void SearchParams::Populate(OptionsParser* options) { options->Add(kMaxCollisionVisitsScalingPowerId, 0.01, 100) = 1.25; options->Add(kOutOfOrderEvalId) = true; - options->Add(kMaxOutOfOrderEvalsId, 0.0f, 100.0f) = 2.4f; + options->Add(kMaxOutOfOrderEvalsFactorId, 0.0f, 100.0f) = 2.4f; options->Add(kStickyEndgamesId) = true; options->Add(kSyzygyFastPlayId) = false; options->Add(kMultiPvId, 1, 500) = 1; @@ -637,9 +634,8 @@ SearchParams::SearchParams(const OptionsDict& options) options.Get(kContemptMaxValueId), options.Get(kWDLContemptAttenuationId))), kWDLEvalObjectivity(options.Get(kWDLEvalObjectivityId)), - kMaxOutOfOrderEvals(std::max( - 1, static_cast(options.Get(kMaxOutOfOrderEvalsId) * - options.Get(kMiniBatchSizeId)))), + kMaxOutOfOrderEvalsFactor( + options.Get(kMaxOutOfOrderEvalsFactorId)), kNpsLimit(options.Get(kNpsLimitId)), kSolidTreeThreshold(options.Get(kSolidTreeThresholdId)), kTaskWorkersPerSearchWorker( diff --git a/src/mcts/params.h b/src/mcts/params.h index 630402d2ec..a3a8ffdf84 100644 --- a/src/mcts/params.h +++ b/src/mcts/params.h @@ -128,7 +128,9 @@ class SearchParams { float GetWDLRescaleRatio() const { return kWDLRescaleParams.ratio; } float GetWDLRescaleDiff() const { return kWDLRescaleParams.diff; } float GetWDLEvalObjectivity() const { return kWDLEvalObjectivity; } - int GetMaxOutOfOrderEvals() const { return kMaxOutOfOrderEvals; } + float GetMaxOutOfOrderEvalsFactor() const { + return kMaxOutOfOrderEvalsFactor; + } float GetNpsLimit() const { return kNpsLimit; } int GetSolidTreeThreshold() const { return kSolidTreeThreshold; } @@ -215,7 +217,7 @@ class SearchParams { static const OptionId kWDLDrawRateTargetId; static const OptionId kWDLDrawRateReferenceId; static const OptionId kWDLBookExitBiasId; - static const OptionId kMaxOutOfOrderEvalsId; + static const OptionId kMaxOutOfOrderEvalsFactorId; static const OptionId kNpsLimitId; static const OptionId kSolidTreeThresholdId; static const OptionId kTaskWorkersPerSearchWorkerId; @@ -274,7 +276,7 @@ class SearchParams { const float kContempt; const WDLRescaleParams kWDLRescaleParams; const float kWDLEvalObjectivity; - const int kMaxOutOfOrderEvals; + const float kMaxOutOfOrderEvalsFactor; const float kNpsLimit; const int kSolidTreeThreshold; const int kTaskWorkersPerSearchWorker; diff --git a/src/mcts/search.cc b/src/mcts/search.cc index e700648f95..4616a88c6b 100644 --- a/src/mcts/search.cc +++ b/src/mcts/search.cc @@ -1,6 +1,6 @@ /* This file is part of Leela Chess Zero. - Copyright (C) 2018-2019 The LCZero Authors + Copyright (C) 2018-2023 The LCZero Authors Leela Chess is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -1246,9 +1246,9 @@ void SearchWorker::InitializeIteration( std::unique_ptr computation) { computation_ = std::make_unique(std::move(computation), search_->cache_); - computation_->Reserve(params_.GetMiniBatchSize()); + computation_->Reserve(target_minibatch_size_); minibatch_.clear(); - minibatch_.reserve(2 * params_.GetMiniBatchSize()); + minibatch_.reserve(2 * target_minibatch_size_); } // 2. Gather minibatch. @@ -1299,8 +1299,8 @@ void SearchWorker::GatherMinibatch() { // Gather nodes to process in the current batch. // If we had too many nodes out of order, also interrupt the iteration so // that search can exit. - while (minibatch_size < params_.GetMiniBatchSize() && - number_out_of_order_ < params_.GetMaxOutOfOrderEvals()) { + while (minibatch_size < target_minibatch_size_ && + number_out_of_order_ < max_out_of_order_) { // If there's something to process without touching slow neural net, do it. if (minibatch_size > 0 && computation_->GetCacheMisses() == 0) return; @@ -1322,8 +1322,8 @@ void SearchWorker::GatherMinibatch() { int new_start = static_cast(minibatch_.size()); PickNodesToExtend( - std::min({collisions_left, params_.GetMiniBatchSize() - minibatch_size, - params_.GetMaxOutOfOrderEvals() - number_out_of_order_})); + std::min({collisions_left, target_minibatch_size_ - minibatch_size, + max_out_of_order_ - number_out_of_order_})); // Count the non-collisions. int non_collisions = 0; diff --git a/src/mcts/search.h b/src/mcts/search.h index ead8ed2682..1cb97aac20 100644 --- a/src/mcts/search.h +++ b/src/mcts/search.h @@ -1,6 +1,6 @@ /* This file is part of Leela Chess Zero. - Copyright (C) 2018 The LCZero Authors + Copyright (C) 2018-2023 The LCZero Authors Leela Chess is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -222,6 +222,13 @@ class SearchWorker { this->RunTasks(i); }); } + target_minibatch_size_ = params_.GetMiniBatchSize(); + if (target_minibatch_size_ == 0) { + target_minibatch_size_ = search_->network_->GetMiniBatchSize(); + } + max_out_of_order_ = + std::max(1, static_cast(params_.GetMaxOutOfOrderEvalsFactor() * + target_minibatch_size_)); } ~SearchWorker() { @@ -452,6 +459,8 @@ class SearchWorker { // List of nodes to process. std::vector minibatch_; std::unique_ptr computation_; + int target_minibatch_size_; + int max_out_of_order_; // History is reset and extended by PickNodeToExtend(). PositionHistory history_; int number_out_of_order_ = 0; diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index a9667206da..f1b5a1a17b 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -1,6 +1,6 @@ /* This file is part of Leela Chess Zero. - Copyright (C) 2018-2022 The LCZero Authors + Copyright (C) 2018-2023 The LCZero Authors Leela Chess is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -167,6 +167,10 @@ class BlasNetwork : public Network { return capabilities_; } + int GetMiniBatchSize() const override { + return 7; + } + void InitThread(int id) override { Numa::BindThread(id); } std::unique_ptr GetBuffers() { diff --git a/src/neural/cuda/network_cuda.cc b/src/neural/cuda/network_cuda.cc index 275a332e6e..d0e56358de 100644 --- a/src/neural/cuda/network_cuda.cc +++ b/src/neural/cuda/network_cuda.cc @@ -217,6 +217,7 @@ class CudaNetwork : public Network { showDeviceInfo(deviceProp); l2_cache_size_ = deviceProp.l2CacheSize; + sm_count_ = deviceProp.multiProcessorCount; allow_cache_opt_ = options.GetOrDefault("cache_opt", false); @@ -895,6 +896,11 @@ class CudaNetwork : public Network { return capabilities_; } + int GetMiniBatchSize() const override { + // Simple heuristic that seems to work for a wide range of GPUs. + return 2 * sm_count_; + } + std::unique_ptr NewComputation() override { // Set correct gpu id for this computation (as it might have been called // from a different thread). @@ -931,6 +937,7 @@ class CudaNetwork : public Network { const NetworkCapabilities capabilities_; int gpu_id_; int l2_cache_size_; + int sm_count_; int max_batch_size_; bool wdl_; bool moves_left_; diff --git a/src/neural/network.h b/src/neural/network.h index 054b2ebd33..f95319220f 100644 --- a/src/neural/network.h +++ b/src/neural/network.h @@ -1,6 +1,6 @@ /* This file is part of Leela Chess Zero. - Copyright (C) 2018 The LCZero Authors + Copyright (C) 2018-2023 The LCZero Authors Leela Chess is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -107,6 +107,7 @@ class Network { virtual const NetworkCapabilities& GetCapabilities() const = 0; virtual std::unique_ptr NewComputation() = 0; virtual void InitThread(int /*id*/) {} + virtual int GetMiniBatchSize() const { return 256; } virtual ~Network() = default; }; diff --git a/src/neural/onednn/network_onednn.cc b/src/neural/onednn/network_onednn.cc index 8587e12982..e79bf093b2 100644 --- a/src/neural/onednn/network_onednn.cc +++ b/src/neural/onednn/network_onednn.cc @@ -1,6 +1,6 @@ /* This file is part of Leela Chess Zero. - Copyright (C) 2021-2022 The LCZero Authors + Copyright (C) 2021-2023 The LCZero Authors Leela Chess is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -801,6 +801,10 @@ class OnednnNetwork : public Network { return capabilities_; } + int GetMiniBatchSize() const override { + return batch_size_ * steps_; + } + std::unique_ptr NewComputation() override { return std::make_unique(this, wdl_, moves_left_); } diff --git a/src/neural/onnx/network_onnx.cc b/src/neural/onnx/network_onnx.cc index d83ac37cf0..e5eb8e0443 100644 --- a/src/neural/onnx/network_onnx.cc +++ b/src/neural/onnx/network_onnx.cc @@ -1,6 +1,6 @@ /* This file is part of Leela Chess Zero. - Copyright (C) 2021 The LCZero Authors + Copyright (C) 2021-2023 The LCZero Authors Leela Chess is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -94,6 +94,10 @@ class OnnxNetwork : public Network { const NetworkCapabilities& GetCapabilities() const override { return capabilities_; } + int GetMiniBatchSize() const override { + return batch_size_ == -1 ? Network::GetMiniBatchSize() + : batch_size_ * steps_; + } Ort::Env onnx_env_; // Prepare sessions for this many multiples of the batch size;