MAINT: Quick fix for the one pool world (#11)

This is just Wonchan's old PR with small comments added. I plan to open an issue about actually adding at least some estimates about memory usage. This may not matter for many simple pipelines, but also could matter a lot for some. (Memory use is not easy to bound, e.g. for joins or csv reading, although even a very high overestimate may be useful if we read a small csv for example.) --------- Signed-off-by: Sebastian Berg <[email protected]> Co-authored-by: Wonchan Lee <[email protected]>
rapidsai · Jan 10, 2025 · 59b04a5 · 59b04a5
1 parent c3581e0
commit 59b04a5
Show file tree

Hide file tree

Showing 9 changed files with 71 additions and 26 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,4 +1,5 @@
 cpp/build/
+python/build/
 ci/manual-ci-test.py
 Docker
 python/_skbuild

diff --git a/Dockerfile b/Dockerfile
@@ -27,9 +27,9 @@ RUN mkdir -p /opt/legate-dataframe/conda-env-file
 COPY ./conda/environments/*.yaml /opt/legate-dataframe/conda-env-file/
 
 # To ensure we find the GPU version of legate in the docker build.
-ARG CONDA_OVERRIDE_CUDA=$CUDA_VERSION
+ARG CONDA_OVERRIDE_CUDA=12.4
 RUN /bin/bash -c '/opt/conda/bin/mamba env create --name legate-dev --file \
-  /opt/legate-dataframe/conda-env-file/all_cuda-$(cut --output-delimiter="" -d "." -f 1,2 <<< ${CUDA_VERSION})_arch-x86_64.yaml'
+  /opt/legate-dataframe/conda-env-file/all_cuda-$(cut --output-delimiter="" -d "." -f 1,2 <<< ${CONDA_OVERRIDE_CUDA})_arch-x86_64.yaml'
 
 # Build and install legate-dataframe
 WORKDIR /opt/legate-dataframe

diff --git a/ci/manual-ci-test.py b/ci/manual-ci-test.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright 2024 NVIDIA Corporation
+# Copyright (c) 2024-2025, NVIDIA CORPORATION
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,25 +30,15 @@
         "--no-capture-output",
         "-n",
         "legate-dev",
-        # TODO: Should not need legate startup,
-        # see https://github.com/nv-legate/legate.core.internal/issues/1304
-        "legate",
-        "--gpus=2",
-        "cpp/build/gtests/cpp_tests",
+        "ci/run_ctests.sh",
     ],
     "py": [
         "conda",
         "run",
         "--no-capture-output",
         "-n",
         "legate-dev",
-        # TODO: Should not need legate startup,
-        # see https://github.com/nv-legate/legate.core.internal/issues/1304
-        "legate",
-        "--gpus=2",
-        "--module",
-        "pytest",
-        "python/tests",
+        "ci/run_pytests.sh",
     ],
 }
 

diff --git a/ci/run_pytests.sh b/ci/run_pytests.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 # [description]
 #
@@ -26,6 +26,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/tests/
 LEGATE_TEST=${LEGATE_TEST:-1} \
 legate \
     --gpus "$(nvidia-smi -L | wc -l)"\
+    --fbmem=4000 \
     --module pytest \
     . \
     -sv \

diff --git a/cpp/src/core/library.cpp b/cpp/src/core/library.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,6 +69,46 @@ class Mapper : public legate::mapping::Mapper {
     return mappings;
   }
 
+  std::optional<std::size_t> allocation_pool_size(const legate::mapping::Task& task,
+                                                  legate::mapping::StoreTarget memory_kind)
+  {
+    const auto task_id = static_cast<int>(task.task_id());
+
+    if (memory_kind == legate::mapping::StoreTarget::ZCMEM) {
+      switch (task_id) {
+        case legate::dataframe::task::OpCode::Join:
+          // TODO: Join is identical to GroupBy, but we would have to look at
+          // both input tables to ge the maximum column number.
+          return std::nullopt;
+        case legate::dataframe::task::OpCode::GroupByAggregation: {
+          // Aggregation use repartitioning which uses ZCMEM for NCCL.
+          // This depends on the number of columns (first scalar when storing
+          // the first table is the number of columns).
+          if (task.is_single_task()) {
+            // No need for repartitioning, so no need for ZCMEM
+            return 0;
+          }
+          auto num_cols = task.scalars().at(0).value<int32_t>();
+          auto nrank    = task.get_launch_domain().get_volume();
+
+          // Space for the exchange buffers containing size chunks:
+          size_t size_exchange_nbytes = nrank * nrank * 2 * sizeof(std::size_t);
+          // Space for column packing metadata exchange
+          size_t sizeof_serialized_column = 6 * 8;  // not public, rough upper bound
+          // num_cols * 2 for the string column child and + 1 for the number of cols
+          size_t metadata_nbytes = nrank * (num_cols * 2 + 1) * sizeof_serialized_column;
+
+          return size_exchange_nbytes + metadata_nbytes;
+        }
+        default: return 0;
+      }
+    }
+
+    // TODO: Returning nullopt prevents other parallel task launches so it would be
+    //       good to provide estimated usage for most tasks here.
+    return std::nullopt;
+  }
+
   legate::Scalar tunable_value(legate::TunableID tunable_id) override { return legate::Scalar{0}; }
 
  private:
@@ -81,8 +121,15 @@ legate::Library create_and_registrate_library()
   if (env == nullptr || std::string{env} == "0") {
     GlobalMemoryResource::set_as_default_mmr_resource();
   }
-  auto context = legate::Runtime::get_runtime()->find_or_create_library(
-    library_name, legate::ResourceConfig{}, std::make_unique<Mapper>());
+  // Set with_has_allocations globally since currently all tasks allocate (and libcudf may also)
+  auto options = legate::VariantOptions{}.with_has_allocations(true);
+  auto context =
+    legate::Runtime::get_runtime()->find_or_create_library(library_name,
+                                                           legate::ResourceConfig{},
+                                                           std::make_unique<Mapper>(),
+                                                           {{legate::VariantCode::CPU, options},
+                                                            {legate::VariantCode::GPU, options},
+                                                            {legate::VariantCode::OMP, options}});
   task::Registry::get_registrar().register_all_tasks(context);
   return legate::Runtime::get_runtime()->find_library(legate::dataframe::library_name);
 }

diff --git a/cpp/src/core/repartition_by_hash.cu b/cpp/src/core/repartition_by_hash.cu
@@ -63,6 +63,7 @@ class ExchangedSizes {
     : _ctx(ctx), stream(ctx.tmp_stream())
   {
     assert(columns.size() == ctx.nranks - 1);
+    // Note: Size of this buffer is taken into account in the mapper:
     _all_sizes =
       legate::create_buffer<std::size_t>(ctx.nranks * ctx.nranks * 2, Memory::Kind::Z_COPY_MEM);
 
@@ -151,6 +152,7 @@ std::pair<std::vector<cudf::table_view>, std::map<int, rmm::device_buffer>> shuf
     std::size_t nbytes = sizes.metadata(peer, ctx.rank);
     if (nbytes > 0) {
       assert(peer != ctx.rank);
+      // Note: Size of this buffer is taken into account in the mapper:
       recv_metadata.insert(
         {peer, legate::create_buffer<uint8_t>(nbytes, Memory::Kind::Z_COPY_MEM)});
     }

diff --git a/cpp/tests/test_cudf_interop.cpp b/cpp/tests/test_cudf_interop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,8 @@ namespace {
 static const char* library_name = "test.cudf_interop";
 
 struct RoundTripTableTask : public legate::LegateTask<RoundTripTableTask> {
-  static constexpr auto TASK_ID = legate::LocalTaskID{0};
+  static constexpr auto TASK_ID             = legate::LocalTaskID{0};
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
   static void gpu_variant(legate::TaskContext context)
   {

diff --git a/cpp/tests/test_repartition_by_hash.cpp b/cpp/tests/test_repartition_by_hash.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,8 @@ namespace {
 static const char* library_name = "test.repartition_by_hash";
 
 struct CheckHash : public legate::LegateTask<CheckHash> {
-  static constexpr auto TASK_ID = legate::LocalTaskID{0};
+  static constexpr auto TASK_ID             = legate::LocalTaskID{0};
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
   static void gpu_variant(legate::TaskContext context)
   {

diff --git a/cpp/tests/test_task.cpp b/cpp/tests/test_task.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,8 @@ namespace {
 static const char* library_name = "test.global_row_offset";
 
 struct GlobalRowOffsetTask : public legate::LegateTask<GlobalRowOffsetTask> {
-  static constexpr auto TASK_ID = legate::LocalTaskID{0};
+  static constexpr auto TASK_ID             = legate::LocalTaskID{0};
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
   static void gpu_variant(legate::TaskContext context)
   {
@@ -54,7 +55,8 @@ struct GlobalRowOffsetTask : public legate::LegateTask<GlobalRowOffsetTask> {
 };
 
 struct TaskArgumentMix : public legate::LegateTask<TaskArgumentMix> {
-  static constexpr auto TASK_ID = legate::LocalTaskID{1};
+  static constexpr auto TASK_ID             = legate::LocalTaskID{1};
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
 
   static void gpu_variant(legate::TaskContext context)
   {