Remove StreamExecutor::UnifiedMemoryAllocate and ::UnifiedMemoryDeall…

…ocate. All callers are migrated to StreamExecutor::CreateMemoryAllocator(MemoryType::kUnified). PiperOrigin-RevId: 719391711
openxla · Jan 24, 2025 · 21391be · 21391be
1 parent 8ef2572
commit 21391be
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 104 deletions.
diff --git a/xla/stream_executor/cuda/cuda_executor.cc b/xla/stream_executor/cuda/cuda_executor.cc
@@ -567,50 +567,34 @@ CudaExecutor::CreateMemoryAllocator(MemoryType type) {
     return std::make_unique<GenericMemoryAllocator>(
         [this](uint64_t size)
             -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
-          void* ptr = UnifiedMemoryAllocate(size);
-          if (ptr == nullptr) {
-            return absl::InternalError("Failed to allocate unified memory");
-          }
+          std::unique_ptr<ActivateContext> activation = Activate();
+          CUdeviceptr result = 0;
+          // "Portable" memory is visible to all CUDA contexts. Safe for our use
+          // model.
+          TF_RETURN_IF_ERROR(cuda::ToStatus(
+              cuMemAllocManaged(&result, size, CU_MEM_ATTACH_GLOBAL)));
+          void* ptr = reinterpret_cast<void*>(result);
+          VLOG(2) << "allocated " << ptr << " for context " << cuda_context_
+                  << " of " << size << " bytes in unified memory";
           return std::make_unique<GenericMemoryAllocation>(
-              ptr, size, [this](void* ptr, uint64_t size) {
-                UnifiedMemoryDeallocate(ptr);
+              ptr, size, [this](void* location, uint64_t size) {
+                std::unique_ptr<ActivateContext> activation = Activate();
+                CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
+                auto status = cuda::ToStatus(cuMemFree(pointer));
+                if (!status.ok()) {
+                  LOG(ERROR) << "failed to free unified memory at " << location
+                             << "; result: " << status;
+                } else {
+                  VLOG(2) << "deallocated unified memory at " << location
+                          << " for context " << cuda_context_;
+                }
               });
         });
   }
   return absl::UnimplementedError(
       absl::StrFormat("Unsupported memory type %d", type));
 }
 
-void CudaExecutor::UnifiedMemoryDeallocate(void* location) {
-  std::unique_ptr<ActivateContext> activation = Activate();
-  CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
-  auto status = cuda::ToStatus(cuMemFree(pointer));
-  if (!status.ok()) {
-    LOG(ERROR) << "failed to free unified memory at " << location
-               << "; result: " << status;
-  } else {
-    VLOG(2) << "deallocated unified memory at " << location << " for context "
-            << cuda_context_;
-  }
-}
-
-void* CudaExecutor::UnifiedMemoryAllocate(uint64_t size) {
-  std::unique_ptr<ActivateContext> activation = Activate();
-  CUdeviceptr result = 0;
-  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  auto status =
-      cuda::ToStatus(cuMemAllocManaged(&result, size, CU_MEM_ATTACH_GLOBAL));
-  if (!status.ok()) {
-    LOG(ERROR) << "failed to alloc " << size
-               << " bytes unified memory; result: " << status;
-    return nullptr;
-  }
-  void* ptr = reinterpret_cast<void*>(result);
-  VLOG(2) << "allocated " << ptr << " for context " << cuda_context_ << " of "
-          << size << " bytes in unified memory";
-  return ptr;
-}
-
 absl::Status CudaExecutor::Init() {
   TF_ASSIGN_OR_RETURN(device_, GetDevice(device_ordinal()));
   TF_ASSIGN_OR_RETURN(CudaContext * context,

diff --git a/xla/stream_executor/cuda/cuda_executor.h b/xla/stream_executor/cuda/cuda_executor.h
@@ -116,8 +116,6 @@ class CudaExecutor : public GpuExecutor {
       const override {
     return CudaExecutor::CreateDeviceDescription(device_ordinal());
   }
-  void* UnifiedMemoryAllocate(uint64_t size) override;
-  void UnifiedMemoryDeallocate(void* location) override;
   absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
       uint64_t size) override;
 

diff --git a/xla/stream_executor/integrations/device_mem_allocator.h b/xla/stream_executor/integrations/device_mem_allocator.h
@@ -33,21 +33,13 @@ class DeviceMemAllocator : public tsl::SubAllocator {
   // Note: stream_exec cannot be null.
   DeviceMemAllocator(StreamExecutor* stream_exec,
                      tsl::PlatformDeviceId device_id, MemoryType memory_type,
-                     const std::vector<Visitor>& alloc_visitors)
+                     const std::vector<Visitor>& alloc_visitors = {})
       : SubAllocator(alloc_visitors, {}),
         stream_exec_(stream_exec),
         device_id_(device_id),
         memory_type_(memory_type) {
     CHECK(stream_exec_ != nullptr);
-  }
-
-  DeviceMemAllocator(StreamExecutor* stream_exec,
-                     tsl::PlatformDeviceId device_id, MemoryType memory_type)
-      : SubAllocator({}, {}),
-        stream_exec_(stream_exec),
-        device_id_(device_id),
-        memory_type_(memory_type) {
-    CHECK(stream_exec_ != nullptr);
+    CHECK(memory_type_ != MemoryType::kUnified);
   }
 
   ~DeviceMemAllocator() override = default;
@@ -59,9 +51,7 @@ class DeviceMemAllocator : public tsl::SubAllocator {
     void* ptr = nullptr;
     *bytes_received = num_bytes;
     if (num_bytes > 0) {
-      if (memory_type_ == MemoryType::kUnified) {
-        ptr = stream_exec_->UnifiedMemoryAllocate(num_bytes);
-      } else if (memory_type_ == MemoryType::kCollective) {
+      if (memory_type_ == MemoryType::kCollective) {
         auto status_or = stream_exec_->CollectiveMemoryAllocate(num_bytes);
         CHECK(status_or.ok()) << status_or.status().message();
         ptr = status_or.value();
@@ -83,9 +73,7 @@ class DeviceMemAllocator : public tsl::SubAllocator {
 
     if (ptr != nullptr) {
       VisitFree(ptr, device_id_.value(), num_bytes);
-      if (memory_type_ == MemoryType::kUnified) {
-        stream_exec_->UnifiedMemoryDeallocate(ptr);
-      } else if (memory_type_ == MemoryType::kCollective) {
+      if (memory_type_ == MemoryType::kCollective) {
         auto status = stream_exec_->CollectiveMemoryDeallocate(ptr);
         CHECK(status.ok()) << status.message();
       } else if (memory_type_ == MemoryType::kHost) {

diff --git a/xla/stream_executor/mock_stream_executor.h b/xla/stream_executor/mock_stream_executor.h
@@ -65,8 +65,6 @@ class MockStreamExecutor : public StreamExecutor {
   MOCK_METHOD(DeviceMemoryBase, Allocate, (uint64_t size, int64_t memory_space),
               (override));
   MOCK_METHOD(void, Deallocate, (DeviceMemoryBase * mem), (override));
-  MOCK_METHOD(void*, UnifiedMemoryAllocate, (uint64_t size), (override));
-  MOCK_METHOD(void, UnifiedMemoryDeallocate, (void* mem), (override));
   MOCK_METHOD(absl::StatusOr<void*>, CollectiveMemoryAllocate, (uint64_t size),
               (override));
   MOCK_METHOD(absl::Status, CollectiveMemoryDeallocate, (void* mem),

diff --git a/xla/stream_executor/rocm/rocm_executor.cc b/xla/stream_executor/rocm/rocm_executor.cc
@@ -754,49 +754,35 @@ RocmExecutor::CreateMemoryAllocator(MemoryType type) {
     return std::make_unique<GenericMemoryAllocator>(
         [this](uint64_t size)
             -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
-          void* ptr = UnifiedMemoryAllocate(size);
-          if (ptr == nullptr) {
-            return absl::InternalError("Failed to allocate unified memory");
-          }
+          std::unique_ptr<ActivateContext> activation = Activate();
+          hipDeviceptr_t result = 0;
+          // "managed" memory is visible to both CPU and GPU.
+          TF_RETURN_IF_ERROR(ToStatus(
+              wrap::hipMallocManaged(&result, size, hipMemAttachGlobal),
+              "Failed to allocate managed memory"));
+          void* ptr = reinterpret_cast<void*>(result);
+          VLOG(2) << "allocated " << ptr << " for context " << rocm_context_
+                  << " of " << size << " bytes in unified memory";
           return std::make_unique<GenericMemoryAllocation>(
-              ptr, size, [this](void* ptr, uint64_t size) {
-                UnifiedMemoryDeallocate(ptr);
+              ptr, size, [this](void* location, uint64_t size) {
+                std::unique_ptr<ActivateContext> activation = Activate();
+                hipDeviceptr_t pointer =
+                    absl::bit_cast<hipDeviceptr_t>(location);
+                hipError_t res = wrap::hipFree(pointer);
+                if (res != hipSuccess) {
+                  LOG(ERROR) << "failed to free unified memory at " << location
+                             << "; result: " << ToString(res);
+                } else {
+                  VLOG(2) << "deallocated unified memory at " << location
+                          << " for context " << rocm_context_;
+                }
               });
         });
   }
   return absl::UnimplementedError(
       absl::StrFormat("Unsupported memory type %d", type));
 }
 
-void* RocmExecutor::UnifiedMemoryAllocate(uint64_t size) {
-  std::unique_ptr<ActivateContext> activation = Activate();
-  hipDeviceptr_t result = 0;
-  // "managed" memory is visible to both CPU and GPU.
-  hipError_t res = wrap::hipMallocManaged(&result, size, hipMemAttachGlobal);
-  if (res != hipSuccess) {
-    LOG(ERROR) << "failed to alloc " << size
-               << " bytes unified memory; result: " << ToString(res);
-    return nullptr;
-  }
-  void* ptr = reinterpret_cast<void*>(result);
-  VLOG(2) << "allocated " << ptr << " for context " << rocm_context_ << " of "
-          << size << " bytes in unified memory";
-  return ptr;
-}
-
-void RocmExecutor::UnifiedMemoryDeallocate(void* location) {
-  std::unique_ptr<ActivateContext> activation = Activate();
-  hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
-  hipError_t res = wrap::hipFree(pointer);
-  if (res != hipSuccess) {
-    LOG(ERROR) << "failed to free unified memory at " << location
-               << "; result: " << ToString(res);
-  } else {
-    VLOG(2) << "deallocated unified memory at " << location << " for context "
-            << rocm_context_;
-  }
-}
-
 bool RocmExecutor::SynchronizeAllActivity() {
   return rocm_context_->Synchronize().ok();
 }

diff --git a/xla/stream_executor/rocm/rocm_executor.h b/xla/stream_executor/rocm/rocm_executor.h
@@ -107,9 +107,6 @@ class RocmExecutor : public GpuExecutor {
       const override {
     return RocmExecutor::CreateDeviceDescription(device_ordinal());
   }
-  void* UnifiedMemoryAllocate(uint64_t size) override;
-
-  void UnifiedMemoryDeallocate(void* location) override;
   absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
       uint64_t size) override;
   void HostMemoryDeallocate(void* location) override;

diff --git a/xla/stream_executor/stream_executor.h b/xla/stream_executor/stream_executor.h
@@ -156,16 +156,6 @@ class StreamExecutor {
   // Deallocation of a nullptr-representative value is permitted.
   virtual void Deallocate(DeviceMemoryBase* mem) = 0;
 
-  // Allocates unified memory space of the given size, if supported.
-  // See
-  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
-  // for more details on unified memory.
-  virtual void* UnifiedMemoryAllocate(uint64_t size) { return nullptr; }
-
-  // Deallocates unified memory space previously allocated with
-  // UnifiedMemoryAllocate.
-  virtual void UnifiedMemoryDeallocate(void* mem) {}
-
   // Allocates collective device memory using ncclMemAlloc.
   // See
   // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html