Skip to content

Commit

Permalink
Remove StreamExecutor::UnifiedMemoryAllocate and ::UnifiedMemoryDeall…
Browse files Browse the repository at this point in the history
…ocate.

All callers are migrated to StreamExecutor::CreateMemoryAllocator(MemoryType::kUnified).

PiperOrigin-RevId: 719391711
  • Loading branch information
klucke authored and Google-ML-Automation committed Jan 24, 2025
1 parent 8ef2572 commit 21391be
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 104 deletions.
56 changes: 20 additions & 36 deletions xla/stream_executor/cuda/cuda_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -567,50 +567,34 @@ CudaExecutor::CreateMemoryAllocator(MemoryType type) {
return std::make_unique<GenericMemoryAllocator>(
[this](uint64_t size)
-> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
void* ptr = UnifiedMemoryAllocate(size);
if (ptr == nullptr) {
return absl::InternalError("Failed to allocate unified memory");
}
std::unique_ptr<ActivateContext> activation = Activate();
CUdeviceptr result = 0;
// "Portable" memory is visible to all CUDA contexts. Safe for our use
// model.
TF_RETURN_IF_ERROR(cuda::ToStatus(
cuMemAllocManaged(&result, size, CU_MEM_ATTACH_GLOBAL)));
void* ptr = reinterpret_cast<void*>(result);
VLOG(2) << "allocated " << ptr << " for context " << cuda_context_
<< " of " << size << " bytes in unified memory";
return std::make_unique<GenericMemoryAllocation>(
ptr, size, [this](void* ptr, uint64_t size) {
UnifiedMemoryDeallocate(ptr);
ptr, size, [this](void* location, uint64_t size) {
std::unique_ptr<ActivateContext> activation = Activate();
CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
auto status = cuda::ToStatus(cuMemFree(pointer));
if (!status.ok()) {
LOG(ERROR) << "failed to free unified memory at " << location
<< "; result: " << status;
} else {
VLOG(2) << "deallocated unified memory at " << location
<< " for context " << cuda_context_;
}
});
});
}
return absl::UnimplementedError(
absl::StrFormat("Unsupported memory type %d", type));
}

void CudaExecutor::UnifiedMemoryDeallocate(void* location) {
std::unique_ptr<ActivateContext> activation = Activate();
CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
auto status = cuda::ToStatus(cuMemFree(pointer));
if (!status.ok()) {
LOG(ERROR) << "failed to free unified memory at " << location
<< "; result: " << status;
} else {
VLOG(2) << "deallocated unified memory at " << location << " for context "
<< cuda_context_;
}
}

void* CudaExecutor::UnifiedMemoryAllocate(uint64_t size) {
std::unique_ptr<ActivateContext> activation = Activate();
CUdeviceptr result = 0;
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
auto status =
cuda::ToStatus(cuMemAllocManaged(&result, size, CU_MEM_ATTACH_GLOBAL));
if (!status.ok()) {
LOG(ERROR) << "failed to alloc " << size
<< " bytes unified memory; result: " << status;
return nullptr;
}
void* ptr = reinterpret_cast<void*>(result);
VLOG(2) << "allocated " << ptr << " for context " << cuda_context_ << " of "
<< size << " bytes in unified memory";
return ptr;
}

absl::Status CudaExecutor::Init() {
TF_ASSIGN_OR_RETURN(device_, GetDevice(device_ordinal()));
TF_ASSIGN_OR_RETURN(CudaContext * context,
Expand Down
2 changes: 0 additions & 2 deletions xla/stream_executor/cuda/cuda_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,6 @@ class CudaExecutor : public GpuExecutor {
const override {
return CudaExecutor::CreateDeviceDescription(device_ordinal());
}
void* UnifiedMemoryAllocate(uint64_t size) override;
void UnifiedMemoryDeallocate(void* location) override;
absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
uint64_t size) override;

Expand Down
20 changes: 4 additions & 16 deletions xla/stream_executor/integrations/device_mem_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,13 @@ class DeviceMemAllocator : public tsl::SubAllocator {
// Note: stream_exec cannot be null.
DeviceMemAllocator(StreamExecutor* stream_exec,
tsl::PlatformDeviceId device_id, MemoryType memory_type,
const std::vector<Visitor>& alloc_visitors)
const std::vector<Visitor>& alloc_visitors = {})
: SubAllocator(alloc_visitors, {}),
stream_exec_(stream_exec),
device_id_(device_id),
memory_type_(memory_type) {
CHECK(stream_exec_ != nullptr);
}

DeviceMemAllocator(StreamExecutor* stream_exec,
tsl::PlatformDeviceId device_id, MemoryType memory_type)
: SubAllocator({}, {}),
stream_exec_(stream_exec),
device_id_(device_id),
memory_type_(memory_type) {
CHECK(stream_exec_ != nullptr);
CHECK(memory_type_ != MemoryType::kUnified);
}

~DeviceMemAllocator() override = default;
Expand All @@ -59,9 +51,7 @@ class DeviceMemAllocator : public tsl::SubAllocator {
void* ptr = nullptr;
*bytes_received = num_bytes;
if (num_bytes > 0) {
if (memory_type_ == MemoryType::kUnified) {
ptr = stream_exec_->UnifiedMemoryAllocate(num_bytes);
} else if (memory_type_ == MemoryType::kCollective) {
if (memory_type_ == MemoryType::kCollective) {
auto status_or = stream_exec_->CollectiveMemoryAllocate(num_bytes);
CHECK(status_or.ok()) << status_or.status().message();
ptr = status_or.value();
Expand All @@ -83,9 +73,7 @@ class DeviceMemAllocator : public tsl::SubAllocator {

if (ptr != nullptr) {
VisitFree(ptr, device_id_.value(), num_bytes);
if (memory_type_ == MemoryType::kUnified) {
stream_exec_->UnifiedMemoryDeallocate(ptr);
} else if (memory_type_ == MemoryType::kCollective) {
if (memory_type_ == MemoryType::kCollective) {
auto status = stream_exec_->CollectiveMemoryDeallocate(ptr);
CHECK(status.ok()) << status.message();
} else if (memory_type_ == MemoryType::kHost) {
Expand Down
2 changes: 0 additions & 2 deletions xla/stream_executor/mock_stream_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@ class MockStreamExecutor : public StreamExecutor {
MOCK_METHOD(DeviceMemoryBase, Allocate, (uint64_t size, int64_t memory_space),
(override));
MOCK_METHOD(void, Deallocate, (DeviceMemoryBase * mem), (override));
MOCK_METHOD(void*, UnifiedMemoryAllocate, (uint64_t size), (override));
MOCK_METHOD(void, UnifiedMemoryDeallocate, (void* mem), (override));
MOCK_METHOD(absl::StatusOr<void*>, CollectiveMemoryAllocate, (uint64_t size),
(override));
MOCK_METHOD(absl::Status, CollectiveMemoryDeallocate, (void* mem),
Expand Down
56 changes: 21 additions & 35 deletions xla/stream_executor/rocm/rocm_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -754,49 +754,35 @@ RocmExecutor::CreateMemoryAllocator(MemoryType type) {
return std::make_unique<GenericMemoryAllocator>(
[this](uint64_t size)
-> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
void* ptr = UnifiedMemoryAllocate(size);
if (ptr == nullptr) {
return absl::InternalError("Failed to allocate unified memory");
}
std::unique_ptr<ActivateContext> activation = Activate();
hipDeviceptr_t result = 0;
// "managed" memory is visible to both CPU and GPU.
TF_RETURN_IF_ERROR(ToStatus(
wrap::hipMallocManaged(&result, size, hipMemAttachGlobal),
"Failed to allocate managed memory"));
void* ptr = reinterpret_cast<void*>(result);
VLOG(2) << "allocated " << ptr << " for context " << rocm_context_
<< " of " << size << " bytes in unified memory";
return std::make_unique<GenericMemoryAllocation>(
ptr, size, [this](void* ptr, uint64_t size) {
UnifiedMemoryDeallocate(ptr);
ptr, size, [this](void* location, uint64_t size) {
std::unique_ptr<ActivateContext> activation = Activate();
hipDeviceptr_t pointer =
absl::bit_cast<hipDeviceptr_t>(location);
hipError_t res = wrap::hipFree(pointer);
if (res != hipSuccess) {
LOG(ERROR) << "failed to free unified memory at " << location
<< "; result: " << ToString(res);
} else {
VLOG(2) << "deallocated unified memory at " << location
<< " for context " << rocm_context_;
}
});
});
}
return absl::UnimplementedError(
absl::StrFormat("Unsupported memory type %d", type));
}

void* RocmExecutor::UnifiedMemoryAllocate(uint64_t size) {
std::unique_ptr<ActivateContext> activation = Activate();
hipDeviceptr_t result = 0;
// "managed" memory is visible to both CPU and GPU.
hipError_t res = wrap::hipMallocManaged(&result, size, hipMemAttachGlobal);
if (res != hipSuccess) {
LOG(ERROR) << "failed to alloc " << size
<< " bytes unified memory; result: " << ToString(res);
return nullptr;
}
void* ptr = reinterpret_cast<void*>(result);
VLOG(2) << "allocated " << ptr << " for context " << rocm_context_ << " of "
<< size << " bytes in unified memory";
return ptr;
}

void RocmExecutor::UnifiedMemoryDeallocate(void* location) {
std::unique_ptr<ActivateContext> activation = Activate();
hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
hipError_t res = wrap::hipFree(pointer);
if (res != hipSuccess) {
LOG(ERROR) << "failed to free unified memory at " << location
<< "; result: " << ToString(res);
} else {
VLOG(2) << "deallocated unified memory at " << location << " for context "
<< rocm_context_;
}
}

bool RocmExecutor::SynchronizeAllActivity() {
return rocm_context_->Synchronize().ok();
}
Expand Down
3 changes: 0 additions & 3 deletions xla/stream_executor/rocm/rocm_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,6 @@ class RocmExecutor : public GpuExecutor {
const override {
return RocmExecutor::CreateDeviceDescription(device_ordinal());
}
void* UnifiedMemoryAllocate(uint64_t size) override;

void UnifiedMemoryDeallocate(void* location) override;
absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
uint64_t size) override;
void HostMemoryDeallocate(void* location) override;
Expand Down
10 changes: 0 additions & 10 deletions xla/stream_executor/stream_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,16 +156,6 @@ class StreamExecutor {
// Deallocation of a nullptr-representative value is permitted.
virtual void Deallocate(DeviceMemoryBase* mem) = 0;

// Allocates unified memory space of the given size, if supported.
// See
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
// for more details on unified memory.
virtual void* UnifiedMemoryAllocate(uint64_t size) { return nullptr; }

// Deallocates unified memory space previously allocated with
// UnifiedMemoryAllocate.
virtual void UnifiedMemoryDeallocate(void* mem) {}

// Allocates collective device memory using ncclMemAlloc.
// See
// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html
Expand Down

0 comments on commit 21391be

Please sign in to comment.