From 7cc8f4bcb1b8147cdd9fcb87286678e3e43759c3 Mon Sep 17 00:00:00 2001 From: Adel Johar Date: Tue, 28 Jan 2025 17:13:25 +0100 Subject: [PATCH] Docs: Add xnack to unified memory page --- .../memory_management/unified_memory.rst | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst b/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst index c253416928..13fddae8df 100644 --- a/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst +++ b/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst @@ -231,6 +231,123 @@ functions on ROCm and CUDA, both with and without HMM support. making an explicit copy, like a normal memory access, hence the term "zero copy". +.. _xnack: + +XNACK +----- + +On a subset of GPUs, such as the MI200, there is an option to automatically migrate pages +of memory between host and device. This is important for managed memory, where the locality +of the data is important for performance. +Depending on the system, page migration may be disabled by default in which case managed +memory will act like pinned host memory and suffer degraded performance. + +**XNACK** describes the GPUs ability to retry memory accesses that failed due a page fault +(which normally would lead to a memory access error), and instead retrieve the missing page. + +This also affects memory allocated by the system as indicated by the previous table. + +Below is a small example that demonstrates an explicit page fault and how **XNACK** affects +the page fault behavior. + +.. code-block:: cpp + + #include + #include + + #define HIP_CHECK(expression) \ + { \ + const hipError_t err = expression; \ + if(err != hipSuccess){ \ + std::cerr << "HIP error: " \ + << hipGetErrorString(err) \ + << " at " << __LINE__ << "\n"; \ + } \ + } + + __global__ void write_to_memory(int* data, int size) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) + { + // Writing to memory that may not have been allocated in GPU memory + data[idx] = idx * 2; // Triggers a page fault if not resident + } + } + + int main() + { + const int N = 1024; // 1K elements + int* data; + + // Allocate unified memory + HIP_CHECK(hipMallocManaged(&data, N * sizeof(int))); + + // Intentionally don't initialize or prefetch any part of the data + // No initialization: data is uninitialized but accessible + + // Launch kernel that writes to all elements + hipLaunchKernelGGL(write_to_memory, dim3(4), dim3(256), 0, 0, data, N); + + // Synchronize to ensure kernel execution and fault resolution + HIP_CHECK(hipDeviceSynchronize()); + + // Print a few results + for (int i = 0; i < 10; ++i) + { + std::cout << "data[" << i << "] = " << data[i] << std::endl; + } + + // Free memory + HIP_CHECK(hipFree(data)); + return 0; + } + + +The key behaviors in the example above are as follows:- + +#. | No Prefetch or Initialization: The memory is allocated using hipMallocManaged, but + | it's not initialized or explicitly prefetched to the GPU. + +#. | Kernel Write: The kernel writes to the entire array, including memory locations + | that haven't been allocated in GPU memory yet. This triggers page faults for pages + | not currently mapped to the GPU. + +#. | If **XNACK** is enabled, page faults are handled gracefully: the runtime allocates + | or fetches the missing pages as needed, ensuring correct execution. If **XNACK** is + | disabled, the GPU would not handle the page faults, leading to undefined behavior. + +To check if page migration is available on a platform, use ``rocminfo``: + +.. code-block:: bash + + $ rocminfo | grep xnack + Name: amdgcn-amd-amdhsa--gfx90a:sramecc+:xnack- + +Here, ``xnack-`` means that XNACK is available but is disabled by default. +Turning on XNACK by setting the environment variable ``HSA_XNACK=1`` and gives +the expected result, ``xnack+``: + +.. code-block:: bash + + $ HSA_XNACK=1 rocminfo | grep xnack + Name: amdgcn-amd-amdhsa--gfx90a:sramecc+:xnack+ + +``hipcc`` by default will generate code that runs correctly with both XNACK enabled or disabled. +Setting the ``--offload-arch=``-option with ``xnack+`` or ``xnack-`` forces code to +be only run with XNACK enabled or disabled respectively. + +.. code-block:: bash + + # Compiled kernels will run regardless if XNACK is enabled or is disabled. + hipcc --offload-arch=gfx90a + + # Compiled kernels will only be run if XNACK is enabled with XNACK=1. + hipcc --offload-arch=gfx90a:xnack+ + + # Compiled kernels will only be run if XNACK is disabled with XNACK=0. + hipcc --offload-arch=gfx90a:xnack- + .. _checking unified memory support: Checking unified memory support