From 7cc8f4bcb1b8147cdd9fcb87286678e3e43759c3 Mon Sep 17 00:00:00 2001
From: Adel Johar <adel.johar@amd.com>
Date: Tue, 28 Jan 2025 17:13:25 +0100
Subject: [PATCH] Docs: Add xnack to unified memory page

---
 .../memory_management/unified_memory.rst      | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)
diff --git a/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst b/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst
index c253416928..13fddae8df 100644
--- a/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst
+++ b/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst
@@ -231,6 +231,123 @@ functions on ROCm and CUDA, both with and without HMM support.
         making an explicit copy, like a normal memory access, hence the term
         "zero copy".
 
+.. _xnack:
+
+XNACK
+-----
+
+On a subset of GPUs, such as the MI200, there is an option to automatically migrate pages
+of memory between host and device. This is important for managed memory, where the locality
+of the data is important for performance.
+Depending on the system, page migration may be disabled by default in which case managed
+memory will act like pinned host memory and suffer degraded performance.
+
+**XNACK** describes the GPUs ability to retry memory accesses that failed due a page fault
+(which normally would lead to a memory access error), and instead retrieve the missing page.
+
+This also affects memory allocated by the system as indicated by the previous table.
+
+Below is a small example that demonstrates an explicit page fault and how **XNACK** affects
+the page fault behavior.
+
+.. code-block:: cpp
+
+    #include <hip/hip_runtime.h>
+    #include <iostream>
+
+    #define HIP_CHECK(expression)              \
+    {                                          \
+        const hipError_t err = expression;     \
+        if(err != hipSuccess){                 \
+            std::cerr << "HIP error: "         \
+                << hipGetErrorString(err)      \
+                << " at " << __LINE__ << "\n"; \
+        }                                      \
+    }
+
+    __global__ void write_to_memory(int* data, int size)
+    {
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size)
+        {
+            // Writing to memory that may not have been allocated in GPU memory
+            data[idx] = idx * 2; // Triggers a page fault if not resident
+        }
+    }
+
+    int main()
+    {
+        const int N = 1024; // 1K elements
+        int* data;
+
+        // Allocate unified memory
+        HIP_CHECK(hipMallocManaged(&data, N * sizeof(int)));
+
+        // Intentionally don't initialize or prefetch any part of the data
+        // No initialization: data is uninitialized but accessible
+
+        // Launch kernel that writes to all elements
+        hipLaunchKernelGGL(write_to_memory, dim3(4), dim3(256), 0, 0, data, N);
+
+        // Synchronize to ensure kernel execution and fault resolution
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Print a few results
+        for (int i = 0; i < 10; ++i)
+        {
+            std::cout << "data[" << i << "] = " << data[i] << std::endl;
+        }
+
+        // Free memory
+        HIP_CHECK(hipFree(data));
+        return 0;
+    }
+
+
+The key behaviors in the example above are as follows:-
+
+#. | No Prefetch or Initialization: The memory is allocated using hipMallocManaged, but
+   | it's not initialized or explicitly prefetched to the GPU.
+
+#. | Kernel Write: The kernel writes to the entire array, including memory locations
+   | that haven't been allocated in GPU memory yet. This triggers page faults for pages
+   | not currently mapped to the GPU.
+
+#. | If **XNACK** is enabled, page faults are handled gracefully: the runtime allocates
+   | or fetches the missing pages as needed, ensuring correct execution. If **XNACK** is
+   | disabled, the GPU would not handle the page faults, leading to undefined behavior.
+
+To check if page migration is available on a platform, use ``rocminfo``:
+
+.. code-block:: bash
+
+    $ rocminfo | grep xnack
+    Name:                    amdgcn-amd-amdhsa--gfx90a:sramecc+:xnack-
+
+Here, ``xnack-`` means that XNACK is available but is disabled by default.
+Turning on XNACK by setting the environment variable ``HSA_XNACK=1`` and gives
+the expected result, ``xnack+``:
+
+.. code-block:: bash
+
+    $ HSA_XNACK=1 rocminfo | grep xnack
+    Name:                    amdgcn-amd-amdhsa--gfx90a:sramecc+:xnack+
+
+``hipcc`` by default will generate code that runs correctly with both XNACK enabled or disabled.
+Setting the ``--offload-arch=``-option with ``xnack+`` or ``xnack-`` forces code to
+be only run with XNACK enabled or disabled respectively.
+
+.. code-block:: bash
+
+    # Compiled kernels will run regardless if XNACK is enabled or is disabled.
+    hipcc --offload-arch=gfx90a
+
+    # Compiled kernels will only be run if XNACK is enabled with XNACK=1.
+    hipcc --offload-arch=gfx90a:xnack+
+
+    # Compiled kernels will only be run if XNACK is disabled with XNACK=0.
+    hipcc --offload-arch=gfx90a:xnack-
+
 .. _checking unified memory support:
 
 Checking unified memory support