diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml index ffc5fe3af..13eb10f0f 100644 --- a/.github/workflows/integration-test-backup.yml +++ b/.github/workflows/integration-test-backup.yml @@ -5,17 +5,20 @@ on: workflow_dispatch jobs: IntegrationTest: runs-on: self-hosted + defaults: + run: + shell: bash strategy: matrix: - container-image: [ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8, ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1] + cuda: [ cuda11.8, cuda12.1 ] container: - image: ${{ matrix.container-image }} + image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}" options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Install CMake run: | @@ -39,35 +42,35 @@ jobs: run: | set -e export PATH=/usr/local/mpi/bin:$PATH - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - name: Run mscclpp SendRecv test run: | set -e export PATH=/usr/local/mpi/bin:$PATH - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl - name: Run mscclpp AllReduce test run: | set -e export PATH=/usr/local/mpi/bin:$PATH - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl - name: Run mscclpp AllToAll test run: | set -e export PATH=/usr/local/mpi/bin:$PATH - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - name: Check collective primitives performance run: | diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml index 736c800e7..9bcbf53b2 100644 --- a/.github/workflows/ut-backup.yml +++ b/.github/workflows/ut-backup.yml @@ -5,18 +5,21 @@ on: workflow_dispatch jobs: UnitTest: runs-on: self-hosted + defaults: + run: + shell: bash timeout-minutes: 30 strategy: matrix: - container-image: [ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8, ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1] + cuda: [ cuda11.8, cuda12.1 ] container: - image: ${{ matrix.container-image }} + image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}" options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Build run: | @@ -44,9 +47,9 @@ jobs: run: | set -e export PATH=/usr/local/mpi/bin:$PATH - mpirun -tag-output -np 2 ./build/test/mp_unit_tests - mpirun -tag-output -np 4 ./build/test/mp_unit_tests - mpirun -tag-output -np 8 ./build/test/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests working-directory: ${{ github.workspace }} - name: PyTests @@ -54,10 +57,10 @@ jobs: set -e export PATH=/usr/local/mpi/bin:$PATH cd build && make pylib-copy - if [[ '${{ matrix.container-image }}' == *'cuda11'* ]]; then - pip3 install -r ../python/test/requirements_cu11.txt + if [[ '${{ matrix.cuda }}' == 'cuda11'* ]]; then + python3 -m pip install -r ../python/test/requirements_cu11.txt else - pip3 install -r ../python/test/requirements_cu12.txt + python3 -m pip install -r ../python/test/requirements_cu12.txt fi - mpirun -tag-output -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x + mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x working-directory: ${{ github.workspace }} diff --git a/include/mscclpp/packet.hpp b/include/mscclpp/packet.hpp index d9accd1d4..6f95bb092 100644 --- a/include/mscclpp/packet.hpp +++ b/include/mscclpp/packet.hpp @@ -48,7 +48,7 @@ union LLPacket { /// @param flag The flag to read. /// @param data The 8-byte data read. /// @return True if the flag is not equal to the given flag. - __forceinline__ __device__ bool readOnce(uint32_t flag, uint2& data) { + __forceinline__ __device__ bool readOnce(uint32_t flag, uint2& data) const { uint32_t flag1, flag2; asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data.x), "=r"(flag1), "=r"(data.y), "=r"(flag2) @@ -60,7 +60,7 @@ union LLPacket { /// @param flag The flag to read. /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative. /// @return The 8-byte data read. - __forceinline__ __device__ uint2 read(uint32_t flag, int64_t maxSpinCount = 100000000) { + __forceinline__ __device__ uint2 read(uint32_t flag, int64_t maxSpinCount = 100000000) const { uint2 data; POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount); return data; @@ -75,28 +75,31 @@ union LLPacket { }; #ifdef __CUDACC__ -__forceinline__ __device__ void putPackets(void* dst, uint64_t dstOffset, void* src, uint64_t srcOffset, - uint64_t srcBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { +/// Read from the origin and write to the target buffer. +__forceinline__ __device__ void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes - uint32_t* srcBase = (uint32_t*)((char*)src + srcOffset); - LLPacket* dstBase = (LLPacket*)((char*)dst + dstOffset); - size_t nElem = srcBytes / sizeof(uint64_t); + const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset); + LLPacket* targetBase = (LLPacket*)((char*)targetPtr + targetOffset); + size_t nElem = originBytes / sizeof(uint64_t); for (size_t i = threadId; i < nElem; i += numThreads) { - LLPacket* pkt = &dstBase[i]; - pkt->write(srcBase[2 * i], srcBase[2 * i + 1], flag); + LLPacket* pkt = &targetBase[i]; + pkt->write(originBase[2 * i], originBase[2 * i + 1], flag); } } -__forceinline__ __device__ void getPackets(void* dst, uint64_t dstOffset, void* src, uint64_t srcOffset, - uint64_t dstBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { +/// Read from the target buffer and write to the origin. +__forceinline__ __device__ void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes - // TODO(saemal): this is not matching sm_channel get method. - LLPacket* srcBase = (LLPacket*)((char*)src + srcOffset); - uint2* dstBase = (uint2*)((char*)dst + dstOffset); - size_t nElem = dstBytes / sizeof(uint2); + const LLPacket* targetBase = (const LLPacket*)((const char*)targetPtr + targetOffset); + uint2* originBase = (uint2*)((char*)originPtr + originOffset); + size_t nElem = originBytes / sizeof(uint2); for (size_t i = threadId; i < nElem; i += numThreads) { - LLPacket* pkt = &srcBase[i]; - dstBase[i] = pkt->read(flag); + const LLPacket* pkt = &targetBase[i]; + originBase[i] = pkt->read(flag); } } #endif // __CUDACC__ diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp index e6d947d4b..30c69631c 100644 --- a/include/mscclpp/sm_channel_device.hpp +++ b/include/mscclpp/sm_channel_device.hpp @@ -196,48 +196,50 @@ struct SmChannelDeviceHandle { } } - /// Copy data from the local memory to the remote memory. + /// Copy data from the local memory (origin) to the remote memory (target). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. /// /// @tparam Alignment The alignment of the source and destination addresses. Should be 4, 8, or a multiple of 16. /// @tparam CopyRemainder Whether to copy remainder bytes when the number of bytes is not a multiple of @p /// Alignment. - /// @param dstOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment. - /// @param srcOffset The offset in bytes of the local address. Should be a multiple of @p Alignment. - /// @param bytes Bytes of the data to be copied. Should be a multiple of @p Alignment. + /// @param targetOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment. + /// @param originOffset The offset in bytes of the local address. Should be a multiple of @p Alignment. + /// @param originBytes Bytes of the origin to be copied. Should be a multiple of @p Alignment. /// @param threadId The index of the current thread among all threads running this function. This is different from /// the `threadIdx` in CUDA. /// @param numThreads The total number of threads that run this function. /// template - __forceinline__ __device__ void put(uint64_t dstOffset, uint64_t srcOffset, uint64_t bytes, uint32_t threadId, - uint32_t numThreads) { - copy((char*)dst_ + dstOffset, (char*)src_ + srcOffset, bytes, threadId, numThreads); + __forceinline__ __device__ void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, + uint32_t threadId, uint32_t numThreads) { + copy((char*)dst_ + targetOffset, (char*)src_ + originOffset, originBytes, threadId, + numThreads); } - /// Copy data from the remote memory to the local memory. + /// Copy data from the remote memory (target) to the local memory (origin). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. /// /// @tparam Alignment The alignment of the source and destination addresses. Should be 4, 8, or a multiple of 16. /// @tparam CopyRemainder Whether to copy remainder bytes when the number of bytes is not a multiple of @p /// Alignment. - /// @param dstOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment. - /// @param srcOffset The offset in bytes of the local address. Should be a multiple of @p Alignment. - /// @param bytes Bytes of the data to be copied. Should be a multiple of @p Alignment. + /// @param targetOffset The offset in bytes of the remote address. Should be a multiple of @p Alignment. + /// @param originOffset The offset in bytes of the local address. Should be a multiple of @p Alignment. + /// @param originBytes Bytes of the origin to be copied. Should be a multiple of @p Alignment. /// @param threadId The index of the current thread among all threads running this function. This is different from /// the `threadIdx` in CUDA. /// @param numThreads The total number of threads that run this function. /// template - __forceinline__ __device__ void get(uint64_t dstOffset, uint64_t srcOffset, uint64_t bytes, uint32_t threadId, - uint32_t numThreads) { + __forceinline__ __device__ void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, + uint32_t threadId, uint32_t numThreads) { // Note that `dst` and `src` are swapped for `get()`. - copy((char*)src_ + srcOffset, (char*)dst_ + dstOffset, bytes, threadId, numThreads); + copy((char*)src_ + originOffset, (char*)dst_ + targetOffset, originBytes, threadId, + numThreads); } - /// Copy data from the local memory to the remote memory. + /// Copy data from the local memory (origin) to the remote memory (target). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. /// @@ -251,11 +253,11 @@ struct SmChannelDeviceHandle { /// @param numThreads The total number of threads that run this function. /// template - __forceinline__ __device__ void put(uint64_t offset, uint64_t size, uint32_t threadId, uint32_t numThreads) { - put(offset, offset, size, threadId, numThreads); + __forceinline__ __device__ void put(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) { + put(offset, offset, bytes, threadId, numThreads); } - /// Copy data from the remote memory to the local memory. + /// Copy data from the remote memory (target) to the local memory (origin). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. /// @@ -269,40 +271,41 @@ struct SmChannelDeviceHandle { /// @param numThreads The total number of threads that run this function. /// template - __forceinline__ __device__ void get(uint64_t offset, uint64_t size, uint32_t threadId, uint32_t numThreads) { - get(offset, offset, size, threadId, numThreads); + __forceinline__ __device__ void get(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) { + get(offset, offset, bytes, threadId, numThreads); } - /// Construct @ref LLPacket from the data in the local memory and write it on the remote memory. + /// Construct @ref LLPacket from the data in the local memory (origin) and write it on the remote packet buffer + /// (target). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of packets. /// - /// @param dstOffset The offset in bytes of the remote address. - /// @param srcOffset The offset in bytes of the local address. - /// @param bytes Bytes of the data to be copied. + /// @param targetOffset The offset in bytes of the remote packet buffer. + /// @param originOffset The offset in bytes of the local data. + /// @param originBytes Bytes of the origin to be copied. /// @param threadId The index of the current thread among all threads running this function. This is different from /// the `threadIdx` in CUDA. /// @param numThreads The total number of threads that run this function. /// - __forceinline__ __device__ void putPackets(uint64_t dstOffset, uint64_t srcOffset, uint64_t bytes, uint32_t threadId, - uint32_t numThreads, uint32_t flag) { - mscclpp::putPackets(dst_, dstOffset, src_, srcOffset, bytes, threadId, numThreads, flag); + __forceinline__ __device__ void putPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, + uint32_t threadId, uint32_t numThreads, uint32_t flag) { + mscclpp::putPackets(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); } - /// Retrieve data from @ref LLPacket in the local packet buffer and write it on the local memory. + /// Retrieve data from @ref LLPacket in the local packet buffer (target) and write it on the local data (origin). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. /// - /// @param dstOffset The offset in bytes of the local memory. - /// @param srcOffset The offset in bytes of the local packet buffer. - /// @param bytes Bytes of the data to be copied. + /// @param targetOffset The offset in bytes of the local packet buffer. + /// @param originOffset The offset in bytes of the local data. + /// @param originBytes Bytes of the origin to be copied. /// @param threadId The index of the current thread among all threads running this function. This is different from /// the `threadIdx` in CUDA. /// @param numThreads The total number of threads that run this function. /// - __forceinline__ __device__ void getPackets(uint64_t dstOffset, uint64_t srcOffset, uint64_t bytes, uint32_t threadId, - uint32_t numThreads, uint32_t flag) { - mscclpp::getPackets(src_, dstOffset, getPacketBuffer_, srcOffset, bytes, threadId, numThreads, flag); + __forceinline__ __device__ void getPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, + uint32_t threadId, uint32_t numThreads, uint32_t flag) { + mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); } /// Signal the remote semaphore. diff --git a/python/test/simple_proxy_channel_test.cu b/python/test/simple_proxy_channel_test.cu index 3a9c5566f..b8cd51d2d 100644 --- a/python/test/simple_proxy_channel_test.cu +++ b/python/test/simple_proxy_channel_test.cu @@ -26,7 +26,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) channels[tid].put(2 * my_offset, 2 * my_offset, 2 * size_per_rank); } if (my_nghr != my_rank && my_nghr < nranks) - mscclpp::getPackets(data, my_nghr_offset, scratch, 2 * my_nghr_offset, size_per_rank, tid % nthreads_per_rank, + mscclpp::getPackets(scratch, 2 * my_nghr_offset, data, my_nghr_offset, size_per_rank, tid % nthreads_per_rank, nthreads_per_rank, flag); } else { if (tid < nranks && tid != my_rank) { diff --git a/python/test/sm_channel_test.cu b/python/test/sm_channel_test.cu index 4b1ad8e07..a97217522 100644 --- a/python/test/sm_channel_test.cu +++ b/python/test/sm_channel_test.cu @@ -17,7 +17,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) if (bid < nranks && bid != my_rank) { if (use_packet) { channels[bid].putPackets(2 * my_offset, my_offset, size_per_rank, tid, blockDim.x, flag); - channels[bid].getPackets(my_nghr_offset, 2 * my_nghr_offset, size_per_rank, tid, blockDim.x, flag); + channels[bid].getPackets(2 * my_nghr_offset, my_nghr_offset, size_per_rank, tid, blockDim.x, flag); } else { channels[bid].put(my_offset, my_offset, size_per_rank, tid, blockDim.x); __syncthreads(); diff --git a/test/mp_unit/proxy_channel_tests.cu b/test/mp_unit/proxy_channel_tests.cu index 102b96c64..ae0ea4c68 100644 --- a/test/mp_unit/proxy_channel_tests.cu +++ b/test/mp_unit/proxy_channel_tests.cu @@ -238,7 +238,7 @@ __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, m flusher = 0; } } else { - mscclpp::getPackets(buff, 0, getPktBuf, 0, nElem * sizeof(int), threadId, numThreads, flag); + mscclpp::getPackets(getPktBuf, 0, buff, 0, nElem * sizeof(int), threadId, numThreads, flag); if (CheckCorrectness) { // If each thread reads 8 bytes at once, we don't need a barrier after getPackets(). // __syncthreads();