Skip to content

Commit

Permalink
Collective Builder API for PVC (codeplaysoftware#122)
Browse files Browse the repository at this point in the history
* initial changes for pvc builder API support

* fix a few compilation issues

* fix remaining  compilation errors

* change from Unpredicated to 3 stage pipeline, bug fix when defining Stride B in gemm collective builder

* change KernelSingleStage to KernelPVC

* Remove question about the C and D type to be the same

Co-authored-by: Mehdi Goli <[email protected]>

* remove the #ifndef check

* removed comments about alignment, check for nvidia target, and remove header included in packed_stride.hpp

* change name from pvc_mma_builder to pvc_mma_builder

* Remove comment about large margin error as it does not hold true anymore

Co-authored-by: Muhammad Tanvir <[email protected]>

* Shorten comment

Co-authored-by: Mehdi Goli <[email protected]>

* Remove comment about cluster in PVC

Co-authored-by: Mehdi Goli <[email protected]>

* Update include/cutlass/gemm/collective/builders/pvc_mma_builder.inl

Co-authored-by: Mehdi Goli <[email protected]>

* Update include/cutlass/gemm/collective/builders/pvc_mma_builder.inl

Co-authored-by: Mehdi Goli <[email protected]>

* change PVC to Intel

Co-authored-by: Mehdi Goli <[email protected]>

* remove other mentions of PVC and change them to Intel

* Update include/cutlass/gemm/collective/builders/pvc_mma_builder.inl

Co-authored-by: Mehdi Goli <[email protected]>

* Update include/cutlass/gemm/collective/builders/pvc_mma_builder.inl

Co-authored-by: Mehdi Goli <[email protected]>

* Update include/cutlass/epilogue/collective/builders/pvc_builder.inl

Co-authored-by: Mehdi Goli <[email protected]>

* remove extra include from sm90_builder.inl

* remove extra underscore

---------

Co-authored-by: Mehdi Goli <[email protected]>
Co-authored-by: Muhammad Tanvir <[email protected]>
  • Loading branch information
3 people authored Aug 18, 2024
1 parent c9c7e78 commit 64acac8
Show file tree
Hide file tree
Showing 8 changed files with 657 additions and 4 deletions.
5 changes: 5 additions & 0 deletions examples/sycl/pvc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,8 @@ cutlass_example_add_executable(
pvc_gemm_with_epilogue_relu
pvc_gemm_with_epilogue_relu.cpp
)

cutlass_example_add_executable(
pvc_collective_builder
pvc_collective_builder.cpp
)
390 changes: 390 additions & 0 deletions examples/sycl/pvc/pvc_collective_builder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,390 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/


#include "cutlass/gemm/device/gemm_universal.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"

#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/kernel_hardware_info.h"

#include "cutlass/util/command_line.h"
#include "cutlass/util/device_memory.h"
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/reference/device/gemm_complex.h"
#include "cutlass/util/reference/device/tensor_compare.h"
#include "cutlass/util/GPU_Clock.hpp"

#include "cutlass/util/reference/device/tensor_relu.h"
#include "cutlass/tensor_view.h"
#include "cutlass/coord.h"

#include <cute/tensor.hpp>
#include <random>



template <typename T>
static void fill_matrix(std::vector<T> &vector)
{
std::generate(std::begin(vector), std::end(vector), [&] {
return static_cast<T>( (rand() / double(RAND_MAX)) );
});
}

using namespace cute;

///////////////////////////////////////////////////////////////////////////////////////////////////

// Command line options parsing
struct Options {

bool help;
bool error;

int m, n, k, l, iterations;
float alpha, beta;

Options():
help(false),
error(false),
m(4096), n(4096), k(4096), l(1), iterations(100),
alpha(1.f), beta(0.f)
{ }

// Parses the command line
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);

if (cmd.check_cmd_line_flag("help")) {
help = true;
return;
}

cmd.get_cmd_line_argument("m", m, 4096);
cmd.get_cmd_line_argument("n", n, 4096);
cmd.get_cmd_line_argument("k", k, 4096);
cmd.get_cmd_line_argument("l", l, 1);
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
cmd.get_cmd_line_argument("beta", beta, 0.f);
cmd.get_cmd_line_argument("iterations", iterations, 100);
}

/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {

out << "PVC GEMM Example\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement\n\n"
<< " --m=<int> Sets the M extent of the GEMM\n"
<< " --n=<int> Sets the N extent of the GEMM\n"
<< " --k=<int> Sets the K extent of the GEMM\n"
<< " --l=<int> Sets the L extent (batch count) of the GEMM\n"
<< " --alpha=<s32> Epilogue scalar alpha\n"
<< " --beta=<s32> Epilogue scalar beta\n\n"
<< " --iterations=<int> Iterations\n\n";

return out;
}
};

///////////////////////////////////////////////////////////////////////////////////////////////////

template <
class Gemm
>
struct ExampleRunner {

using StrideA = typename Gemm::GemmKernel::StrideA;
using StrideB = typename Gemm::GemmKernel::StrideB;
using StrideC = typename Gemm::GemmKernel::StrideC;
using StrideD = typename Gemm::GemmKernel::StrideD;

using LayoutA = typename Gemm::LayoutA;
using LayoutB = typename Gemm::LayoutB;
using LayoutC = typename Gemm::LayoutC;
using LayoutD = typename Gemm::LayoutD;

using ElementA = typename Gemm::ElementA;
using ElementB = typename Gemm::ElementB;
using ElementAcc = typename Gemm::ElementAccumulator;

using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
using ElementC = typename Gemm::ElementC;
using ElementOutput = typename CollectiveEpilogue::ElementOutput;
using ElementCompute = typename CollectiveEpilogue::ElementCompute;
using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;

using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;

//
// Data members
//

/// Initialization
StrideA stride_A;
StrideB stride_B;
StrideC stride_C;
StrideD stride_D;

cutlass::DeviceAllocation<ElementA> block_A;
cutlass::DeviceAllocation<ElementB> block_B;
cutlass::DeviceAllocation<ElementC> block_C;
cutlass::DeviceAllocation<ElementOutput> block_D;
cutlass::DeviceAllocation<ElementOutput> block_ref_D;

//
// Methods
//

bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
auto [M, N, K, L] = problem_size;

cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));

cutlass::reference::device::GemmComplex(
{M, N, K},
alpha,
ref_A,
cutlass::ComplexTransform::kNone,
ref_B,
cutlass::ComplexTransform::kNone,
beta,
ref_C,
ref_D,
ElementAccumulator(0),
L, // batch_count
M * K, // batch_stride_A
K * N, // batch_stride_B
M * N, // batch_stride_C
M * N // batch_stride_D
);

syclcompat::wait();

using TensorView = cutlass::TensorView<ElementOutput, LayoutD>;
cutlass::reference::device::TensorReLu(TensorView(block_ref_D.get(), LayoutD::packed({M, N}),
cutlass::make_Coord(M, N)));

syclcompat::wait();

// Check if output from CUTLASS kernel and reference kernel are relatively equal or not
auto epsilon = static_cast<ElementOutput>(0.1f);
auto nonzero_floor = static_cast<ElementOutput>(0.1f);

bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(
block_ref_D.get(), block_D.get(), block_D.size(),
epsilon, nonzero_floor);

return passed;
}

/// Initialize operands to be used in the GEMM and reference GEMM
void initialize(const ProblemShapeType& problem_size) {
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
auto [M, N, K, L] = problem_shape_MNKL;

stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));

block_A.reset(M * K * L);
block_B.reset(K * N * L);
block_C.reset(M * N * L);
block_D.reset(M * N * L);
block_ref_D.reset(M * N * L);

// TODO: Enable initialization on device directly once RNG is
// available through SYCL.
std::vector<ElementA> a(K * M * L);
std::vector<ElementB> b(K * N * L);
std::vector<ElementC> c(M * N * L);
std::vector<ElementC> d(M * N * L, ElementC{0});

fill_matrix(a);
fill_matrix(b);
fill_matrix(c);

syclcompat::memcpy(block_A.get(), a.data(), a.size() * sizeof(ElementA));
syclcompat::memcpy(block_B.get(), b.data(), b.size() * sizeof(ElementB));
syclcompat::memcpy(block_C.get(), c.data(), c.size() * sizeof(ElementC));
syclcompat::memcpy(block_D.get(), d.data(), d.size() * sizeof(ElementC));
}

void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};

initialize(problem_size);

typename Gemm::GemmKernel::Arguments arguments{
cutlass::gemm::GemmUniversalMode::kGemm,
problem_size,
{block_A.get(), stride_A, block_B.get(), stride_B},
{{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
hw_info
};

Gemm gemm_op;

size_t workspace_size = Gemm::get_workspace_size(arguments);
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);

gemm_op.can_implement(arguments);

gemm_op.initialize(arguments, workspace.get());

// Run the GEMM
gemm_op.run();

syclcompat::wait();

// Verify that the result is correct
bool passed = verify(problem_size, options.alpha, options.beta);
std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;

if (passed && options.iterations > 0) {
GPU_Clock timer;
timer.start();
for (int i = 0; i < options.iterations; ++i) {
gemm_op.run();
}
syclcompat::wait();

float cute_time = timer.seconds() / options.iterations;
double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
}

return;
}

};

int main(int argc, const char** argv)
{
//
// Parse options
//

Options options;

options.parse(argc, argv);

if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}

if (options.error) {
std::cerr << "Aborting execution." << std::endl;
return -1;
}

//
// Run examples
//

// The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
// information is used by the underlying kernel.
cutlass::KernelHardwareInfo hw_info;

// Change device_id to another value if you are running on a machine with multiple GPUs and wish
// to use a GPU other than that with device ID 0.
hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);

bool passed;

// The code section below describes datatype for input, output matrices and computation between
// elements in input matrices.
using ElementAccumulator = float; // <- data type of accumulator
using ElementComputeEpilogue = float; // <- data type of epilogue operations
using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A
using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B
using ElementOutput = float; // <- data type of elements in output matrix D

constexpr int AlignmentA = sizeof(ElementInputA);
constexpr int AlignmentB = sizeof(ElementInputB);
constexpr int AlignmentC = sizeof(ElementAccumulator);
constexpr int AlignmentD = sizeof(ElementOutput);

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using LayoutC = cutlass::layout::RowMajor;
using LayoutD = cutlass::layout::RowMajor;

// Workgroup-level tile
using TileShape = Shape<_256, _256, _32>;

using CollectiveMainloop = cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::IntelPVC, cutlass::arch::OpClassTensorOp,
ElementInputA, LayoutA, AlignmentA,
ElementInputB, LayoutB, AlignmentB,
ElementAccumulator,
TileShape, Shape<_1, _1, _1>,
cutlass::gemm::collective::StageCountAuto,
cutlass::gemm::collective::KernelScheduleAuto
>::CollectiveOp;

using EpilogueOp = cutlass::epilogue::fusion::LinCombEltAct<cutlass::epilogue::thread::ReLu,
ElementOutput, ElementComputeEpilogue, ElementAccumulator,
ElementAccumulator, cutlass::FloatRoundStyle::round_to_nearest>;

using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::IntelPVC, cutlass::arch::OpClassTensorOp,
TileShape, Shape<_1, _1, _1>,
cutlass::epilogue::collective::EpilogueTileAuto, ElementComputeEpilogue,
ElementAccumulator,
ElementAccumulator, LayoutC, AlignmentC,
ElementOutput, LayoutD, AlignmentD,
cutlass::epilogue::collective::EpilogueScheduleAuto,
EpilogueOp
>::CollectiveOp;

using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
Shape<int, int, int, int>,
CollectiveMainloop,
CollectiveEpilogue
>;

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

ExampleRunner<Gemm> runner;

runner.run(options, hw_info);

return 0;
}
Loading

0 comments on commit 64acac8

Please sign in to comment.