From a96ff4930c3e92cb0c54f33019718d7a59afdf3f Mon Sep 17 00:00:00 2001 From: Tarun Prabhu Date: Wed, 15 May 2024 17:36:26 -0600 Subject: [PATCH] Run clang-format on Kitsune-specific files. Remove trailing whitespace and excess newlines from elsewhere. --- clang/lib/Driver/ToolChain.cpp | 2 +- kitsune/CMakeLists.txt | 3 +- kitsune/runtime/CMakeLists.txt | 13 ++-- llvm/lib/Transforms/Tapir/CudaABI.cpp | 22 +++---- llvm/lib/Transforms/Tapir/HipABI.cpp | 95 +++++++++++++-------------- 5 files changed, 65 insertions(+), 70 deletions(-) diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 71d82cd8fdc821..35ba05b2400452 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -1716,7 +1716,7 @@ void ToolChain::AddKitsuneLinkerArgs(const ArgList &Args, case llvm::TapirTargetID::Hip: CmdArgs.push_back( Args.MakeArgString(StringRef("-L") + KITSUNE_HIP_LIBRARY_DIR)); - ExtractArgsFromString("-lamdhip64", CmdArgs, Args); + ExtractArgsFromString("-lamdhip64", CmdArgs, Args); ExtractArgsFromString(KITSUNE_HIP_EXTRA_LINKER_FLAGS, CmdArgs, Args); break; diff --git a/kitsune/CMakeLists.txt b/kitsune/CMakeLists.txt index b6f150959553fa..cf707c22e00874 100644 --- a/kitsune/CMakeLists.txt +++ b/kitsune/CMakeLists.txt @@ -274,7 +274,7 @@ if (KITSUNE_HIP_ENABLE) if (DEFINED $ENV{ROCM_PATH}) message(STATUS " using ROCM_PATH environment variable.") set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "AMD ROCM/HIP installation directory.") - else() + else() message(STATUS " selecting a common default install location.") set(ROCM_PATH "/opt/rocm" CACHE PATH "AMD ROCM/HIP installation directory.") endif() @@ -351,7 +351,6 @@ if (KITSUNE_OPENCILK_ENABLE) CACHE STRING "Additional C++ compiler flags needed to build Kokkos. These are only used when building Kitsune") - # We pass some LLVM_* variables to make Cheetah behave as if it were an # in-tree build. set(KITSUNE_CHEETAH_SOURCE_DIR ${KITSUNE_TARGETS_BINARY_DIR}/cheetah/cheetah) diff --git a/kitsune/runtime/CMakeLists.txt b/kitsune/runtime/CMakeLists.txt index 14d6b0e0fb1e10..1662c3a11734a0 100644 --- a/kitsune/runtime/CMakeLists.txt +++ b/kitsune/runtime/CMakeLists.txt @@ -72,7 +72,6 @@ target_include_directories(${KITRT} # set_property(TARGET ${KITRT} APPEND PROPERTY # BUILD_RPATH "$ORIGIN/../lib${LLVM_LIBDIR_SUFFIX}:${PROJECT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}") - find_library(LIB_LLVM NAMES LLVM LLVM-${LLVM_VERSION_MAJOR} @@ -140,13 +139,13 @@ endif() # kitsune/CMakeLists.txt). They are passed in via CMake because the Kitsune # runtime is treated as an "ExternalProject". # -# Note on AMD HIP CMake details... When is HIP not HIP? When it is HIP... +# Note on AMD HIP CMake details... When is HIP not HIP? When it is HIP... # It is easy to confuse requirements for HIP as the HIP "language" vs. the # HIP runtime. The language has tangled dependencies within a LLVM context -# as they build around LLVM and Clang too. Using some of the 'hip::host' +# as they build around LLVM and Clang too. Using some of the 'hip::host' # targets are actually intended for the language and not runtime libraries. -# Unfortunately, these details seem to be a moving target with ecah release -# of rocm/hip... +# Unfortunately, these details seem to be a moving target with ecah release +# of rocm/hip... if (KITSUNE_HIP_ENABLE) if (NOT DEFINED ROCM_PATH) @@ -154,7 +153,7 @@ if (KITSUNE_HIP_ENABLE) if (DEFINED $ENV{ROCM_PATH}) message(STATUS " using ROCM_PATH environment variable.") set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "AMD ROCM/HIP installation directory.") - else() + else() message(STATUS " selecting a common default install location.") set(ROCM_PATH "/opt/rocm" CACHE PATH "AMD ROCM/HIP installation directory.") endif() @@ -177,7 +176,7 @@ if (KITSUNE_HIP_ENABLE) message(STATUS "hip include dir: ${hip_INCLUDE_DIR}") message(STATUS "hip library dir: ${hip_LIB_INSTALL_DIR}") - + target_compile_definitions(${KITRT} PUBLIC KITRT_HIP_ENABLED) target_compile_definitions(${KITRT} PUBLIC __HIP_PLATFORM_AMD__) target_include_directories(${KITRT} PUBLIC ${hip_INCLUDE_DIR}) diff --git a/llvm/lib/Transforms/Tapir/CudaABI.cpp b/llvm/lib/Transforms/Tapir/CudaABI.cpp index a04e986de3aedf..7405da84212872 100644 --- a/llvm/lib/Transforms/Tapir/CudaABI.cpp +++ b/llvm/lib/Transforms/Tapir/CudaABI.cpp @@ -414,7 +414,7 @@ CudaLoop::CudaLoop(Module &M, Module &KernelModule, const std::string &KN, Int32Ty, // threads-per-block KernelInstMixTy->getPointerTo(), // instruction mix info VoidPtrTy); // opaque cuda stream - + KitCudaMemPrefetchFn = M.getOrInsertFunction("__kitcuda_mem_gpu_prefetch", VoidPtrTy, // return an opaque stream @@ -986,14 +986,14 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, // supported) we can end up with multiple calls to the outlined // loop (which has been setup for dead code elimination) but can // cause invalid IR that trips us up when handling the GPU module - // code generation. This is a challenge in the Tapir design that - // was not geared to handle some of the nuances of GPU target - // transformations (and code gen). To address this, we need to + // code generation. This is a challenge in the Tapir design that + // was not geared to handle some of the nuances of GPU target + // transformations (and code gen). To address this, we need to // do some clean up to keep the IR correct (or the verifier will - // fail on us...). Specifically, we can no longer depend upon + // fail on us...). Specifically, we can no longer depend upon // DCE as it runs too late in the GPU transformation process... // - // TODO: This code can be shared between the cuda and hip targets... + // TODO: This code can be shared between the cuda and hip targets... // Function *TargetKF = KernelModule.getFunction(KernelName); std::list RemoveList; @@ -1029,7 +1029,7 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, IRBuilder<> NewBuilder(&NewBB->front()); // TODO: There is some potential here to share this code across both - // the hip and cuda transforms... + // the hip and cuda transforms... LLVM_DEBUG(dbgs() << "\t*- code gen packing of " << OrderedInputs.size() << " kernel args.\n"); PointerType *VoidPtrTy = PointerType::getUnqual(Ctx); @@ -1049,8 +1049,8 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, i++; if (CodeGenPrefetch && V->getType()->isPointerTy()) { - LLVM_DEBUG(dbgs() << "\t\t- code gen prefetch for kernel arg #" - << i << "\n"); + LLVM_DEBUG(dbgs() << "\t\t- code gen prefetch for kernel arg #" << i + << "\n"); Value *VoidPP = NewBuilder.CreateBitCast(V, VoidPtrTy); Value *SPtr = NewBuilder.CreateLoad(VoidPtrTy, CudaStream); Value *NewSPtr = @@ -2005,7 +2005,7 @@ CudaABIOutputFile CudaABI::generatePTX() { FunctionAnalysisManager fam; CGSCCAnalysisManager cgam; ModuleAnalysisManager mam; - + PassBuilder pb(PTXTargetMachine, pto); pb.registerModuleAnalyses(mam); pb.registerCGSCCAnalyses(cgam); @@ -2013,7 +2013,7 @@ CudaABIOutputFile CudaABI::generatePTX() { pb.registerLoopAnalyses(lam); PTXTargetMachine->registerPassBuilderCallbacks(pb, false); pb.crossRegisterProxies(lam, fam, cgam, mam); - + ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(optLevel); mpm.addPass(VerifierPass()); LLVM_DEBUG(dbgs() << "\t\t* module: " << KernelModule.getName() << "\n"); diff --git a/llvm/lib/Transforms/Tapir/HipABI.cpp b/llvm/lib/Transforms/Tapir/HipABI.cpp index 3084124822d3bc..0b02db079d9d9c 100644 --- a/llvm/lib/Transforms/Tapir/HipABI.cpp +++ b/llvm/lib/Transforms/Tapir/HipABI.cpp @@ -113,11 +113,10 @@ #include "llvm/Transforms/Tapir/Outline.h" #include "llvm/Transforms/Tapir/TapirGPUUtils.h" #include "llvm/Transforms/Tapir/TapirLoopInfo.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Mem2Reg.h" +#include "llvm/Transforms/Utils/TapirUtils.h" using namespace llvm; @@ -249,8 +248,7 @@ cl::opt ROCmABITarget( "hipabi-rocm-abi", cl::init(ROCm_ABI_V4), cl::Hidden, cl::desc("Select the targeted ROCm ABI version."), cl::values(clEnumValN(ROCm_ABI_V4, "v4", "Target ROCm version 4 ABI."), - clEnumValN(ROCm_ABI_V5, "v5", - "Target ROCm v. 5 ABI."))); + clEnumValN(ROCm_ABI_V5, "v5", "Target ROCm v. 5 ABI."))); cl::opt Use64ElementWavefront( "hipabi-wavefront64", cl::init(true), cl::Hidden, @@ -623,7 +621,8 @@ void HipABI::transformConstants(Function *Fn) { LLVM_DEBUG(dbgs() << "\t\t\t\tnew store: " << *SI << "\n"); } else if (auto *Call = dyn_cast(U->getUser())) { unsigned argNo = Call->getArgOperandNo(U); - LLVM_DEBUG(dbgs() << "\t\tpatching callable instruction: " << *Call << "\n"); + LLVM_DEBUG(dbgs() << "\t\tpatching callable instruction: " << *Call + << "\n"); // FIXME: This is not correct! The function operand should be // checked to see what address space it expects. Instruction *asCast = @@ -631,7 +630,8 @@ void HipABI::transformConstants(Function *Fn) { Call->setArgOperand(argNo, asCast); LLVM_DEBUG(dbgs() << "\t\t\t\tnew call: " << *Call << "\n"); } else if (auto *GEP = dyn_cast(U->getUser())) { - LLVM_DEBUG(dbgs() << "\t\tpatching gep instruction:\n\t\t\t" << *GEP << "\n"); + LLVM_DEBUG(dbgs() << "\t\tpatching gep instruction:\n\t\t\t" << *GEP + << "\n"); Instruction *asCast = new AddrSpaceCastInst(NewGEP, OldGEP->getType(), "", GEP); GEP->setOperand(GEP->getPointerOperandIndex(), asCast); @@ -830,24 +830,25 @@ HipLoop::HipLoop(Module &M, Module &KModule, const std::string &Name, Int64Ty, // number of floating point ops. Int64Ty); // number of integer ops. - KitHipLaunchFn = M.getOrInsertFunction("__kithip_launch_kernel", - VoidPtrTy, // return an opaque stream - VoidPtrTy, // fat-binary - VoidPtrTy, // kernel name - VoidPtrTy, // arguments - Int64Ty, // trip count - Int32Ty, // threads-per-block + KitHipLaunchFn = M.getOrInsertFunction( + "__kithip_launch_kernel", + VoidPtrTy, // return an opaque stream + VoidPtrTy, // fat-binary + VoidPtrTy, // kernel name + VoidPtrTy, // arguments + Int64Ty, // trip count + Int32Ty, // threads-per-block KernelInstMixTy->getPointerTo(), // instruction mix info - VoidPtrTy); // opaque cuda stream + VoidPtrTy); // opaque cuda stream - KitHipMemPrefetchFn = M.getOrInsertFunction("__kithip_mem_gpu_prefetch", - VoidPtrTy, // return an opaque stream - VoidPtrTy, // pointer to prefetch - VoidPtrTy); // use opaque stream. + KitHipMemPrefetchFn = + M.getOrInsertFunction("__kithip_mem_gpu_prefetch", + VoidPtrTy, // return an opaque stream + VoidPtrTy, // pointer to prefetch + VoidPtrTy); // use opaque stream. } -HipLoop::~HipLoop() { /* no-op */ -} +HipLoop::~HipLoop() { /* no-op */ } // TODO: Can we also transform the arguments into a different address space here // and avoid our use of 'mutate' elsewhere in the code? @@ -1315,14 +1316,14 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, // supported) we can end up with multiple calls to the outlined // loop (which has been setup for dead code elimination) but can // cause invalid IR that trips us up when handling the GPU module - // code generation. This is a challenge in the Tapir design that - // was not geared to handle some of the nuances of GPU target - // transformations (and code gen). To address this, we need to + // code generation. This is a challenge in the Tapir design that + // was not geared to handle some of the nuances of GPU target + // transformations (and code gen). To address this, we need to // do some clean up to keep the IR correct (or the verifier will - // fail on us...). Specifically, we can no longer depend upon + // fail on us...). Specifically, we can no longer depend upon // DCE as it runs too late in the GPU transformation process... // - // TODO: This code can be shared between the cuda and hip targets... + // TODO: This code can be shared between the cuda and hip targets... // Function *TargetKF = KernelModule.getFunction(KernelName); std::list RemoveList; @@ -1357,7 +1358,7 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, IRBuilder<> NewBuilder(&NewBB->front()); // TODO: There is some potential here to share this code across both - // the hip and cuda transforms... + // the hip and cuda transforms... LLVM_DEBUG(dbgs() << "\t*- code gen packing of " << OrderedInputs.size() << " kernel args.\n"); PointerType *VoidPtrTy = PointerType::getUnqual(Ctx); @@ -1377,12 +1378,12 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, i++; if (CodeGenPrefetch && V->getType()->isPointerTy()) { - LLVM_DEBUG(dbgs() << "\t\t- code gen prefetch for kernel arg #" - << i << "\n"); + LLVM_DEBUG(dbgs() << "\t\t- code gen prefetch for kernel arg #" << i + << "\n"); Value *VoidPP = NewBuilder.CreateBitCast(V, VoidPtrTy); Value *SPtr = NewBuilder.CreateLoad(VoidPtrTy, HipStream); - Value *NewSPtr = - NewBuilder.CreateCall(KitHipMemPrefetchFn, {VoidPP, SPtr}); + Value *NewSPtr = + NewBuilder.CreateCall(KitHipMemPrefetchFn, {VoidPP, SPtr}); if (not StreamAssigned) { NewBuilder.CreateStore(NewSPtr, HipStream); StreamAssigned = true; @@ -1463,13 +1464,13 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, ConstantInt::get(Int64Ty, InstMix.num_iops)); AllocaInst *AI = NewBuilder.CreateAlloca(KernelInstMixTy); - NewBuilder.CreateStore(InstructionMix, AI); + NewBuilder.CreateStore(InstructionMix, AI); LLVM_DEBUG(dbgs() << "\t*- code gen kernel launch...\n"); Value *KSPtr = NewBuilder.CreateLoad(VoidPtrTy, HipStream); - CallInst *LaunchStream = NewBuilder.CreateCall(KitHipLaunchFn, - {DummyFBPtr, KNameParam, argsPtr, CastTripCount, - TPBlockValue, AI, KSPtr}); + CallInst *LaunchStream = NewBuilder.CreateCall( + KitHipLaunchFn, {DummyFBPtr, KNameParam, argsPtr, CastTripCount, + TPBlockValue, AI, KSPtr}); NewBuilder.CreateStore(LaunchStream, HipStream); LLVM_DEBUG(dbgs() << "\t\t+- registering launch stream:\n" << "\t\t\tcall: " << *LaunchStream << "\n" @@ -1514,8 +1515,7 @@ HipABI::HipABI(Module &InputModule) Int32Ty, // host pointer Int64Ty, // device pointer Int64Ty); // number of bytes to copy - KitHipSyncFn = M.getOrInsertFunction("__kithip_sync_thread_stream", - VoidTy, + KitHipSyncFn = M.getOrInsertFunction("__kithip_sync_thread_stream", VoidTy, VoidPtrTy); // no return, nor parameters // Build the details we need for the AMDGPU/HIP target. std::string ArchString = "amdgcn"; @@ -1575,8 +1575,7 @@ HipABI::HipABI(Module &InputModule) ROCmModulesLoaded = false; } -HipABI::~HipABI() { /* no-op */ -} +HipABI::~HipABI() { /* no-op */ } std::unique_ptr &HipABI::getLibDeviceModule() { @@ -1704,7 +1703,7 @@ void HipABI::postProcessFunction(Function &F, bool OutliningTapirLoops) { for (Value *SR : SyncRegList) { for (Use &U : SR->uses()) { if (auto *SyncI = dyn_cast(U.getUser())) - CallInst::Create(KitHipSyncFn, {HipStream}, "", + CallInst::Create(KitHipSyncFn, {HipStream}, "", &*SyncI->getSuccessor(0)->begin()); } } @@ -1740,7 +1739,7 @@ void HipABI::finalizeLaunchCalls(Module &M, GlobalVariable *BundleBin) { AllocaInst *StreamAI = getLaunchStream(SavedLaunchCI); assert(StreamAI != nullptr && "unexpected null launch stream!"); IRBuilder<> SyncBuilder(CI); - Value *HipStream = + Value *HipStream = SyncBuilder.CreateLoad(VoidPtrTy, StreamAI, "hipstreamh"); CI->setArgOperand(0, HipStream); SavedLaunchCI = nullptr; @@ -1815,7 +1814,7 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) { ObjFile->keep(); if (OptLevel > 0) { - if (OptLevel > 3) + if (OptLevel > 3) OptLevel = 3; LLVM_DEBUG(dbgs() << "\t- running kernel module optimization passes...\n"); PipelineTuningOptions pto; @@ -1831,7 +1830,7 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) { OptimizationLevel::O2, OptimizationLevel::O3, }; - OptimizationLevel optLevel = optLevels[OptLevel]; + OptimizationLevel optLevel = optLevels[OptLevel]; // From the LLVM docs: Create the analysis managers. // These must be declared in this order so that they are destroyed in the @@ -1860,8 +1859,7 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) { legacy::PassManager PassMgr; if (AMDTargetMachine->addPassesToEmitFile(PassMgr, ObjFile->os(), nullptr, - CodeGenFileType::ObjectFile, - false)) + CodeGenFileType::ObjectFile, false)) report_fatal_error("hipabi: AMDGPU target failed!"); PassMgr.run(KernelModule); @@ -1998,11 +1996,10 @@ GlobalVariable *HipABI::embedBundle(HipABIOutputFile &BundleFile) { StringRef(Bundle->getBufferStart(), Bundle->getBufferSize()), Bundle->getBufferSize(), Int8Ty); GlobalVariable *BundleGV; - BundleGV = new GlobalVariable(M, BundleArray->getType(), true, - GlobalValue::PrivateLinkage, BundleArray, - "__hip_fatbin", - nullptr, GlobalVariable::NotThreadLocal); - + BundleGV = new GlobalVariable( + M, BundleArray->getType(), true, GlobalValue::PrivateLinkage, BundleArray, + "__hip_fatbin", nullptr, GlobalVariable::NotThreadLocal); + const char *BundleSectionName = ".hip_fatbin"; BundleGV->setUnnamedAddr(GlobalValue::UnnamedAddr::None); BundleGV->setSection(BundleSectionName);