Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NFC] Cleanup code #50

Merged
merged 1 commit into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clang/lib/Driver/ToolChain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1716,7 +1716,7 @@ void ToolChain::AddKitsuneLinkerArgs(const ArgList &Args,
case llvm::TapirTargetID::Hip:
CmdArgs.push_back(
Args.MakeArgString(StringRef("-L") + KITSUNE_HIP_LIBRARY_DIR));
ExtractArgsFromString("-lamdhip64", CmdArgs, Args);
ExtractArgsFromString("-lamdhip64", CmdArgs, Args);
ExtractArgsFromString(KITSUNE_HIP_EXTRA_LINKER_FLAGS, CmdArgs, Args);
break;

Expand Down
3 changes: 1 addition & 2 deletions kitsune/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ if (KITSUNE_HIP_ENABLE)
if (DEFINED $ENV{ROCM_PATH})
message(STATUS " using ROCM_PATH environment variable.")
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "AMD ROCM/HIP installation directory.")
else()
else()
message(STATUS " selecting a common default install location.")
set(ROCM_PATH "/opt/rocm" CACHE PATH "AMD ROCM/HIP installation directory.")
endif()
Expand Down Expand Up @@ -351,7 +351,6 @@ if (KITSUNE_OPENCILK_ENABLE)
CACHE STRING
"Additional C++ compiler flags needed to build Kokkos. These are only used when building Kitsune")


# We pass some LLVM_* variables to make Cheetah behave as if it were an
# in-tree build.
set(KITSUNE_CHEETAH_SOURCE_DIR ${KITSUNE_TARGETS_BINARY_DIR}/cheetah/cheetah)
Expand Down
13 changes: 6 additions & 7 deletions kitsune/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ target_include_directories(${KITRT}
# set_property(TARGET ${KITRT} APPEND PROPERTY
# BUILD_RPATH "$ORIGIN/../lib${LLVM_LIBDIR_SUFFIX}:${PROJECT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}")


find_library(LIB_LLVM
NAMES LLVM
LLVM-${LLVM_VERSION_MAJOR}
Expand Down Expand Up @@ -140,21 +139,21 @@ endif()
# kitsune/CMakeLists.txt). They are passed in via CMake because the Kitsune
# runtime is treated as an "ExternalProject".
#
# Note on AMD HIP CMake details... When is HIP not HIP? When it is HIP...
# Note on AMD HIP CMake details... When is HIP not HIP? When it is HIP...
# It is easy to confuse requirements for HIP as the HIP "language" vs. the
# HIP runtime. The language has tangled dependencies within a LLVM context
# as they build around LLVM and Clang too. Using some of the 'hip::host'
# as they build around LLVM and Clang too. Using some of the 'hip::host'
# targets are actually intended for the language and not runtime libraries.
# Unfortunately, these details seem to be a moving target with ecah release
# of rocm/hip...
# Unfortunately, these details seem to be a moving target with ecah release
# of rocm/hip...
if (KITSUNE_HIP_ENABLE)

if (NOT DEFINED ROCM_PATH)
message(STATUS "Kitsune HIP support enabled but ROCM_PATH not set.")
if (DEFINED $ENV{ROCM_PATH})
message(STATUS " using ROCM_PATH environment variable.")
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "AMD ROCM/HIP installation directory.")
else()
else()
message(STATUS " selecting a common default install location.")
set(ROCM_PATH "/opt/rocm" CACHE PATH "AMD ROCM/HIP installation directory.")
endif()
Expand All @@ -177,7 +176,7 @@ if (KITSUNE_HIP_ENABLE)

message(STATUS "hip include dir: ${hip_INCLUDE_DIR}")
message(STATUS "hip library dir: ${hip_LIB_INSTALL_DIR}")

target_compile_definitions(${KITRT} PUBLIC KITRT_HIP_ENABLED)
target_compile_definitions(${KITRT} PUBLIC __HIP_PLATFORM_AMD__)
target_include_directories(${KITRT} PUBLIC ${hip_INCLUDE_DIR})
Expand Down
22 changes: 11 additions & 11 deletions llvm/lib/Transforms/Tapir/CudaABI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ CudaLoop::CudaLoop(Module &M, Module &KernelModule, const std::string &KN,
Int32Ty, // threads-per-block
KernelInstMixTy->getPointerTo(), // instruction mix info
VoidPtrTy); // opaque cuda stream

KitCudaMemPrefetchFn =
M.getOrInsertFunction("__kitcuda_mem_gpu_prefetch",
VoidPtrTy, // return an opaque stream
Expand Down Expand Up @@ -986,14 +986,14 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
// supported) we can end up with multiple calls to the outlined
// loop (which has been setup for dead code elimination) but can
// cause invalid IR that trips us up when handling the GPU module
// code generation. This is a challenge in the Tapir design that
// was not geared to handle some of the nuances of GPU target
// transformations (and code gen). To address this, we need to
// code generation. This is a challenge in the Tapir design that
// was not geared to handle some of the nuances of GPU target
// transformations (and code gen). To address this, we need to
// do some clean up to keep the IR correct (or the verifier will
// fail on us...). Specifically, we can no longer depend upon
// fail on us...). Specifically, we can no longer depend upon
// DCE as it runs too late in the GPU transformation process...
//
// TODO: This code can be shared between the cuda and hip targets...
// TODO: This code can be shared between the cuda and hip targets...
//
Function *TargetKF = KernelModule.getFunction(KernelName);
std::list<Instruction *> RemoveList;
Expand Down Expand Up @@ -1029,7 +1029,7 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
IRBuilder<> NewBuilder(&NewBB->front());

// TODO: There is some potential here to share this code across both
// the hip and cuda transforms...
// the hip and cuda transforms...
LLVM_DEBUG(dbgs() << "\t*- code gen packing of " << OrderedInputs.size()
<< " kernel args.\n");
PointerType *VoidPtrTy = PointerType::getUnqual(Ctx);
Expand All @@ -1049,8 +1049,8 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
i++;

if (CodeGenPrefetch && V->getType()->isPointerTy()) {
LLVM_DEBUG(dbgs() << "\t\t- code gen prefetch for kernel arg #"
<< i << "\n");
LLVM_DEBUG(dbgs() << "\t\t- code gen prefetch for kernel arg #" << i
<< "\n");
Value *VoidPP = NewBuilder.CreateBitCast(V, VoidPtrTy);
Value *SPtr = NewBuilder.CreateLoad(VoidPtrTy, CudaStream);
Value *NewSPtr =
Expand Down Expand Up @@ -2005,15 +2005,15 @@ CudaABIOutputFile CudaABI::generatePTX() {
FunctionAnalysisManager fam;
CGSCCAnalysisManager cgam;
ModuleAnalysisManager mam;

PassBuilder pb(PTXTargetMachine, pto);
pb.registerModuleAnalyses(mam);
pb.registerCGSCCAnalyses(cgam);
pb.registerFunctionAnalyses(fam);
pb.registerLoopAnalyses(lam);
PTXTargetMachine->registerPassBuilderCallbacks(pb, false);
pb.crossRegisterProxies(lam, fam, cgam, mam);

ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(optLevel);
mpm.addPass(VerifierPass());
LLVM_DEBUG(dbgs() << "\t\t* module: " << KernelModule.getName() << "\n");
Expand Down
95 changes: 46 additions & 49 deletions llvm/lib/Transforms/Tapir/HipABI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,10 @@
#include "llvm/Transforms/Tapir/Outline.h"
#include "llvm/Transforms/Tapir/TapirGPUUtils.h"
#include "llvm/Transforms/Tapir/TapirLoopInfo.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/TapirUtils.h"
#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Mem2Reg.h"
#include "llvm/Transforms/Utils/TapirUtils.h"

using namespace llvm;

Expand Down Expand Up @@ -249,8 +248,7 @@ cl::opt<ROCmABIVersion> ROCmABITarget(
"hipabi-rocm-abi", cl::init(ROCm_ABI_V4), cl::Hidden,
cl::desc("Select the targeted ROCm ABI version."),
cl::values(clEnumValN(ROCm_ABI_V4, "v4", "Target ROCm version 4 ABI."),
clEnumValN(ROCm_ABI_V5, "v5",
"Target ROCm v. 5 ABI.")));
clEnumValN(ROCm_ABI_V5, "v5", "Target ROCm v. 5 ABI.")));

cl::opt<bool> Use64ElementWavefront(
"hipabi-wavefront64", cl::init(true), cl::Hidden,
Expand Down Expand Up @@ -623,15 +621,17 @@ void HipABI::transformConstants(Function *Fn) {
LLVM_DEBUG(dbgs() << "\t\t\t\tnew store: " << *SI << "\n");
} else if (auto *Call = dyn_cast<CallBase>(U->getUser())) {
unsigned argNo = Call->getArgOperandNo(U);
LLVM_DEBUG(dbgs() << "\t\tpatching callable instruction: " << *Call << "\n");
LLVM_DEBUG(dbgs() << "\t\tpatching callable instruction: " << *Call
<< "\n");
// FIXME: This is not correct! The function operand should be
// checked to see what address space it expects.
Instruction *asCast =
new AddrSpaceCastInst(NewGEP, OldGEP->getType(), "", Call);
Call->setArgOperand(argNo, asCast);
LLVM_DEBUG(dbgs() << "\t\t\t\tnew call: " << *Call << "\n");
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(U->getUser())) {
LLVM_DEBUG(dbgs() << "\t\tpatching gep instruction:\n\t\t\t" << *GEP << "\n");
LLVM_DEBUG(dbgs() << "\t\tpatching gep instruction:\n\t\t\t" << *GEP
<< "\n");
Instruction *asCast =
new AddrSpaceCastInst(NewGEP, OldGEP->getType(), "", GEP);
GEP->setOperand(GEP->getPointerOperandIndex(), asCast);
Expand Down Expand Up @@ -830,24 +830,25 @@ HipLoop::HipLoop(Module &M, Module &KModule, const std::string &Name,
Int64Ty, // number of floating point ops.
Int64Ty); // number of integer ops.

KitHipLaunchFn = M.getOrInsertFunction("__kithip_launch_kernel",
VoidPtrTy, // return an opaque stream
VoidPtrTy, // fat-binary
VoidPtrTy, // kernel name
VoidPtrTy, // arguments
Int64Ty, // trip count
Int32Ty, // threads-per-block
KitHipLaunchFn = M.getOrInsertFunction(
"__kithip_launch_kernel",
VoidPtrTy, // return an opaque stream
VoidPtrTy, // fat-binary
VoidPtrTy, // kernel name
VoidPtrTy, // arguments
Int64Ty, // trip count
Int32Ty, // threads-per-block
KernelInstMixTy->getPointerTo(), // instruction mix info
VoidPtrTy); // opaque cuda stream
VoidPtrTy); // opaque cuda stream

KitHipMemPrefetchFn = M.getOrInsertFunction("__kithip_mem_gpu_prefetch",
VoidPtrTy, // return an opaque stream
VoidPtrTy, // pointer to prefetch
VoidPtrTy); // use opaque stream.
KitHipMemPrefetchFn =
M.getOrInsertFunction("__kithip_mem_gpu_prefetch",
VoidPtrTy, // return an opaque stream
VoidPtrTy, // pointer to prefetch
VoidPtrTy); // use opaque stream.
}

HipLoop::~HipLoop() { /* no-op */
}
HipLoop::~HipLoop() { /* no-op */ }

// TODO: Can we also transform the arguments into a different address space here
// and avoid our use of 'mutate' elsewhere in the code?
Expand Down Expand Up @@ -1315,14 +1316,14 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
// supported) we can end up with multiple calls to the outlined
// loop (which has been setup for dead code elimination) but can
// cause invalid IR that trips us up when handling the GPU module
// code generation. This is a challenge in the Tapir design that
// was not geared to handle some of the nuances of GPU target
// transformations (and code gen). To address this, we need to
// code generation. This is a challenge in the Tapir design that
// was not geared to handle some of the nuances of GPU target
// transformations (and code gen). To address this, we need to
// do some clean up to keep the IR correct (or the verifier will
// fail on us...). Specifically, we can no longer depend upon
// fail on us...). Specifically, we can no longer depend upon
// DCE as it runs too late in the GPU transformation process...
//
// TODO: This code can be shared between the cuda and hip targets...
// TODO: This code can be shared between the cuda and hip targets...
//
Function *TargetKF = KernelModule.getFunction(KernelName);
std::list<Instruction *> RemoveList;
Expand Down Expand Up @@ -1357,7 +1358,7 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
IRBuilder<> NewBuilder(&NewBB->front());

// TODO: There is some potential here to share this code across both
// the hip and cuda transforms...
// the hip and cuda transforms...
LLVM_DEBUG(dbgs() << "\t*- code gen packing of " << OrderedInputs.size()
<< " kernel args.\n");
PointerType *VoidPtrTy = PointerType::getUnqual(Ctx);
Expand All @@ -1377,12 +1378,12 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
i++;

if (CodeGenPrefetch && V->getType()->isPointerTy()) {
LLVM_DEBUG(dbgs() << "\t\t- code gen prefetch for kernel arg #"
<< i << "\n");
LLVM_DEBUG(dbgs() << "\t\t- code gen prefetch for kernel arg #" << i
<< "\n");
Value *VoidPP = NewBuilder.CreateBitCast(V, VoidPtrTy);
Value *SPtr = NewBuilder.CreateLoad(VoidPtrTy, HipStream);
Value *NewSPtr =
NewBuilder.CreateCall(KitHipMemPrefetchFn, {VoidPP, SPtr});
Value *NewSPtr =
NewBuilder.CreateCall(KitHipMemPrefetchFn, {VoidPP, SPtr});
if (not StreamAssigned) {
NewBuilder.CreateStore(NewSPtr, HipStream);
StreamAssigned = true;
Expand Down Expand Up @@ -1463,13 +1464,13 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
ConstantInt::get(Int64Ty, InstMix.num_iops));

AllocaInst *AI = NewBuilder.CreateAlloca(KernelInstMixTy);
NewBuilder.CreateStore(InstructionMix, AI);
NewBuilder.CreateStore(InstructionMix, AI);

LLVM_DEBUG(dbgs() << "\t*- code gen kernel launch...\n");
Value *KSPtr = NewBuilder.CreateLoad(VoidPtrTy, HipStream);
CallInst *LaunchStream = NewBuilder.CreateCall(KitHipLaunchFn,
{DummyFBPtr, KNameParam, argsPtr, CastTripCount,
TPBlockValue, AI, KSPtr});
CallInst *LaunchStream = NewBuilder.CreateCall(
KitHipLaunchFn, {DummyFBPtr, KNameParam, argsPtr, CastTripCount,
TPBlockValue, AI, KSPtr});
NewBuilder.CreateStore(LaunchStream, HipStream);
LLVM_DEBUG(dbgs() << "\t\t+- registering launch stream:\n"
<< "\t\t\tcall: " << *LaunchStream << "\n"
Expand Down Expand Up @@ -1514,8 +1515,7 @@ HipABI::HipABI(Module &InputModule)
Int32Ty, // host pointer
Int64Ty, // device pointer
Int64Ty); // number of bytes to copy
KitHipSyncFn = M.getOrInsertFunction("__kithip_sync_thread_stream",
VoidTy,
KitHipSyncFn = M.getOrInsertFunction("__kithip_sync_thread_stream", VoidTy,
VoidPtrTy); // no return, nor parameters
// Build the details we need for the AMDGPU/HIP target.
std::string ArchString = "amdgcn";
Expand Down Expand Up @@ -1575,8 +1575,7 @@ HipABI::HipABI(Module &InputModule)
ROCmModulesLoaded = false;
}

HipABI::~HipABI() { /* no-op */
}
HipABI::~HipABI() { /* no-op */ }

std::unique_ptr<Module> &HipABI::getLibDeviceModule() {

Expand Down Expand Up @@ -1704,7 +1703,7 @@ void HipABI::postProcessFunction(Function &F, bool OutliningTapirLoops) {
for (Value *SR : SyncRegList) {
for (Use &U : SR->uses()) {
if (auto *SyncI = dyn_cast<SyncInst>(U.getUser()))
CallInst::Create(KitHipSyncFn, {HipStream}, "",
CallInst::Create(KitHipSyncFn, {HipStream}, "",
&*SyncI->getSuccessor(0)->begin());
}
}
Expand Down Expand Up @@ -1740,7 +1739,7 @@ void HipABI::finalizeLaunchCalls(Module &M, GlobalVariable *BundleBin) {
AllocaInst *StreamAI = getLaunchStream(SavedLaunchCI);
assert(StreamAI != nullptr && "unexpected null launch stream!");
IRBuilder<> SyncBuilder(CI);
Value *HipStream =
Value *HipStream =
SyncBuilder.CreateLoad(VoidPtrTy, StreamAI, "hipstreamh");
CI->setArgOperand(0, HipStream);
SavedLaunchCI = nullptr;
Expand Down Expand Up @@ -1815,7 +1814,7 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) {
ObjFile->keep();

if (OptLevel > 0) {
if (OptLevel > 3)
if (OptLevel > 3)
OptLevel = 3;
LLVM_DEBUG(dbgs() << "\t- running kernel module optimization passes...\n");
PipelineTuningOptions pto;
Expand All @@ -1831,7 +1830,7 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) {
OptimizationLevel::O2,
OptimizationLevel::O3,
};
OptimizationLevel optLevel = optLevels[OptLevel];
OptimizationLevel optLevel = optLevels[OptLevel];

// From the LLVM docs: Create the analysis managers.
// These must be declared in this order so that they are destroyed in the
Expand Down Expand Up @@ -1860,8 +1859,7 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) {

legacy::PassManager PassMgr;
if (AMDTargetMachine->addPassesToEmitFile(PassMgr, ObjFile->os(), nullptr,
CodeGenFileType::ObjectFile,
false))
CodeGenFileType::ObjectFile, false))
report_fatal_error("hipabi: AMDGPU target failed!");

PassMgr.run(KernelModule);
Expand Down Expand Up @@ -1998,11 +1996,10 @@ GlobalVariable *HipABI::embedBundle(HipABIOutputFile &BundleFile) {
StringRef(Bundle->getBufferStart(), Bundle->getBufferSize()),
Bundle->getBufferSize(), Int8Ty);
GlobalVariable *BundleGV;
BundleGV = new GlobalVariable(M, BundleArray->getType(), true,
GlobalValue::PrivateLinkage, BundleArray,
"__hip_fatbin",
nullptr, GlobalVariable::NotThreadLocal);

BundleGV = new GlobalVariable(
M, BundleArray->getType(), true, GlobalValue::PrivateLinkage, BundleArray,
"__hip_fatbin", nullptr, GlobalVariable::NotThreadLocal);

const char *BundleSectionName = ".hip_fatbin";
BundleGV->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
BundleGV->setSection(BundleSectionName);
Expand Down