Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support PAPI@7 #9

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,16 +1,28 @@
name = "PAPI"
uuid = "c3f66453-0ce3-4e3d-8601-c404262f204f"
authors = ["Tom Haber <[email protected]>"]
version = "0.3.0"
version = "0.4.0"

[deps]
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
NUMA_jll = "7f51dc2b-bb24-59f8-b771-bb1490e4195d"
PAPI_jll = "062e04e5-c3d3-5549-ab66-579a72a7bc1b"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[compat]
julia = "1"
CUDA = "4"
JSON = "0.21"
NUMA_jll = "2"
PAPI_jll = "7"
Preferences = "1"
Statistics = "1"
julia = "1.6"

[extras]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
97 changes: 0 additions & 97 deletions deps/build.jl

This file was deleted.

106 changes: 106 additions & 0 deletions examples/cuda.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
using CUDA
import Libdl

CUDA.versioninfo()
# Libdl.dlopen(CUDA.CUDA_Driver_jll.libcuda, Libdl.RTLD_NOW | Libdl.RTLD_GLOBAL)
# Libdl.dlopen(CUDA.CUDA_Runtime_jll.libcudart, Libdl.RTLD_NOW | Libdl.RTLD_GLOBAL)
# Libdl.dlopen(CUDA.CUDA_Runtime_jll.libcupti, Libdl.RTLD_NOW | Libdl.RTLD_GLOBAL)

using PAPI
using Unitful

# module CycleUnits
# using Unitful
# @unit nat "nat" Nat 1 false
# @unit bit "bit" Bit log(2)*u"nat" false
# @unit B "B" Byte 8bit false
# Unitful.register(@__MODULE__)
# end

# https://crd.lbl.gov/assets/Uploads/ECP22-Roofline-2-NVIDIA-and-NERSC.pdf
# https://www.exascaleproject.org/wp-content/uploads/2021/01/PAPI_BOF_Presentation-pdf.pdf

# - DRAM: `cuda:::dram__bytes.sum`
# - L2: `cuda:::lts__t_bytes.sum`
# - L1: `cuda:::l1tex__t_bytes.sum`

# - Double precision:
# - `cuda:::sm__sass_thread_inst_executed_op_dadd_pred_on.sum`
# - `cuda:::sm__sass_thread_inst_executed_op_dmul_pred_on.sum`
# - `cuda:::sm__sass_thread_inst_executed_op_dfma_pred_on.sum`

function time_events!(ev_set, dev)
@assert PAPI.try_add_event(ev_set, PAPI.name_to_event("cuda:::sm__cycles_elapsed.avg:device=$dev"))
@assert PAPI.try_add_event(ev_set, PAPI.name_to_event("cuda:::sm__cycles_elapsed.avg.per_second:device=$dev"))
end

function memory_events!(ev_set, dev)
@assert PAPI.try_add_event(ev_set, PAPI.name_to_event("cuda:::dram__bytes.sum.per_second:device=$dev"))
@assert PAPI.try_add_event(ev_set, PAPI.name_to_event("cuda:::lts__t_bytes.sum.per_second:device=$dev")) # L2
@assert PAPI.try_add_event(ev_set, PAPI.name_to_event("cuda:::l1tex__t_bytes.sum.per_second:device=$dev"))
end

function float_events!(ev_set, dev, ::Type{T}) where T<:AbstractFloat
prefix = if T == Float64
'd'
elseif T == Float32
'f'
elseif T == Float16
'h'
else
error("Unknown $T")
end
@assert PAPI.try_add_event(ev_set,
PAPI.name_to_event("cuda:::sm__sass_thread_inst_executed_op_$(prefix)add_pred_on.sum:device=$dev"))
@assert PAPI.try_add_event(ev_set,
PAPI.name_to_event("cuda:::sm__sass_thread_inst_executed_op_$(prefix)mul_pred_on.sum:device=$dev"))
@assert PAPI.try_add_event(ev_set,
PAPI.name_to_event("cuda:::sm__sass_thread_inst_executed_op_$(prefix)fma_pred_on.sum:device=$dev"))
end

function measure(f, ::Type{T}) where T
pcuda = PAPI.find_component("cuda")
ev_set_mem = PAPI.EventSet(pcuda)
ev_set_flops = PAPI.EventSet(pcuda)

dev = CUDA.device().handle
memory_events!(ev_set_mem, dev)
time_events!(ev_set_mem, dev)

float_events!(ev_set_flops, dev, T)

PAPI.start_counters(ev_set_mem)
CUDA.@sync f()
counters = PAPI.stop_counters(ev_set_mem)

# dram, l2, l1 in bytes.per_second
dram, l2, l1, cycles, cycles_per_second = counters
time = cycles/cycles_per_second

PAPI.start_counters(ev_set_flops)
CUDA.@sync f()

counters = PAPI.stop_counters(ev_set_flops)
add, mul, fma = counters

flop = add + mul + 2fma
flops = flop/time

(;dram, l2, l1, cycles, cycles_per_second, time, add, mul, fma, flop, flops)
end

function square_kernel!(A, B)
idx = threadIdx().x
@inbounds b = B[idx]
@inbounds A[idx] = b^2
nothing
end

A = CUDA.zeros(1024)
B = CUDA.zeros(1024)

results = measure(eltype(A)) do
@cuda threads=1024 square_kernel!(A, B)
end

@show results
2 changes: 2 additions & 0 deletions gen/LocalPreferences.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[CUDA_Runtime_jll]
version = "none"
6 changes: 6 additions & 0 deletions gen/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[deps]
Clang = "40e3b903-d033-50b4-a0cc-940c62c95e31"
PAPI_jll = "062e04e5-c3d3-5549-ab66-579a72a7bc1b"

[extras]
CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
23 changes: 23 additions & 0 deletions gen/generator.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using Clang.Generators
using PAPI_jll

@assert PAPI_jll.is_available()

cd(@__DIR__)

include_dir = normpath(PAPI_jll.artifact_dir, "include")

options = load_options(joinpath(@__DIR__, "generator.toml"))

# add compiler flags, e.g. "-DXXXXXXXXX"
args = get_default_args() # Note you must call this function firstly and then append your own flags
push!(args, "-I$include_dir")

# there is also an experimental `detect_headers` function for auto-detecting top-level headers in the directory
headers = [joinpath(include_dir, "papi.h")]

# create context
ctx = create_context(headers, args, options)

# run generator
build!(ctx)
8 changes: 8 additions & 0 deletions gen/generator.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[general]
library_name = "libpapi"
output_file_path = "./libPAPI.jl"
module_name = "API"
jll_pkg_name = "PAPI_jll"
prologue_file_path = "./prologue.jl"
print_enum_as_integer = true
print_using_CEnum = false
1 change: 1 addition & 0 deletions gen/prologue.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
PAPI_VERSION_NUMBER(maj,min,rev,inc) = (((maj)<<24) | ((min)<<16) | ((rev)<<8) | (inc))
Loading