diff --git a/util/hw_stats/run_hw.py b/util/hw_stats/run_hw.py index 22283b7c4..4c5fbbcb4 100755 --- a/util/hw_stats/run_hw.py +++ b/util/hw_stats/run_hw.py @@ -69,7 +69,7 @@ parser.add_option("-l", "--limit_kernel_number", dest="kernel_number", type=int, default=-99, help="Limits the number of profiled kernels (useful in larger applications") parser.add_option("-C", "--collect", dest="collect", default="cycles", - help="Pass what you want from the hardware. Options are: \"cycles,other_stats\"") + help="Pass what you want from the hardware. Options are: \"cycles,other_stats,full_set\"") (options, args) = parser.parse_args() @@ -147,6 +147,14 @@ ",l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_miss.sum,idc__requests.sum,idc__requests_lookup_hit.sum," +\ "sm__sass_inst_executed_op_shared_ld.sum,sm__sass_inst_executed_op_shared_st.sum,lts__t_sectors_srcunit_tex_op_read_lookup_miss.sum,lts__t_sectors_srcunit_tex_op_write_lookup_miss.sum,sm__pipe_alu_cycles_active.sum,sm__pipe_fma_cycles_active.sum,sm__pipe_fp64_cycles_active.sum,sm__pipe_shared_cycles_active.sum,sm__pipe_tensor_cycles_active.sum,sm__pipe_tensor_op_hmma_cycles_active.sum,sm__cycles_active.sum,sm__cycles_active.avg,sm__cycles_elapsed.avg,sm__sass_thread_inst_executed_op_integer_pred_on.sum,sm__sass_thread_inst_executed_ops_dadd_dmul_dfma_pred_on.sum,sm__sass_thread_inst_executed_ops_fadd_fmul_ffma_pred_on.sum,sm__sass_thread_inst_executed_ops_hadd_hmul_hfma_pred_on.sum,sm__inst_executed_pipe_alu.sum,sm__inst_executed_pipe_fma.sum,sm__inst_executed_pipe_fp16.sum,sm__inst_executed_pipe_fp64.sum,sm__inst_executed_pipe_tensor.sum,sm__inst_executed_pipe_tex.sum,sm__inst_executed_pipe_xu.sum,sm__inst_executed_pipe_lsu.sum," +\ "sm__sass_thread_inst_executed_op_fp16_pred_on.sum,sm__sass_thread_inst_executed_op_fp32_pred_on.sum,sm__sass_thread_inst_executed_op_fp64_pred_on.sum,sm__sass_thread_inst_executed_op_dmul_pred_on.sum,sm__sass_thread_inst_executed_op_dfma_pred_on.sum,sm__sass_inst_executed_op_memory_128b.sum,sm__sass_inst_executed_op_memory_64b.sum,sm__sass_inst_executed_op_memory_32b.sum,sm__sass_inst_executed_op_memory_16b.sum,sm__sass_inst_executed_op_memory_8b.sum,smsp__thread_inst_executed_per_inst_executed.ratio,sm__sass_thread_inst_executed.sum" +\ + "l1tex__data_bank_conflicts_pipe_lsu_mem_shared.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum" +\ + " --csv --page raw --target-processes all " + kernel_number +\ + " " + exec_path + " " + str(args) +\ + " | tee " + os.path.join(this_run_dir,logfile + ".nsight") + elif "full_set" in options.collect: + if options.nsight_profiler: + sh_contents += "\nexport CUDA_VERSION=\"" + cuda_version + "\"; export CUDA_VISIBLE_DEVICES=\"" + options.device_num +\ + "\" ; timeout 30m nv-nsight-cu-cli --set full" +\ " --csv --page raw --target-processes all " + kernel_number +\ " " + exec_path + " " + str(args) +\ " | tee " + os.path.join(this_run_dir,logfile + ".nsight") diff --git a/util/job_launching/apps/define-all-apps.yml b/util/job_launching/apps/define-all-apps.yml index 2b553651a..0a03ef7ef 100644 --- a/util/job_launching/apps/define-all-apps.yml +++ b/util/job_launching/apps/define-all-apps.yml @@ -92,6 +92,13 @@ GPU_Microbenchmark: - shared_lat: - args: accel-sim-mem: 1G + - shared_bank_conflicts: + ## argument 1 kernel has conflicts + - args: 1 + accel-sim-mem: 1G + ## argument 2 kernel doesn't have conflicts + - args: 2 + accel-sim-mem: 1G - MaxFlops: - args: accel-sim-mem: 1G diff --git a/util/job_launching/stats/example_stats.yml b/util/job_launching/stats/example_stats.yml index 5b3a52351..768b0680d 100644 --- a/util/job_launching/stats/example_stats.yml +++ b/util/job_launching/stats/example_stats.yml @@ -24,6 +24,8 @@ collect_aggregate: - 'total dram reads\s*=\s*(.*)' - 'total dram writes\s*=\s*(.*)' - 'kernel_launch_uid\s*=\s*(.*)' + - 'gpgpu_n_shmem_bkconflict\s*=\s*(.*)' + - 'gpgpu_n_l1cache_bkconflict\s*=\s*(.*)' # These stats are reset each kernel and should not be diff'd diff --git a/util/plotting/correl_mappings.py b/util/plotting/correl_mappings.py index eb44dd9ad..54a70fb61 100644 --- a/util/plotting/correl_mappings.py +++ b/util/plotting/correl_mappings.py @@ -474,6 +474,17 @@ drophwnumbelow=0, plottype="log", stattype="counter" + ), + CorrelStat(chart_name="Shared Memory Bank Conflicts", + plotfile="shmem-bank-conflict", + hw_eval="np.average(hw[\"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum\"])\ + + np.average(hw[\"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum\"])", + hw_error=None, + sim_eval="float(sim[\"gpgpu_n_shmem_bkconflict\s*=\s*(.*)\"])", + hw_name="all", + drophwnumbelow=0, + plottype="log", + stattype="counter" ), CorrelStat(chart_name="DRAM Reads", plotfile="dram-read-transactions",