diff --git a/util/hw_stats/run_hw.py b/util/hw_stats/run_hw.py
index 22283b7c4..4c5fbbcb4 100755
--- a/util/hw_stats/run_hw.py
+++ b/util/hw_stats/run_hw.py
@@ -69,7 +69,7 @@
 parser.add_option("-l", "--limit_kernel_number", dest="kernel_number", type=int, default=-99,
                  help="Limits the number of profiled kernels (useful in larger applications")
 parser.add_option("-C", "--collect", dest="collect", default="cycles",
-                help="Pass what you want from the hardware. Options are: \"cycles,other_stats\"")
+                help="Pass what you want from the hardware. Options are: \"cycles,other_stats,full_set\"")
 
 (options, args) = parser.parse_args()
 
@@ -147,6 +147,14 @@
                     ",l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_miss.sum,idc__requests.sum,idc__requests_lookup_hit.sum," +\
                     "sm__sass_inst_executed_op_shared_ld.sum,sm__sass_inst_executed_op_shared_st.sum,lts__t_sectors_srcunit_tex_op_read_lookup_miss.sum,lts__t_sectors_srcunit_tex_op_write_lookup_miss.sum,sm__pipe_alu_cycles_active.sum,sm__pipe_fma_cycles_active.sum,sm__pipe_fp64_cycles_active.sum,sm__pipe_shared_cycles_active.sum,sm__pipe_tensor_cycles_active.sum,sm__pipe_tensor_op_hmma_cycles_active.sum,sm__cycles_active.sum,sm__cycles_active.avg,sm__cycles_elapsed.avg,sm__sass_thread_inst_executed_op_integer_pred_on.sum,sm__sass_thread_inst_executed_ops_dadd_dmul_dfma_pred_on.sum,sm__sass_thread_inst_executed_ops_fadd_fmul_ffma_pred_on.sum,sm__sass_thread_inst_executed_ops_hadd_hmul_hfma_pred_on.sum,sm__inst_executed_pipe_alu.sum,sm__inst_executed_pipe_fma.sum,sm__inst_executed_pipe_fp16.sum,sm__inst_executed_pipe_fp64.sum,sm__inst_executed_pipe_tensor.sum,sm__inst_executed_pipe_tex.sum,sm__inst_executed_pipe_xu.sum,sm__inst_executed_pipe_lsu.sum," +\
                     "sm__sass_thread_inst_executed_op_fp16_pred_on.sum,sm__sass_thread_inst_executed_op_fp32_pred_on.sum,sm__sass_thread_inst_executed_op_fp64_pred_on.sum,sm__sass_thread_inst_executed_op_dmul_pred_on.sum,sm__sass_thread_inst_executed_op_dfma_pred_on.sum,sm__sass_inst_executed_op_memory_128b.sum,sm__sass_inst_executed_op_memory_64b.sum,sm__sass_inst_executed_op_memory_32b.sum,sm__sass_inst_executed_op_memory_16b.sum,sm__sass_inst_executed_op_memory_8b.sum,smsp__thread_inst_executed_per_inst_executed.ratio,sm__sass_thread_inst_executed.sum" +\
+                    "l1tex__data_bank_conflicts_pipe_lsu_mem_shared.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum" +\
+                    " --csv --page raw --target-processes all " + kernel_number +\
+                    " " + exec_path + " " + str(args) +\
+                    " | tee " + os.path.join(this_run_dir,logfile + ".nsight")
+        elif "full_set" in options.collect:
+            if options.nsight_profiler:
+                sh_contents += "\nexport CUDA_VERSION=\"" + cuda_version + "\"; export CUDA_VISIBLE_DEVICES=\"" + options.device_num +\
+                    "\" ; timeout 30m nv-nsight-cu-cli --set full" +\
                     " --csv --page raw --target-processes all " + kernel_number +\
                     " " + exec_path + " " + str(args) +\
                     " | tee " + os.path.join(this_run_dir,logfile + ".nsight")
diff --git a/util/job_launching/apps/define-all-apps.yml b/util/job_launching/apps/define-all-apps.yml
index 2b553651a..0a03ef7ef 100644
--- a/util/job_launching/apps/define-all-apps.yml
+++ b/util/job_launching/apps/define-all-apps.yml
@@ -92,6 +92,13 @@ GPU_Microbenchmark:
         - shared_lat:
             - args: 
               accel-sim-mem: 1G
+        - shared_bank_conflicts:
+        ## argument 1 kernel has conflicts
+            - args: 1
+              accel-sim-mem: 1G
+        ## argument 2 kernel doesn't have conflicts
+            - args: 2
+              accel-sim-mem: 1G
         - MaxFlops:
             - args: 
               accel-sim-mem: 1G
diff --git a/util/job_launching/stats/example_stats.yml b/util/job_launching/stats/example_stats.yml
index 5b3a52351..768b0680d 100644
--- a/util/job_launching/stats/example_stats.yml
+++ b/util/job_launching/stats/example_stats.yml
@@ -24,6 +24,8 @@ collect_aggregate:
     - 'total dram reads\s*=\s*(.*)'
     - 'total dram writes\s*=\s*(.*)'
     - 'kernel_launch_uid\s*=\s*(.*)'
+    - 'gpgpu_n_shmem_bkconflict\s*=\s*(.*)'
+    - 'gpgpu_n_l1cache_bkconflict\s*=\s*(.*)'
 
 
 # These stats are reset each kernel and should not be diff'd
diff --git a/util/plotting/correl_mappings.py b/util/plotting/correl_mappings.py
index eb44dd9ad..54a70fb61 100644
--- a/util/plotting/correl_mappings.py
+++ b/util/plotting/correl_mappings.py
@@ -474,6 +474,17 @@
         drophwnumbelow=0,
         plottype="log",
         stattype="counter"
+    ),
+    CorrelStat(chart_name="Shared Memory Bank Conflicts",
+        plotfile="shmem-bank-conflict",
+        hw_eval="np.average(hw[\"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum\"])\
+        + np.average(hw[\"l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum\"])",
+        hw_error=None,
+        sim_eval="float(sim[\"gpgpu_n_shmem_bkconflict\s*=\s*(.*)\"])",
+        hw_name="all",
+        drophwnumbelow=0,
+        plottype="log",
+        stattype="counter"
     ),
 	CorrelStat(chart_name="DRAM Reads",
         plotfile="dram-read-transactions",