Skip to content

Commit

Permalink
ocl: improved tuning script
Browse files Browse the repository at this point in the history
* tune_multiply.py
  - Make OPENCL_LIBSMM_SMM_XF an integer upfront.
  - Fixed/improved handling recursion (CTRL-C).
  - Ensure normalized/sorted output.
  - Properly implement interface.
  - Improved handling errors.
   -Locked XF-property.
* Improved ensuring minimum requested WG-size.
* SMM-Kernel: keep and improved code comments.
* Updated LIBXSMM (Daint CI).
  • Loading branch information
hfp committed Oct 23, 2023
1 parent 304698b commit a42631f
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 140 deletions.
2 changes: 1 addition & 1 deletion .ci/daint.cscs.ch/ocl.build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ if [ ! -d "${HOME}/libxsmm" ]; then
fi
cd "${HOME}/libxsmm"
git fetch
git checkout 7871d611393dd2354aaba6d5558e718eb6d86d75
git checkout 54b8bb32042d204ff6fe550fa70d961f22c0b99e
make -j
cd ..

Expand Down
27 changes: 14 additions & 13 deletions src/acc/opencl/smm/kernels/multiply.cl
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
BARRIER(CLK_LOCAL_MEM_FENCE);
# endif
# if (WRK < SWG)
if (WRK <= idx) return;
if (WRK <= idx) return; /* WRK <= idx */
# endif
c0 = params[2] - IDXBASE;
# if defined(BSC) && (1 != BK) && (1 != UM)
Expand All @@ -309,7 +309,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
const int c1 = ((i + 1) < batchsize ? (params[3 * i + 5] - IDXBASE) : -1);
#else
# if (WRK < SWG)
if (WRK > idx)
if (WRK > idx) /* WRK > idx */
# endif
{
const int a0 = params[0] - IDXBASE, b0 = params[1] - IDXBASE, c0 = params[2] - IDXBASE;
Expand Down Expand Up @@ -379,7 +379,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
for (short bm = 0; bm < BM; ++bm) {
const int m = bm + m0;
# if (SM % BM)
if (m < SM)
if (m < SM) /* m < SM */
# endif
{
UNROLL_FORCE(SK)
Expand All @@ -390,7 +390,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
const int n = bn + n0;
# endif
# if (SN % BN)
if (n < SN)
if (n < SN) /* n < SN */
# endif
{
# if defined(SLM_C) && (1 < BS)
Expand Down Expand Up @@ -442,7 +442,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
const int n = bn + n0;
# endif
# if (SN % BN)
if (n < SN)
if (n < SN) /* n < SN */
# endif
{
# if defined(REG_B)
Expand All @@ -454,7 +454,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
for (short bm = 0; bm < BM; ++bm) {
const int m = bm + m0;
# if (SM % BM)
if (m < SM)
if (m < SM) /* m < SM */
# endif
{
# if defined(SLM_C) && (1 < BS)
Expand Down Expand Up @@ -492,7 +492,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
const int n = bn + n0;
# endif
# if (SN % BN)
if (n < SN)
if (n < SN) /* n < SN */
# endif
{
# if (1 == BS)
Expand All @@ -507,7 +507,7 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
for (short bm = 0; bm < BM; ++bm) {
const int m = bm + m0;
# if (SM % BM)
if (m < SM)
if (m < SM) /* m < SM */
# endif
{
# if defined(SLM_C) && (1 < BS)
Expand Down Expand Up @@ -563,14 +563,15 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
UNROLL_FORCE(SM)
for (short m = 0; m < SM; ++m) {
# if (200 /*CL_VERSION_2_0*/ <= __OPENCL_VERSION__) && !defined(SLM_A) && !defined(REG_A) && (WRK == SM) && \
(SM <= SGS || SM <= SWG) /* size of subgroup or size of workgroup is sufficient */
(SM <= SGS || SM <= SWG)
/* size of subgroup or size of workgroup is sufficient */
# if (SM <= SGS)
CNM(idx, m) = MAD(sub_group_broadcast(a, m), b, CNM(idx, m));
# else
CNM(idx, m) = MAD(work_group_broadcast(a, m), b, CNM(idx, m));
# endif
# else /* fallback */
CNM(idx, m) = MAD(AMK(m, k), b, CNM(idx, m));
# else
CNM(idx, m) = MAD(AMK(m, k), b, CNM(idx, m)); /* fallback */
# endif
}
# if defined(BARRIER) && (MAX(1, SGS) < SWG) && defined(SLM_A)
Expand Down Expand Up @@ -683,14 +684,14 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res
for (short bn = 0; bn < BN; ++bn) {
const int n = bn + n0;
# if (SN % BN)
if (n < SN)
if (n < SN) /* n < SN */
# endif
{
UNROLL_FORCE(BM)
for (short bm = 0; bm < BM; ++bm) {
const int m = bm + m0;
# if (SM % BM)
if (m < SM)
if (m < SM) /* m < SM */
# endif
{
# if defined(SLM_C)
Expand Down
18 changes: 10 additions & 8 deletions src/acc/opencl/smm/opencl_libsmm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1393,16 +1393,18 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
assert(1 <= bs && 0 < new_config.wgsize[kernel_idx] && 0 < wgsize_max && 0 < wgsize_prf);
/* ensure minimum requested WG-size */
while ((nbm * nbn) < new_config.ws && (nbm < m_max || nbn < n_max)) {
if (nbn < n_max) {
++nbn;
new_config.bn = (n_max + nbn - 1) / nbn;
}
else if (nbm < m_max) {
++nbm;
new_config.bm = (m_max + nbm - 1) / nbm;
}
if (nbn < n_max) ++nbn;
else if (nbm < m_max) ++nbm;
}
if ((nbm * nbn) < new_config.ws) {
new_config.bn = (n_max + nbn - 1) / nbn;
new_config.bm = (m_max + nbm - 1) / nbm;
new_config.wgsize[kernel_idx] = (2 > new_config.wg ? (nbm * nbn) : ((int)LIBXSMM_UP2POT(nbm * nbn)));
}
else { /* reset */
nbm = (m_max + new_config.bm - 1) / new_config.bm;
nbn = (n_max + new_config.bn - 1) / new_config.bn;
}
/* limit WG-size to maximum WG-size */
while (wgsize_max < new_config.wgsize[kernel_idx] && (new_config.bm < m_max || new_config.bn < n_max)) {
if (new_config.bn < n_max) {
Expand Down
Loading

0 comments on commit a42631f

Please sign in to comment.