diff --git a/.ci/daint.cscs.ch/ocl.build.sh b/.ci/daint.cscs.ch/ocl.build.sh index ef788d6a9ce..baa2ed4dc88 100755 --- a/.ci/daint.cscs.ch/ocl.build.sh +++ b/.ci/daint.cscs.ch/ocl.build.sh @@ -27,7 +27,7 @@ if [ ! -d "${HOME}/libxsmm" ]; then fi cd "${HOME}/libxsmm" git fetch -git checkout 05705477183444a82c8d9be8d7c2627efd6d67fa +git checkout 6c55e168d2053fa44f60f6985c370303bd84f9c1 make -j cd .. diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c index 64756cc7d11..e595727ff80 100644 --- a/src/acc/opencl/acc_opencl.c +++ b/src/acc/opencl/acc_opencl.c @@ -1290,7 +1290,7 @@ int c_dbcsr_acc_opencl_flags_atomics(const c_dbcsr_acc_opencl_device_t* devinfo, } assert(NULL != atomic_exp); /* compose build parameters and flags */ - result = LIBXSMM_SNPRINTF(flags, flags_maxlen, "-DTAN=%i %s %s -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\" %s", kind, atomic_type, + result = LIBXSMM_SNPRINTF(flags, flags_maxlen, " -DTAN=%i %s %s -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\" %s", kind, atomic_type, atomic_ops, atomic_exp, barrier_expr); } } diff --git a/src/acc/opencl/smm/tune_multiply.py b/src/acc/opencl/smm/tune_multiply.py index a6fd8e64f02..8f51f17effe 100755 --- a/src/acc/opencl/smm/tune_multiply.py +++ b/src/acc/opencl/smm/tune_multiply.py @@ -15,7 +15,6 @@ from opentuner import Result from signal import signal, SIGINT import tempfile -import socket import shutil import copy import json @@ -176,16 +175,15 @@ def __init__(self, args): ): # setup database (DB) if args.database is None: # adjust DB-location envrank = os.getenv("PMI_RANK", os.getenv("OMPI_COMM_WORLD_LOCAL_RANK")) + directory = "{}-{}".format(dbdir, os.getenv("HOSTNAME")) if envrank: self.idevice = int(envrank) % self.ndevices - directory = "{}-{}.db".format(dbdir, self.idevice) - else: - directory = "{}.db".format(dbdir) + directory += ".{}".format(self.idevice) if os.path.isdir(directory): shutil.rmtree(directory) os.mkdir(directory) self.args.database = "sqlite:///" + os.path.join( - directory, "{}.db".format(socket.gethostname()) + directory, "{}.db".format(os.getpid()) ) if not self.args.label: # label for DB-session self.args.label = "{}-{}-{}-s{}".format( @@ -436,7 +434,7 @@ def merge_jsons(self, filenames): s = 0 if 0 < gflops: g = int(filename.split("-")[-1].split("g")[0]) - s = gflops / g # slowdown + s = gflops / g if 0 < g else 0 # slowdown if mtime < os.path.getmtime(filename): if 0 < s: retsld[1] = retsld[1] + math.log(s) @@ -842,6 +840,8 @@ def handle_sigint(self, signum, frame): # OPENCL_LIBSMM_SMM_xx=tune|enabled|on must be given to permit tuning) if os.getenv("OPENCL_LIBSMM_SMM_WS") not in default_enable_tune: os.environ["OPENCL_LIBSMM_SMM_WS"] = "{}".format(args.ws) + if os.getenv("OPENCL_LIBSMM_SMM_AL") not in default_enable_tune: + os.environ["OPENCL_LIBSMM_SMM_AL"] = "{}".format(args.al) # fix tunables according to level of tuning if 1 <= args.tlevel or 0 > args.tlevel: os.environ["OPENCL_LIBSMM_SMM_BM"] = "{}".format(args.bm) diff --git a/src/acc/opencl/smm/tune_multiply.sh b/src/acc/opencl/smm/tune_multiply.sh index 736e4058bcb..9516e43c1cb 100755 --- a/src/acc/opencl/smm/tune_multiply.sh +++ b/src/acc/opencl/smm/tune_multiply.sh @@ -83,39 +83,19 @@ then break;; esac done + # how to print standard vs error messages if [ ! "${HELP}" ] || [ "0" = "${HELP}" ]; then ECHO=">&2 echo" else ECHO="echo" fi - eval "${ECHO} \"Usage: $0 [options] []\"" - eval "${ECHO} \" Options must precede triplet specification\"" - eval "${ECHO} \" -w|--wait N: initial delay before auto-tuning (default: ${WAIT_DEFAULT} s)\"" - eval "${ECHO} \" -c|--continue: proceed with plan if tuning is interrupted\"" - eval "${ECHO} \" -u|--update: retune all JSONs found in directory (see -p)\"" - eval "${ECHO} \" -s|--batchsize N: Number of batched SMMs (a.k.a. stacksize)\"" - eval "${ECHO} \" -a|--tuning-level N=0..3: all, most, some, least tunables\"" - eval "${ECHO} \" -b|--backwards: tune in descending order of triplets\"" - eval "${ECHO} \" -t|--maxtime N: number of seconds spent per kernel\"" - eval "${ECHO} \" -p|--jsondir P: path to JSON-files (tuned params)\"" - eval "${ECHO} \" -i|--part N (1-based): Nth session out of nparts\"" - eval "${ECHO} \" -j|--nparts N: number of total sessions (see -i)\"" - eval "${ECHO} \" -r|--bound L U: limit L**3 < MNK <= U**3\"" - eval "${ECHO} \" -m|--limit N: limit any shape extent to N\"" - eval "${ECHO} \" -n|--triplets N: limit number of triplet\"" - eval "${ECHO} \" -k|--specid N: predefined triplets\"" - eval "${ECHO} \" 0-10: older to newer (larger), e.g.,\"" - eval "${ECHO} \" 0: 201 kernels\"" - eval "${ECHO} \" 10: 1266 kernels\"" - eval "${ECHO} \" , e.g., 134 kernels\"" - eval "${ECHO} \" 23, 5 32 13 24 26, 4 9\"" - eval "${ECHO}" - # default settings + # default/basic settings if [ ! "${BATCHSIZE}" ]; then BATCHSIZE=0; fi if [ ! "${JSONDIR}" ]; then JSONDIR=.; fi if [ ! "${TLEVEL}" ]; then TLEVEL=-1; fi - if [ ! "${NPARTS}" ]; then NPARTS=1; fi - if [ ! "${PART}" ]; then PART=1; fi + if [ ! "${NPARTS}" ]; then NPARTS=${PMI_SIZE:-1}; fi + if [ ! "${PART}" ]; then PART=${PMI_RANK:-0}; PART=$((PART+1)); fi + if [ ! "${WAIT}" ] && [ "1" = "${NPARTS}" ]; then WAIT=0; fi # sanity checks if [ "0" != "$((NPARTS&2 echo "ERROR: part-number ${PART} is larger than the requested ${NPARTS} parts!" @@ -131,7 +111,6 @@ then exit 1 elif [ ! "${HELP}" ] || [ "0" = "${HELP}" ]; then if [ "${UPDATE}" ] && [ "0" != "${UPDATE}" ]; then - if [ ! "${TLEVEL}" ] || [ "0" != "$((0>TLEVEL))" ]; then TLEVEL=1; fi MNKS=$(${SED} -n "s/.*tune_multiply-..*-\(..*x..*x.[^-]*\)-..*gflops\.json/\1/p" <<<"${JSONS}" \ | ${SORT} -u -n -tx -k1,1 -k2,2 -k3,3) elif [ "${SPECID}" ]; then @@ -142,6 +121,30 @@ then else exit 0 fi + if [ ! "${WAIT}" ]; then + eval "${ECHO} \"Usage: $0 [options] []\"" + eval "${ECHO} \" Options must precede triplet specification\"" + eval "${ECHO} \" -w|--wait N: initial delay before auto-tuning (default: ${WAIT_DEFAULT} s)\"" + eval "${ECHO} \" -c|--continue: proceed with plan if tuning is interrupted\"" + eval "${ECHO} \" -u|--update: retune all JSONs found in directory (see -p)\"" + eval "${ECHO} \" -s|--batchsize N: Number of batched SMMs (a.k.a. stacksize)\"" + eval "${ECHO} \" -a|--tuning-level N=0..3: all, most, some, least tunables\"" + eval "${ECHO} \" -b|--backwards: tune in descending order of triplets\"" + eval "${ECHO} \" -t|--maxtime N: number of seconds spent per kernel\"" + eval "${ECHO} \" -p|--jsondir P: path to JSON-files (tuned params)\"" + eval "${ECHO} \" -i|--part N (1-based): Nth session out of nparts\"" + eval "${ECHO} \" -j|--nparts N: number of total sessions (see -i)\"" + eval "${ECHO} \" -r|--bound L U: limit L**3 < MNK <= U**3\"" + eval "${ECHO} \" -m|--limit N: limit any shape extent to N\"" + eval "${ECHO} \" -n|--triplets N: limit number of triplet\"" + eval "${ECHO} \" -k|--specid N: predefined triplets\"" + eval "${ECHO} \" 0-10: older to newer (larger), e.g.,\"" + eval "${ECHO} \" 0: 201 kernels\"" + eval "${ECHO} \" 10: 1266 kernels\"" + eval "${ECHO} \" , e.g., 134 kernels\"" + eval "${ECHO} \" 23, 5 32 13 24 26, 4 9\"" + eval "${ECHO}" + fi if [ "${MNKS}" ]; then if [ "${BOUNDL}" ] || [ "${BOUNDU}" ]; then if [ ! "${BOUNDL}" ]; then BOUNDL=0; elif [ ! "${BOUNDU}" ]; then BOUNDU=0; fi @@ -187,10 +190,12 @@ then PARTSIZE=$(((NTRIPLETS+NPARTS-1)/NPARTS)) PARTOFFS=$(((PART-1)*PARTSIZE)) PARTSIZE=$((PARTSIZE<=(NTRIPLETS-PARTOFFS)?PARTSIZE:(NTRIPLETS-PARTOFFS))) - if [ "0" != "$((NPARTS<=NTRIPLETS))" ]; then - echo "Session ${PART} of ${NPARTS} part(s)." - else - echo "Session ${PART} of ${NPARTS} part(s). The problem is over-decomposed!" + if [ ! "${WAIT}" ] || [ "0" != "${WAIT}" ]; then + if [ "0" != "$((NPARTS<=NTRIPLETS))" ]; then + echo "Session ${PART} of ${NPARTS} part(s)." + else + echo "Session ${PART} of ${NPARTS} part(s). The problem is over-decomposed!" + fi fi if [ ! "${MAXTIME}" ] && [[ (! "${CONTINUE}" || \ "${CONTINUE}" = "false" || \ @@ -200,10 +205,12 @@ then MAXTIME=160 fi if [ "${MAXTIME}" ] && [ "0" != "$((0