From ab07d396e5ec8f0e7dce4736ab3368f00a736fce Mon Sep 17 00:00:00 2001 From: Hans Pabst Date: Wed, 6 Mar 2024 17:03:44 +0100 Subject: [PATCH] ocl: adjusted finalization-flow of OpenCL backend * Fixed case when ACC_OPENCL_MEM_DEVPTR is turned off at compile-time. * Ensure termination message appears one time at most (cleanup). * Introduced ACC_OPENCL_EVENT_CHAIN and ACC_OPENCL_EVENT_WAIT. * Removed compile-time option for ACC_OPENCL_MEM_CPYSYNC. * Removed compile-time setting (ACC_OPENCL_STREAM_NULL). * Introduced WA-levels to distinct certain WAs. * Ensure final cleanup (atexit). --- src/acc/opencl/acc_opencl.c | 115 +++++++++++++++-------------- src/acc/opencl/acc_opencl.h | 22 +++--- src/acc/opencl/acc_opencl_event.c | 40 ++++++---- src/acc/opencl/acc_opencl_mem.c | 44 +---------- src/acc/opencl/acc_opencl_stream.c | 4 - 5 files changed, 100 insertions(+), 125 deletions(-) diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c index b7ecece9739..b8f67f50c91 100644 --- a/src/acc/opencl/acc_opencl.c +++ b/src/acc/opencl/acc_opencl.c @@ -138,15 +138,52 @@ int c_dbcsr_acc_opencl_order_devices(const void* dev_a, const void* dev_b) { } -LIBXSMM_ATTRIBUTE_CTOR void c_dbcsr_acc_opencl_init(void) { - /* attempt to automatically initialize backend */ - ACC_OPENCL_EXPECT(EXIT_SUCCESS == c_dbcsr_acc_init()); -} +/* attempt to automatically initialize backend */ +LIBXSMM_ATTRIBUTE_CTOR void c_dbcsr_acc_opencl_init(void) { ACC_OPENCL_EXPECT(EXIT_SUCCESS == c_dbcsr_acc_init()); } +/* attempt to automatically finalize backend */ LIBXSMM_ATTRIBUTE_DTOR void c_dbcsr_acc_opencl_finalize(void) { - /* attempt to automatically finalize backend */ - ACC_OPENCL_EXPECT(EXIT_SUCCESS == c_dbcsr_acc_finalize()); + assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_MAXNDEVS); + if (0 != c_dbcsr_acc_opencl_config.ndevices) { + int i; + for (i = 0; i < ACC_OPENCL_MAXNDEVS; ++i) { + const cl_device_id device_id = c_dbcsr_acc_opencl_config.devices[i]; + if (NULL != device_id) { +# if defined(CL_VERSION_1_2) && defined(_DEBUG) + ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseDevice(device_id)); +# endif + /* c_dbcsr_acc_opencl_create_context scans for non-NULL devices */ + c_dbcsr_acc_opencl_config.devices[i] = NULL; + } + } + if (NULL != c_dbcsr_acc_opencl_config.device.stream.queue) { /* release private stream */ + clReleaseCommandQueue(c_dbcsr_acc_opencl_config.device.stream.queue); /* ignore return code */ + } + if (NULL != c_dbcsr_acc_opencl_config.device.context) { + const cl_context context = c_dbcsr_acc_opencl_config.device.context; + c_dbcsr_acc_opencl_config.device.context = NULL; + clReleaseContext(context); /* ignore return code */ + } + for (i = 0; i < ACC_OPENCL_NLOCKS; ++i) { /* destroy locks */ + ACC_OPENCL_DESTROY((ACC_OPENCL_LOCKTYPE*)(c_dbcsr_acc_opencl_locks + ACC_OPENCL_CACHELINE * i)); + } + /* release/reset buffers */ +# if defined(ACC_OPENCL_MEM_DEVPTR) + free(c_dbcsr_acc_opencl_config.memptrs); + free(c_dbcsr_acc_opencl_config.memptr_data); +# endif + free(c_dbcsr_acc_opencl_config.streams); + free(c_dbcsr_acc_opencl_config.stream_data); + free(c_dbcsr_acc_opencl_config.events); + free(c_dbcsr_acc_opencl_config.event_data); + /* clear entire configuration structure */ + memset(&c_dbcsr_acc_opencl_config, 0, sizeof(c_dbcsr_acc_opencl_config)); +# if defined(ACC_OPENCL_CACHE_DID) + c_dbcsr_acc_opencl_active_id = 0; /* reset cached active device-ID */ +# endif + libxsmm_finalize(); + } } @@ -178,7 +215,7 @@ int c_dbcsr_acc_init(void) { const int nccs = (NULL == env_nccs ? ACC_OPENCL_NCCS : atoi(env_nccs)); # endif const char *const env_neo = getenv("NEOReadDebugKeys"), *const env_wa = getenv("ACC_OPENCL_WA"); - const int neo = (NULL == env_neo ? 1 : atoi(env_neo)), wa = neo * (NULL == env_wa ? 1 : atoi(env_wa)); + const int neo = (NULL == env_neo ? 1 : atoi(env_neo)), wa = neo * (NULL == env_wa ? 2 : atoi(env_wa)); # if defined(ACC_OPENCL_ASYNC) const char* const env_async = (ACC_OPENCL_ASYNC); const int async_default = 3; @@ -257,15 +294,13 @@ int c_dbcsr_acc_init(void) { # endif if (0 != wa) { /* environment is populated before touching the compute runtime */ static char* key_value[] = { - "NEOReadDebugKeys=1", "DirectSubmissionOverrideBlitterSupport=0", "EnableRecoverablePageFaults=0"}; - for (i = 0; i < sizeof(key_value) / sizeof(*key_value); ++i) { - const char* const sep = strchr(key_value[i], '='); - const size_t n = (NULL != sep ? (sep - key_value[i]) : 0); - if (0 < n && n < ACC_OPENCL_BUFFERSIZE) { - memcpy(buffer, key_value[i], n); - buffer[n] = '\0'; - ACC_OPENCL_EXPECT(NULL != getenv(buffer) || 0 == LIBXSMM_PUTENV(key_value[i])); - } + "NEOReadDebugKeys=1", "EnableRecoverablePageFaults=0", "DirectSubmissionOverrideBlitterSupport=0"}; + if (NULL == env_neo) ACC_OPENCL_EXPECT(0 == LIBXSMM_PUTENV(key_value[0])); + if (NULL == getenv("EnableRecoverablePageFaults")) { + ACC_OPENCL_EXPECT(0 == LIBXSMM_PUTENV(key_value[1])); + } + if (NULL == getenv("DirectSubmissionOverrideBlitterSupport") && 2 <= wa) { + ACC_OPENCL_EXPECT(0 == LIBXSMM_PUTENV(key_value[2])); } } # if defined(ACC_OPENCL_CACHE_DIR) @@ -612,15 +647,15 @@ int c_dbcsr_acc_finalize(void) { # else int result = EXIT_SUCCESS; # endif + static void (*cleanup)(void) = c_dbcsr_acc_opencl_finalize; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif - if (0 != c_dbcsr_acc_opencl_config.ndevices) { - int i; - assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_MAXNDEVS); + assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_MAXNDEVS); + if (0 != c_dbcsr_acc_opencl_config.ndevices && NULL != cleanup) { if (0 != c_dbcsr_acc_opencl_config.verbosity) { cl_device_id device = NULL; int d; @@ -642,42 +677,8 @@ int c_dbcsr_acc_finalize(void) { */ if (EXIT_SUCCESS == result) result = libsmm_acc_finalize(); # endif - libxsmm_finalize(); - for (i = 0; i < ACC_OPENCL_MAXNDEVS; ++i) { - const cl_device_id device_id = c_dbcsr_acc_opencl_config.devices[i]; - if (NULL != device_id) { -# if defined(CL_VERSION_1_2) && defined(_DEBUG) - ACC_OPENCL_CHECK(clReleaseDevice(device_id), "release device", result); -# endif - /* c_dbcsr_acc_opencl_create_context scans for non-NULL devices */ - c_dbcsr_acc_opencl_config.devices[i] = NULL; - } - } - if (NULL != c_dbcsr_acc_opencl_config.device.stream.queue) { /* release private stream */ - clReleaseCommandQueue(c_dbcsr_acc_opencl_config.device.stream.queue); /* ignore return code */ - } - if (NULL != c_dbcsr_acc_opencl_config.device.context) { - const cl_context context = c_dbcsr_acc_opencl_config.device.context; - c_dbcsr_acc_opencl_config.device.context = NULL; - clReleaseContext(context); /* ignore return code */ - } - for (i = 0; i < ACC_OPENCL_NLOCKS; ++i) { /* destroy locks */ - ACC_OPENCL_DESTROY((ACC_OPENCL_LOCKTYPE*)(c_dbcsr_acc_opencl_locks + ACC_OPENCL_CACHELINE * i)); - } - /* release/reset buffers */ -# if defined(ACC_OPENCL_MEM_DEVPTR) - free(c_dbcsr_acc_opencl_config.memptrs); - free(c_dbcsr_acc_opencl_config.memptr_data); -# endif - free(c_dbcsr_acc_opencl_config.streams); - free(c_dbcsr_acc_opencl_config.stream_data); - free(c_dbcsr_acc_opencl_config.events); - free(c_dbcsr_acc_opencl_config.event_data); - /* clear entire configuration structure */ - memset(&c_dbcsr_acc_opencl_config, 0, sizeof(c_dbcsr_acc_opencl_config)); -# if defined(ACC_OPENCL_CACHE_DID) - c_dbcsr_acc_opencl_active_id = 0; /* reset cached active device-ID */ -# endif + if (EXIT_SUCCESS == result) result = atexit(cleanup); + cleanup = NULL; } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); @@ -997,8 +998,10 @@ int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_i CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0 /* terminator */ }; # endif +# if defined(ACC_OPENCL_MEM_DEVPTR) cl_platform_id platform = NULL; cl_bitfield bitfield = 0; +# endif c_dbcsr_acc_opencl_config.device.intel = (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 0 /*use_platform_name*/)); c_dbcsr_acc_opencl_config.device.nv = (EXIT_SUCCESS == @@ -1026,6 +1029,7 @@ int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_i { c_dbcsr_acc_opencl_config.device.unified = CL_FALSE; } +# if defined(ACC_OPENCL_MEM_DEVPTR) if (0 != (4 & c_dbcsr_acc_opencl_config.xhints) && 2 <= *c_dbcsr_acc_opencl_config.device.std_level && 0 != c_dbcsr_acc_opencl_config.device.intel && 0 == c_dbcsr_acc_opencl_config.device.unified && EXIT_SUCCESS == clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL) && @@ -1046,6 +1050,7 @@ int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_i ptr = clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL"); LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clMemFreeINTEL, &ptr); } +# endif # if defined(ACC_OPENCL_CMDAGR) if (0 != c_dbcsr_acc_opencl_config.device.intel) { /* device vendor (above) can now be used */ int result_cmdagr = EXIT_SUCCESS; diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h index c874266ada0..c109de21ef3 100644 --- a/src/acc/opencl/acc_opencl.h +++ b/src/acc/opencl/acc_opencl.h @@ -9,10 +9,6 @@ #ifndef ACC_OPENCL_H #define ACC_OPENCL_H -#if defined(__OFFLOAD_OPENCL) && !defined(__OPENCL) -# define __OPENCL -#endif - #if defined(__OPENCL) # if !defined(CL_TARGET_OPENCL_VERSION) # define CL_TARGET_OPENCL_VERSION 220 @@ -108,11 +104,7 @@ # define ACC_OPENCL_STREAM_PRIORITIES # endif #endif -/* Stream-argument (ACC-interface) can be NULL (synchronous) */ -#if !defined(ACC_OPENCL_STREAM_NULL) && 1 -# define ACC_OPENCL_STREAM_NULL -#endif -/* Support arithmetic for device-pointers (DBM) */ +/* Support arithmetic for device-pointers */ #if !defined(ACC_OPENCL_MEM_DEVPTR) && 1 # define ACC_OPENCL_MEM_DEVPTR #endif @@ -169,6 +161,11 @@ clCreateCommandQueue(CTX, DEV, (cl_command_queue_properties)(NULL != (PROPS) ? ((PROPS)[1]) : 0), RESULT) #endif +/* Support for other libraries, e.g., CP2K's DBM/DBT */ +#if defined(ACC_OPENCL_MEM_DEVPTR) && defined(__OFFLOAD_OPENCL) && !defined(__OPENCL) +# define __OPENCL +#endif + #if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER # define ACC_OPENCL_EXPECT(EXPR) LIBXSMM_EXPECT(EXPR) # define LIBXSMM_STRISTR libxsmm_stristr @@ -252,7 +249,10 @@ typedef struct c_dbcsr_acc_opencl_stream_t { typedef struct c_dbcsr_acc_opencl_device_t { /** Activated device context. */ cl_context context; - /** Stream for internal purpose. */ + /** + * Stream for internal purpose, e.g., stream-argument + * (ACC-interface) can be NULL (synchronous) + */ c_dbcsr_acc_opencl_stream_t stream; /** OpenCL compiler flag (language standard). */ char std_flag[16]; @@ -353,7 +353,7 @@ int c_dbcsr_acc_opencl_info_devptr( c_dbcsr_acc_opencl_info_memptr_t* info, const void* memory, size_t elsize, const size_t* amount, size_t* offset); /** Finds an existing stream for the given thread-ID (or NULL). */ const c_dbcsr_acc_opencl_stream_t* c_dbcsr_acc_opencl_stream(ACC_OPENCL_LOCKTYPE* lock, int thread_id); -/** Determines default-stream (see ACC_OPENCL_STREAM_NULL). */ +/** Determines default-stream (see c_dbcsr_acc_opencl_device_t::stream). */ const c_dbcsr_acc_opencl_stream_t* c_dbcsr_acc_opencl_stream_default(void); /** Like c_dbcsr_acc_memset_zero, but supporting an arbitrary value used as initialization pattern. */ int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nbytes, void* stream); diff --git a/src/acc/opencl/acc_opencl_event.c b/src/acc/opencl/acc_opencl_event.c index dc012a25fce..7fde985a04e 100644 --- a/src/acc/opencl/acc_opencl_event.c +++ b/src/acc/opencl/acc_opencl_event.c @@ -12,6 +12,12 @@ # if !defined(ACC_OPENCL_EVENT_FLUSH) && 0 # define ACC_OPENCL_EVENT_FLUSH # endif +# if !defined(ACC_OPENCL_EVENT_CHAIN) && 0 +# define ACC_OPENCL_EVENT_CHAIN +# endif +# if !defined(ACC_OPENCL_EVENT_WAIT) && 0 +# define ACC_OPENCL_EVENT_WAIT +# endif # if defined(__cplusplus) @@ -76,11 +82,7 @@ int c_dbcsr_acc_stream_wait_event(void* stream, void* event) { /* wait for an ev static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif -# if defined(ACC_OPENCL_STREAM_NULL) str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream_default()); -# else - str = ACC_OPENCL_STREAM(stream); -# endif assert(NULL != str && NULL != str->queue && NULL != event); clevent = *ACC_OPENCL_EVENT(event); if (NULL != clevent) { @@ -88,9 +90,11 @@ int c_dbcsr_acc_stream_wait_event(void* stream, void* event) { /* wait for an ev cl_event clevent_result = NULL; result = clEnqueueBarrierWithWaitList(str->queue, 1, &clevent, &clevent_result); if (EXIT_SUCCESS == result) { +# if defined(ACC_OPENCL_EVENT_CHAIN) result = clReleaseEvent(clevent); assert(NULL != clevent_result); *(cl_event*)event = (EXIT_SUCCESS == result ? clevent_result : NULL); +# endif } else # else @@ -122,23 +126,29 @@ int c_dbcsr_acc_event_record(void* event, void* stream) { static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif -# if defined(ACC_OPENCL_STREAM_NULL) str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream_default()); -# else - str = ACC_OPENCL_STREAM(stream); -# endif assert(NULL != str && NULL != str->queue && NULL != event); clevent = *ACC_OPENCL_EVENT(event); +# if defined(ACC_OPENCL_EVENT_FLUSH) + result = clFlush(str->queue); + if (EXIT_SUCCESS == result) +# endif + { # if defined(CL_VERSION_1_2) - result = (NULL == clevent ? clEnqueueMarkerWithWaitList(str->queue, 0, NULL, &clevent_result) - : clEnqueueMarkerWithWaitList(str->queue, 1, &clevent, &clevent_result)); +# if defined(ACC_OPENCL_EVENT_WAIT) + if (NULL != clevent) result = clEnqueueMarkerWithWaitList(str->queue, 1, &clevent, &clevent_result); + else +# endif + { + result = clEnqueueMarkerWithWaitList(str->queue, 0, NULL, &clevent_result); + } # else - if (NULL != clevent) result = clEnqueueWaitForEvents(str->queue, 1, &clevent); - if (EXIT_SUCCESS == result) result = clEnqueueMarker(str->queue, &clevent_result); -# endif -# if defined(ACC_OPENCL_EVENT_FLUSH) - if (EXIT_SUCCESS == result) result = clFlush(str->queue); +# if defined(ACC_OPENCL_EVENT_WAIT) + if (NULL != clevent) result = clEnqueueWaitForEvents(str->queue, 1, &clevent); +# endif + if (EXIT_SUCCESS == result) result = clEnqueueMarker(str->queue, &clevent_result); # endif + } if (NULL != clevent) { const int result_release = clReleaseEvent(clevent); if (EXIT_SUCCESS == result) result = result_release; diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c index 640809bbbcf..5ea4846ad86 100644 --- a/src/acc/opencl/acc_opencl_mem.c +++ b/src/acc/opencl/acc_opencl_mem.c @@ -22,9 +22,6 @@ # if !defined(ACC_OPENCL_MEM_ALIGNSCALE) # define ACC_OPENCL_MEM_ALIGNSCALE 8 # endif -# if !defined(ACC_OPENCL_MEM_CPYSYNC) && 1 -# define ACC_OPENCL_MEM_CPYSYNC -# endif # if defined(__cplusplus) @@ -183,12 +180,8 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) assert(NULL != host_mem); memory = clCreateBuffer(c_dbcsr_acc_opencl_config.device.context, CL_MEM_ALLOC_HOST_PTR, nbytes, NULL /*host_ptr*/, &result); if (EXIT_SUCCESS == result) { -# if defined(ACC_OPENCL_STREAM_NULL) const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream_default()); -# else - const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); -# endif void* const mapped = clEnqueueMapBuffer( str->queue, memory, CL_TRUE /*always block*/, CL_MAP_READ | CL_MAP_WRITE, 0 /*offset*/, nbytes, 0, NULL, NULL, &result); assert(EXIT_SUCCESS == result || NULL == mapped); @@ -230,19 +223,13 @@ int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream) { c_dbcsr_acc_opencl_info_memptr_t* const meminfo = c_dbcsr_acc_opencl_info_hostptr(host_mem); if (NULL != meminfo->memory) { const c_dbcsr_acc_opencl_info_memptr_t info = *meminfo; /* copy meminfo prior to unmap */ -# if defined(ACC_OPENCL_STREAM_NULL) const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream_default()); -# else - const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); -# endif int result_release; cl_event event; assert(NULL != str && NULL != str->queue); result = clEnqueueUnmapMemObject(str->queue, info.memory, info.memptr, 0, NULL, &event); -# if defined(ACC_OPENCL_STREAM_NULL) if (NULL == stream && EXIT_SUCCESS == result) result = clWaitForEvents(1, &event); -# endif result_release = clReleaseMemObject(info.memory); if (EXIT_SUCCESS == result) result = result_release; } @@ -255,7 +242,7 @@ int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream) { } -/* like c_dbcsr_acc_memcpy_d2h, but accounting for some async support/workaround. */ +/* like c_dbcsr_acc_memcpy_d2h, but apply some async workaround. */ int c_dbcsr_acc_opencl_memcpy_d2h( cl_mem /*dev_mem*/, void* /*host_mem*/, size_t /*offset*/, size_t /*nbytes*/, cl_command_queue /*queue*/, int /*blocking*/); int c_dbcsr_acc_opencl_memcpy_d2h( @@ -276,15 +263,14 @@ int c_dbcsr_acc_opencl_memcpy_d2h( { result = clEnqueueReadBuffer(queue, dev_mem, finish, offset, nbytes, host_mem, 0, NULL, NULL); } -# if defined(ACC_OPENCL_MEM_CPYSYNC) - if (EXIT_SUCCESS != result && !finish) { + if (EXIT_SUCCESS != result && !finish) { /* retry synchronously */ int result_sync = EXIT_SUCCESS; -# if defined(ACC_OPENCL_MEM_DEVPTR) +# if defined(ACC_OPENCL_MEM_DEVPTR) if (NULL != c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL) { result_sync = c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL(queue, CL_TRUE, host_mem, dev_mem, nbytes, 0, NULL, NULL); } else -# endif +# endif { result_sync = clEnqueueReadBuffer(queue, dev_mem, CL_TRUE, offset, nbytes, host_mem, 0, NULL, NULL); } @@ -296,7 +282,6 @@ int c_dbcsr_acc_opencl_memcpy_d2h( result = EXIT_SUCCESS; } } -# endif return result; } @@ -483,12 +468,8 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v # endif assert((NULL != host_mem && NULL != dev_mem) || 0 == nbytes); if (NULL != host_mem && NULL != dev_mem && 0 != nbytes) { -# if defined(ACC_OPENCL_STREAM_NULL) const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream(NULL /*lock*/, ACC_OPENCL_OMP_TID())); -# else - const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); -# endif # if defined(ACC_OPENCL_ASYNC) const cl_bool finish = (0 == (1 & c_dbcsr_acc_opencl_config.async) || NULL == stream || (0 != c_dbcsr_acc_opencl_config.device.nv && NULL == (ACC_OPENCL_ASYNC))); @@ -535,14 +516,9 @@ int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, v # endif assert((NULL != dev_mem && NULL != host_mem) || 0 == nbytes); if (NULL != host_mem && NULL != dev_mem && 0 != nbytes) { -# if defined(ACC_OPENCL_STREAM_NULL) const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream(NULL /*lock*/, ACC_OPENCL_OMP_TID())); const cl_bool finish = (NULL != stream ? CL_FALSE : CL_TRUE); -# else - const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); - const cl_bool finish = CL_FALSE; -# endif c_dbcsr_acc_opencl_info_memptr_t info; size_t offset = 0; assert(NULL != str && NULL != str->queue); @@ -575,12 +551,8 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt # endif assert((NULL != devmem_src && NULL != devmem_dst) || 0 == nbytes); if (NULL != devmem_src && NULL != devmem_dst && 0 != nbytes) { -# if defined(ACC_OPENCL_STREAM_NULL) const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream(NULL /*lock*/, ACC_OPENCL_OMP_TID())); -# else - const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); -# endif cl_event event = NULL; assert(NULL != str && NULL != str->queue); # if defined(ACC_OPENCL_MEM_DEVPTR) @@ -606,9 +578,7 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_memory); # endif } -# if defined(ACC_OPENCL_STREAM_NULL) if (NULL == stream && EXIT_SUCCESS == result) result = clWaitForEvents(1, &event); -# endif } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); @@ -627,12 +597,8 @@ int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nb # endif assert(NULL != dev_mem || 0 == nbytes); if (0 != nbytes) { -# if defined(ACC_OPENCL_STREAM_NULL) const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream(NULL /*lock*/, ACC_OPENCL_OMP_TID())); -# else - const c_dbcsr_acc_opencl_stream_t* const str = ACC_OPENCL_STREAM(stream); -# endif size_t size_of_value = 1; cl_event event; if (0 == LIBXSMM_MOD2(nbytes, 4)) size_of_value = 4; @@ -666,9 +632,7 @@ int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nb ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_memory); # endif } -# if defined(ACC_OPENCL_STREAM_NULL) if (NULL == stream && EXIT_SUCCESS == result) result = clWaitForEvents(1, &event); -# endif } # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) c_dbcsr_timestop(&routine_handle); diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c index 72af4ac9599..11d3159473c 100644 --- a/src/acc/opencl/acc_opencl_stream.c +++ b/src/acc/opencl/acc_opencl_stream.c @@ -251,11 +251,7 @@ int c_dbcsr_acc_stream_sync(void* stream) { static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); # endif -# if defined(ACC_OPENCL_STREAM_NULL) str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream_default()); -# else - str = ACC_OPENCL_STREAM(stream); -# endif assert(NULL != str && NULL != str->queue); result = clFinish(str->queue); # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)