Skip to content

Commit

Permalink
[GPU] Stream refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
vladimir-paramuzov committed Sep 24, 2024
1 parent 05b6c3d commit 6f3316c
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 69 deletions.
34 changes: 31 additions & 3 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,36 @@

namespace cldnn {

// Possible sync methods for kernels in stream
enum class SyncMethods {
/* Build dependency graph using events. Each kernel creates proper ze_event which is set as dependency of users
At this moment it requires multiple retain/release calls for ze_event after each enqueueNDRange
which is less performant comparing to the barriers version
*/
events = 0,
/* Enqueue barriers between dependent kernels. For example consider the following dimond dependency graph:
kernel_0
/ \
kernel_1 kernel_2
\ /
kernel_3
In that case we do the following:
1. Enqueue kernel_0
2. Enqueue barrier (ensures kernel_0 is completed)
3. Enqueue kernel_1
4. Enqueue kernel_2 (doesn't depend on kernel_1)
5. Enqueue barrier (ensures kernel_1 and kernel_2 are completed)
6. Enqueue kernel_3
*/
barriers = 1,
/* No explicit syncronization is needed. Applicable for in-order queue only */
none = 2
};

class stream {
public:
using ptr = std::shared_ptr<stream>;
explicit stream(QueueTypes queue_type) : queue_type(queue_type) {}
explicit stream(QueueTypes queue_type, SyncMethods sync_method) : m_queue_type(queue_type), m_sync_method(sync_method) {}
virtual ~stream() = default;

virtual void flush() const = 0;
Expand All @@ -42,16 +68,18 @@ class stream {
virtual event::ptr create_base_event() = 0;
virtual event::ptr aggregate_events(const std::vector<event::ptr>& events, bool group = false, bool is_output = false);

QueueTypes get_queue_type() const { return queue_type; }
QueueTypes get_queue_type() const { return m_queue_type; }

static QueueTypes detect_queue_type(engine_types engine_type, void* queue_handle);
static SyncMethods get_expected_sync_method(const ExecutionConfig& config);

#ifdef ENABLE_ONEDNN_FOR_GPU
virtual dnnl::stream& get_onednn_stream() = 0;
#endif

protected:
QueueTypes queue_type;
QueueTypes m_queue_type;
SyncMethods m_sync_method;
};

} // namespace cldnn
45 changes: 15 additions & 30 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
Original file line number Diff line number Diff line change
@@ -1,22 +1,17 @@
// Copyright (C) 2019-2022 Intel Corporation
// Copyright (C) 2019-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "ocl_stream.hpp"
#include "intel_gpu/runtime/stream.hpp"
#include "ocl_event.hpp"
#include "ocl_user_event.hpp"
#include "ocl_command_queues_builder.hpp"
#include "intel_gpu/plugin/common_utils.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "ocl_kernel.hpp"
#include "ocl_common.hpp"

#include <cassert>
#include <iomanip>
#include <ios>

#include <fstream>
#include <thread>
#include <string>
#include <vector>
#include <memory>
Expand Down Expand Up @@ -190,28 +185,19 @@ void set_arguments_impl(ocl_kernel_type& kernel,
}
}

sync_methods get_expected_sync_method(const ExecutionConfig& config) {
auto profiling = config.get_property(ov::enable_profiling);
auto queue_type = config.get_property(ov::intel_gpu::queue_type);
return profiling ? sync_methods::events : queue_type == QueueTypes::out_of_order ? sync_methods::barriers
: sync_methods::none;
}

} // namespace

ocl_stream::ocl_stream(const ocl_engine &engine, const ExecutionConfig& config)
: stream(config.get_property(ov::intel_gpu::queue_type))
, _engine(engine)
, sync_method(get_expected_sync_method(config)) {
: stream(config.get_property(ov::intel_gpu::queue_type), stream::get_expected_sync_method(config))
, _engine(engine) {
auto context = engine.get_cl_context();
auto device = engine.get_cl_device();
ocl::command_queues_builder queue_builder;
queue_builder.set_profiling(config.get_property(ov::enable_profiling));
queue_builder.set_out_of_order(queue_type == QueueTypes::out_of_order);
queue_builder.set_out_of_order(m_queue_type == QueueTypes::out_of_order);

if (sync_method == sync_methods::none && queue_type == QueueTypes::out_of_order) {
throw std::runtime_error("[CLDNN] Unexpected sync method (none) is specified for out_of_order queue");
}
OPENVINO_ASSERT(m_sync_method != SyncMethods::none || m_queue_type == QueueTypes::in_order,
"[GPU] Unexpected sync method (none) is specified for out_of_order queue");

bool priorty_extensions = engine.extension_supported("cl_khr_priority_hints") && engine.extension_supported("cl_khr_create_command_queue");
queue_builder.set_priority_mode(config.get_property(ov::intel_gpu::hint::queue_priority), priorty_extensions);
Expand All @@ -226,16 +212,15 @@ ocl_stream::ocl_stream(const ocl_engine &engine, const ExecutionConfig& config)
}

ocl_stream::ocl_stream(const ocl_engine &engine, const ExecutionConfig& config, void *handle)
: stream(ocl_stream::detect_queue_type(handle))
, _engine(engine)
, sync_method(get_expected_sync_method(config)) {
: stream(ocl_stream::detect_queue_type(handle), stream::get_expected_sync_method(config))
, _engine(engine) {
auto casted_handle = static_cast<cl_command_queue>(handle);
_command_queue = ocl_queue_type(casted_handle, true);
}

#ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::stream& ocl_stream::get_onednn_stream() {
OPENVINO_ASSERT(queue_type == QueueTypes::in_order, "[GPU] Can't create onednn stream handle as onednn doesn't support out-of-order queue");
OPENVINO_ASSERT(m_queue_type == QueueTypes::in_order, "[GPU] Can't create onednn stream handle as onednn doesn't support out-of-order queue");
OPENVINO_ASSERT(_engine.get_device_info().vendor_id == INTEL_VENDOR_ID, "[GPU] Can't create onednn stream handle as for non-Intel devices");
if (!_onednn_stream) {
_onednn_stream = std::make_shared<dnnl::stream>(dnnl::ocl_interop::make_stream(_engine.get_onednn_engine(), _command_queue.get()));
Expand Down Expand Up @@ -284,21 +269,21 @@ event::ptr ocl_stream::enqueue_kernel(kernel& kernel,
auto local = toNDRange(args_desc.workGroups.local);
std::vector<cl::Event> dep_events;
std::vector<cl::Event>* dep_events_ptr = nullptr;
if (sync_method == sync_methods::events) {
if (m_sync_method == SyncMethods::events) {
for (auto& dep : deps) {
if (auto ocl_base_ev = std::dynamic_pointer_cast<ocl_base_event>(dep)) {
if (ocl_base_ev->get().get() != nullptr)
dep_events.push_back(ocl_base_ev->get());
}
}
dep_events_ptr = &dep_events;
} else if (sync_method == sync_methods::barriers) {
} else if (m_sync_method == SyncMethods::barriers) {
sync_events(deps, is_output);
}

cl::Event ret_ev;

bool set_output_event = sync_method == sync_methods::events || is_output;
bool set_output_event = m_sync_method == SyncMethods::events || is_output;

try {
_command_queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local, dep_events_ptr, set_output_event ? &ret_ev : nullptr);
Expand Down Expand Up @@ -330,7 +315,7 @@ event::ptr ocl_stream::enqueue_marker(std::vector<event::ptr> const& deps, bool
return std::make_shared<ocl_event>(ret_ev);
}

if (sync_method == sync_methods::events) {
if (m_sync_method == SyncMethods::events) {
cl::Event ret_ev;
std::vector<cl::Event> dep_events;
for (auto& dep : deps) {
Expand All @@ -349,7 +334,7 @@ event::ptr ocl_stream::enqueue_marker(std::vector<event::ptr> const& deps, bool
}

return std::make_shared<ocl_event>(ret_ev, ++_queue_counter);
} else if (sync_method == sync_methods::barriers) {
} else if (m_sync_method == SyncMethods::barriers) {
sync_events(deps, is_output);
return std::make_shared<ocl_event>(_last_barrier_ev, _last_barrier);
} else {
Expand Down
39 changes: 3 additions & 36 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,55 +10,24 @@
#include "ocl_engine.hpp"

#include <memory>
#include <chrono>
#include <thread>
#include <iostream>
#include <sstream>
#include <utility>
#include <vector>

namespace cldnn {
namespace ocl {

// Possible sync methods for kernels in stream
enum class sync_methods {
/* Build dependency graph using events. Each kernel creates proper cl_event which is set as dependency of users
At this moment it requires multiple retain/release calls for cl_event after each enqueueNDRange
which is less performant comparing to the barriers version
*/
events = 0,
/* Enqueue barriers between dependent kernels. For example consider the following dimond dependency graph:
kernel_0
/ \
kernel_1 kernel_2
\ /
kernel_3
In that case we do the following:
1. Enqueue kernel_0
2. Enqueue barrier (ensures kernel_0 is completed)
3. Enqueue kernel_1
4. Enqueue kernel_2 (doesn't depend on kernel_1)
5. Enqueue barrier (ensures kernel_1 and kernel_2 are completed)
6. Enqueue kernel_3
*/
barriers = 1,
/* No explicit syncronization is needed. Applicable for in-order queue only */
none = 2
};
class ocl_stream : public stream {
public:
const ocl_queue_type& get_cl_queue() const { return _command_queue; }

explicit ocl_stream(const ocl_engine& engine, const ExecutionConfig& config);
ocl_stream(const ocl_engine& engine, const ExecutionConfig& config);
ocl_stream(const ocl_engine &engine, const ExecutionConfig& config, void *handle);
ocl_stream(ocl_stream&& other)
: stream(other.queue_type)
: stream(other.m_queue_type, other.m_sync_method)
, _engine(other._engine)
, _command_queue(other._command_queue)
, _queue_counter(other._queue_counter.load())
, _last_barrier(other._last_barrier.load())
, _last_barrier_ev(other._last_barrier_ev)
, sync_method(other.sync_method) {}
, _last_barrier_ev(other._last_barrier_ev) {}

~ocl_stream() = default;

Expand Down Expand Up @@ -96,8 +65,6 @@ class ocl_stream : public stream {
std::atomic<uint64_t> _last_barrier{0};
cl::Event _last_barrier_ev;

sync_methods sync_method;

#ifdef ENABLE_ONEDNN_FOR_GPU
std::shared_ptr<dnnl::stream> _onednn_stream = nullptr;
#endif
Expand Down
7 changes: 7 additions & 0 deletions src/plugins/intel_gpu/src/runtime/stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ QueueTypes stream::detect_queue_type(engine_types engine_type, void* queue_handl
}
}

SyncMethods stream::get_expected_sync_method(const ExecutionConfig& config) {
auto profiling = config.get_property(ov::enable_profiling);
auto queue_type = config.get_property(ov::intel_gpu::queue_type);
return profiling ? SyncMethods::events : queue_type == QueueTypes::out_of_order ? SyncMethods::barriers
: SyncMethods::none;
}

event::ptr stream::aggregate_events(const std::vector<event::ptr>& events, bool group, bool is_output) {
if (events.size() == 1 && !is_output)
return events[0];
Expand Down

0 comments on commit 6f3316c

Please sign in to comment.