Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the option to measure separate timers per thread #3378

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ set( with-models OFF CACHE STRING "The models to include as a semicolon-separate
set( tics_per_ms "1000.0" CACHE STRING "Specify elementary unit of time [default=1000 tics per ms]." )
set( tics_per_step "100" CACHE STRING "Specify resolution [default=100 tics per step]." )
set( with-detailed-timers OFF CACHE STRING "Build with detailed internal time measurements [default=OFF]. Detailed timers can affect the performance." )
set( with-threaded-timers ON CACHE STRING "Build with one internal timer per thread [default=ON]. Multi-threaded timers can affect the performance." )
JanVogelsang marked this conversation as resolved.
Show resolved Hide resolved
set( target-bits-split "standard" CACHE STRING "Split of the 64-bit target neuron identifier type [default='standard']. 'standard' is recommended for most users. If running on more than 262144 MPI processes or more than 512 threads, change to 'hpc'." )

# generic build configuration
Expand Down Expand Up @@ -143,6 +144,7 @@ nest_process_with_gsl()
nest_process_with_openmp()
nest_process_with_mpi()
nest_process_with_detailed_timers()
nest_process_with_threaded_timers()
nest_process_with_libneurosim()
nest_process_with_music()
nest_process_with_sionlib()
Expand Down
7 changes: 7 additions & 0 deletions cmake/ConfigureSummary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,13 @@ function( NEST_PRINT_CONFIG_SUMMARY )
message( "Detailed timers : No" )
endif ()

message( "" )
if ( THREADED_TIMERS )
message( "Threaded timers : Yes" )
else ()
message( "Threaded timers : No" )
endif ()

JanVogelsang marked this conversation as resolved.
Show resolved Hide resolved
message( "" )
if ( HAVE_MUSIC )
message( "Use MUSIC : Yes (MUSIC ${MUSIC_VERSION})" )
Expand Down
7 changes: 7 additions & 0 deletions cmake/ProcessOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,13 @@ function( NEST_PROCESS_WITH_DETAILED_TIMERS )
endif ()
endfunction()

function( NEST_PROCESS_WITH_THREADED_TIMERS )
set( THREADED_TIMERS OFF PARENT_SCOPE )
if ( ${with-threaded-timers} STREQUAL "ON" )
set( THREADED_TIMERS ON PARENT_SCOPE )
endif ()
endfunction()

function( NEST_PROCESS_WITH_LIBNEUROSIM )
# Find libneurosim
set( HAVE_LIBNEUROSIM OFF PARENT_SCOPE )
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -637,12 +637,12 @@ For example, the ``stopwatch.h`` file could look like:
} /* namespace timer */
#endif /* STOPWATCH_H */

And the corresponding ``stopwatch.cpp``:
And the corresponding ``stopwatch_impl.h``:

.. code:: cpp

/*
* stopwatch.cpp
* stopwatch_impl.h
*
* This file is part of NEST.
*
Expand Down
7 changes: 5 additions & 2 deletions doc/htmldoc/installation/cmake_options.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ For more details, see the :ref:`Python binding <compile_with_python>` section be
.. _performance_cmake:

Maximize performance, reduce energy consumption
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The following options help to optimize NEST for maximal performance and thus reduced energy consumption.

Expand All @@ -126,7 +126,7 @@ The following options help to optimize NEST for maximal performance and thus red
in place.
* Using ``-march=native`` requires that you build NEST on the same CPU architecture as you will use to run it.
* For the technically minded: Even just using ``-O3`` removes some ``assert()`` statements from NEST since we
have wrapped some of them in functions, which get eliminated due to interprocedural optimization.
have wrapped some of them in functions, which get eliminated due to interprocedural optimization.



Expand Down Expand Up @@ -197,6 +197,9 @@ NEST properties
+-----------------------------------------------+----------------------------------------------------------------+
| ``-Dtics_per_step=[number]`` | Specify resolution [default=100 tics per step]. |
+-----------------------------------------------+----------------------------------------------------------------+
| ``-Dwith-threaded-timers=[OFF|ON]`` | Build with one internal timer per thread [default=ON]. |
| | Multi-threaded timers can affect the performance. |
+-----------------------------------------------+----------------------------------------------------------------+
| ``-Dwith-detailed-timers=[OFF|ON]`` | Build with detailed internal time measurements [default=OFF]. |
| | Detailed timers can affect the performance. |
+-----------------------------------------------+----------------------------------------------------------------+
Expand Down
8 changes: 4 additions & 4 deletions doc/htmldoc/nest_behavior/built-in_timers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ The following basic time measurements are available:
|Name |Explanation |
+=============================+==================================+
|``time_construction_create`` |Cumulative time NEST spent |
| |creating neurons and devices |
| |creating neurons and devices |
+-----------------------------+----------------------------------+
|``time_construction_connect``|Cumulative time NEST spent |
| |creating connections |
Expand Down Expand Up @@ -54,7 +54,7 @@ attributes are:
+-----------------------+----------------------------------+
|``local_spike_counter``|Number of spikes emitted by the |
| |neurons represented on this MPI |
| |rank during the last |
| |rank during the last |
| |``Simulate()`` |
+-----------------------+----------------------------------+

Expand Down Expand Up @@ -82,11 +82,11 @@ available as kernel attributes:
+================================+==================================+==================================+
|``time_gather_target_data`` |Cumulative time for communicating |``time_communicate_prepare`` |
| |connection information from | |
| |postsynaptic to presynaptic side | |
| |postsynaptic to presynaptic side | |
+--------------------------------+----------------------------------+----------------------------------+
|``time_communicate_target_data``|Cumulative time for core MPI |``time_gather_target_data`` |
| |communication when gathering | |
| |target data | |
| |target data | |
JanVogelsang marked this conversation as resolved.
Show resolved Hide resolved
+--------------------------------+----------------------------------+----------------------------------+
|``time_update`` |Time for neuron update |``time_simulate`` |
+--------------------------------+----------------------------------+----------------------------------+
JanVogelsang marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
1 change: 0 additions & 1 deletion libnestutil/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ set( nestutil_sources
numerics.h numerics.cpp
regula_falsi.h
sort.h
stopwatch.h stopwatch.cpp
string_utils.h
vector_util.h
)
Expand Down
3 changes: 3 additions & 0 deletions libnestutil/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,9 @@
/* Whether to enable detailed NEST internal timers */
#cmakedefine TIMER_DETAILED 1

/* Whether to use one NEST internal timer per thread */
#cmakedefine THREADED_TIMERS 1

/* Whether to do full logging */
#cmakedefine ENABLE_FULL_LOGGING 1

Expand Down
33 changes: 0 additions & 33 deletions libnestutil/stopwatch.cpp

This file was deleted.

1 change: 1 addition & 0 deletions nestkernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ set ( nestkernel_sources
stimulation_backend.h
buffer_resize_log.h buffer_resize_log.cpp
nest_extension_interface.h
stopwatch.h stopwatch_impl.h
)


Expand Down
3 changes: 3 additions & 0 deletions nestkernel/connection_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#include "nest_names.h"
#include "node.h"
#include "sonata_connector.h"
#include "stopwatch_impl.h"
#include "target_table_devices_impl.h"
#include "vp_manager_impl.h"

Expand Down Expand Up @@ -1800,7 +1801,9 @@ nest::ConnectionManager::collect_compressed_spike_data( const size_t tid )
} // of omp single; implicit barrier

source_table_.collect_compressible_sources( tid );
kernel().get_omp_synchronization_stopwatch().start();
#pragma omp barrier
kernel().get_omp_synchronization_stopwatch().stop();
#pragma omp single
{
source_table_.fill_compressed_spike_data( compressed_spike_data_ );
Expand Down
2 changes: 1 addition & 1 deletion nestkernel/connection_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ class ConnectionManager : public ManagerInterface

// public stop watch for benchmarking purposes
// start and stop in high-level connect functions in nestmodule.cpp and nest.cpp
Stopwatch sw_construction_connect;
Stopwatch< StopwatchVerbosity::Normal, StopwatchType::MasterOnly > sw_construction_connect;

const std::vector< SpikeData >& get_compressed_spike_data( const synindex syn_id, const size_t idx );

Expand Down
66 changes: 32 additions & 34 deletions nestkernel/event_delivery_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,13 @@
#include "mpi_manager_impl.h"
#include "send_buffer_position.h"
#include "source.h"
#include "stopwatch_impl.h"
#include "vp_manager.h"
#include "vp_manager_impl.h"

// Includes from sli:
#include "dictutils.h"

#include "compose.hpp"

namespace nest
{

Expand Down Expand Up @@ -191,11 +190,9 @@ EventDeliveryManager::get_status( DictionaryDatum& dict )
( *dict )[ names::spike_buffer_resize_log ] = log_events;
send_recv_buffer_resize_log_.to_dict( log_events );

#ifdef TIMER_DETAILED
def< double >( dict, names::time_collocate_spike_data, sw_collocate_spike_data_.elapsed() );
def< double >( dict, names::time_communicate_spike_data, sw_communicate_spike_data_.elapsed() );
def< double >( dict, names::time_communicate_target_data, sw_communicate_target_data_.elapsed() );
#endif
sw_collocate_spike_data_.output_timer( dict, names::time_collocate_spike_data );
sw_communicate_spike_data_.output_timer( dict, names::time_communicate_spike_data );
sw_communicate_target_data_.output_timer( dict, names::time_communicate_target_data );
}

void
Expand Down Expand Up @@ -317,18 +314,14 @@ EventDeliveryManager::reset_counters()
void
EventDeliveryManager::reset_timers_for_preparation()
{
#ifdef TIMER_DETAILED
sw_communicate_target_data_.reset();
#endif
}

void
EventDeliveryManager::reset_timers_for_dynamics()
{
#ifdef TIMER_DETAILED
sw_collocate_spike_data_.reset();
sw_communicate_spike_data_.reset();
#endif
}

void
Expand Down Expand Up @@ -400,11 +393,7 @@ EventDeliveryManager::gather_spike_data_( std::vector< SpikeDataT >& send_buffer
// Need to get new positions in case buffer size has changed
SendBufferPosition send_buffer_position;

#ifdef TIMER_DETAILED
{
sw_collocate_spike_data_.start();
}
#endif
sw_collocate_spike_data_.start();

// Set marker at end of each chunk to DEFAULT
reset_complete_marker_spike_data_( send_buffer_position, send_buffer );
Expand All @@ -426,12 +415,13 @@ EventDeliveryManager::gather_spike_data_( std::vector< SpikeDataT >& send_buffer
// as all chunk-end entries, have marker DEFAULT.
set_end_marker_( send_buffer_position, send_buffer, local_max_spikes_per_rank );

#ifdef TIMER_DETAILED
{
sw_collocate_spike_data_.stop();
sw_communicate_spike_data_.start();
}
sw_collocate_spike_data_.stop();
#if defined( HAVE_MPI ) && defined( TIMER_DETAILED )
kernel().get_mpi_synchronization_stopwatch().start();
kernel().mpi_manager.synchronize();
kernel().get_mpi_synchronization_stopwatch().stop();
#endif
sw_communicate_spike_data_.start();
JanVogelsang marked this conversation as resolved.
Show resolved Hide resolved

// Given that we templatize by plain vs offgrid, this if should not be necessary, but ...
if ( off_grid_spiking_ )
Expand All @@ -443,11 +433,7 @@ EventDeliveryManager::gather_spike_data_( std::vector< SpikeDataT >& send_buffer
kernel().mpi_manager.communicate_spike_data_Alltoall( send_buffer, recv_buffer );
}

#ifdef TIMER_DETAILED
{
sw_communicate_spike_data_.stop();
}
#endif
sw_communicate_spike_data_.stop();

global_max_spikes_per_rank_ = get_global_max_spikes_per_rank_( send_buffer_position, recv_buffer );

Expand Down Expand Up @@ -811,7 +797,9 @@ EventDeliveryManager::gather_target_data( const size_t tid )
resize_send_recv_buffers_target_data();
}
} // of omp master; (no barrier)
kernel().get_omp_synchronization_stopwatch().start();
#pragma omp barrier
kernel().get_omp_synchronization_stopwatch().stop();

kernel().connection_manager.restore_source_table_entry_point( tid );

Expand All @@ -826,18 +814,21 @@ EventDeliveryManager::gather_target_data( const size_t tid )
set_complete_marker_target_data_( assigned_ranks, send_buffer_position );
}
kernel().connection_manager.save_source_table_entry_point( tid );
kernel().get_omp_synchronization_stopwatch().start();
#pragma omp barrier
kernel().get_omp_synchronization_stopwatch().stop();
kernel().connection_manager.clean_source_table( tid );

#pragma omp master
{
#ifdef TIMER_DETAILED
sw_communicate_target_data_.start();
#if defined( HAVE_MPI ) && defined( TIMER_DETAILED )
kernel().get_mpi_synchronization_stopwatch().start();
JanVogelsang marked this conversation as resolved.
Show resolved Hide resolved
kernel().mpi_manager.synchronize();
kernel().get_mpi_synchronization_stopwatch().stop();
#endif
sw_communicate_target_data_.start();
kernel().mpi_manager.communicate_target_data_Alltoall( send_buffer_target_data_, recv_buffer_target_data_ );
#ifdef TIMER_DETAILED
sw_communicate_target_data_.stop();
#endif
} // of omp master (no barriers!)
#pragma omp barrier

Expand Down Expand Up @@ -883,7 +874,9 @@ EventDeliveryManager::gather_target_data_compressed( const size_t tid )
resize_send_recv_buffers_target_data();
}
} // of omp master; no barrier
kernel().get_omp_synchronization_stopwatch().start();
#pragma omp barrier
kernel().get_omp_synchronization_stopwatch().stop();

TargetSendBufferPosition send_buffer_position(
assigned_ranks, kernel().mpi_manager.get_send_recv_count_target_data_per_rank() );
Expand All @@ -898,17 +891,20 @@ EventDeliveryManager::gather_target_data_compressed( const size_t tid )
set_complete_marker_target_data_( assigned_ranks, send_buffer_position );
}

kernel().get_omp_synchronization_stopwatch().start();
#pragma omp barrier
kernel().get_omp_synchronization_stopwatch().stop();

#pragma omp master
{
#ifdef TIMER_DETAILED
sw_communicate_target_data_.start();
#if defined( HAVE_MPI ) && defined( TIMER_DETAILED )
kernel().get_mpi_synchronization_stopwatch().start();
kernel().mpi_manager.synchronize();
kernel().get_mpi_synchronization_stopwatch().stop();
#endif
sw_communicate_target_data_.start();
kernel().mpi_manager.communicate_target_data_Alltoall( send_buffer_target_data_, recv_buffer_target_data_ );
#ifdef TIMER_DETAILED
sw_communicate_target_data_.stop();
#endif
} // of omp master (no barrier)
#pragma omp barrier

Expand All @@ -925,7 +921,9 @@ EventDeliveryManager::gather_target_data_compressed( const size_t tid )
{
buffer_size_target_data_has_changed_ = kernel().mpi_manager.increase_buffer_size_target_data();
} // of omp master (no barrier)
kernel().get_omp_synchronization_stopwatch().start();
#pragma omp barrier
kernel().get_omp_synchronization_stopwatch().stop();
}

} // of while
Expand Down
9 changes: 4 additions & 5 deletions nestkernel/event_delivery_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -466,13 +466,12 @@ class EventDeliveryManager : public ManagerInterface

PerThreadBoolIndicator gather_completed_checker_;

#ifdef TIMER_DETAILED
// private stop watches for benchmarking purposes
// (intended for internal core developers, not for use in the public API)
Stopwatch sw_collocate_spike_data_;
Stopwatch sw_communicate_spike_data_;
Stopwatch sw_communicate_target_data_;
#endif
// TODO JV: Make sure DETAILED_TIMERS is only ever used in stopwatch.h
Stopwatch< StopwatchVerbosity::Detailed, StopwatchType::MasterOnly > sw_collocate_spike_data_;
Stopwatch< StopwatchVerbosity::Detailed, StopwatchType::MasterOnly > sw_communicate_spike_data_;
Stopwatch< StopwatchVerbosity::Detailed, StopwatchType::MasterOnly > sw_communicate_target_data_;
};

inline void
Expand Down
Loading
Loading