From 1482f00275c4496ff9b2b7f348e39370c7547adc Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 19 Dec 2024 10:54:12 +0000 Subject: [PATCH 01/17] Add unsafe api calls checker to track down issues such as #4195 This checker is used to detect accidental thread scheduling switching points happening during profiling sampling. See the bigger comment in unsafe_api_calls_check.h . I was able to check that this checker correctly triggers for the bug in #4195, and also the bug I'm going to fix next, which is the use of `rb_hash_lookup` in the otel context reading code. --- .../collectors_thread_context.c | 78 +++++++++++++++++-- .../profiling.c | 2 + .../unsafe_api_calls_check.c | 47 +++++++++++ .../unsafe_api_calls_check.h | 25 ++++++ lib/datadog/core/configuration.rb | 2 +- .../collectors/thread_context_spec.rb | 23 +++--- 6 files changed, 159 insertions(+), 18 deletions(-) create mode 100644 ext/datadog_profiling_native_extension/unsafe_api_calls_check.c create mode 100644 ext/datadog_profiling_native_extension/unsafe_api_calls_check.h diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 0b56268bed5..b9606797106 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -9,6 +9,7 @@ #include "private_vm_api_access.h" #include "stack_recorder.h" #include "time_helpers.h" +#include "unsafe_api_calls_check.h" // Used to trigger sampling of threads, based on external "events", such as: // * periodic timer for cpu-time and wall-time @@ -203,10 +204,10 @@ static int hash_map_per_thread_context_mark(st_data_t key_thread, st_data_t _val static int hash_map_per_thread_context_free_values(st_data_t _thread, st_data_t value_per_thread_context, st_data_t _argument); static VALUE _native_new(VALUE klass); static VALUE _native_initialize(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self); -static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread); +static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread, VALUE allow_exception); static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance); static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance); -static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state); +static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state, VALUE allow_exception); static void update_metrics_and_sample( struct thread_context_collector_state *state, VALUE thread_being_sampled, @@ -310,11 +311,11 @@ void collectors_thread_context_init(VALUE profiling_module) { rb_define_singleton_method(collectors_thread_context_class, "_native_initialize", _native_initialize, -1); rb_define_singleton_method(collectors_thread_context_class, "_native_inspect", _native_inspect, 1); rb_define_singleton_method(collectors_thread_context_class, "_native_reset_after_fork", _native_reset_after_fork, 1); - rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 2); + rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 3); rb_define_singleton_method(testing_module, "_native_sample_allocation", _native_sample_allocation, 3); rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1); rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1); - rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 2); + rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 3); rb_define_singleton_method(testing_module, "_native_thread_list", _native_thread_list, 0); rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1); rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1); @@ -504,31 +505,49 @@ static VALUE _native_initialize(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _sel // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. -static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE profiler_overhead_stack_thread) { +static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE profiler_overhead_stack_thread, VALUE allow_exception) { + ENFORCE_BOOLEAN(allow_exception); + if (!is_thread_alive(profiler_overhead_stack_thread)) rb_raise(rb_eArgError, "Unexpected: profiler_overhead_stack_thread is not alive"); + if (allow_exception == Qfalse) debug_enter_unsafe_context(); + thread_context_collector_sample(collector_instance, monotonic_wall_time_now_ns(RAISE_ON_FAILURE), profiler_overhead_stack_thread); + + if (allow_exception == Qfalse) debug_leave_unsafe_context(); + return Qtrue; } // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. static VALUE _native_on_gc_start(DDTRACE_UNUSED VALUE self, VALUE collector_instance) { + debug_enter_unsafe_context(); + thread_context_collector_on_gc_start(collector_instance); + + debug_leave_unsafe_context(); + return Qtrue; } // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. static VALUE _native_on_gc_finish(DDTRACE_UNUSED VALUE self, VALUE collector_instance) { + debug_enter_unsafe_context(); + (void) !thread_context_collector_on_gc_finish(collector_instance); + + debug_leave_unsafe_context(); + return Qtrue; } // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. -static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state) { +static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state, VALUE allow_exception) { ENFORCE_BOOLEAN(reset_monotonic_to_system_state); + ENFORCE_BOOLEAN(allow_exception); struct thread_context_collector_state *state; TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); @@ -537,7 +556,12 @@ static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_ state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER; } + if (allow_exception == Qfalse) debug_enter_unsafe_context(); + thread_context_collector_sample_after_gc(collector_instance); + + if (allow_exception == Qfalse) debug_leave_unsafe_context(); + return Qtrue; } @@ -982,7 +1006,13 @@ static void trigger_sample_for_thread( // It SHOULD NOT be used for other purposes. static VALUE _native_thread_list(DDTRACE_UNUSED VALUE _self) { VALUE result = rb_ary_new(); + + debug_enter_unsafe_context(); + ddtrace_thread_list(result); + + debug_leave_unsafe_context(); + return result; } @@ -1501,7 +1531,12 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec. // It SHOULD NOT be used for other purposes. static VALUE _native_sample_allocation(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE sample_weight, VALUE new_object) { + debug_enter_unsafe_context(); + thread_context_collector_sample_allocation(collector_instance, NUM2UINT(sample_weight), new_object); + + debug_leave_unsafe_context(); + return Qtrue; } @@ -1640,7 +1675,12 @@ void thread_context_collector_sample_skipped_allocation_samples(VALUE self_insta } static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples) { + debug_enter_unsafe_context(); + thread_context_collector_sample_skipped_allocation_samples(collector_instance, NUM2UINT(skipped_samples)); + + debug_leave_unsafe_context(); + return Qtrue; } @@ -1979,31 +2019,53 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) { static VALUE _native_on_gvl_waiting(DDTRACE_UNUSED VALUE self, VALUE thread) { ENFORCE_THREAD(thread); + debug_enter_unsafe_context(); + thread_context_collector_on_gvl_waiting(thread_from_thread_object(thread)); + + debug_leave_unsafe_context(); + return Qnil; } static VALUE _native_gvl_waiting_at_for(DDTRACE_UNUSED VALUE self, VALUE thread) { ENFORCE_THREAD(thread); + debug_enter_unsafe_context(); + intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread); + + debug_leave_unsafe_context(); + return LONG2NUM(gvl_waiting_at); } static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread) { ENFORCE_THREAD(thread); - return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) == ON_GVL_RUNNING_SAMPLE ? Qtrue : Qfalse; + debug_enter_unsafe_context(); + + VALUE result = thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) == ON_GVL_RUNNING_SAMPLE ? Qtrue : Qfalse; + + debug_leave_unsafe_context(); + + return result; } static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) { ENFORCE_THREAD(thread); - return thread_context_collector_sample_after_gvl_running( + debug_enter_unsafe_context(); + + VALUE result = thread_context_collector_sample_after_gvl_running( collector_instance, thread, monotonic_wall_time_now_ns(RAISE_ON_FAILURE) ); + + debug_leave_unsafe_context(); + + return result; } static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns) { diff --git a/ext/datadog_profiling_native_extension/profiling.c b/ext/datadog_profiling_native_extension/profiling.c index 315ca0d2954..a7bfe0d466b 100644 --- a/ext/datadog_profiling_native_extension/profiling.c +++ b/ext/datadog_profiling_native_extension/profiling.c @@ -11,6 +11,7 @@ #include "ruby_helpers.h" #include "setup_signal_handler.h" #include "time_helpers.h" +#include "unsafe_api_calls_check.h" // Each class/module here is implemented in their separate file void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module); @@ -56,6 +57,7 @@ void DDTRACE_EXPORT Init_datadog_profiling_native_extension(void) { collectors_thread_context_init(profiling_module); http_transport_init(profiling_module); stack_recorder_init(profiling_module); + unsafe_api_calls_check_init(); // Hosts methods used for testing the native code using RSpec VALUE testing_module = rb_define_module_under(native_extension_module, "Testing"); diff --git a/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c new file mode 100644 index 00000000000..c3c23b7f1c7 --- /dev/null +++ b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c @@ -0,0 +1,47 @@ +#include +#include +#include + +#include "datadog_ruby_common.h" +#include "unsafe_api_calls_check.h" +#include "extconf.h" + +static bool inside_unsafe_context = false; + +#ifndef NO_POSTPONED_TRIGGER + static rb_postponed_job_handle_t check_for_unsafe_api_calls_handle; +#endif + +static void check_for_unsafe_api_calls(DDTRACE_UNUSED void *_unused); + +void unsafe_api_calls_check_init(void) { + #ifndef NO_POSTPONED_TRIGGER + int unused_flags = 0; + + check_for_unsafe_api_calls_handle = rb_postponed_job_preregister(unused_flags, check_for_unsafe_api_calls, NULL); + + if (check_for_unsafe_api_calls_handle == POSTPONED_JOB_HANDLE_INVALID) { + rb_raise(rb_eRuntimeError, "Failed to register check_for_unsafe_api_calls_handle postponed job (got POSTPONED_JOB_HANDLE_INVALID)"); + } + #endif +} + +void debug_enter_unsafe_context(void) { + inside_unsafe_context = true; + + #ifndef NO_POSTPONED_TRIGGER + rb_postponed_job_trigger(check_for_unsafe_api_calls_handle); + #else + rb_postponed_job_register(0, check_for_unsafe_api_calls, NULL); + #endif +} + +void debug_leave_unsafe_context(void) { + inside_unsafe_context = false; +} + +static void check_for_unsafe_api_calls(DDTRACE_UNUSED void *_unused) { + if (inside_unsafe_context) rb_bug( + "Datadog Ruby profiler detected callback nested inside sample. Please report this at https://github.com/datadog/dd-trace-rb/blob/master/CONTRIBUTING.md#found-a-bug" + ); +} diff --git a/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h new file mode 100644 index 00000000000..092a59f30b3 --- /dev/null +++ b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h @@ -0,0 +1,25 @@ +#pragma once + +// This checker is used to detect accidental thread scheduling switching points happening during profiling sampling. +// +// Specifically, when the profiler is sampling, we're never supposed to call into Ruby code (e.g. methods +// implemented using Ruby code) or allocate Ruby objects. +// That's because those events introduce thread switch points, and really we don't the VM switching between threads +// in the middle of the profiler sampling. +// This includes raising exceptions, unless we're trying to stop the profiler, and even then we must be careful. +// +// The above is especially true in situations such as GC profiling or allocation/heap profiling, as in those situations +// we can even crash the Ruby VM if we switch away at the wrong time. +// +// The below APIs can be used to detect these situations. They work by relying on the following observation: +// in most (all?) thread switch points, Ruby will check for interrupts and run the postponed jobs. +// +// Thus, if we set a flag while we're sampling (inside_unsafe_context), trigger the postponed job, and then only unset +// the flag after sampling, he correct thing to happen is that the postponed job should never see the flag. +// +// If, however, we have a bug and there's a thread switch point, our postponed job will see the flag and immediately +// stop the Ruby VM before further damage happens (and hopefully giving us a stack trace clearly pointing to the culprit). + +void unsafe_api_calls_check_init(void); +void debug_enter_unsafe_context(void); +void debug_leave_unsafe_context(void); diff --git a/lib/datadog/core/configuration.rb b/lib/datadog/core/configuration.rb index 826a3bb414a..4b2b21d6ee7 100644 --- a/lib/datadog/core/configuration.rb +++ b/lib/datadog/core/configuration.rb @@ -236,7 +236,7 @@ def safely_synchronize rescue ThreadError => e logger_without_components.error( 'Detected deadlock during datadog initialization. ' \ - 'Please report this at https://github.com/DataDog/dd-trace-rb/blob/master/CONTRIBUTING.md#found-a-bug' \ + 'Please report this at https://github.com/datadog/dd-trace-rb/blob/master/CONTRIBUTING.md#found-a-bug' \ "\n\tSource:\n\t#{Array(e.backtrace).join("\n\t")}" ) nil diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index e7b1bd28ba0..7145729bcec 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -66,8 +66,8 @@ end end - def sample(profiler_overhead_stack_thread: Thread.current) - described_class::Testing._native_sample(cpu_and_wall_time_collector, profiler_overhead_stack_thread) + def sample(profiler_overhead_stack_thread: Thread.current, allow_exception: false) + described_class::Testing._native_sample(cpu_and_wall_time_collector, profiler_overhead_stack_thread, allow_exception) end def on_gc_start @@ -78,8 +78,12 @@ def on_gc_finish described_class::Testing._native_on_gc_finish(cpu_and_wall_time_collector) end - def sample_after_gc(reset_monotonic_to_system_state: false) - described_class::Testing._native_sample_after_gc(cpu_and_wall_time_collector, reset_monotonic_to_system_state) + def sample_after_gc(reset_monotonic_to_system_state: false, allow_exception: false) + described_class::Testing._native_sample_after_gc( + cpu_and_wall_time_collector, + reset_monotonic_to_system_state, + allow_exception, + ) end def sample_allocation(weight:, new_object: Object.new) @@ -782,18 +786,18 @@ def otel_span_id_to_i(span_id) ) end - context 'raises an exception' do + context 'when an exception is raised' do before { setup_failure } after { expect(ran_log).to eq [:ran_code] } it 'does not leave the exception pending' do - sample + sample(allow_exception: true) expect($!).to be nil end it 'omits the "local root span id" and "span id" labels in the sample' do - sample + sample(allow_exception: true) expect(t1_sample.labels.keys).to_not include(:"local root span id", :"span id") end @@ -1433,7 +1437,7 @@ def sample_and_check(expected_state:) context "when called before on_gc_start/on_gc_finish" do it do - expect { sample_after_gc }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) + expect { sample_after_gc(allow_exception: true) }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) end end @@ -1451,7 +1455,8 @@ def sample_and_check(expected_state:) it do sample_after_gc - expect { sample_after_gc }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) + expect { sample_after_gc(allow_exception: true) } + .to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) end end From 1c26fd855211760e26278406b44d06cef76fb646 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 19 Dec 2024 14:17:03 +0000 Subject: [PATCH 02/17] Fix going into Ruby code when looking up otel context `rb_hash_lookup` calls `#hash` on the key being looked up so it's safe to use unless during sampling. This can cause the same issue as we saw in #4195 leading to ``` [BUG] unexpected situation - recordd:1 current:0 -- C level backtrace information ------------------------------------------- ruby(rb_print_backtrace+0x11) [0x55ba03ccf90f] vm_dump.c:820 ruby(rb_vm_bugreport) vm_dump.c:1151 ruby(bug_report_end+0x0) [0x55ba03e91607] error.c:1042 ruby(rb_bug_without_die) error.c:1042 ruby(die+0x0) [0x55ba03ac0998] error.c:1050 ruby(rb_bug) error.c:1052 ruby(disallow_reentry+0x0) [0x55ba03ab6dcc] vm_sync.c:226 ruby(rb_ec_vm_lock_rec_check+0x1a) [0x55ba03cb17aa] eval_intern.h:144 ruby(rb_ec_tag_state) eval_intern.h:155 ruby(rb_vm_exec) vm.c:2484 ruby(vm_invoke_proc+0x201) [0x55ba03cb62b1] vm.c:1509 ruby(rb_vm_invoke_proc+0x33) [0x55ba03cb65d3] vm.c:1728 ruby(thread_do_start_proc+0x176) [0x55ba03c63516] thread.c:598 ruby(thread_do_start+0x12) [0x55ba03c648a2] thread.c:615 ruby(thread_start_func_2) thread.c:672 ruby(nt_start+0x107) [0x55ba03c65137] thread_pthread.c:2187 /lib/x86_64-linux-gnu/libpthread.so.0(start_thread+0xd9) [0x7ff360b66609] /lib/x86_64-linux-gnu/libc.so.6(clone+0x43) [0x7ff360a70353] ``` --- .../collectors_thread_context.c | 39 ++++++++- .../collectors/thread_context_spec.rb | 84 ++++++++++++++++++- 2 files changed, 120 insertions(+), 3 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index b9606797106..b8e7aedfefb 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -291,6 +291,7 @@ static void otel_without_ddtrace_trace_identifiers_for( ); static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_span_key); static uint64_t otel_span_id_to_uint(VALUE otel_span_id); +static VALUE safely_lookup_hash_without_going_into_ruby_code(VALUE hash, VALUE key); void collectors_thread_context_init(VALUE profiling_module) { VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors"); @@ -1632,7 +1633,7 @@ static void ddtrace_otel_trace_identifiers_for( // trace and span representing it. Each ddtrace trace is then connected to the previous otel span, forming a linked // list. The local root span is going to be the trace/span we find at the end of this linked list. while (otel_values != Qnil) { - VALUE otel_span = rb_hash_lookup(otel_values, otel_current_span_key); + VALUE otel_span = safely_lookup_hash_without_going_into_ruby_code(otel_values, otel_current_span_key); if (otel_span == Qnil) break; VALUE next_trace = rb_ivar_get(otel_span, at_datadog_trace_id); if (next_trace == Qnil) break; @@ -1766,7 +1767,7 @@ static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_sp if (context_entries == Qnil || !RB_TYPE_P(context_entries, T_HASH)) return failed; // If it exists, context_entries is expected to be a Hash[OpenTelemetry::Context::Key, OpenTelemetry::Trace::Span] - VALUE span = rb_hash_lookup(context_entries, otel_current_span_key); + VALUE span = safely_lookup_hash_without_going_into_ruby_code(context_entries, otel_current_span_key); if (span == Qnil) return failed; // If it exists, span_context is expected to be a OpenTelemetry::Trace::SpanContext (don't confuse it with OpenTelemetry::Context) @@ -2092,3 +2093,37 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) { DDTRACE_UNUSED long current_cpu_time_ns ) { return false; } #endif // NO_GVL_INSTRUMENTATION + +#define MAX_SAFE_LOOKUP_SIZE 16 + +typedef struct { VALUE lookup_key; VALUE result; } safe_lookup_hash_state; + +static int safe_lookup_hash_iterate(VALUE key, VALUE value, VALUE state_ptr) { + safe_lookup_hash_state *state = (safe_lookup_hash_state *) state_ptr; + + if (key == state->lookup_key) { + state->result = value; + return ST_STOP; + } + + return ST_CONTINUE; +} + +// This method exists because we need to look up a hash during sampling, but we don't want to invoke any +// Ruby code as a side effect. To be able to look up by hash, `rb_hash_lookup` calls `#hash` on the key, +// which we want to avoid. +// Thus, instead, we opt to just iterate through the hash and check if we can find what we're looking for. +// +// To avoid having too much overhead here we only iterate in hashes up to MAX_SAFE_LOOKUP_SIZE. +// Note that we don't even try to iterate if the hash is bigger -- this is to avoid flaky behavior where +// depending on the internal storage order of the hash we may or not find the key, and instead we always +// enforce the size. +static VALUE safely_lookup_hash_without_going_into_ruby_code(VALUE hash, VALUE key) { + if (!RB_TYPE_P(hash, T_HASH) || RHASH_SIZE(hash) > MAX_SAFE_LOOKUP_SIZE) return Qnil; + + safe_lookup_hash_state state = {.lookup_key = key, .result = Qnil}; + + rb_hash_foreach(hash, safe_lookup_hash_iterate, (VALUE) &state); + + return state.result; +} diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index 7145729bcec..0296be0ff72 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -588,6 +588,7 @@ def self.otel_otlp_exporter_available? false end + # When opentelemetry-sdk is on the Gemfile, but not opentelemetry-exporter-otlp context "when trace comes from otel sdk", if: otel_sdk_available? && !otel_otlp_exporter_available? do let(:otel_tracer) do require "datadog/opentelemetry" @@ -622,6 +623,31 @@ def self.otel_otlp_exporter_available? expect(t1_sample.labels).to_not include("trace endpoint": anything) end + describe 'accessing the current span' do + before do + allow(Datadog.logger).to receive(:error) + + # initialize otel context reading + sample + # clear samples + recorder.serialize! + end + + it 'does not try to hash the CURRENT_SPAN_KEY' do + inner_check_ran = false + + otel_tracer.in_span("profiler.test") do |_span| + expect(OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY)).to_not receive(:hash) + + sample_allocation(weight: 1) + + inner_check_ran = true + end + + expect(inner_check_ran).to be true + end + end + context "when there are multiple otel spans nested" do let(:t1) do Thread.new(ready_queue, otel_tracer) do |ready_queue, otel_tracer| @@ -721,6 +747,7 @@ def self.otel_otlp_exporter_available? end end + # When opentelemetry-sdk AND opentelemetry-exporter-otlp are on the Gemfile context( "when trace comes from otel sdk and the ddtrace otel support is not loaded", if: otel_sdk_available? && otel_otlp_exporter_available? @@ -769,7 +796,7 @@ def otel_span_id_to_i(span_id) expect(t1_sample.labels).to_not include("trace endpoint": anything) end - context 'reading CURRENT_SPAN_KEY' do + describe 'reading CURRENT_SPAN_KEY into otel_current_span_key' do let!(:ran_log) { [] } let(:setup_failure) do @@ -818,6 +845,61 @@ def otel_span_id_to_i(span_id) end end + describe 'accessing the current span' do + before do + allow(OpenTelemetry.logger).to receive(:error) + + # initialize otel context reading + sample + # clear samples + recorder.serialize! + end + + it 'does not try to hash the CURRENT_SPAN_KEY' do + inner_check_ran = false + + otel_tracer.in_span("profiler.test") do |_span| + expect(OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY)).to_not receive(:hash) + + sample_allocation(weight: 1) + + inner_check_ran = true + end + + expect(inner_check_ran).to be true + end + + context 'when there are more than MAX_SAFE_LOOKUP_SIZE entries in the otel context' do + let(:max_safe_lookup_size) { 16 } # Value of MAX_SAFE_LOOKUP_SIZE in C code + + it 'does not try to look up the context' do + otel_tracer.in_span("profiler.test") do |_span| + current_size = OpenTelemetry::Context.current.instance_variable_get(:@entries).size + + OpenTelemetry::Context.with_values( + Array.new((max_safe_lookup_size + 1 - current_size)) { |it| ["key_#{it}", it] }.to_h + ) do + sample_allocation(weight: 12) + end + + OpenTelemetry::Context.with_values( + Array.new((max_safe_lookup_size - current_size)) { |it| ["key_#{it}", it] }.to_h + ) do + sample_allocation(weight: 34) + end + end + + result = samples_for_thread(samples, Thread.current) + + expect(result.size).to be 2 + expect(result.find { |it| it.values.fetch(:"alloc-samples") == 12 }.labels.keys) + .to_not include(:"local root span id", :"span id") + expect(result.find { |it| it.values.fetch(:"alloc-samples") == 34 }.labels.keys) + .to include(:"local root span id", :"span id") + end + end + end + context 'when otel_context_enabled is false' do let(:otel_context_enabled) { false } From eeed3a95ab6774dcbfce1929eadf5b860a0670d5 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 19 Dec 2024 14:42:39 +0000 Subject: [PATCH 03/17] Avoid trying to sample allocations when VM is raising exception During my experiments to reproduce issues around allocation profiling, I've noted that the VM is in an especially delicate state during exception raising, so let's just decline to sample in this situation. --- .../collectors_cpu_and_wall_time_worker.c | 10 ++++++++++ .../private_vm_api_access.c | 3 +++ .../private_vm_api_access.h | 2 ++ 3 files changed, 15 insertions(+) diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index e4e9b69473e..723e2d93b33 100644 --- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -1171,6 +1171,16 @@ static void on_newobj_event(DDTRACE_UNUSED VALUE unused1, DDTRACE_UNUSED void *u return; } + // If Ruby is in the middle of raising an exception, we don't want to try to sample. This is because if we accidentally + // trigger an exception inside the profiler code, bad things will happen (specifically, Ruby will try to kill off the + // thread even though we may try to catch the exception). + // + // Note that "in the middle of raising an exception" means the exception itself has already been allocated. + // What's getting allocated now is probably the backtrace objects (@ivoanjo or at least that's what I've observed) + if (is_raised_flag_set(rb_thread_current())) { + return; + } + // Hot path: Dynamic sampling rate is usually enabled and the sampling decision is usually false if (RB_LIKELY(state->dynamic_sampling_rate_enabled && !discrete_dynamic_sampler_should_sample(&state->allocation_sampler))) { state->stats.allocation_skipped++; diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.c b/ext/datadog_profiling_native_extension/private_vm_api_access.c index 59e5c644882..f7c188861d7 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.c +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.c @@ -800,3 +800,6 @@ static inline int ddtrace_imemo_type(VALUE imemo) { return current_thread; } #endif + +// Is the VM smack in the middle of raising an exception? +bool is_raised_flag_set(VALUE thread) { return thread_struct_from_object(thread)->ec->raised_flag > 0; } diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.h b/ext/datadog_profiling_native_extension/private_vm_api_access.h index c40992274cb..3e412f51ea5 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.h +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.h @@ -68,3 +68,5 @@ const char *imemo_kind(VALUE imemo); #define ENFORCE_THREAD(value) \ { if (RB_UNLIKELY(!rb_typeddata_is_kind_of(value, RTYPEDDATA_TYPE(rb_thread_current())))) raise_unexpected_type(value, ADD_QUOTES(value), "Thread", __FILE__, __LINE__, __func__); } + +bool is_raised_flag_set(VALUE thread); From 31997bff2d41df72089d19a4d8aa8d4ecc530904 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 19 Dec 2024 14:51:46 +0000 Subject: [PATCH 04/17] Update tests with new signatures for test methods --- benchmarks/profiler_gc.rb | 6 +++--- benchmarks/profiler_sample_gvl.rb | 2 +- benchmarks/profiler_sample_loop_v2.rb | 2 +- benchmarks/profiler_sample_serialize.rb | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/profiler_gc.rb b/benchmarks/profiler_gc.rb index e96198cdfba..401af152d84 100644 --- a/benchmarks/profiler_gc.rb +++ b/benchmarks/profiler_gc.rb @@ -14,7 +14,7 @@ def create_profiler # We take a dummy sample so that the context for the main thread is created, as otherwise the GC profiling methods do # not create it (because we don't want to do memory allocations in the middle of GC) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, Thread.current) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, Thread.current, false) end def run_benchmark @@ -29,7 +29,7 @@ def run_benchmark x.report('profiler gc') do Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_start(@collector) Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_finish(@collector) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false, false) end x.save! "#{File.basename(__FILE__)}-results.json" unless VALIDATE_BENCHMARK_MODE @@ -52,7 +52,7 @@ def run_benchmark estimated_gc_per_minute.times do Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_start(@collector) Datadog::Profiling::Collectors::ThreadContext::Testing._native_on_gc_finish(@collector) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample_after_gc(@collector, false, false) end @recorder.serialize diff --git a/benchmarks/profiler_sample_gvl.rb b/benchmarks/profiler_sample_gvl.rb index 76e8528f417..7688730b714 100644 --- a/benchmarks/profiler_sample_gvl.rb +++ b/benchmarks/profiler_sample_gvl.rb @@ -27,7 +27,7 @@ def initialize @target_thread = thread_with_very_deep_stack # Sample once to trigger thread context creation for all threads (including @target_thread) - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD, false) end def create_profiler diff --git a/benchmarks/profiler_sample_loop_v2.rb b/benchmarks/profiler_sample_loop_v2.rb index e007375cf6c..624f29bad25 100644 --- a/benchmarks/profiler_sample_loop_v2.rb +++ b/benchmarks/profiler_sample_loop_v2.rb @@ -37,7 +37,7 @@ def run_benchmark ) x.report("stack collector #{ENV['CONFIG']}") do - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD, false) end x.save! "#{File.basename(__FILE__)}-results.json" unless VALIDATE_BENCHMARK_MODE diff --git a/benchmarks/profiler_sample_serialize.rb b/benchmarks/profiler_sample_serialize.rb index 47ae1b6854a..b90997c42d5 100644 --- a/benchmarks/profiler_sample_serialize.rb +++ b/benchmarks/profiler_sample_serialize.rb @@ -35,7 +35,7 @@ def run_benchmark simulate_seconds = 60 (samples_per_second * simulate_seconds).times do - Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD) + Datadog::Profiling::Collectors::ThreadContext::Testing._native_sample(@collector, PROFILER_OVERHEAD_STACK_THREAD, false) end @recorder.serialize From 54c9d05d576f0a8bde9b35703101ac4644b1619e Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 19 Dec 2024 14:56:37 +0000 Subject: [PATCH 05/17] Check if symbol is static before calling SYM2ID on it It occurs to me that if a symbol is dynamic, we were causing it to become a static symbol (e.g. making it never be able to be garbage collected). This can be very bad! And also, we know the symbol we're looking for must be a static symbol because if nothing else, our initialization caused it to become a static symbol. Thus, if we see a dynamic symbol, we can stop there, since by definition it won't be the symbol we're looking after. This is... really awkward to add a specific unit test for, so I've just relied on our existing test coverage to show that this has not affected the correctness of our otel code. --- .../collectors_thread_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index b8e7aedfefb..4afb23c5a9e 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -1750,7 +1750,7 @@ static void otel_without_ddtrace_trace_identifiers_for( VALUE root_span_type = rb_ivar_get(local_root_span.span, at_kind_id /* @kind */); // We filter out spans that don't have `kind: :server` - if (root_span_type == Qnil || !RB_TYPE_P(root_span_type, T_SYMBOL) || SYM2ID(root_span_type) != server_id) return; + if (root_span_type == Qnil || !RB_TYPE_P(root_span_type, T_SYMBOL) || !RB_STATIC_SYM_P(root_span_type) || SYM2ID(root_span_type) != server_id) return; VALUE trace_resource = rb_ivar_get(local_root_span.span, at_name_id /* @name */); if (!RB_TYPE_P(trace_resource, T_STRING)) return; From 7e493241465e0a6e1a817dc0be9b4e5502bb6a9a Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 19 Dec 2024 17:20:49 +0000 Subject: [PATCH 06/17] Document that unsafe api calls checker is only for test code --- .../unsafe_api_calls_check.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h index 092a59f30b3..c6c8dc56cf5 100644 --- a/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h +++ b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.h @@ -21,5 +21,11 @@ // stop the Ruby VM before further damage happens (and hopefully giving us a stack trace clearly pointing to the culprit). void unsafe_api_calls_check_init(void); + +// IMPORTANT: This method **MUST** only be called from test code, as it causes an immediate hard-crash on the Ruby VM +// when it detects a potential issue, and that's not something we want for production apps. +// +// In the future we may introduce some kind of setting (off by default) to also allow this to be safely be used +// in production code if needed. void debug_enter_unsafe_context(void); void debug_leave_unsafe_context(void); From 6fac7c0ed8ac7070268badea7d94bf6a18d0db44 Mon Sep 17 00:00:00 2001 From: Andrey Marchenko Date: Thu, 2 Jan 2025 12:07:29 +0100 Subject: [PATCH 07/17] use Ruby 3.4.1 for test-memcheck GHA --- .github/workflows/test-memory-leaks.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-memory-leaks.yaml b/.github/workflows/test-memory-leaks.yaml index e1842cb38b6..44d7cb094bf 100644 --- a/.github/workflows/test-memory-leaks.yaml +++ b/.github/workflows/test-memory-leaks.yaml @@ -7,10 +7,10 @@ jobs: - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: - ruby-version: 3.4.0-preview2 # TODO: Use stable version once 3.4 is out + ruby-version: 3.4.1 bundler-cache: true # runs 'bundle install' and caches installed gems automatically bundler: latest - cache-version: v1 # bump this to invalidate cache + cache-version: v2 # bump this to invalidate cache - run: sudo apt-get update && (sudo apt-get install -y valgrind || sleep 5 && sudo apt-get install -y valgrind) && valgrind --version - run: gem update --system 3.5.23 # TODO: This is a workaround for a buggy rubygems in 3.4.0-preview2; remove once stable version 3.4 is out - run: bundle exec rake compile spec:profiling:memcheck From f30168183204a7dc37f6a608ca1d18973dbd79c0 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 2 Jan 2025 11:24:52 +0000 Subject: [PATCH 08/17] Update exceptions file with another variant of thread creation memory leak Since our exceptions match on the stack, they are affected by internal naming changes, and it looks like a new `ruby_xcalloc_body` function is now showing up in the stack. --- suppressions/ruby-3.4.supp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/suppressions/ruby-3.4.supp b/suppressions/ruby-3.4.supp index 646151fa9fb..71af0c40baa 100644 --- a/suppressions/ruby-3.4.supp +++ b/suppressions/ruby-3.4.supp @@ -77,3 +77,19 @@ obj:/usr/bin/tr ... } + +# When a Ruby process forks, it looks like Ruby doesn't clean up the memory of old threads? +{ + ruby-native-thread-memory-4 + Memcheck:Leak + fun:calloc + fun:calloc1 + fun:rb_gc_impl_calloc + fun:ruby_xcalloc_body + fun:ruby_xcalloc + fun:native_thread_alloc + fun:native_thread_create_dedicated + fun:native_thread_create + fun:thread_create_core + ... +} From 849a3221a5d248237a8f5034ff09fa1fdcb78a4d Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 2 Jan 2025 11:33:06 +0000 Subject: [PATCH 09/17] Introduce Ruby 3.5 gemfile variant for testing with dev builds This is waaay incomplete in terms of adding support for Ruby 3.5 but should get us going for ASAN testing for now. --- ruby-3.5.gemfile | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 ruby-3.5.gemfile diff --git a/ruby-3.5.gemfile b/ruby-3.5.gemfile new file mode 100644 index 00000000000..73960422f46 --- /dev/null +++ b/ruby-3.5.gemfile @@ -0,0 +1,75 @@ +source 'https://rubygems.org' + +gemspec + +gem 'base64' +gem 'benchmark-ips', '~> 2.8' +gem 'benchmark-memory', '< 0.2' # V0.2 only works with 2.5+ +gem 'bigdecimal' +gem 'climate_control', '~> 0.2.0' +gem 'concurrent-ruby' + +# Optional extensions +# TODO: Move this to Appraisals? +# dogstatsd v5, but lower than 5.2, has possible memory leak with datadog. +# @see https://github.com/DataDog/dogstatsd-ruby/issues/182 +gem 'dogstatsd-ruby', '>= 3.3.0', '!= 5.0.0', '!= 5.0.1', '!= 5.1.0' + +gem 'extlz4', '~> 0.3', '>= 0.3.3' + +# Profiler testing dependencies +# NOTE: We're excluding versions 3.7.0 and 3.7.1 for the reasons documented in #1424. +# Since most of our customers won't have BUNDLE_FORCE_RUBY_PLATFORM=true, it's not something we want to add +# to our CI, so we just shortcut and exclude specific versions that were affecting our CI. +gem 'google-protobuf', ['~> 3.0', '!= 3.7.0', '!= 3.7.1'] + +gem 'json-schema', '< 3' # V3 only works with 2.5+ +gem 'memory_profiler', '~> 0.9' +gem 'mutex_m' +gem 'os', '~> 1.1' +gem 'pimpmychangelog', '>= 0.1.2' +gem 'pry' +gem 'pry-stack_explorer' +gem 'rake', '>= 10.5' +gem 'rake-compiler', '~> 1.1', '>= 1.1.1' # To compile native extensions +gem 'rspec', '~> 3.13' +gem 'rspec-collection_matchers', '~> 1.1' +gem 'rspec-wait', '~> 0' +gem 'rspec_junit_formatter', '>= 0.5.1' + +# 1.50 is the last version to support Ruby 2.6 +gem 'rubocop', '~> 1.50.0', require: false +gem 'rubocop-packaging', '~> 0.5.2', require: false +gem 'rubocop-performance', '~> 1.9', require: false +# 2.20 is the last version to support Ruby 2.6 +gem 'rubocop-rspec', ['~> 2.20', '< 2.21'], require: false + +# Merging branch coverage results does not work for old, unsupported rubies and JRuby +# We have a fix up for review, https://github.com/simplecov-ruby/simplecov/pull/972, +# but given it only affects unsupported version of Ruby, it might not get merged. +gem 'simplecov', git: 'https://github.com/DataDog/simplecov', ref: '3bb6b7ee58bf4b1954ca205f50dd44d6f41c57db' +gem 'simplecov-cobertura', '~> 2.1.0' # Used by codecov + +# Ruby 3.4 should be supported by strscan v3.1.1. However, the latest release of strscan is v3.1.0. +# Pin strscan to the latest commit sha. +# +# TODO: Remove once v3.1.1 is released. +gem 'strscan', + git: 'https://github.com/ruby/strscan', + ref: '041b15df4ccc067deabd85fd489b2c15961d0e2f' + +gem 'warning', '~> 1' # NOTE: Used in spec_helper.rb +gem 'webmock', '>= 3.10.0' +gem 'webrick', '>= 1.8.2' + +group :check do + gem 'rbs', '~> 3.7', require: false + gem 'steep', '~> 1', '>= 1.9.1', require: false + gem 'ruby_memcheck', '>= 3' + gem 'standard', require: false +end + +group :dev do + gem 'ruby-lsp', require: false + gem 'appraisal', '~> 2.4.0', require: false +end From 381f6ad6960b66bbc1b878b22c50930f8924685c Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 2 Jan 2025 12:13:07 +0000 Subject: [PATCH 10/17] Update list of files used to compute cache checksum In practice this shouldn't make a difference, since the final lockfiles are supposed to be a superset of the root-level gemfile BUT the `Appraisals` file no longer exists anyway and "just in case" let's have it anyway as it seems more correct. --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ed9caf20e38..6aabbeeb7f8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -65,7 +65,7 @@ save_bundle_checksum: &save_bundle_checksum command: | if [ "$CI_BUNDLE_CACHE_HIT" != 1 ]; then # Recompute gemfiles/*.lock checksum, as those files might have changed - cat Gemfile Gemfile.lock Appraisals gemfiles/*.gemfile gemfiles/*.gemfile.lock | md5sum > .circleci/bundle_checksum + cat Gemfile Gemfile.lock ruby-*.gemfile gemfiles/*.gemfile gemfiles/*.gemfile.lock | md5sum > .circleci/bundle_checksum fi cp .circleci/bundle_checksum /usr/local/bundle/bundle_checksum step_bundle_install: &step_bundle_install @@ -96,7 +96,7 @@ step_compute_bundle_checksum: &step_compute_bundle_checksum # updating the gemset lock files produces extremely large commits. command: | bundle lock # Create Gemfile.lock - cat Gemfile Gemfile.lock Appraisals gemfiles/*.gemfile gemfiles/*.gemfile.lock | md5sum > .circleci/bundle_checksum + cat Gemfile Gemfile.lock ruby-*.gemfile gemfiles/*.gemfile gemfiles/*.gemfile.lock | md5sum > .circleci/bundle_checksum step_get_test_agent_trace_check_results: &step_get_test_agent_trace_check_results run: name: Get APM Test Agent Trace Check Results From 8ca9ba5ce3ae71db87a4eacb120e3a8562234c74 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 2 Jan 2025 12:58:17 +0000 Subject: [PATCH 11/17] Bump Ruby 3.4 integration image to stable version --- integration/images/ruby/3.4/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/images/ruby/3.4/Dockerfile b/integration/images/ruby/3.4/Dockerfile index 3cc6648d99f..96500f1c668 100644 --- a/integration/images/ruby/3.4/Dockerfile +++ b/integration/images/ruby/3.4/Dockerfile @@ -1,4 +1,4 @@ -FROM ruby:3.4.0-preview2 +FROM ruby:3.4 ENV DEBIAN_FRONTEND=noninteractive From b595d4177973583aa2f11ad9689d6147efeb2248 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 2 Jan 2025 16:15:19 +0000 Subject: [PATCH 12/17] Enable type checking for AgentSettingsResolver/AgentSettings Steep doesn't seem to be a big fan of Structs so I just went ahead and turned the `AgentSettings` into a regular class that's equivalent to the struct we had before. (In particular, I decided to still keep every field as optional). Ideally this would be a `Data` class, but we're far from dropping support for Rubies that don't have it. --- Steepfile | 1 - .../configuration/agent_settings_resolver.rb | 22 ++++----- .../configuration/agent_settings_resolver.rbs | 46 +++++++++++++------ sig/datadog/core/configuration/ext.rbs | 1 + 4 files changed, 44 insertions(+), 26 deletions(-) diff --git a/Steepfile b/Steepfile index 24d49808692..d2cce9c3ef6 100644 --- a/Steepfile +++ b/Steepfile @@ -80,7 +80,6 @@ target :datadog do ignore 'lib/datadog/core/buffer/thread_safe.rb' ignore 'lib/datadog/core/chunker.rb' ignore 'lib/datadog/core/configuration.rb' - ignore 'lib/datadog/core/configuration/agent_settings_resolver.rb' ignore 'lib/datadog/core/configuration/base.rb' ignore 'lib/datadog/core/configuration/components.rb' ignore 'lib/datadog/core/configuration/dependency_resolver.rb' diff --git a/lib/datadog/core/configuration/agent_settings_resolver.rb b/lib/datadog/core/configuration/agent_settings_resolver.rb index 7d7c048f553..3d407ce1afb 100644 --- a/lib/datadog/core/configuration/agent_settings_resolver.rb +++ b/lib/datadog/core/configuration/agent_settings_resolver.rb @@ -19,17 +19,17 @@ module Configuration # Whenever there is a conflict (different configurations are provided in different orders), it MUST warn the users # about it and pick a value based on the following priority: code > environment variable > defaults. class AgentSettingsResolver - AgentSettings = Struct.new( - :adapter, - :ssl, - :hostname, - :port, - :uds_path, - :timeout_seconds, - keyword_init: true - ) do - def initialize(*) - super + # Immutable container for the resulting settings + class AgentSettings + attr_reader :adapter, :ssl, :hostname, :port, :uds_path, :timeout_seconds + + def initialize(adapter: nil, ssl: nil, hostname: nil, port: nil, uds_path: nil, timeout_seconds: nil) + @adapter = adapter + @ssl = ssl + @hostname = hostname + @port = port + @uds_path = uds_path + @timeout_seconds = timeout_seconds freeze end end diff --git a/sig/datadog/core/configuration/agent_settings_resolver.rbs b/sig/datadog/core/configuration/agent_settings_resolver.rbs index 040a9aa960a..a4ea2830ddb 100644 --- a/sig/datadog/core/configuration/agent_settings_resolver.rbs +++ b/sig/datadog/core/configuration/agent_settings_resolver.rbs @@ -2,9 +2,8 @@ module Datadog module Core module Configuration class AgentSettingsResolver - class AgentSettings < ::Struct[untyped] + class AgentSettings def initialize: (adapter: untyped, ssl: untyped, hostname: untyped, port: untyped, uds_path: untyped, timeout_seconds: untyped) -> void - def merge: (**::Hash[untyped, untyped] member_values) -> AgentSettingsResolver attr_reader adapter: untyped attr_reader ssl: untyped @@ -14,6 +13,17 @@ module Datadog attr_reader timeout_seconds: untyped end + @settings: untyped + @logger: untyped + @configured_hostname: untyped + @configured_port: untyped + @configured_ssl: untyped + @configured_timeout_seconds: untyped + @configured_uds_path: untyped + @uds_fallback: untyped + @mixed_http_and_uds: untyped + @parsed_url: untyped + def self.call: (untyped settings, ?logger: untyped) -> untyped private @@ -38,33 +48,31 @@ module Datadog def configured_uds_path: () -> untyped - def try_parsing_as_integer: (value: untyped, friendly_name: untyped) -> untyped + def parsed_url_ssl?: () -> (nil | untyped) - def try_parsing_as_boolean: (value: untyped, friendly_name: untyped) -> untyped + def try_parsing_as_integer: (value: untyped, friendly_name: untyped) -> untyped - def ssl?: () -> bool + def ssl?: () -> (false | untyped) def hostname: () -> untyped def port: () -> untyped - def uds_path: () -> untyped - def timeout_seconds: () -> untyped - def uds_fallback: () -> untyped + def parsed_url_uds_path: () -> (nil | untyped) - def should_use_uds_fallback?: () -> untyped + def uds_path: () -> (nil | untyped) - def should_use_uds?: () -> bool + def uds_fallback: () -> untyped - def can_use_uds?: () -> bool + def should_use_uds?: () -> untyped - def parsed_url: () -> untyped + def mixed_http_and_uds: () -> untyped - def parsed_url_ssl?: () -> untyped + def can_use_uds?: () -> untyped - def parsed_url_uds_path: () -> untyped + def parsed_url: () -> untyped def pick_from: (*untyped configurations_in_priority_order) -> untyped @@ -72,7 +80,17 @@ module Datadog def log_warning: (untyped message) -> (untyped | nil) + def http_scheme?: (untyped uri) -> untyped + + def parsed_http_url: () -> (untyped | nil) + + def unix_scheme?: (untyped uri) -> untyped + class DetectedConfiguration + @friendly_name: untyped + + @value: untyped + attr_reader friendly_name: untyped attr_reader value: untyped diff --git a/sig/datadog/core/configuration/ext.rbs b/sig/datadog/core/configuration/ext.rbs index 71b7acc0951..77f9eae8265 100644 --- a/sig/datadog/core/configuration/ext.rbs +++ b/sig/datadog/core/configuration/ext.rbs @@ -15,6 +15,7 @@ module Datadog end module Agent + ENV_DEFAULT_HOST: 'DD_AGENT_HOST' ENV_DEFAULT_PORT: 'DD_TRACE_AGENT_PORT' ENV_DEFAULT_URL: 'DD_TRACE_AGENT_URL' ENV_DEFAULT_TIMEOUT_SECONDS: 'DD_TRACE_AGENT_TIMEOUT_SECONDS' From f88e1c1e6f51a5e41cbf47c21f9e59b6585ef1eb Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 2 Jan 2025 16:19:26 +0000 Subject: [PATCH 13/17] Remove workaround for strscan issue This is not expected to be an issue in 3.5 (and is probably fixed for 3.4 as well, but I'll leave that for a separate PR to not affect the appraisals). --- ruby-3.5.gemfile | 8 -------- 1 file changed, 8 deletions(-) diff --git a/ruby-3.5.gemfile b/ruby-3.5.gemfile index 73960422f46..ce83d653fc6 100644 --- a/ruby-3.5.gemfile +++ b/ruby-3.5.gemfile @@ -50,14 +50,6 @@ gem 'rubocop-rspec', ['~> 2.20', '< 2.21'], require: false gem 'simplecov', git: 'https://github.com/DataDog/simplecov', ref: '3bb6b7ee58bf4b1954ca205f50dd44d6f41c57db' gem 'simplecov-cobertura', '~> 2.1.0' # Used by codecov -# Ruby 3.4 should be supported by strscan v3.1.1. However, the latest release of strscan is v3.1.0. -# Pin strscan to the latest commit sha. -# -# TODO: Remove once v3.1.1 is released. -gem 'strscan', - git: 'https://github.com/ruby/strscan', - ref: '041b15df4ccc067deabd85fd489b2c15961d0e2f' - gem 'warning', '~> 1' # NOTE: Used in spec_helper.rb gem 'webmock', '>= 3.10.0' gem 'webrick', '>= 1.8.2' From 6ada623a264d8556414b496050983cf2a286ab4b Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 2 Jan 2025 16:23:53 +0000 Subject: [PATCH 14/17] Move url building behavior from `AgentBaseUrl` to `AgentSettings` This is preparation to also share this behavior with profiling. --- .../configuration/agent_settings_resolver.rb | 18 ++++++ .../core/crashtracking/agent_base_url.rb | 14 +---- .../configuration/agent_settings_resolver.rbs | 7 +++ .../agent_settings_resolver_spec.rb | 57 +++++++++++++++++++ .../core/crashtracking/agent_base_url_spec.rb | 25 ++------ 5 files changed, 88 insertions(+), 33 deletions(-) diff --git a/lib/datadog/core/configuration/agent_settings_resolver.rb b/lib/datadog/core/configuration/agent_settings_resolver.rb index 3d407ce1afb..424f47c331e 100644 --- a/lib/datadog/core/configuration/agent_settings_resolver.rb +++ b/lib/datadog/core/configuration/agent_settings_resolver.rb @@ -32,8 +32,26 @@ def initialize(adapter: nil, ssl: nil, hostname: nil, port: nil, uds_path: nil, @timeout_seconds = timeout_seconds freeze end + + def url + case adapter + when Datadog::Core::Configuration::Ext::Agent::HTTP::ADAPTER + hostname = self.hostname + hostname = "[#{hostname}]" if hostname =~ IPV6_REGEXP + "#{ssl ? 'https' : 'http'}://#{hostname}:#{port}/" + when Datadog::Core::Configuration::Ext::Agent::UnixSocket::ADAPTER + "unix://#{uds_path}" + else + raise ArgumentError, "Unexpected adapter: #{adapter}" + end + end end + # IPv6 regular expression from + # https://stackoverflow.com/questions/53497/regular-expression-that-matches-valid-ipv6-addresses + # Does not match IPv4 addresses. + IPV6_REGEXP = /\A(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\z)/.freeze # rubocop:disable Layout/LineLength + def self.call(settings, logger: Datadog.logger) new(settings, logger: logger).send(:call) end diff --git a/lib/datadog/core/crashtracking/agent_base_url.rb b/lib/datadog/core/crashtracking/agent_base_url.rb index 74b59a1cda5..20a9a55129b 100644 --- a/lib/datadog/core/crashtracking/agent_base_url.rb +++ b/lib/datadog/core/crashtracking/agent_base_url.rb @@ -7,20 +7,8 @@ module Core module Crashtracking # This module provides a method to resolve the base URL of the agent module AgentBaseUrl - # IPv6 regular expression from - # https://stackoverflow.com/questions/53497/regular-expression-that-matches-valid-ipv6-addresses - # Does not match IPv4 addresses. - IPV6_REGEXP = /\A(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\z)/.freeze # rubocop:disable Layout/LineLength - def self.resolve(agent_settings) - case agent_settings.adapter - when Datadog::Core::Configuration::Ext::Agent::HTTP::ADAPTER - hostname = agent_settings.hostname - hostname = "[#{hostname}]" if hostname =~ IPV6_REGEXP - "#{agent_settings.ssl ? 'https' : 'http'}://#{hostname}:#{agent_settings.port}/" - when Datadog::Core::Configuration::Ext::Agent::UnixSocket::ADAPTER - "unix://#{agent_settings.uds_path}" - end + agent_settings.url end end end diff --git a/sig/datadog/core/configuration/agent_settings_resolver.rbs b/sig/datadog/core/configuration/agent_settings_resolver.rbs index a4ea2830ddb..e3a2472abf4 100644 --- a/sig/datadog/core/configuration/agent_settings_resolver.rbs +++ b/sig/datadog/core/configuration/agent_settings_resolver.rbs @@ -11,6 +11,8 @@ module Datadog attr_reader port: untyped attr_reader uds_path: untyped attr_reader timeout_seconds: untyped + + def url: () -> ::String end @settings: untyped @@ -24,6 +26,11 @@ module Datadog @mixed_http_and_uds: untyped @parsed_url: untyped + # IPv6 regular expression from + # https://stackoverflow.com/questions/53497/regular-expression-that-matches-valid-ipv6-addresses + # Does not match IPv4 addresses. + IPV6_REGEXP: ::Regexp + def self.call: (untyped settings, ?logger: untyped) -> untyped private diff --git a/spec/datadog/core/configuration/agent_settings_resolver_spec.rb b/spec/datadog/core/configuration/agent_settings_resolver_spec.rb index ef2a6766ea1..c057e520739 100644 --- a/spec/datadog/core/configuration/agent_settings_resolver_spec.rb +++ b/spec/datadog/core/configuration/agent_settings_resolver_spec.rb @@ -803,4 +803,61 @@ end end end + + describe 'url' do + context 'when using HTTP adapter' do + before do + datadog_settings.agent.host = 'example.com' + datadog_settings.agent.port = 8080 + end + + context 'when SSL is enabled' do + before { datadog_settings.agent.use_ssl = true } + + it 'returns the correct base URL' do + expect(resolver.url).to eq('https://example.com:8080/') + end + end + + context 'when SSL is disabled' do + before { datadog_settings.agent.use_ssl = false } + + it 'returns the correct base URL' do + expect(resolver.url).to eq('http://example.com:8080/') + end + end + + context 'when hostname is an IPv4 address' do + before { datadog_settings.agent.host = '1.2.3.4' } + + it 'returns the correct base URL' do + expect(resolver.url).to eq('http://1.2.3.4:8080/') + end + end + + context 'when hostname is an IPv6 address' do + before { datadog_settings.agent.host = '1234:1234::1' } + + it 'returns the correct base URL' do + expect(resolver.url).to eq('http://[1234:1234::1]:8080/') + end + end + end + + context 'when using UnixSocket adapter' do + before { datadog_settings.agent.uds_path = '/var/run/datadog.sock' } + + it 'returns the correct base URL' do + expect(resolver.url).to eq('unix:///var/run/datadog.sock') + end + end + + context 'when using an unknown adapter' do + it 'raises an exception' do + agent_settings = Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings.new(adapter: :unknown) + + expect { agent_settings.url }.to raise_error(ArgumentError, /Unexpected adapter/) + end + end + end end diff --git a/spec/datadog/core/crashtracking/agent_base_url_spec.rb b/spec/datadog/core/crashtracking/agent_base_url_spec.rb index 407b74daf26..4a8145208cf 100644 --- a/spec/datadog/core/crashtracking/agent_base_url_spec.rb +++ b/spec/datadog/core/crashtracking/agent_base_url_spec.rb @@ -8,8 +8,7 @@ context 'when using HTTP adapter' do context 'when SSL is enabled' do let(:agent_settings) do - instance_double( - Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings, + Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings.new( adapter: Datadog::Core::Configuration::Ext::Agent::HTTP::ADAPTER, ssl: true, hostname: 'example.com', @@ -24,8 +23,7 @@ context 'when SSL is disabled' do let(:agent_settings) do - instance_double( - Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings, + Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings.new( adapter: Datadog::Core::Configuration::Ext::Agent::HTTP::ADAPTER, ssl: false, hostname: 'example.com', @@ -40,8 +38,7 @@ context 'when hostname is an IPv4 address' do let(:agent_settings) do - instance_double( - Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings, + Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings.new( adapter: Datadog::Core::Configuration::Ext::Agent::HTTP::ADAPTER, ssl: false, hostname: '1.2.3.4', @@ -56,8 +53,7 @@ context 'when hostname is an IPv6 address' do let(:agent_settings) do - instance_double( - Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings, + Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings.new( adapter: Datadog::Core::Configuration::Ext::Agent::HTTP::ADAPTER, ssl: false, hostname: '1234:1234::1', @@ -73,8 +69,7 @@ context 'when using UnixSocket adapter' do let(:agent_settings) do - instance_double( - Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings, + Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings.new( adapter: Datadog::Core::Configuration::Ext::Agent::UnixSocket::ADAPTER, uds_path: '/var/run/datadog.sock' ) @@ -84,15 +79,5 @@ expect(described_class.resolve(agent_settings)).to eq('unix:///var/run/datadog.sock') end end - - context 'when using unknownm adapter' do - let(:agent_settings) do - instance_double(Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings, adapter: 'unknown') - end - - it 'returns nil' do - expect(described_class.resolve(agent_settings)).to be_nil - end - end end end From 65c92d284f8eb34039d515746c9894c21a403f76 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 2 Jan 2025 17:28:45 +0000 Subject: [PATCH 15/17] Refactor crashtracking to use `AgentSettings#url` The behavior from the old `AgentBaseUrl` is now contained in `AgentSettings` so we can clean up the extra logic. --- .../core/crashtracking/agent_base_url.rb | 16 ---- lib/datadog/core/crashtracking/component.rb | 4 +- .../core/crashtracking/agent_base_url.rbs | 10 --- .../core/crashtracking/agent_base_url_spec.rb | 83 ------------------- .../core/crashtracking/component_spec.rb | 34 ++------ 5 files changed, 8 insertions(+), 139 deletions(-) delete mode 100644 lib/datadog/core/crashtracking/agent_base_url.rb delete mode 100644 sig/datadog/core/crashtracking/agent_base_url.rbs delete mode 100644 spec/datadog/core/crashtracking/agent_base_url_spec.rb diff --git a/lib/datadog/core/crashtracking/agent_base_url.rb b/lib/datadog/core/crashtracking/agent_base_url.rb deleted file mode 100644 index 20a9a55129b..00000000000 --- a/lib/datadog/core/crashtracking/agent_base_url.rb +++ /dev/null @@ -1,16 +0,0 @@ -# frozen_string_literal: true - -require_relative '../configuration/ext' - -module Datadog - module Core - module Crashtracking - # This module provides a method to resolve the base URL of the agent - module AgentBaseUrl - def self.resolve(agent_settings) - agent_settings.url - end - end - end - end -end diff --git a/lib/datadog/core/crashtracking/component.rb b/lib/datadog/core/crashtracking/component.rb index ee04e1c5cc8..460a7974bc5 100644 --- a/lib/datadog/core/crashtracking/component.rb +++ b/lib/datadog/core/crashtracking/component.rb @@ -3,7 +3,6 @@ require 'libdatadog' require_relative 'tag_builder' -require_relative 'agent_base_url' require_relative '../utils/only_once' require_relative '../utils/at_fork_monkey_patch' @@ -31,8 +30,7 @@ class Component def self.build(settings, agent_settings, logger:) tags = TagBuilder.call(settings) - agent_base_url = AgentBaseUrl.resolve(agent_settings) - logger.warn('Missing agent base URL; cannot enable crash tracking') unless agent_base_url + agent_base_url = agent_settings.url ld_library_path = ::Libdatadog.ld_library_path logger.warn('Missing ld_library_path; cannot enable crash tracking') unless ld_library_path diff --git a/sig/datadog/core/crashtracking/agent_base_url.rbs b/sig/datadog/core/crashtracking/agent_base_url.rbs deleted file mode 100644 index 3a2c77f8e85..00000000000 --- a/sig/datadog/core/crashtracking/agent_base_url.rbs +++ /dev/null @@ -1,10 +0,0 @@ -module Datadog - module Core - module Crashtracking - module AgentBaseUrl - IPV6_REGEXP: Regexp - def self.resolve: (Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings) -> ::String? - end - end - end -end diff --git a/spec/datadog/core/crashtracking/agent_base_url_spec.rb b/spec/datadog/core/crashtracking/agent_base_url_spec.rb deleted file mode 100644 index 4a8145208cf..00000000000 --- a/spec/datadog/core/crashtracking/agent_base_url_spec.rb +++ /dev/null @@ -1,83 +0,0 @@ -# frozen_string_literal: true - -require 'spec_helper' -require 'datadog/core/crashtracking/agent_base_url' - -RSpec.describe Datadog::Core::Crashtracking::AgentBaseUrl do - describe '.resolve' do - context 'when using HTTP adapter' do - context 'when SSL is enabled' do - let(:agent_settings) do - Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings.new( - adapter: Datadog::Core::Configuration::Ext::Agent::HTTP::ADAPTER, - ssl: true, - hostname: 'example.com', - port: 8080 - ) - end - - it 'returns the correct base URL' do - expect(described_class.resolve(agent_settings)).to eq('https://example.com:8080/') - end - end - - context 'when SSL is disabled' do - let(:agent_settings) do - Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings.new( - adapter: Datadog::Core::Configuration::Ext::Agent::HTTP::ADAPTER, - ssl: false, - hostname: 'example.com', - port: 8080 - ) - end - - it 'returns the correct base URL' do - expect(described_class.resolve(agent_settings)).to eq('http://example.com:8080/') - end - end - - context 'when hostname is an IPv4 address' do - let(:agent_settings) do - Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings.new( - adapter: Datadog::Core::Configuration::Ext::Agent::HTTP::ADAPTER, - ssl: false, - hostname: '1.2.3.4', - port: 8080 - ) - end - - it 'returns the correct base URL' do - expect(described_class.resolve(agent_settings)).to eq('http://1.2.3.4:8080/') - end - end - - context 'when hostname is an IPv6 address' do - let(:agent_settings) do - Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings.new( - adapter: Datadog::Core::Configuration::Ext::Agent::HTTP::ADAPTER, - ssl: false, - hostname: '1234:1234::1', - port: 8080 - ) - end - - it 'returns the correct base URL' do - expect(described_class.resolve(agent_settings)).to eq('http://[1234:1234::1]:8080/') - end - end - end - - context 'when using UnixSocket adapter' do - let(:agent_settings) do - Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings.new( - adapter: Datadog::Core::Configuration::Ext::Agent::UnixSocket::ADAPTER, - uds_path: '/var/run/datadog.sock' - ) - end - - it 'returns the correct base URL' do - expect(described_class.resolve(agent_settings)).to eq('unix:///var/run/datadog.sock') - end - end - end -end diff --git a/spec/datadog/core/crashtracking/component_spec.rb b/spec/datadog/core/crashtracking/component_spec.rb index 034bd5f441e..fcbef942d43 100644 --- a/spec/datadog/core/crashtracking/component_spec.rb +++ b/spec/datadog/core/crashtracking/component_spec.rb @@ -9,7 +9,9 @@ describe '.build' do let(:settings) { Datadog::Core::Configuration::Settings.new } - let(:agent_settings) { double('agent_settings') } + let(:agent_settings) do + instance_double(Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings) + end let(:tags) { { 'tag1' => 'value1' } } let(:agent_base_url) { 'agent_base_url' } let(:ld_library_path) { 'ld_library_path' } @@ -19,8 +21,7 @@ it 'creates a new instance of Component and starts it' do expect(Datadog::Core::Crashtracking::TagBuilder).to receive(:call).with(settings) .and_return(tags) - expect(Datadog::Core::Crashtracking::AgentBaseUrl).to receive(:resolve).with(agent_settings) - .and_return(agent_base_url) + expect(agent_settings).to receive(:url).and_return(agent_base_url) expect(::Libdatadog).to receive(:ld_library_path) .and_return(ld_library_path) expect(::Libdatadog).to receive(:path_to_crashtracking_receiver_binary) @@ -42,32 +43,13 @@ end end - context 'when missing `agent_base_url`' do - let(:agent_base_url) { nil } - - it 'returns nil' do - expect(Datadog::Core::Crashtracking::TagBuilder).to receive(:call).with(settings) - .and_return(tags) - expect(Datadog::Core::Crashtracking::AgentBaseUrl).to receive(:resolve).with(agent_settings) - .and_return(agent_base_url) - expect(::Libdatadog).to receive(:ld_library_path) - .and_return(ld_library_path) - expect(::Libdatadog).to receive(:path_to_crashtracking_receiver_binary) - .and_return(path_to_crashtracking_receiver_binary) - expect(logger).to receive(:warn).with(/cannot enable crash tracking/) - - expect(described_class.build(settings, agent_settings, logger: logger)).to be_nil - end - end - context 'when missing `ld_library_path`' do let(:ld_library_path) { nil } it 'returns nil' do expect(Datadog::Core::Crashtracking::TagBuilder).to receive(:call).with(settings) .and_return(tags) - expect(Datadog::Core::Crashtracking::AgentBaseUrl).to receive(:resolve).with(agent_settings) - .and_return(agent_base_url) + expect(agent_settings).to receive(:url).and_return(agent_base_url) expect(::Libdatadog).to receive(:ld_library_path) .and_return(ld_library_path) expect(::Libdatadog).to receive(:path_to_crashtracking_receiver_binary) @@ -84,8 +66,7 @@ it 'returns nil' do expect(Datadog::Core::Crashtracking::TagBuilder).to receive(:call).with(settings) .and_return(tags) - expect(Datadog::Core::Crashtracking::AgentBaseUrl).to receive(:resolve).with(agent_settings) - .and_return(agent_base_url) + expect(agent_settings).to receive(:url).and_return(agent_base_url) expect(::Libdatadog).to receive(:ld_library_path) .and_return(ld_library_path) expect(::Libdatadog).to receive(:path_to_crashtracking_receiver_binary) @@ -102,8 +83,7 @@ it 'returns an instance of Component that failed to start' do expect(Datadog::Core::Crashtracking::TagBuilder).to receive(:call).with(settings) .and_return(tags) - expect(Datadog::Core::Crashtracking::AgentBaseUrl).to receive(:resolve).with(agent_settings) - .and_return(agent_base_url) + expect(agent_settings).to receive(:url).and_return(agent_base_url) expect(::Libdatadog).to receive(:ld_library_path) .and_return(ld_library_path) expect(::Libdatadog).to receive(:path_to_crashtracking_receiver_binary) From e4f5e19d9007907c3c17e6f2c5bb871a950e6288 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 2 Jan 2025 17:32:49 +0000 Subject: [PATCH 16/17] [PROF-11078] Fix profiling exception when agent url is an ipv6 address **What does this PR do?** This PR builds atop #4237 and fixes a similar-ish issue in the profiler caused by the same mishandling of ipv6 addresses. In particular, when provided with an ipv6 address in the agent url, the profiler would fail with an exception: ``` $ env DD_AGENT_HOST=2001:db8:1::2 DD_PROFILING_ENABLED=true \ bundle exec ddprofrb exec ruby -e "sleep 2" dd-trace-rb/lib/datadog/profiling/http_transport.rb:27:in `initialize': Failed to initialize transport: invalid authority (ArgumentError) ``` **Motivation:** Luckily we didn't have any customers using this, as it fails immediately and loudly, but it's still a bug on a configuration that should be supported. **Additional Notes:** Since we had similar buggy logic copy-pasted in crashtracking and profiling (crashtracking had been fixed in #4237) I chose to extract out the relevant logic into the `AgentSettings` class, so that both can reuse it. **How to test the change?** I've added unit test coverage for this issue to profiling, and the snippet above can be used to end-to-end test it's working fine. Here's how it looks on my machine now: ``` E, [2025-01-02T17:32:32.398756 #359317] ERROR -- datadog: [datadog] (dd-trace-rb/lib/datadog/profiling/http_transport.rb:68:in `export') Failed to report profiling data (agent: http://[2001:db8:1::2]:8126/): failed ddog_prof_Exporter_send: error trying to connect: tcp connect error: Network is unreachable (os error 101): tcp connect error: Network is unreachable (os error 101): Network is unreachable (os error 101) ``` E.g. we correctly try to connect to the dummy address, and fail :) (Note: The error message is a bit ugly AND repeats itself a bit. That's being tracked separately in https://github.com/DataDog/libdatadog/issues/283 ) --- lib/datadog/profiling/http_transport.rb | 27 +------------------ sig/datadog/profiling/http_transport.rbs | 4 --- spec/datadog/profiling/http_transport_spec.rb | 13 +++++++++ 3 files changed, 14 insertions(+), 30 deletions(-) diff --git a/lib/datadog/profiling/http_transport.rb b/lib/datadog/profiling/http_transport.rb index 2c89c6548b7..9de76899494 100644 --- a/lib/datadog/profiling/http_transport.rb +++ b/lib/datadog/profiling/http_transport.rb @@ -13,13 +13,11 @@ class HttpTransport def initialize(agent_settings:, site:, api_key:, upload_timeout_seconds:) @upload_timeout_milliseconds = (upload_timeout_seconds * 1_000).to_i - validate_agent_settings(agent_settings) - @exporter_configuration = if agentless?(site, api_key) [:agentless, site, api_key].freeze else - [:agent, base_url_from(agent_settings)].freeze + [:agent, agent_settings.url].freeze end status, result = validate_exporter(exporter_configuration) @@ -75,29 +73,6 @@ def export(flush) private - def base_url_from(agent_settings) - case agent_settings.adapter - when Datadog::Core::Configuration::Ext::Agent::HTTP::ADAPTER - "#{agent_settings.ssl ? "https" : "http"}://#{agent_settings.hostname}:#{agent_settings.port}/" - when Datadog::Core::Configuration::Ext::Agent::UnixSocket::ADAPTER - "unix://#{agent_settings.uds_path}" - else - raise ArgumentError, "Unexpected adapter: #{agent_settings.adapter}" - end - end - - def validate_agent_settings(agent_settings) - supported_adapters = [ - Datadog::Core::Configuration::Ext::Agent::UnixSocket::ADAPTER, - Datadog::Core::Configuration::Ext::Agent::HTTP::ADAPTER - ] - unless supported_adapters.include?(agent_settings.adapter) - raise ArgumentError, - "Unsupported transport configuration for profiling: Adapter #{agent_settings.adapter} " \ - " is not supported" - end - end - def agentless?(site, api_key) site && api_key && Core::Environment::VariableHelpers.env_to_bool(Profiling::Ext::ENV_AGENTLESS, false) end diff --git a/sig/datadog/profiling/http_transport.rbs b/sig/datadog/profiling/http_transport.rbs index 8c58ea8180e..af663a225b7 100644 --- a/sig/datadog/profiling/http_transport.rbs +++ b/sig/datadog/profiling/http_transport.rbs @@ -19,10 +19,6 @@ module Datadog private - def base_url_from: (Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings agent_settings) -> ::String - - def validate_agent_settings: (Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings agent_settings) -> void - def agentless?: (::String? site, ::String? api_key) -> bool def validate_exporter: (exporter_configuration_array exporter_configuration) -> [:ok | :error, ::String?] diff --git a/spec/datadog/profiling/http_transport_spec.rb b/spec/datadog/profiling/http_transport_spec.rb index 44423984fab..0cd98ba09e3 100644 --- a/spec/datadog/profiling/http_transport_spec.rb +++ b/spec/datadog/profiling/http_transport_spec.rb @@ -127,6 +127,19 @@ http_transport end end + + context "when hostname is an ipv6 address" do + let(:hostname) { "1234:1234::1" } + + it "provides the correct ipv6 address-safe url to the exporter" do + expect(described_class) + .to receive(:_native_validate_exporter) + .with([:agent, "http://[1234:1234::1]:12345/"]) + .and_return([:ok, nil]) + + http_transport + end + end end context "when additionally site and api_key are provided" do From 8aba599f784aefd64a1a7bfbec3d315820bbe285 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 2 Jan 2025 17:56:59 +0000 Subject: [PATCH 17/17] Implement `==` for new `AgentSettings` class Forgot this one, some of our tests relied on it! --- .../core/configuration/agent_settings_resolver.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/datadog/core/configuration/agent_settings_resolver.rb b/lib/datadog/core/configuration/agent_settings_resolver.rb index 424f47c331e..5ced057274a 100644 --- a/lib/datadog/core/configuration/agent_settings_resolver.rb +++ b/lib/datadog/core/configuration/agent_settings_resolver.rb @@ -45,6 +45,16 @@ def url raise ArgumentError, "Unexpected adapter: #{adapter}" end end + + def ==(other) + self.class == other.class && + adapter == other.adapter && + ssl == other.ssl && + hostname == other.hostname && + port == other.port && + uds_path == other.uds_path && + timeout_seconds == other.timeout_seconds + end end # IPv6 regular expression from