From 285037269a420c0d73c6aad39426b8a8b59aeae3 Mon Sep 17 00:00:00 2001 From: Anthony Shull Date: Tue, 7 May 2024 12:39:57 -0500 Subject: [PATCH] nebulex and req stats being sent --- lib/dotcom/application.ex | 23 +--- lib/dotcom/cache/telemetry.ex | 44 +++--- lib/dotcom/cache/telemetry/reporter.ex | 126 ------------------ lib/dotcom/stats.ex | 0 lib/dotcom/telemetry.ex | 0 lib/dotcom_web/stats.ex | 0 lib/dotcom_web/telemetry.ex | 0 .../behaviours => lib}/req/behaviour.ex | 0 lib/{mbta/api => req}/stats.ex | 43 +++--- lib/req/telemetry.ex | 47 +++++++ mix.exs | 1 - 11 files changed, 86 insertions(+), 198 deletions(-) delete mode 100644 lib/dotcom/cache/telemetry/reporter.ex create mode 100644 lib/dotcom/stats.ex create mode 100644 lib/dotcom/telemetry.ex create mode 100644 lib/dotcom_web/stats.ex create mode 100644 lib/dotcom_web/telemetry.ex rename {test/support/behaviours => lib}/req/behaviour.ex (100%) rename lib/{mbta/api => req}/stats.ex (53%) create mode 100644 lib/req/telemetry.ex diff --git a/lib/dotcom/application.ex b/lib/dotcom/application.ex index 734d55ab5b..ce022aae94 100644 --- a/lib/dotcom/application.ex +++ b/lib/dotcom/application.ex @@ -13,8 +13,6 @@ defmodule Dotcom.Application do # See http://elixir-lang.org/docs/stable/elixir/Application.html # for more information on OTP Applications def start(_type, _args) do - telemetry_metrics_splunk_config = Application.get_env(:dotcom, :telemetry_metrics_splunk) - Application.put_env( :dotcom, :allow_indexing, @@ -31,26 +29,7 @@ defmodule Dotcom.Application do children = [ {Application.get_env(:dotcom, :cache, Dotcom.Cache.Multilevel), []}, - { - TelemetryMetricsSplunk, - [ - metrics: [ - Metrics.last_value("mbta_api.request.count"), - Metrics.last_value("mbta_api.request.avg") - ], - token: telemetry_metrics_splunk_config[:token], - url: telemetry_metrics_splunk_config[:url] - ] - }, - {MBTA.Api.Stats, %{}}, - { - :telemetry_poller, - measurements: [ - {MBTA.Api.Stats, :dispatch_stats, []} - ], - period: :timer.seconds(5), - init_delay: :timer.seconds(5) - } + {Req.Telemetry, []} ] ++ if Application.get_env(:dotcom, :env) != :test do [ diff --git a/lib/dotcom/cache/telemetry.ex b/lib/dotcom/cache/telemetry.ex index f9ad31f9e0..9e296d1211 100644 --- a/lib/dotcom/cache/telemetry.ex +++ b/lib/dotcom/cache/telemetry.ex @@ -1,16 +1,8 @@ defmodule Dotcom.Cache.Telemetry do @moduledoc """ - This supervisor establishes a connection between the telemetry_poller and our telemetry reporters. + This supervisor establishes a connection between the telemetry_poller and the `TelemetryMetricsSplunk` reporter. Cache stats are emitted by every level of `Dotcom.Cache.Multilevel`. We poll for them every minute. - - Currently, they are passed to two reporters: - - The Statsd reporter will eventually be hooked up to Splunk metrics. - For now, it does no harm to emit them even though nothing is listening. - - The custom reporter logs in a format that can be picked up in Splunk logs. - Eventually, this should be removed. """ use Supervisor @@ -22,27 +14,35 @@ defmodule Dotcom.Cache.Telemetry do end def init(_arg) do + telemetry_metrics_splunk_config = Application.get_env(:dotcom, :telemetry_metrics_splunk) + children = [ { - :telemetry_poller, - measurements: periodic_measurements(), period: 60_000, init_delay: 5_000 + TelemetryMetricsSplunk, + [ + metrics: metrics(), + token: telemetry_metrics_splunk_config[:token], + url: telemetry_metrics_splunk_config[:url] + ] }, - {Dotcom.Cache.Telemetry.Reporter, metrics: reporter_metrics()}, - {TelemetryMetricsStatsd, metrics: statsd_metrics()} + { + :telemetry_poller, + measurements: measurements(), period: :timer.seconds(60), init_delay: :timer.seconds(5) + } ] Supervisor.init(children, strategy: :one_for_one) end - defp reporter_metrics do + defp measurements do [ - Metrics.last_value("dotcom.cache.multilevel.l1.stats.updates"), - Metrics.last_value("dotcom.cache.multilevel.l2.stats.updates"), - Metrics.last_value("dotcom.cache.multilevel.l3.stats.updates") + {Dotcom.Cache.Multilevel.Local, :dispatch_stats, []}, + {Dotcom.Cache.Multilevel.Publisher, :dispatch_stats, []}, + {Dotcom.Cache.Multilevel.Redis, :dispatch_stats, []} ] end - defp statsd_metrics do + defp metrics do [ Metrics.last_value("dotcom.cache.multilevel.l1.stats.hits"), Metrics.last_value("dotcom.cache.multilevel.l1.stats.misses"), @@ -51,12 +51,4 @@ defmodule Dotcom.Cache.Telemetry do Metrics.last_value("dotcom.cache.multilevel.l3.stats.evictions") ] end - - defp periodic_measurements do - [ - {Dotcom.Cache.Multilevel.Local, :dispatch_stats, []}, - {Dotcom.Cache.Multilevel.Publisher, :dispatch_stats, []}, - {Dotcom.Cache.Multilevel.Redis, :dispatch_stats, []} - ] - end end diff --git a/lib/dotcom/cache/telemetry/reporter.ex b/lib/dotcom/cache/telemetry/reporter.ex deleted file mode 100644 index bf5ff6ea79..0000000000 --- a/lib/dotcom/cache/telemetry/reporter.ex +++ /dev/null @@ -1,126 +0,0 @@ -defmodule Dotcom.Cache.Telemetry.Reporter do - @moduledoc """ - This custom Telemetry Reporter logs hit rate information for `Dotom.Cache.Multilevel`. - It also logs exceptions for `Redix.ConnectionError`, `Redix.Error`, and `ArgumentError`. - And, finally, it logs evictions for `Dotcom.Cache.Multilevel.Publisher`. - - See https://blog.miguelcoba.com/telemetry-and-metrics-in-elixir#heading-customreporter for more on writing custom reporters. - """ - - use GenServer - - require Logger - - alias Telemetry.Metrics - - def start_link(metrics: metrics) do - GenServer.start_link(__MODULE__, metrics) - end - - @impl true - def init(metrics) do - Process.flag(:trap_exit, true) - - groups = Enum.group_by(metrics, & &1.event_name) - - for {event, metrics} <- groups do - :telemetry.attach({__MODULE__, event, self()}, event, &__MODULE__.handle_event/4, metrics) - end - - :telemetry.attach( - "cache-command-exception", - [:dotcom, :cache, :multilevel, :l2, :command, :exception], - &__MODULE__.handle_exception/4, - nil - ) - - {:ok, Map.keys(groups)} - end - - @impl true - def terminate(_, events) do - for event <- events do - :telemetry.detach({__MODULE__, event, self()}) - end - - :ok - end - - def handle_event(_event_name, measurements, metadata, metrics) do - metrics - |> Enum.map(&handle_metric(&1, measurements, metadata)) - end - - def handle_exception( - event_name, - _measurements, - %{kind: kind, reason: %Redix.ConnectionError{reason: reason}}, - _config - ) do - name = event_name(event_name) - - Logger.warning("#{name} kind=#{kind} reason=#{reason}") - end - - def handle_exception( - event_name, - _measurements, - %{kind: kind, reason: %Redix.Error{message: message}}, - _config - ) do - name = event_name(event_name) - - Logger.warning("#{name} kind=#{kind} reason=#{message}") - end - - def handle_exception( - event_name, - _measurements, - %{kind: kind, reason: %ArgumentError{message: message}}, - _config - ) do - name = event_name(event_name) - - Logger.warning("#{name} kind=#{kind} reason=#{message}") - end - - defp handle_metric(%Metrics.LastValue{}, %{evictions: evictions}, %{ - cache: Dotcom.Cache.Multilevel.Publisher - }) do - name = module_name(Dotcom.Cache.Multilevel.Publisher) - - Logger.notice("#{name}.stats evictions=#{evictions}") - end - - defp handle_metric(%Metrics.LastValue{}, %{hits: hits, misses: misses}, metadata) do - name = module_name(metadata.cache) - - total = hits + misses - rate = if total > 0, do: Float.ceil(hits / total, 4), else: 0 - - if rate > 0 do - Logger.notice("#{name}.stats hits=#{hits} misses=#{misses} total=#{total} hit_rate=#{rate}") - end - end - - defp handle_metric(metric, _measurements, metadata) do - name = module_name(metadata.cache) - - Logger.warning("#{name}.unsupported_metric metric=#{metric.__struct__}") - end - - defp event_name(event) do - event - |> Enum.map_join(".", &Atom.to_string/1) - |> String.downcase() - end - - defp module_name(module) do - module - |> Kernel.to_string() - |> String.split(".") - |> (fn [_ | tail] -> tail end).() - |> Enum.join(".") - |> String.downcase() - end -end diff --git a/lib/dotcom/stats.ex b/lib/dotcom/stats.ex new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/dotcom/telemetry.ex b/lib/dotcom/telemetry.ex new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/dotcom_web/stats.ex b/lib/dotcom_web/stats.ex new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/dotcom_web/telemetry.ex b/lib/dotcom_web/telemetry.ex new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/support/behaviours/req/behaviour.ex b/lib/req/behaviour.ex similarity index 100% rename from test/support/behaviours/req/behaviour.ex rename to lib/req/behaviour.ex diff --git a/lib/mbta/api/stats.ex b/lib/req/stats.ex similarity index 53% rename from lib/mbta/api/stats.ex rename to lib/req/stats.ex index 008e8ae90c..fb3080cadb 100644 --- a/lib/mbta/api/stats.ex +++ b/lib/req/stats.ex @@ -1,4 +1,4 @@ -defmodule MBTA.Api.Stats do +defmodule Req.Stats do @moduledoc """ This Agent attaches to telemetry events emitted by Finch and aggregates them by path and status. """ @@ -18,37 +18,40 @@ defmodule MBTA.Api.Stats do Handles telemetry events and aggregates them by path and status. """ def handle_event(_name, measurement, metadata, _config) do - path = path_to_atom(metadata.request.path) - status = status_to_atom(metadata.status) + host = metadata.request.host + path = strip_filename(metadata.request.path) + status = metadata.status duration = measurement[:duration] Agent.update(__MODULE__, fn state -> - if Kernel.get_in(state, [path, status]) do - Kernel.update_in(state, [path, status], &(&1 ++ [duration])) + if Kernel.get_in(state, [host, path, status]) do + Kernel.update_in(state, [host, path, status], &(&1 ++ [duration])) else - Kernel.put_in(state, [Access.key(path, %{}), status], [duration]) + Kernel.put_in(state, [Access.key(host, %{}), Access.key(path, %{}), status], [duration]) end end) end @doc """ - Dispatches the aggregated stats to the `[:mbta_api, :request]` telemetry event. + Dispatches the aggregated stats to the `[:req, :request]` telemetry event. Resets the Agent state after dispatching the stats. """ def dispatch_stats() do - Enum.each(Agent.get(__MODULE__, & &1), &dispatch_path/1) + Enum.each(Agent.get(__MODULE__, & &1), &dispatch_host/1) Agent.update(__MODULE__, fn _ -> %{} end) end - defp dispatch_path({path, stats}) do - Enum.each(stats, fn {status, durations} -> - dispatch_stat(path, status, durations) + defp dispatch_host({host, stats}) do + Enum.each(stats, fn {path, statuses} -> + Enum.each(statuses, fn {status, durations} -> + dispatch_stat(host, path, status, durations) + end) end) end - defp dispatch_stat(path, status, durations) do + defp dispatch_stat(host, path, status, durations) do count = Enum.count(durations) avg = @@ -57,22 +60,16 @@ defmodule MBTA.Api.Stats do |> Kernel.div(count) |> System.convert_time_unit(:native, :millisecond) - :telemetry.execute([:mbta_api, :request], %{count: count, avg: avg}, %{ + :telemetry.execute([:req, :request], %{count: count, avg: avg}, %{ + host: host, path: path, status: status }) end - defp path_to_atom(path) do + defp strip_filename(path) do path - |> String.replace(~r{^/|/$}, "") - |> String.replace(~r{/}, "_") - |> String.to_atom() - end - - defp status_to_atom(status) do - status - |> Integer.to_string() - |> String.to_atom() + |> (&Regex.replace(~r/\/$/, &1, "")).() + |> (&Regex.replace(~r/[\w|-]+\.\w+/, &1, "")).() end end diff --git a/lib/req/telemetry.ex b/lib/req/telemetry.ex new file mode 100644 index 0000000000..9bae902aed --- /dev/null +++ b/lib/req/telemetry.ex @@ -0,0 +1,47 @@ +defmodule Req.Telemetry do + @moduledoc """ + """ + + use Supervisor + + alias Telemetry.Metrics + + def start_link(arg) do + Supervisor.start_link(__MODULE__, arg, name: __MODULE__) + end + + def init(_arg) do + telemetry_metrics_splunk_config = Application.get_env(:dotcom, :telemetry_metrics_splunk) + + children = [ + { + TelemetryMetricsSplunk, + [ + metrics: metrics(), + token: telemetry_metrics_splunk_config[:token], + url: telemetry_metrics_splunk_config[:url] + ] + }, + { + :telemetry_poller, + measurements: measurements(), period: :timer.seconds(60), init_delay: :timer.seconds(5) + }, + {Req.Stats, %{}} + ] + + Supervisor.init(children, strategy: :one_for_one) + end + + defp measurements do + [ + {Req.Stats, :dispatch_stats, []} + ] + end + + defp metrics do + [ + Metrics.last_value("req.request.count"), + Metrics.last_value("req.request.avg") + ] + end +end diff --git a/mix.exs b/mix.exs index a65f715d32..b34f3cafa2 100644 --- a/mix.exs +++ b/mix.exs @@ -144,7 +144,6 @@ defmodule DotCom.Mixfile do {:telemetry, "1.2.1", override: true}, {:telemetry_metrics, "1.0.0", override: true}, {:telemetry_metrics_splunk, "0.0.1-alpha"}, - {:telemetry_metrics_statsd, "0.7.0"}, {:telemetry_poller, "1.1.0"}, {:telemetry_test, "0.1.2", only: [:test]}, # latest version is 3.7.11; cannot upgrade because tests fail