From e6ce5aa76d430f96b1050e1b6a595d6e27352d39 Mon Sep 17 00:00:00 2001 From: Will Liu Date: Mon, 30 Sep 2024 14:26:45 -0400 Subject: [PATCH 1/2] remove dashboards/monitors --- indexer/indexer_monitors.tf | 16 - indexer_dashboards/indexer_dashboards.tf | 4 - indexer_dashboards/providers.tf | 6 - indexer_dashboards/terraform.tfvars | 51 - indexer_dashboards/variables.tf | 40 - indexer_dashboards/versions.tf | 18 - .../indexer_dashboards/comlink_dashboard.tf | 822 ---- modules/indexer_dashboards/ender_dashboard.tf | 3354 ----------------- .../indexer_dashboards/full_node_dashboard.tf | 544 --- modules/indexer_dashboards/locals.tf | 69 - .../indexer_dashboards/postgres_dashboard.tf | 66 - .../roundtable_dashboard.tf | 952 ----- modules/indexer_dashboards/socks_dashboard.tf | 1526 -------- modules/indexer_dashboards/variables.tf | 22 - modules/indexer_dashboards/versions.tf | 10 - .../indexer_dashboards/vulcan_dashboard.tf | 1813 --------- modules/indexer_monitors/locals.tf | 39 - modules/indexer_monitors/monitors.tf | 373 -- .../precautionary_monitors.tf | 148 - .../indexer_monitors/roundtable_monitors.tf | 51 - .../indexer_monitors/synthetic_monitors.tf | 368 -- modules/indexer_monitors/variables.tf | 54 - modules/indexer_monitors/versions.tf | 10 - 23 files changed, 10356 deletions(-) delete mode 100644 indexer/indexer_monitors.tf delete mode 100644 indexer_dashboards/indexer_dashboards.tf delete mode 100644 indexer_dashboards/providers.tf delete mode 100644 indexer_dashboards/terraform.tfvars delete mode 100644 indexer_dashboards/variables.tf delete mode 100644 indexer_dashboards/versions.tf delete mode 100644 modules/indexer_dashboards/comlink_dashboard.tf delete mode 100644 modules/indexer_dashboards/ender_dashboard.tf delete mode 100644 modules/indexer_dashboards/full_node_dashboard.tf delete mode 100644 modules/indexer_dashboards/locals.tf delete mode 100644 modules/indexer_dashboards/postgres_dashboard.tf delete mode 100644 modules/indexer_dashboards/roundtable_dashboard.tf delete mode 100644 modules/indexer_dashboards/socks_dashboard.tf delete mode 100644 modules/indexer_dashboards/variables.tf delete mode 100644 modules/indexer_dashboards/versions.tf delete mode 100644 modules/indexer_dashboards/vulcan_dashboard.tf delete mode 100644 modules/indexer_monitors/locals.tf delete mode 100644 modules/indexer_monitors/monitors.tf delete mode 100644 modules/indexer_monitors/precautionary_monitors.tf delete mode 100644 modules/indexer_monitors/roundtable_monitors.tf delete mode 100644 modules/indexer_monitors/synthetic_monitors.tf delete mode 100644 modules/indexer_monitors/variables.tf delete mode 100644 modules/indexer_monitors/versions.tf diff --git a/indexer/indexer_monitors.tf b/indexer/indexer_monitors.tf deleted file mode 100644 index 021bac14..00000000 --- a/indexer/indexer_monitors.tf +++ /dev/null @@ -1,16 +0,0 @@ -module "indexer_monitors" { - count = var.enable_monitoring ? 1 : 0 - - source = "../modules/indexer_monitors" - env_tag = "v4-${var.environment}" - environment = var.environment - slack_channel = var.monitoring_slack_channel - pagerduty_tag = var.monitoring_pagerduty_tag - secondary_pagerduty_tag = var.secondary_monitoring_pagerduty_tag - ecs_cluster_name = var.full_node_name - msk_cluster_name = aws_msk_cluster.main.cluster_name - team = var.monitoring_team - url = var.indexer_url - enable_precautionary_monitors = var.enable_precautionary_monitors - aws_account_id = var.monitoring_aws_account_id -} diff --git a/indexer_dashboards/indexer_dashboards.tf b/indexer_dashboards/indexer_dashboards.tf deleted file mode 100644 index 031fed71..00000000 --- a/indexer_dashboards/indexer_dashboards.tf +++ /dev/null @@ -1,4 +0,0 @@ -module "indexer_dashboards" { - source = "../modules/indexer_dashboards" - indexer_services_variable_mapping = var.indexer_services_variable_mapping -} diff --git a/indexer_dashboards/providers.tf b/indexer_dashboards/providers.tf deleted file mode 100644 index 8d83a778..00000000 --- a/indexer_dashboards/providers.tf +++ /dev/null @@ -1,6 +0,0 @@ -# Default provider. -provider "datadog" { - api_key = var.datadog_api_key - app_key = var.datadog_app_key - api_url = var.datadog_api_url -} diff --git a/indexer_dashboards/terraform.tfvars b/indexer_dashboards/terraform.tfvars deleted file mode 100644 index d83fce12..00000000 --- a/indexer_dashboards/terraform.tfvars +++ /dev/null @@ -1,51 +0,0 @@ -indexer_services_variable_mapping = { - "dev" : { - "environment" : "dev", - "service" : "indexer", - "cluster_name" : "dev-indexer-apne1-cluster", - "ecs_cluster_name" : "dev-indexer-full-node-cluster", - "msk_cluster_name" : "dev-indexer-apne1-msk-cluster" - }, - "dev2" : { - "environment" : "dev2", - "service" : "indexer", - "cluster_name" : "dev2-indexer-apne1-cluster", - "ecs_cluster_name" : "dev2-indexer-full-node-cluster", - "msk_cluster_name" : "dev2-indexer-apne1-msk-cluster" - }, - "dev3" : { - "environment" : "dev3", - "service" : "indexer", - "cluster_name" : "dev3-indexer-apne1-cluster", - "ecs_cluster_name" : "dev3-indexer-full-node-cluster", - "msk_cluster_name" : "dev3-indexer-apne1-msk-cluster" - }, - "dev4" : { - "environment" : "dev4", - "service" : "indexer", - "cluster_name" : "dev4-indexer-apne1-cluster", - "ecs_cluster_name" : "dev4-indexer-full-node-cluster", - "msk_cluster_name" : "dev4-indexer-apne1-msk-cluster" - }, - "dev5" : { - "environment" : "dev5", - "service" : "indexer", - "cluster_name" : "dev5-indexer-apne1-cluster", - "ecs_cluster_name" : "dev5-indexer-full-node-cluster", - "msk_cluster_name" : "dev5-indexer-apne1-msk-cluster" - }, - "staging" : { - "environment" : "staging", - "service" : "indexer", - "cluster_name" : "staging-indexer-apne1-cluster", - "ecs_cluster_name" : "staging-indexer-full-node-cluster", - "msk_cluster_name" : "staging-indexer-apne1-msk-cluster" - }, - "testnet2" : { - "environment" : "testnet2", - "service" : "indexer", - "cluster_name" : "testnet2-indexer-apne1-cluster", - "ecs_cluster_name" : "testnet2-indexer-full-node-cluster", - "msk_cluster_name" : "testnet2-indexer-apne1-msk-cluster" - }, -} diff --git a/indexer_dashboards/variables.tf b/indexer_dashboards/variables.tf deleted file mode 100644 index 1f9bd1fd..00000000 --- a/indexer_dashboards/variables.tf +++ /dev/null @@ -1,40 +0,0 @@ -variable "indexer_services_variable_mapping" { - type = map( - object({ - # Environment - environment = string - - # Service name - service = string - - # Cluster name for the indexer services - cluster_name = string - - # ECS cluster name for the full node - ecs_cluster_name = string - - # MSK cluster name - msk_cluster_name = string - }) - ) - - description = "Map of variable name to preset values of variables used in indexer services." -} - -variable "datadog_api_key" { - type = string - description = "Datadog API key" - sensitive = true -} - -variable "datadog_app_key" { - type = string - description = "Datadog app key" - sensitive = true -} - -variable "datadog_api_url" { - type = string - description = "The datadog api url" - default = "https://api.datadoghq.com/" -} diff --git a/indexer_dashboards/versions.tf b/indexer_dashboards/versions.tf deleted file mode 100644 index 401824f8..00000000 --- a/indexer_dashboards/versions.tf +++ /dev/null @@ -1,18 +0,0 @@ -terraform { - cloud { - organization = "dydxprotocol" - - workspaces { - tags = ["indexer-dashboards"] - } - } - - required_providers { - datadog = { - source = "DataDog/datadog" - version = "~> 3.29" - } - } - - required_version = "~> 1.3.2" -} diff --git a/modules/indexer_dashboards/comlink_dashboard.tf b/modules/indexer_dashboards/comlink_dashboard.tf deleted file mode 100644 index c3c7b154..00000000 --- a/modules/indexer_dashboards/comlink_dashboard.tf +++ /dev/null @@ -1,822 +0,0 @@ -resource "datadog_dashboard_json" "comlink" { - dashboard = < 1000", - "message": "Max Kafka Offset is > 1000 for at least 1 socks instance. This means delayed notifications for all websocket messages.\n\n${local.monitor_suffix_literal}", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 1000 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "notify_by": [ - "*" - ], - "include_tags": false, - "evaluation_delay": 900, - "new_group_delay": 60, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} - -resource "datadog_monitor_json" "orderbook_crossed" { - count = var.enable_precautionary_monitors ? 1 : 0 - - monitor = < 10 blocks behind latest block", - "type": "query alert", - "query": "min(last_30m):max:dydxprotocol.cometbft_consensus_height{env:${var.environment}} - max:ender.processing_block_height{env:${var.environment},service:indexer} > 10", - "message": "${local.critical_monitor_suffix_literal}", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 10 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": true, - "renotify_interval": 0, - "include_tags": false, - "no_data_timeframe": 60, - "new_host_delay": 300, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} - -resource "datadog_monitor_json" "last_processed_block_last_10min" { - monitor = < 100 blocks behind latest block", - "type": "query alert", - "query": "min(last_10m):max:dydxprotocol.cometbft_consensus_height{env:${var.environment}} - max:ender.processing_block_height{env:${var.environment},service:indexer} > 100", - "message": "${local.critical_monitor_suffix_literal}", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 100 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": true, - "renotify_interval": 0, - "include_tags": false, - "no_data_timeframe": 60, - "new_host_delay": 300, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} - -resource "datadog_monitor_json" "on_chain_kafka_offset" { - monitor = < 10", - "message": "Max. offset lag for the `to-ender` Kafka topic is > 10 meaning on-chain updates are delayed.\n\nResolution:\n- investigate why `ender` task running in ECS is not consuming from Kafka topic\n\n${local.critical_monitor_suffix_literal}", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 10 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "evaluation_delay": 900, - "new_group_delay": 0, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} - -resource "datadog_monitor_json" "off_chain_kafka_offset" { - monitor = < 100", - "message": "Max. offset lag for the `to-vulcan` Kafka topic is > 100 meaning order OPEN / CANCEL and order book updates are delayed.\n\nResolution:\n- increase the number of `vulcan` tasks running in ECS\n\n${local.monitor_suffix_literal}", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 100 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "evaluation_delay": 900, - "new_group_delay": 0, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} - -resource "datadog_monitor_json" "fast_sync_snapshots" { - monitor = < 0.01", - "message": "Elevated Internal Server Errors from Comlink. Check Comlink logs/RDS for any issues.\n\n${local.monitor_suffix_literal}", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 0.01 - }, - "notify_audit": false, - "include_tags": false, - "notify_no_data": false, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} diff --git a/modules/indexer_monitors/precautionary_monitors.tf b/modules/indexer_monitors/precautionary_monitors.tf deleted file mode 100644 index 837a8cc8..00000000 --- a/modules/indexer_monitors/precautionary_monitors.tf +++ /dev/null @@ -1,148 +0,0 @@ -resource "datadog_monitor_json" "average_block_processing_rate" { - monitor = < 0.5", - "message": "This is not an actionable alert. When this alert fires, that means that the Indexer is processing blocks slow and more time should be invested in improving Ender latency. Please notify Trading if this alert fires.\n\n${local.monitor_suffix_literal}", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 0.5 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": true, - "renotify_interval": 0, - "include_tags": false, - "no_data_timeframe": 60, - "new_host_delay": 300, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} - -resource "datadog_monitor_json" "p95_block_processing_rate" { - monitor = < 0.75", - "message": "This is not an actionable alert. When this alert fires, that means that the Indexer is processing blocks slow and more time should be invested in improving Ender latency. Please notify Trading if this alert fires.\n\n${local.monitor_suffix_literal}", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 0.75 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": true, - "renotify_interval": 0, - "include_tags": false, - "no_data_timeframe": 60, - "new_host_delay": 300, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} - -resource "datadog_monitor_json" "rds_read_replica_lag" { - monitor = < 2", - "message": "This is not an actionable alert. When this alert fires, that means that the RDS read replica lag is high.\n\n${local.monitor_suffix_literal}", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 1 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": true, - "renotify_interval": 0, - "include_tags": false, - "no_data_timeframe": 60, - "evaluation_delay": 900, - "new_host_delay": 300, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} - -resource "datadog_monitor_json" "websocket_stream_destroyed" { - monitor = < 500", - "message": "Underlying socket was destroyed, leading to lost messages. Check if there are any CPU/memory spikes on any specific tasks. This should auto-recover.\n\n${local.monitor_suffix_literal}", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 500 - }, - "notify_audit": false, - "include_tags": false, - "notify_no_data": false, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} - -resource "datadog_monitor_json" "stale_compliance_data" { - count = var.environment == "mainnet" ? 1 : 0 - monitor = < 1000", - "message": "Addresses have stale compliance data. Check the two metrics to determine if active or inactive addresses are stale. update-compliance-data.ts roundtable is responsible for updating compliance data.", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 1000 - }, - "notify_audit": false, - "include_tags": false, - "notify_no_data": false, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} \ No newline at end of file diff --git a/modules/indexer_monitors/roundtable_monitors.tf b/modules/indexer_monitors/roundtable_monitors.tf deleted file mode 100644 index b16972aa..00000000 --- a/modules/indexer_monitors/roundtable_monitors.tf +++ /dev/null @@ -1,51 +0,0 @@ -resource "datadog_monitor_json" "roundtable_update_affiliate_info_persistent_cache_stale" { - monitor = < 600", - "message": "persistentCache.affiliateInfoUpdateTime is more than 10 minutes in the past. This indicates that update-affiliate-info roundtable has not run successfully in past 10 min -> affiliate_info table is stale.", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 600 - }, - "notify_audit": false, - "include_tags": false, - "notify_no_data": false, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} - -resource "datadog_monitor_json" "roundtable_update_wallet_total_volume_persistent_cache_stale" { - monitor = < 600", - "message": "persistentCache.totalVolumeUpdateTime is more than 10 minutes in the past. This indicates that update-wallet-total-volume roundtable has not run successfully in past 10 min -> totalVolume column of wallets table is stale.", - "tags": [ - "team:${var.team}", - "env:${var.env_tag}" - ], - "options": { - "thresholds": { - "critical": 600 - }, - "notify_audit": false, - "include_tags": false, - "notify_no_data": false, - "silenced": {} - }, - "priority": null, - "restricted_roles": null -} -EOF -} \ No newline at end of file diff --git a/modules/indexer_monitors/synthetic_monitors.tf b/modules/indexer_monitors/synthetic_monitors.tf deleted file mode 100644 index 64696ce3..00000000 --- a/modules/indexer_monitors/synthetic_monitors.tf +++ /dev/null @@ -1,368 +0,0 @@ -resource "datadog_synthetics_test" "socks" { - type = "api" - subtype = "websocket" - status = "live" - - request_definition { - url = local.wss_url - } - - request_headers = { - Content-Type = "application/json" - } - - assertion { - type = "responseTime" - operator = "lessThan" - target = "3000" - } - - assertion { - type = "receivedMessage" - operator = "contains" - target = "connected" - } - - name = "[${var.environment}] Indexer Websocket connection test" - message = "Websocket connections cannot be established with the Indexer.\n\n Impact:\nFE/API wil be degraded from being unable to connect to websockets.\n\nResolution:\nCheck `socks` logs in AWS / Datadog to see why the endpoint is erroring.\n\n${local.monitor_suffix}" - tags = ["team:${var.team}", "env:${var.env_tag}"] - locations = [ - "aws:ap-east-1", - "aws:ap-northeast-1", - "aws:eu-central-1", - "aws:eu-west-1", - ] - - options_list { - monitor_name = "[${var.environment}] Indexer Socks is down" - monitor_options { - renotify_interval = 0 - } - retry { - count = 3 - interval = local.retry_interval - } - tick_every = local.tick_frequency - } -} - -resource "datadog_synthetics_test" "api_http_synthetic_monitors" { - for_each = local.api_http_synthetic_monitor_configurations - - type = "api" - subtype = "http" - status = "live" - - request_definition { - method = "GET" - url = each.value.url - } - - request_headers = { - Content-Type = "application/json" - } - - assertion { - operator = "lessThan" - type = "responseTime" - target = "3000" - } - - assertion { - operator = "is" - type = "statusCode" - target = "200" - } - - assertion { - operator = "validatesJSONPath" - type = "body" - targetjsonpath { - jsonpath = each.value.targetjsonpath.jsonpath - operator = each.value.targetjsonpath.operator - targetvalue = each.value.targetjsonpath.targetvalue - } - } - - name = each.value.name - message = each.value.message - tags = ["team:${var.team}", "env:${var.env_tag}"] - locations = [ - "aws:ap-east-1", - "aws:ap-northeast-1", - "aws:eu-central-1", - "aws:eu-west-1", - ] - - options_list { - monitor_name = each.value.monitor_name - monitor_options { - renotify_interval = 0 - } - retry { - count = local.retry_count - interval = local.retry_interval - } - tick_every = local.tick_frequency - } -} - -resource "datadog_synthetics_test" "comlink_trades" { - type = "api" - subtype = "multi" - status = "live" - - api_step { - name = "BTC trades" - subtype = "http" - - request_definition { - method = "GET" - url = "${local.https_url}/trades/perpetualMarket/BTC-USD" - } - - request_headers = { - Content-Type = "application/json" - } - - retry { - count = local.retry_count - interval = local.tick_frequency - } - - assertion { - operator = "lessThan" - type = "responseTime" - target = "5000" - } - - assertion { - operator = "is" - type = "statusCode" - target = "200" - } - - assertion { - operator = "is" - property = "content-type" - type = "header" - target = "application/json; charset=utf-8" - } - - assertion { - type = "body" - operator = "validatesJSONPath" - targetjsonpath { - jsonpath = "trades.length" - operator = "moreThan" - targetvalue = "-1" - } - } - } - - api_step { - name = "ETH trades" - subtype = "http" - - request_definition { - method = "GET" - url = "${local.https_url}/trades/perpetualMarket/ETH-USD" - } - - request_headers = { - Content-Type = "application/json" - } - - retry { - count = local.retry_count - interval = local.tick_frequency - } - - assertion { - operator = "lessThan" - type = "responseTime" - target = "5000" - } - - assertion { - operator = "is" - type = "statusCode" - target = "200" - } - - assertion { - operator = "is" - property = "content-type" - type = "header" - target = "application/json; charset=utf-8" - } - - assertion { - type = "body" - operator = "validatesJSONPath" - targetjsonpath { - jsonpath = "trades.length" - operator = "moreThan" - targetvalue = "-1" - } - } - } - - name = "[${var.environment}] Indexer Comlink /trades endpoint" - message = "/trades endpoint on Comlink is down\n \n Impact:\nFE/API wil be degraded from lack of trades.\n\nResolution:\nCheck `comlink` logs in AWS / Datadog to see why the endpoint is erroring.\n\n${local.monitor_suffix}" - tags = ["team:${var.team}", "env:${var.env_tag}"] - locations = [ - "aws:ap-east-1", - "aws:ap-northeast-1", - "aws:eu-central-1", - "aws:eu-west-1", - ] - - options_list { - monitor_name = "[${var.environment}] Indexer Comlink /trades endpoint is down" - monitor_options { - renotify_interval = 0 - } - tick_every = local.tick_frequency - } -} - -resource "datadog_synthetics_test" "comlink_orderbook" { - type = "api" - subtype = "multi" - status = "live" - - api_step { - name = "BTC orderbook" - subtype = "http" - - request_definition { - method = "GET" - url = "${local.https_url}/orderbooks/perpetualMarket/BTC-USD" - } - - request_headers = { - Content-Type = "application/json" - } - - retry { - count = local.retry_count - interval = local.retry_interval - } - - assertion { - operator = "lessThan" - type = "responseTime" - target = "3000" - } - - assertion { - operator = "is" - type = "statusCode" - target = "200" - } - - assertion { - operator = "is" - property = "content-type" - type = "header" - target = "application/json; charset=utf-8" - } - - assertion { - type = "body" - operator = "validatesJSONPath" - targetjsonpath { - jsonpath = "asks.length" - operator = "moreThan" - targetvalue = "-1" - } - } - - assertion { - type = "body" - operator = "validatesJSONPath" - targetjsonpath { - jsonpath = "bids.length" - operator = "moreThan" - targetvalue = "-1" - } - } - } - - api_step { - name = "ETH orderbook" - subtype = "http" - - request_definition { - method = "GET" - url = "${local.https_url}/orderbooks/perpetualMarket/ETH-USD" - } - - request_headers = { - Content-Type = "application/json" - } - - retry { - count = local.retry_count - interval = local.retry_interval - } - - assertion { - operator = "lessThan" - type = "responseTime" - target = "3000" - } - - assertion { - operator = "is" - type = "statusCode" - target = "200" - } - - assertion { - operator = "is" - property = "content-type" - type = "header" - target = "application/json; charset=utf-8" - } - - assertion { - type = "body" - operator = "validatesJSONPath" - targetjsonpath { - jsonpath = "asks.length" - operator = "moreThan" - targetvalue = "-1" - } - } - - assertion { - type = "body" - operator = "validatesJSONPath" - targetjsonpath { - jsonpath = "bids.length" - operator = "moreThan" - targetvalue = "-1" - } - } - } - - name = "[${var.environment}] Indexer Comlink /orderbook endpoint" - message = "/orderbooks endpoint is down.\n\nImpact:\nFE / API experience degraded as no orderbooks can be fetched.\n\nResolution:\nCheck `comlink` logs in AWS / Datadog to determine why the endpoint is erroring.\n\n${local.monitor_suffix}" - tags = ["team:${var.team}", "env:${var.env_tag}"] - locations = [ - "aws:ap-east-1", - "aws:ap-northeast-1", - "aws:eu-central-1", - "aws:eu-west-1", - ] - - options_list { - monitor_name = "[${var.environment}] Indexer Comlink /orderbook endpoint is down" - monitor_options { - renotify_interval = 0 - } - tick_every = local.tick_frequency - } -} diff --git a/modules/indexer_monitors/variables.tf b/modules/indexer_monitors/variables.tf deleted file mode 100644 index 2911d2c8..00000000 --- a/modules/indexer_monitors/variables.tf +++ /dev/null @@ -1,54 +0,0 @@ -variable "env_tag" { - type = string - description = "Env tag to add to all monitors" -} - -variable "environment" { - type = string - description = "Environment that all metrics for monitors reside in. All Indexer service metrics should use the env tag." -} - -variable "slack_channel" { - type = string - description = "Slack channel to publish all alerts to. If \"\", then no slack channel will be used. Should be prepended with @ such as '@dydx-alerts'" -} - -variable "pagerduty_tag" { - type = string - description = "PagerDuty tag to add to all monitors. If \"\", then no PagerDuty tag will be used. Should be prepended with @ such as '@pagerduty-indexer'" -} - -variable "secondary_pagerduty_tag" { - type = string - description = "PagerDuty tag to add to critical monitors. This will be in addition to pagerduty_tag above. Should be prepended with @ such as '@pagerduty-indexer'" -} - -variable "ecs_cluster_name" { - type = string - description = "ECS cluster name for the full node" -} - -variable "msk_cluster_name" { - type = string - description = "MSK cluster name" -} - -variable "team" { - type = string - description = "Team tag to add to all monitors" -} - -variable "url" { - type = string - description = "Indexer URL to monitor, should not include https:// or www. Should be something like `indexer.dydx.exchange`" -} - -variable "enable_precautionary_monitors" { - type = bool - description = "Whether to enable precautionary monitors" -} - -variable "aws_account_id" { - type = string - description = "Account ID for the AWS account" -} diff --git a/modules/indexer_monitors/versions.tf b/modules/indexer_monitors/versions.tf deleted file mode 100644 index d41867e6..00000000 --- a/modules/indexer_monitors/versions.tf +++ /dev/null @@ -1,10 +0,0 @@ -terraform { - required_providers { - datadog = { - source = "DataDog/datadog" - version = "~> 3.29" - } - } - - required_version = "~> 1.3.2" -} From 7f9928df6d4f833366cf42524713c4979062cc6d Mon Sep 17 00:00:00 2001 From: Will Liu Date: Mon, 30 Sep 2024 14:32:34 -0400 Subject: [PATCH 2/2] add back monitors since they r part of indexer proj --- indexer/indexer_monitors.tf | 16 + modules/indexer_monitors/locals.tf | 39 ++ modules/indexer_monitors/monitors.tf | 373 ++++++++++++++++++ .../precautionary_monitors.tf | 148 +++++++ .../indexer_monitors/roundtable_monitors.tf | 51 +++ .../indexer_monitors/synthetic_monitors.tf | 368 +++++++++++++++++ modules/indexer_monitors/variables.tf | 54 +++ modules/indexer_monitors/versions.tf | 10 + 8 files changed, 1059 insertions(+) create mode 100644 indexer/indexer_monitors.tf create mode 100644 modules/indexer_monitors/locals.tf create mode 100644 modules/indexer_monitors/monitors.tf create mode 100644 modules/indexer_monitors/precautionary_monitors.tf create mode 100644 modules/indexer_monitors/roundtable_monitors.tf create mode 100644 modules/indexer_monitors/synthetic_monitors.tf create mode 100644 modules/indexer_monitors/variables.tf create mode 100644 modules/indexer_monitors/versions.tf diff --git a/indexer/indexer_monitors.tf b/indexer/indexer_monitors.tf new file mode 100644 index 00000000..021bac14 --- /dev/null +++ b/indexer/indexer_monitors.tf @@ -0,0 +1,16 @@ +module "indexer_monitors" { + count = var.enable_monitoring ? 1 : 0 + + source = "../modules/indexer_monitors" + env_tag = "v4-${var.environment}" + environment = var.environment + slack_channel = var.monitoring_slack_channel + pagerduty_tag = var.monitoring_pagerduty_tag + secondary_pagerduty_tag = var.secondary_monitoring_pagerduty_tag + ecs_cluster_name = var.full_node_name + msk_cluster_name = aws_msk_cluster.main.cluster_name + team = var.monitoring_team + url = var.indexer_url + enable_precautionary_monitors = var.enable_precautionary_monitors + aws_account_id = var.monitoring_aws_account_id +} diff --git a/modules/indexer_monitors/locals.tf b/modules/indexer_monitors/locals.tf new file mode 100644 index 00000000..7eb22972 --- /dev/null +++ b/modules/indexer_monitors/locals.tf @@ -0,0 +1,39 @@ +locals { + monitor_suffix_literal = "{{#is_alert}}\\n${var.pagerduty_tag}\\n{{/is_alert}}\\n\\n{{#is_recovery}}\\n${var.pagerduty_tag}\\n{{/is_recovery}}\\n\\n${var.slack_channel}\\n\\n" + monitor_suffix = "{{#is_alert}}\n${var.pagerduty_tag}\n{{/is_alert}}\n\n{{#is_recovery}}\n${var.pagerduty_tag}\n{{/is_recovery}}\n\n${var.slack_channel}\n\n" + critical_monitor_suffix_literal = "{{#is_alert}}\\n${var.pagerduty_tag} ${var.secondary_pagerduty_tag}\\n{{/is_alert}}\\n\\n{{#is_recovery}}\\n${var.pagerduty_tag} ${var.secondary_pagerduty_tag}\\n{{/is_recovery}}\\n\\n${var.slack_channel}\\n\\n" + critical_monitor_suffix = "{{#is_alert}}\n${var.pagerduty_tag} ${var.secondary_pagerduty_tag}\n{{/is_alert}}\n\n{{#is_recovery}}\n${var.pagerduty_tag} ${var.secondary_pagerduty_tag}\n{{/is_recovery}}\n\n${var.slack_channel}\n\n" + monitor_no_data_suffix_literal = "{{#is_no_data}}\\n${var.pagerduty_tag}\\n{{/is_no_data}}\\n\\n{{#is_no_data_recovery}}\\n${var.pagerduty_tag}\\n{{/is_no_data_recovery}}\\n\\n" + monitor_no_data_suffix = "{{#is_no_data}}\n${var.pagerduty_tag}\n{{/is_no_data}}\n\n{{#is_no_data_recovery}}\n${var.pagerduty_tag}\n{{/is_no_data_recovery}}\n\n" + wss_url = "wss://${var.url}/v4/ws" + https_url = "https://${var.url}/v4" + tick_frequency = 300 # 5 minutes + retry_interval = 3000 # 3 seconds in milliseconds + retry_count = 3 # 3 retries + snapshot_bucket_prefix = var.aws_account_id == "" ? var.environment : "${var.aws_account_id}-${var.environment}" + + api_http_synthetic_monitor_configurations = { + "height" : { + url = "${local.https_url}/height" + targetjsonpath = { + jsonpath = "height" + operator = "moreThan" + targetvalue = "0" + } + name = "[${var.environment}] Indexer Comlink /height endpoint" + message = "/height endpoint on Comlink is down\n \n Impact:\nFE/API is unable to determine height.\n\nResolution:\nCheck `comlink` logs in AWS / Datadog to see why the endpoint is erroring.\n\n${local.monitor_suffix}" + monitor_name = "[${var.environment}] Indexer Comlink /height endpoint is down" + }, + "perpetualMarkets" : { + url = "${local.https_url}/perpetualMarkets" + targetjsonpath = { + jsonpath = "markets['LINK-USD'].openInterest" + operator = "moreThan" + targetvalue = "0" + } + name = "[${var.environment}] Indexer Comlink /perpetualMarkets endpoint" + message = "/perpetualMarkets endpoint on Comlink is down\n \n Impact:\nFE/API wil be degraded from inability to pull perpetual markets.\n\nResolution:\nCheck `comlink` logs in AWS / Datadog to see why the endpoint is erroring.\n\n${local.monitor_suffix}" + monitor_name = "[${var.environment}] Indexer Comlink /perpetualMarkets endpoint is down" + } + } +} diff --git a/modules/indexer_monitors/monitors.tf b/modules/indexer_monitors/monitors.tf new file mode 100644 index 00000000..9ad1f84d --- /dev/null +++ b/modules/indexer_monitors/monitors.tf @@ -0,0 +1,373 @@ +resource "datadog_monitor_json" "socks_kafka_offset" { + count = var.enable_precautionary_monitors ? 1 : 0 + + monitor = < 1000", + "message": "Max Kafka Offset is > 1000 for at least 1 socks instance. This means delayed notifications for all websocket messages.\n\n${local.monitor_suffix_literal}", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 1000 + }, + "notify_audit": false, + "require_full_window": false, + "notify_no_data": false, + "renotify_interval": 0, + "notify_by": [ + "*" + ], + "include_tags": false, + "evaluation_delay": 900, + "new_group_delay": 60, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} + +resource "datadog_monitor_json" "orderbook_crossed" { + count = var.enable_precautionary_monitors ? 1 : 0 + + monitor = < 10 blocks behind latest block", + "type": "query alert", + "query": "min(last_30m):max:dydxprotocol.cometbft_consensus_height{env:${var.environment}} - max:ender.processing_block_height{env:${var.environment},service:indexer} > 10", + "message": "${local.critical_monitor_suffix_literal}", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 10 + }, + "notify_audit": false, + "require_full_window": false, + "notify_no_data": true, + "renotify_interval": 0, + "include_tags": false, + "no_data_timeframe": 60, + "new_host_delay": 300, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} + +resource "datadog_monitor_json" "last_processed_block_last_10min" { + monitor = < 100 blocks behind latest block", + "type": "query alert", + "query": "min(last_10m):max:dydxprotocol.cometbft_consensus_height{env:${var.environment}} - max:ender.processing_block_height{env:${var.environment},service:indexer} > 100", + "message": "${local.critical_monitor_suffix_literal}", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 100 + }, + "notify_audit": false, + "require_full_window": false, + "notify_no_data": true, + "renotify_interval": 0, + "include_tags": false, + "no_data_timeframe": 60, + "new_host_delay": 300, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} + +resource "datadog_monitor_json" "on_chain_kafka_offset" { + monitor = < 10", + "message": "Max. offset lag for the `to-ender` Kafka topic is > 10 meaning on-chain updates are delayed.\n\nResolution:\n- investigate why `ender` task running in ECS is not consuming from Kafka topic\n\n${local.critical_monitor_suffix_literal}", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 10 + }, + "notify_audit": false, + "require_full_window": false, + "notify_no_data": false, + "renotify_interval": 0, + "include_tags": false, + "evaluation_delay": 900, + "new_group_delay": 0, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} + +resource "datadog_monitor_json" "off_chain_kafka_offset" { + monitor = < 100", + "message": "Max. offset lag for the `to-vulcan` Kafka topic is > 100 meaning order OPEN / CANCEL and order book updates are delayed.\n\nResolution:\n- increase the number of `vulcan` tasks running in ECS\n\n${local.monitor_suffix_literal}", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 100 + }, + "notify_audit": false, + "require_full_window": false, + "notify_no_data": false, + "renotify_interval": 0, + "include_tags": false, + "evaluation_delay": 900, + "new_group_delay": 0, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} + +resource "datadog_monitor_json" "fast_sync_snapshots" { + monitor = < 0.01", + "message": "Elevated Internal Server Errors from Comlink. Check Comlink logs/RDS for any issues.\n\n${local.monitor_suffix_literal}", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 0.01 + }, + "notify_audit": false, + "include_tags": false, + "notify_no_data": false, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} diff --git a/modules/indexer_monitors/precautionary_monitors.tf b/modules/indexer_monitors/precautionary_monitors.tf new file mode 100644 index 00000000..837a8cc8 --- /dev/null +++ b/modules/indexer_monitors/precautionary_monitors.tf @@ -0,0 +1,148 @@ +resource "datadog_monitor_json" "average_block_processing_rate" { + monitor = < 0.5", + "message": "This is not an actionable alert. When this alert fires, that means that the Indexer is processing blocks slow and more time should be invested in improving Ender latency. Please notify Trading if this alert fires.\n\n${local.monitor_suffix_literal}", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 0.5 + }, + "notify_audit": false, + "require_full_window": false, + "notify_no_data": true, + "renotify_interval": 0, + "include_tags": false, + "no_data_timeframe": 60, + "new_host_delay": 300, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} + +resource "datadog_monitor_json" "p95_block_processing_rate" { + monitor = < 0.75", + "message": "This is not an actionable alert. When this alert fires, that means that the Indexer is processing blocks slow and more time should be invested in improving Ender latency. Please notify Trading if this alert fires.\n\n${local.monitor_suffix_literal}", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 0.75 + }, + "notify_audit": false, + "require_full_window": false, + "notify_no_data": true, + "renotify_interval": 0, + "include_tags": false, + "no_data_timeframe": 60, + "new_host_delay": 300, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} + +resource "datadog_monitor_json" "rds_read_replica_lag" { + monitor = < 2", + "message": "This is not an actionable alert. When this alert fires, that means that the RDS read replica lag is high.\n\n${local.monitor_suffix_literal}", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 1 + }, + "notify_audit": false, + "require_full_window": false, + "notify_no_data": true, + "renotify_interval": 0, + "include_tags": false, + "no_data_timeframe": 60, + "evaluation_delay": 900, + "new_host_delay": 300, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} + +resource "datadog_monitor_json" "websocket_stream_destroyed" { + monitor = < 500", + "message": "Underlying socket was destroyed, leading to lost messages. Check if there are any CPU/memory spikes on any specific tasks. This should auto-recover.\n\n${local.monitor_suffix_literal}", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 500 + }, + "notify_audit": false, + "include_tags": false, + "notify_no_data": false, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} + +resource "datadog_monitor_json" "stale_compliance_data" { + count = var.environment == "mainnet" ? 1 : 0 + monitor = < 1000", + "message": "Addresses have stale compliance data. Check the two metrics to determine if active or inactive addresses are stale. update-compliance-data.ts roundtable is responsible for updating compliance data.", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 1000 + }, + "notify_audit": false, + "include_tags": false, + "notify_no_data": false, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} \ No newline at end of file diff --git a/modules/indexer_monitors/roundtable_monitors.tf b/modules/indexer_monitors/roundtable_monitors.tf new file mode 100644 index 00000000..b16972aa --- /dev/null +++ b/modules/indexer_monitors/roundtable_monitors.tf @@ -0,0 +1,51 @@ +resource "datadog_monitor_json" "roundtable_update_affiliate_info_persistent_cache_stale" { + monitor = < 600", + "message": "persistentCache.affiliateInfoUpdateTime is more than 10 minutes in the past. This indicates that update-affiliate-info roundtable has not run successfully in past 10 min -> affiliate_info table is stale.", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 600 + }, + "notify_audit": false, + "include_tags": false, + "notify_no_data": false, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} + +resource "datadog_monitor_json" "roundtable_update_wallet_total_volume_persistent_cache_stale" { + monitor = < 600", + "message": "persistentCache.totalVolumeUpdateTime is more than 10 minutes in the past. This indicates that update-wallet-total-volume roundtable has not run successfully in past 10 min -> totalVolume column of wallets table is stale.", + "tags": [ + "team:${var.team}", + "env:${var.env_tag}" + ], + "options": { + "thresholds": { + "critical": 600 + }, + "notify_audit": false, + "include_tags": false, + "notify_no_data": false, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} +EOF +} \ No newline at end of file diff --git a/modules/indexer_monitors/synthetic_monitors.tf b/modules/indexer_monitors/synthetic_monitors.tf new file mode 100644 index 00000000..64696ce3 --- /dev/null +++ b/modules/indexer_monitors/synthetic_monitors.tf @@ -0,0 +1,368 @@ +resource "datadog_synthetics_test" "socks" { + type = "api" + subtype = "websocket" + status = "live" + + request_definition { + url = local.wss_url + } + + request_headers = { + Content-Type = "application/json" + } + + assertion { + type = "responseTime" + operator = "lessThan" + target = "3000" + } + + assertion { + type = "receivedMessage" + operator = "contains" + target = "connected" + } + + name = "[${var.environment}] Indexer Websocket connection test" + message = "Websocket connections cannot be established with the Indexer.\n\n Impact:\nFE/API wil be degraded from being unable to connect to websockets.\n\nResolution:\nCheck `socks` logs in AWS / Datadog to see why the endpoint is erroring.\n\n${local.monitor_suffix}" + tags = ["team:${var.team}", "env:${var.env_tag}"] + locations = [ + "aws:ap-east-1", + "aws:ap-northeast-1", + "aws:eu-central-1", + "aws:eu-west-1", + ] + + options_list { + monitor_name = "[${var.environment}] Indexer Socks is down" + monitor_options { + renotify_interval = 0 + } + retry { + count = 3 + interval = local.retry_interval + } + tick_every = local.tick_frequency + } +} + +resource "datadog_synthetics_test" "api_http_synthetic_monitors" { + for_each = local.api_http_synthetic_monitor_configurations + + type = "api" + subtype = "http" + status = "live" + + request_definition { + method = "GET" + url = each.value.url + } + + request_headers = { + Content-Type = "application/json" + } + + assertion { + operator = "lessThan" + type = "responseTime" + target = "3000" + } + + assertion { + operator = "is" + type = "statusCode" + target = "200" + } + + assertion { + operator = "validatesJSONPath" + type = "body" + targetjsonpath { + jsonpath = each.value.targetjsonpath.jsonpath + operator = each.value.targetjsonpath.operator + targetvalue = each.value.targetjsonpath.targetvalue + } + } + + name = each.value.name + message = each.value.message + tags = ["team:${var.team}", "env:${var.env_tag}"] + locations = [ + "aws:ap-east-1", + "aws:ap-northeast-1", + "aws:eu-central-1", + "aws:eu-west-1", + ] + + options_list { + monitor_name = each.value.monitor_name + monitor_options { + renotify_interval = 0 + } + retry { + count = local.retry_count + interval = local.retry_interval + } + tick_every = local.tick_frequency + } +} + +resource "datadog_synthetics_test" "comlink_trades" { + type = "api" + subtype = "multi" + status = "live" + + api_step { + name = "BTC trades" + subtype = "http" + + request_definition { + method = "GET" + url = "${local.https_url}/trades/perpetualMarket/BTC-USD" + } + + request_headers = { + Content-Type = "application/json" + } + + retry { + count = local.retry_count + interval = local.tick_frequency + } + + assertion { + operator = "lessThan" + type = "responseTime" + target = "5000" + } + + assertion { + operator = "is" + type = "statusCode" + target = "200" + } + + assertion { + operator = "is" + property = "content-type" + type = "header" + target = "application/json; charset=utf-8" + } + + assertion { + type = "body" + operator = "validatesJSONPath" + targetjsonpath { + jsonpath = "trades.length" + operator = "moreThan" + targetvalue = "-1" + } + } + } + + api_step { + name = "ETH trades" + subtype = "http" + + request_definition { + method = "GET" + url = "${local.https_url}/trades/perpetualMarket/ETH-USD" + } + + request_headers = { + Content-Type = "application/json" + } + + retry { + count = local.retry_count + interval = local.tick_frequency + } + + assertion { + operator = "lessThan" + type = "responseTime" + target = "5000" + } + + assertion { + operator = "is" + type = "statusCode" + target = "200" + } + + assertion { + operator = "is" + property = "content-type" + type = "header" + target = "application/json; charset=utf-8" + } + + assertion { + type = "body" + operator = "validatesJSONPath" + targetjsonpath { + jsonpath = "trades.length" + operator = "moreThan" + targetvalue = "-1" + } + } + } + + name = "[${var.environment}] Indexer Comlink /trades endpoint" + message = "/trades endpoint on Comlink is down\n \n Impact:\nFE/API wil be degraded from lack of trades.\n\nResolution:\nCheck `comlink` logs in AWS / Datadog to see why the endpoint is erroring.\n\n${local.monitor_suffix}" + tags = ["team:${var.team}", "env:${var.env_tag}"] + locations = [ + "aws:ap-east-1", + "aws:ap-northeast-1", + "aws:eu-central-1", + "aws:eu-west-1", + ] + + options_list { + monitor_name = "[${var.environment}] Indexer Comlink /trades endpoint is down" + monitor_options { + renotify_interval = 0 + } + tick_every = local.tick_frequency + } +} + +resource "datadog_synthetics_test" "comlink_orderbook" { + type = "api" + subtype = "multi" + status = "live" + + api_step { + name = "BTC orderbook" + subtype = "http" + + request_definition { + method = "GET" + url = "${local.https_url}/orderbooks/perpetualMarket/BTC-USD" + } + + request_headers = { + Content-Type = "application/json" + } + + retry { + count = local.retry_count + interval = local.retry_interval + } + + assertion { + operator = "lessThan" + type = "responseTime" + target = "3000" + } + + assertion { + operator = "is" + type = "statusCode" + target = "200" + } + + assertion { + operator = "is" + property = "content-type" + type = "header" + target = "application/json; charset=utf-8" + } + + assertion { + type = "body" + operator = "validatesJSONPath" + targetjsonpath { + jsonpath = "asks.length" + operator = "moreThan" + targetvalue = "-1" + } + } + + assertion { + type = "body" + operator = "validatesJSONPath" + targetjsonpath { + jsonpath = "bids.length" + operator = "moreThan" + targetvalue = "-1" + } + } + } + + api_step { + name = "ETH orderbook" + subtype = "http" + + request_definition { + method = "GET" + url = "${local.https_url}/orderbooks/perpetualMarket/ETH-USD" + } + + request_headers = { + Content-Type = "application/json" + } + + retry { + count = local.retry_count + interval = local.retry_interval + } + + assertion { + operator = "lessThan" + type = "responseTime" + target = "3000" + } + + assertion { + operator = "is" + type = "statusCode" + target = "200" + } + + assertion { + operator = "is" + property = "content-type" + type = "header" + target = "application/json; charset=utf-8" + } + + assertion { + type = "body" + operator = "validatesJSONPath" + targetjsonpath { + jsonpath = "asks.length" + operator = "moreThan" + targetvalue = "-1" + } + } + + assertion { + type = "body" + operator = "validatesJSONPath" + targetjsonpath { + jsonpath = "bids.length" + operator = "moreThan" + targetvalue = "-1" + } + } + } + + name = "[${var.environment}] Indexer Comlink /orderbook endpoint" + message = "/orderbooks endpoint is down.\n\nImpact:\nFE / API experience degraded as no orderbooks can be fetched.\n\nResolution:\nCheck `comlink` logs in AWS / Datadog to determine why the endpoint is erroring.\n\n${local.monitor_suffix}" + tags = ["team:${var.team}", "env:${var.env_tag}"] + locations = [ + "aws:ap-east-1", + "aws:ap-northeast-1", + "aws:eu-central-1", + "aws:eu-west-1", + ] + + options_list { + monitor_name = "[${var.environment}] Indexer Comlink /orderbook endpoint is down" + monitor_options { + renotify_interval = 0 + } + tick_every = local.tick_frequency + } +} diff --git a/modules/indexer_monitors/variables.tf b/modules/indexer_monitors/variables.tf new file mode 100644 index 00000000..2911d2c8 --- /dev/null +++ b/modules/indexer_monitors/variables.tf @@ -0,0 +1,54 @@ +variable "env_tag" { + type = string + description = "Env tag to add to all monitors" +} + +variable "environment" { + type = string + description = "Environment that all metrics for monitors reside in. All Indexer service metrics should use the env tag." +} + +variable "slack_channel" { + type = string + description = "Slack channel to publish all alerts to. If \"\", then no slack channel will be used. Should be prepended with @ such as '@dydx-alerts'" +} + +variable "pagerduty_tag" { + type = string + description = "PagerDuty tag to add to all monitors. If \"\", then no PagerDuty tag will be used. Should be prepended with @ such as '@pagerduty-indexer'" +} + +variable "secondary_pagerduty_tag" { + type = string + description = "PagerDuty tag to add to critical monitors. This will be in addition to pagerduty_tag above. Should be prepended with @ such as '@pagerduty-indexer'" +} + +variable "ecs_cluster_name" { + type = string + description = "ECS cluster name for the full node" +} + +variable "msk_cluster_name" { + type = string + description = "MSK cluster name" +} + +variable "team" { + type = string + description = "Team tag to add to all monitors" +} + +variable "url" { + type = string + description = "Indexer URL to monitor, should not include https:// or www. Should be something like `indexer.dydx.exchange`" +} + +variable "enable_precautionary_monitors" { + type = bool + description = "Whether to enable precautionary monitors" +} + +variable "aws_account_id" { + type = string + description = "Account ID for the AWS account" +} diff --git a/modules/indexer_monitors/versions.tf b/modules/indexer_monitors/versions.tf new file mode 100644 index 00000000..d41867e6 --- /dev/null +++ b/modules/indexer_monitors/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_providers { + datadog = { + source = "DataDog/datadog" + version = "~> 3.29" + } + } + + required_version = "~> 1.3.2" +}