From 8a7cf16612144d8b0ddae4779b3901741039ef8a Mon Sep 17 00:00:00 2001 From: Matthieu Bourgain Date: Fri, 15 Nov 2024 15:40:31 +0100 Subject: [PATCH 1/2] add snapshot monitors for elasticsearch --- database/elasticsearch/README.md | 44 +++++--- database/elasticsearch/inputs.tf | 77 +++++++++++++ .../elasticsearch/monitors-elasticsearch.tf | 105 ++++++++++++++---- database/elasticsearch/outputs.tf | 10 ++ 4 files changed, 201 insertions(+), 35 deletions(-) diff --git a/database/elasticsearch/README.md b/database/elasticsearch/README.md index bea1e44c..c8d8dccc 100644 --- a/database/elasticsearch/README.md +++ b/database/elasticsearch/README.md @@ -23,22 +23,24 @@ Creates DataDog monitors with the following checks: - Elasticsearch average search fetch latency - Elasticsearch average search query latency - Elasticsearch average Young-generation garbage collections latency -- Elasticsearch change alert on the average time spent by tasks in the queue -- Elasticsearch change alert on the number of currently active queries -- Elasticsearch change alert on the number of query cache evictions -- Elasticsearch change alert on the number of request cache evictions -- Elasticsearch change alert on the number of search fetches currently running -- Elasticsearch change alert on the total number of evictions from the fielddata cache -- ElasticSearch Cluster has unassigned shards -- ElasticSearch Cluster is initializing shards -- ElasticSearch Cluster is relocating shards -- ElasticSearch Cluster status not green -- ElasticSearch does not respond -- ElasticSearch free space < 10% +- Elasticsearch change alert on the average time spent by tasks in the queue on {{cluster_name}} +- Elasticsearch change alert on the number of currently active queries on {{cluster_name}} +- Elasticsearch change alert on the number of query cache evictions on {{node_name}} +- Elasticsearch change alert on the number of request cache evictions on {{node_name}} +- Elasticsearch change alert on the number of search fetches currently running on {{cluster_name}} +- Elasticsearch change alert on the total number of evictions from the fielddata cache on {{node_name}} +- ElasticSearch Cluster has unassigned shards on {{cluster_name}} +- ElasticSearch Cluster is initializing shards on {{cluster_name}} +- ElasticSearch Cluster is relocating shards on {{cluster_name}} +- ElasticSearch Cluster status not green on {{cluster_name}} +- ElasticSearch does not respond on {{server}}:{{port}} +- ElasticSearch free space < 10% on {{node_name}} - Elasticsearch JVM HEAP memory usage - Elasticsearch JVM memory Old usage - Elasticsearch JVM memory Young usage -- Elasticsearch number of current open HTTP connections anomaly detected +- Elasticsearch number of current open HTTP connections anomaly detected on {{node_name}} +- Elasticsearch {{policy}} snapshot deletion failure on {{cluster_name}} +- Elasticsearch {{policy}} snapshot failed on {{cluster_name}} ## Requirements @@ -85,6 +87,8 @@ Creates DataDog monitors with the following checks: | [datadog_monitor.request_cache_evictions_change](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.search_query_change](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.search_query_latency](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | +| [datadog_monitor.slm_snapshot_deletion_failures](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | +| [datadog_monitor.slm_snapshots_failed](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.task_time_in_queue_change](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | ## Inputs @@ -258,6 +262,18 @@ Creates DataDog monitors with the following checks: | [search\_query\_latency\_threshold\_warning](#input\_search\_query\_latency\_threshold\_warning) | Cluster Status warning threshold | `string` | `10` | no | | [search\_query\_latency\_time\_aggregator](#input\_search\_query\_latency\_time\_aggregator) | Time aggregator for the Cluster Status monitor | `string` | `"avg"` | no | | [search\_query\_latency\_timeframe](#input\_search\_query\_latency\_timeframe) | Timeframe for the Cluster Status monitor | `string` | `"last_15m"` | no | +| [slm\_snapshot\_deletion\_failures\_enabled](#input\_slm\_snapshot\_deletion\_failures\_enabled) | Flag to enable SLM Snapshot deletion failures monitor | `string` | `"true"` | no | +| [slm\_snapshot\_deletion\_failures\_extra\_tags](#input\_slm\_snapshot\_deletion\_failures\_extra\_tags) | Extra tags for SLM Snapshot deletion failures monitor | `list(string)` | `[]` | no | +| [slm\_snapshot\_deletion\_failures\_message](#input\_slm\_snapshot\_deletion\_failures\_message) | Custom message for SLM Snapshot deletion failures monitor | `string` | `""` | no | +| [slm\_snapshot\_deletion\_failures\_threshold\_critical](#input\_slm\_snapshot\_deletion\_failures\_threshold\_critical) | SLM Snapshot deletion failures critical threshold | `string` | `1` | no | +| [slm\_snapshot\_deletion\_failures\_time\_aggregator](#input\_slm\_snapshot\_deletion\_failures\_time\_aggregator) | Time aggregator for SLM Snapshot deletion failures monitor | `string` | n/a | yes | +| [slm\_snapshot\_deletion\_failures\_timeframe](#input\_slm\_snapshot\_deletion\_failures\_timeframe) | SLM Snapshot deletion failures timeframe | `string` | `"last_5m"` | no | +| [slm\_snapshots\_failed\_enabled](#input\_slm\_snapshots\_failed\_enabled) | Flag to enable SLM Snapshots Failed monitor | `string` | `"true"` | no | +| [slm\_snapshots\_failed\_extra\_tags](#input\_slm\_snapshots\_failed\_extra\_tags) | Extra tags for SLM Snapshots Failed monitor | `list(string)` | `[]` | no | +| [slm\_snapshots\_failed\_message](#input\_slm\_snapshots\_failed\_message) | Custom message for SLM Snapshots Failed monitor | `string` | `""` | no | +| [slm\_snapshots\_failed\_threshold\_critical](#input\_slm\_snapshots\_failed\_threshold\_critical) | SLM Snapshots Failed critical threshold | `string` | `1` | no | +| [slm\_snapshots\_failed\_time\_aggregator](#input\_slm\_snapshots\_failed\_time\_aggregator) | Time aggregator for SLM Snapshots Failed monitor | `string` | n/a | yes | +| [slm\_snapshots\_failed\_timeframe](#input\_slm\_snapshots\_failed\_timeframe) | SLM Snapshots Failed timeframe | `string` | `"last_5m"` | no | | [tags](#input\_tags) | Global variables | `list(string)` |
[
"type:database",
"provider:elasticsearch",
"resource:elasticsearch"
]
| no | | [task\_time\_in\_queue\_change\_enabled](#input\_task\_time\_in\_queue\_change\_enabled) | Flag to enable Cluster Status monitor | `string` | `"true"` | no | | [task\_time\_in\_queue\_change\_extra\_tags](#input\_task\_time\_in\_queue\_change\_extra\_tags) | Extra tags for Cluster Status monitor | `list(string)` | `[]` | no | @@ -295,6 +311,8 @@ Creates DataDog monitors with the following checks: | [request\_cache\_evictions\_change\_id](#output\_request\_cache\_evictions\_change\_id) | id for monitor request\_cache\_evictions\_change | | [search\_query\_change\_id](#output\_search\_query\_change\_id) | id for monitor search\_query\_change | | [search\_query\_latency\_id](#output\_search\_query\_latency\_id) | id for monitor search\_query\_latency | +| [slm\_snapshot\_deletion\_failures\_id](#output\_slm\_snapshot\_deletion\_failures\_id) | id for monitor slm\_snapshot\_deletion\_failures | +| [slm\_snapshots\_failed\_id](#output\_slm\_snapshots\_failed\_id) | id for monitor slm\_snapshots\_failed | | [task\_time\_in\_queue\_change\_id](#output\_task\_time\_in\_queue\_change\_id) | id for monitor task\_time\_in\_queue\_change | ## Related documentation diff --git a/database/elasticsearch/inputs.tf b/database/elasticsearch/inputs.tf index 5a0390b1..64a3dd71 100644 --- a/database/elasticsearch/inputs.tf +++ b/database/elasticsearch/inputs.tf @@ -1110,3 +1110,80 @@ variable "not_responding_extra_tags" { default = [] } +# +# SLM - Snapshots Failed +# + +variable "slm_snapshots_failed_enabled" { + description = "Flag to enable SLM Snapshots Failed monitor" + type = string + default = "true" +} + +variable "slm_snapshots_failed_message" { + description = "Custom message for SLM Snapshots Failed monitor" + type = string + default = "" +} + +variable "slm_snapshots_failed_time_aggregator" { + description = "Time aggregator for SLM Snapshots Failed monitor" + type = string +} + +variable "slm_snapshots_failed_threshold_critical" { + description = "SLM Snapshots Failed critical threshold" + type = string + default = 1 +} + +variable "slm_snapshots_failed_timeframe" { + description = "SLM Snapshots Failed timeframe" + type = string + default = "last_5m" +} + +variable "slm_snapshots_failed_extra_tags" { + description = "Extra tags for SLM Snapshots Failed monitor" + type = list(string) + default = [] +} + +# +# SLM - Snapshot deletion failures +# + +variable "slm_snapshot_deletion_failures_enabled" { + description = "Flag to enable SLM Snapshot deletion failures monitor" + type = string + default = "true" +} + +variable "slm_snapshot_deletion_failures_message" { + description = "Custom message for SLM Snapshot deletion failures monitor" + type = string + default = "" +} + +variable "slm_snapshot_deletion_failures_time_aggregator" { + description = "Time aggregator for SLM Snapshot deletion failures monitor" + type = string +} + +variable "slm_snapshot_deletion_failures_threshold_critical" { + description = "SLM Snapshot deletion failures critical threshold" + type = string + default = 1 +} + +variable "slm_snapshot_deletion_failures_timeframe" { + description = "SLM Snapshot deletion failures timeframe" + type = string + default = "last_5m" +} + +variable "slm_snapshot_deletion_failures_extra_tags" { + description = "Extra tags for SLM Snapshot deletion failures monitor" + type = list(string) + default = [] +} diff --git a/database/elasticsearch/monitors-elasticsearch.tf b/database/elasticsearch/monitors-elasticsearch.tf index ef2c6b13..f71fa325 100644 --- a/database/elasticsearch/monitors-elasticsearch.tf +++ b/database/elasticsearch/monitors-elasticsearch.tf @@ -3,7 +3,7 @@ # resource "datadog_monitor" "not_responding" { count = var.not_responding_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch does not respond" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch does not respond on {{server}}:{{port}}" message = coalesce(var.not_responding_message, var.message) type = "service check" @@ -34,7 +34,7 @@ EOQ # resource "datadog_monitor" "cluster_status_not_green" { count = var.cluster_status_not_green_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster status not green" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster status not green on {{cluster_name}}" message = coalesce(var.cluster_status_not_green_message, var.message) type = "metric alert" @@ -66,7 +66,7 @@ EOQ # resource "datadog_monitor" "cluster_initializing_shards" { count = var.cluster_initializing_shards_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster is initializing shards" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster is initializing shards on {{cluster_name}}" message = coalesce(var.cluster_initializing_shards_message, var.message) type = "metric alert" @@ -97,7 +97,7 @@ EOQ # resource "datadog_monitor" "cluster_relocating_shards" { count = var.cluster_relocating_shards_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster is relocating shards" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster is relocating shards on {{cluster_name}}" message = coalesce(var.cluster_relocating_shards_message, var.message) type = "metric alert" @@ -128,7 +128,7 @@ EOQ # resource "datadog_monitor" "cluster_unassigned_shards" { count = var.cluster_unassigned_shards_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster has unassigned shards" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster has unassigned shards on {{cluster_name}}" message = coalesce(var.cluster_unassigned_shards_message, var.message) type = "metric alert" @@ -159,7 +159,7 @@ EOQ # resource "datadog_monitor" "node_free_space" { count = var.node_free_space_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch free space < 10%" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch free space < 10% on {{node_name}}" message = coalesce(var.node_free_space_message, var.message) type = "query alert" @@ -194,7 +194,7 @@ EOQ # resource "datadog_monitor" "jvm_heap_memory_usage" { count = var.jvm_heap_memory_usage_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM HEAP memory usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM HEAP memory usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}} on {{node_name}}" message = coalesce(var.jvm_heap_memory_usage_message, var.message) type = "query alert" @@ -225,7 +225,7 @@ EOQ # resource "datadog_monitor" "jvm_memory_young_usage" { count = var.jvm_memory_young_usage_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM memory Young usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM memory Young usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}} on {{node_name}}" message = coalesce(var.jvm_memory_young_usage_message, var.message) type = "query alert" @@ -256,7 +256,7 @@ EOQ # resource "datadog_monitor" "jvm_memory_old_usage" { count = var.jvm_memory_old_usage_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM memory Old usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM memory Old usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}} on {{node_name}}" message = coalesce(var.jvm_memory_old_usage_message, var.message) type = "query alert" @@ -287,7 +287,7 @@ EOQ # resource "datadog_monitor" "jvm_gc_old_collection_latency" { count = var.jvm_gc_old_collection_latency_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average Old-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average Old-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}} on {{node_name}}" message = coalesce(var.jvm_gc_old_collection_latency_message, var.message) type = "query alert" @@ -321,7 +321,7 @@ EOQ # resource "datadog_monitor" "jvm_gc_young_collection_latency" { count = var.jvm_gc_young_collection_latency_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average Young-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average Young-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}} on {{node_name}}" message = coalesce(var.jvm_gc_young_collection_latency_message, var.message) type = "query alert" @@ -355,7 +355,7 @@ EOQ # resource "datadog_monitor" "indexing_latency" { count = var.indexing_latency_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average indexing latency by document {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average indexing latency by document {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}} on {{node_name}}" message = coalesce(var.indexing_latency_message, var.message) type = "query alert" @@ -390,7 +390,7 @@ EOQ # resource "datadog_monitor" "flush_latency" { count = var.flush_latency_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average index flushing to disk latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average index flushing to disk latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}} on {{node_name}}" message = coalesce(var.flush_latency_message, var.message) type = "query alert" @@ -425,7 +425,7 @@ EOQ # resource "datadog_monitor" "http_connections_anomaly" { count = var.http_connections_anomaly_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch number of current open HTTP connections anomaly detected" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch number of current open HTTP connections anomaly detected on {{node_name}}" message = coalesce(var.http_connections_anomaly_message, var.message) type = "query alert" @@ -469,7 +469,7 @@ EOQ # resource "datadog_monitor" "search_query_latency" { count = var.search_query_latency_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average search query latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average search query latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}} on {{node_name}}" message = coalesce(var.search_query_latency_message, var.message) type = "query alert" @@ -504,7 +504,7 @@ EOQ # resource "datadog_monitor" "fetch_latency" { count = var.fetch_latency_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average search fetch latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average search fetch latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}} on {{node_name}}" message = coalesce(var.fetch_latency_message, var.message) type = "query alert" @@ -539,7 +539,7 @@ EOQ # resource "datadog_monitor" "search_query_change" { count = var.search_query_change_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of currently active queries" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of currently active queries on {{cluster_name}}" message = coalesce(var.search_query_change_message, var.message) type = "query alert" @@ -570,7 +570,7 @@ EOQ # resource "datadog_monitor" "fetch_change" { count = var.fetch_change_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of search fetches currently running" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of search fetches currently running on {{cluster_name}}" message = coalesce(var.fetch_change_message, var.message) type = "query alert" @@ -601,7 +601,7 @@ EOQ # resource "datadog_monitor" "field_data_evictions_change" { count = var.field_data_evictions_change_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the total number of evictions from the fielddata cache" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the total number of evictions from the fielddata cache on {{node_name}}" message = coalesce(var.field_data_evictions_change_message, var.message) type = "query alert" @@ -633,7 +633,7 @@ EOQ # resource "datadog_monitor" "query_cache_evictions_change" { count = var.query_cache_evictions_change_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of query cache evictions" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of query cache evictions on {{node_name}}" message = coalesce(var.query_cache_evictions_change_message, var.message) type = "query alert" @@ -665,7 +665,7 @@ EOQ # resource "datadog_monitor" "request_cache_evictions_change" { count = var.request_cache_evictions_change_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of request cache evictions" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of request cache evictions on {{node_name}}" message = coalesce(var.request_cache_evictions_change_message, var.message) type = "query alert" @@ -697,7 +697,7 @@ EOQ # resource "datadog_monitor" "task_time_in_queue_change" { count = var.task_time_in_queue_change_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the average time spent by tasks in the queue" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the average time spent by tasks in the queue on {{cluster_name}}" message = coalesce(var.task_time_in_queue_change_message, var.message) type = "query alert" @@ -723,3 +723,64 @@ EOQ tags = concat(local.common_tags, var.tags, var.task_time_in_queue_change_extra_tags) } +# +# SLM - Snapshots Failed +# + +resource "datadog_monitor" "slm_snapshots_failed" { + count = var.slm_snapshots_failed_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch {{policy}} snapshot failed on {{cluster_name}}" + message = coalesce(var.slm_snapshots_failed_message, var.message) + type = "query alert" + + query = <= ${var.slm_snapshots_failed_threshold_critical} +EOQ + + monitor_thresholds { + critical = var.slm_snapshots_failed_threshold_critical + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + new_group_delay = var.new_group_delay + notify_audit = false + include_tags = true + require_full_window = true + notify_no_data = false + + tags = concat(local.common_tags, var.tags, var.slm_snapshots_failed_extra_tags) +} + +# +# SLM - Snapshot deletion failure +# + +resource "datadog_monitor" "slm_snapshot_deletion_failures" { + count = var.slm_snapshot_deletion_failures_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch {{policy}} snapshot deletion failure on {{cluster_name}}" + message = coalesce(var.slm_snapshot_deletion_failures_message, var.message) + type = "query alert" + + query = <= ${var.slm_snapshot_deletion_failures_threshold_critical} +EOQ + + monitor_thresholds { + critical = var.slm_snapshot_deletion_failures_threshold_critical + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + new_group_delay = var.new_group_delay + notify_audit = false + include_tags = true + require_full_window = true + notify_no_data = false + + tags = concat(local.common_tags, var.tags, var.slm_snapshot_deletion_failures_extra_tags) +} diff --git a/database/elasticsearch/outputs.tf b/database/elasticsearch/outputs.tf index b70b9cbb..ab22e3cf 100644 --- a/database/elasticsearch/outputs.tf +++ b/database/elasticsearch/outputs.tf @@ -103,6 +103,16 @@ output "search_query_latency_id" { value = datadog_monitor.search_query_latency.*.id } +output "slm_snapshot_deletion_failures_id" { + description = "id for monitor slm_snapshot_deletion_failures" + value = datadog_monitor.slm_snapshot_deletion_failures.*.id +} + +output "slm_snapshots_failed_id" { + description = "id for monitor slm_snapshots_failed" + value = datadog_monitor.slm_snapshots_failed.*.id +} + output "task_time_in_queue_change_id" { description = "id for monitor task_time_in_queue_change" value = datadog_monitor.task_time_in_queue_change.*.id From 513a0220257814ad30e83e8ea9e0160f0a6ee5ae Mon Sep 17 00:00:00 2001 From: Matthieu Bourgain Date: Fri, 15 Nov 2024 15:47:19 +0100 Subject: [PATCH 2/2] fix missing default on variables --- database/elasticsearch/README.md | 4 ++-- database/elasticsearch/inputs.tf | 26 ++++++++++++++------------ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/database/elasticsearch/README.md b/database/elasticsearch/README.md index c8d8dccc..cb0d29de 100644 --- a/database/elasticsearch/README.md +++ b/database/elasticsearch/README.md @@ -266,13 +266,13 @@ Creates DataDog monitors with the following checks: | [slm\_snapshot\_deletion\_failures\_extra\_tags](#input\_slm\_snapshot\_deletion\_failures\_extra\_tags) | Extra tags for SLM Snapshot deletion failures monitor | `list(string)` | `[]` | no | | [slm\_snapshot\_deletion\_failures\_message](#input\_slm\_snapshot\_deletion\_failures\_message) | Custom message for SLM Snapshot deletion failures monitor | `string` | `""` | no | | [slm\_snapshot\_deletion\_failures\_threshold\_critical](#input\_slm\_snapshot\_deletion\_failures\_threshold\_critical) | SLM Snapshot deletion failures critical threshold | `string` | `1` | no | -| [slm\_snapshot\_deletion\_failures\_time\_aggregator](#input\_slm\_snapshot\_deletion\_failures\_time\_aggregator) | Time aggregator for SLM Snapshot deletion failures monitor | `string` | n/a | yes | +| [slm\_snapshot\_deletion\_failures\_time\_aggregator](#input\_slm\_snapshot\_deletion\_failures\_time\_aggregator) | Time aggregator for SLM Snapshot deletion failures monitor | `string` | `"avg"` | no | | [slm\_snapshot\_deletion\_failures\_timeframe](#input\_slm\_snapshot\_deletion\_failures\_timeframe) | SLM Snapshot deletion failures timeframe | `string` | `"last_5m"` | no | | [slm\_snapshots\_failed\_enabled](#input\_slm\_snapshots\_failed\_enabled) | Flag to enable SLM Snapshots Failed monitor | `string` | `"true"` | no | | [slm\_snapshots\_failed\_extra\_tags](#input\_slm\_snapshots\_failed\_extra\_tags) | Extra tags for SLM Snapshots Failed monitor | `list(string)` | `[]` | no | | [slm\_snapshots\_failed\_message](#input\_slm\_snapshots\_failed\_message) | Custom message for SLM Snapshots Failed monitor | `string` | `""` | no | | [slm\_snapshots\_failed\_threshold\_critical](#input\_slm\_snapshots\_failed\_threshold\_critical) | SLM Snapshots Failed critical threshold | `string` | `1` | no | -| [slm\_snapshots\_failed\_time\_aggregator](#input\_slm\_snapshots\_failed\_time\_aggregator) | Time aggregator for SLM Snapshots Failed monitor | `string` | n/a | yes | +| [slm\_snapshots\_failed\_time\_aggregator](#input\_slm\_snapshots\_failed\_time\_aggregator) | Time aggregator for SLM Snapshots Failed monitor | `string` | `"avg"` | no | | [slm\_snapshots\_failed\_timeframe](#input\_slm\_snapshots\_failed\_timeframe) | SLM Snapshots Failed timeframe | `string` | `"last_5m"` | no | | [tags](#input\_tags) | Global variables | `list(string)` |
[
"type:database",
"provider:elasticsearch",
"resource:elasticsearch"
]
| no | | [task\_time\_in\_queue\_change\_enabled](#input\_task\_time\_in\_queue\_change\_enabled) | Flag to enable Cluster Status monitor | `string` | `"true"` | no | diff --git a/database/elasticsearch/inputs.tf b/database/elasticsearch/inputs.tf index 64a3dd71..8efb9000 100644 --- a/database/elasticsearch/inputs.tf +++ b/database/elasticsearch/inputs.tf @@ -1129,12 +1129,7 @@ variable "slm_snapshots_failed_message" { variable "slm_snapshots_failed_time_aggregator" { description = "Time aggregator for SLM Snapshots Failed monitor" type = string -} - -variable "slm_snapshots_failed_threshold_critical" { - description = "SLM Snapshots Failed critical threshold" - type = string - default = 1 + default = "avg" } variable "slm_snapshots_failed_timeframe" { @@ -1143,6 +1138,12 @@ variable "slm_snapshots_failed_timeframe" { default = "last_5m" } +variable "slm_snapshots_failed_threshold_critical" { + description = "SLM Snapshots Failed critical threshold" + type = string + default = 1 +} + variable "slm_snapshots_failed_extra_tags" { description = "Extra tags for SLM Snapshots Failed monitor" type = list(string) @@ -1168,12 +1169,7 @@ variable "slm_snapshot_deletion_failures_message" { variable "slm_snapshot_deletion_failures_time_aggregator" { description = "Time aggregator for SLM Snapshot deletion failures monitor" type = string -} - -variable "slm_snapshot_deletion_failures_threshold_critical" { - description = "SLM Snapshot deletion failures critical threshold" - type = string - default = 1 + default = "avg" } variable "slm_snapshot_deletion_failures_timeframe" { @@ -1182,6 +1178,12 @@ variable "slm_snapshot_deletion_failures_timeframe" { default = "last_5m" } +variable "slm_snapshot_deletion_failures_threshold_critical" { + description = "SLM Snapshot deletion failures critical threshold" + type = string + default = 1 +} + variable "slm_snapshot_deletion_failures_extra_tags" { description = "Extra tags for SLM Snapshot deletion failures monitor" type = list(string)