From 8a7cf16612144d8b0ddae4779b3901741039ef8a Mon Sep 17 00:00:00 2001
From: Matthieu Bourgain <matthieu.bourgain@fr.clara.net>
Date: Fri, 15 Nov 2024 15:40:31 +0100
Subject: [PATCH 1/2] add snapshot monitors for elasticsearch

---
 database/elasticsearch/README.md              |  44 +++++---
 database/elasticsearch/inputs.tf              |  77 +++++++++++++
 .../elasticsearch/monitors-elasticsearch.tf   | 105 ++++++++++++++----
 database/elasticsearch/outputs.tf             |  10 ++
 4 files changed, 201 insertions(+), 35 deletions(-)

diff --git a/database/elasticsearch/README.md b/database/elasticsearch/README.md
index bea1e44c..c8d8dccc 100644
--- a/database/elasticsearch/README.md
+++ b/database/elasticsearch/README.md
@@ -23,22 +23,24 @@ Creates DataDog monitors with the following checks:
 - Elasticsearch average search fetch latency
 - Elasticsearch average search query latency
 - Elasticsearch average Young-generation garbage collections latency
-- Elasticsearch change alert on the average time spent by tasks in the queue
-- Elasticsearch change alert on the number of currently active queries
-- Elasticsearch change alert on the number of query cache evictions
-- Elasticsearch change alert on the number of request cache evictions
-- Elasticsearch change alert on the number of search fetches currently running
-- Elasticsearch change alert on the total number of evictions from the fielddata cache
-- ElasticSearch Cluster has unassigned shards
-- ElasticSearch Cluster is initializing shards
-- ElasticSearch Cluster is relocating shards
-- ElasticSearch Cluster status not green
-- ElasticSearch does not respond
-- ElasticSearch free space < 10%
+- Elasticsearch change alert on the average time spent by tasks in the queue on {{cluster_name}}
+- Elasticsearch change alert on the number of currently active queries on {{cluster_name}}
+- Elasticsearch change alert on the number of query cache evictions on {{node_name}}
+- Elasticsearch change alert on the number of request cache evictions on {{node_name}}
+- Elasticsearch change alert on the number of search fetches currently running on {{cluster_name}}
+- Elasticsearch change alert on the total number of evictions from the fielddata cache on {{node_name}}
+- ElasticSearch Cluster has unassigned shards on {{cluster_name}}
+- ElasticSearch Cluster is initializing shards on {{cluster_name}}
+- ElasticSearch Cluster is relocating shards on {{cluster_name}}
+- ElasticSearch Cluster status not green on {{cluster_name}}
+- ElasticSearch does not respond on {{server}}:{{port}}
+- ElasticSearch free space < 10% on {{node_name}}
 - Elasticsearch JVM HEAP memory usage
 - Elasticsearch JVM memory Old usage
 - Elasticsearch JVM memory Young usage
-- Elasticsearch number of current open HTTP connections anomaly detected
+- Elasticsearch number of current open HTTP connections anomaly detected on {{node_name}}
+- Elasticsearch {{policy}} snapshot deletion failure on {{cluster_name}}
+- Elasticsearch {{policy}} snapshot failed on {{cluster_name}}
 
 <!-- BEGIN_TF_DOCS -->
 ## Requirements
@@ -85,6 +87,8 @@ Creates DataDog monitors with the following checks:
 | [datadog_monitor.request_cache_evictions_change](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource |
 | [datadog_monitor.search_query_change](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource |
 | [datadog_monitor.search_query_latency](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource |
+| [datadog_monitor.slm_snapshot_deletion_failures](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource |
+| [datadog_monitor.slm_snapshots_failed](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource |
 | [datadog_monitor.task_time_in_queue_change](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource |
 
 ## Inputs
@@ -258,6 +262,18 @@ Creates DataDog monitors with the following checks:
 | <a name="input_search_query_latency_threshold_warning"></a> [search\_query\_latency\_threshold\_warning](#input\_search\_query\_latency\_threshold\_warning) | Cluster Status warning threshold | `string` | `10` | no |
 | <a name="input_search_query_latency_time_aggregator"></a> [search\_query\_latency\_time\_aggregator](#input\_search\_query\_latency\_time\_aggregator) | Time aggregator for the Cluster Status monitor | `string` | `"avg"` | no |
 | <a name="input_search_query_latency_timeframe"></a> [search\_query\_latency\_timeframe](#input\_search\_query\_latency\_timeframe) | Timeframe for the Cluster Status monitor | `string` | `"last_15m"` | no |
+| <a name="input_slm_snapshot_deletion_failures_enabled"></a> [slm\_snapshot\_deletion\_failures\_enabled](#input\_slm\_snapshot\_deletion\_failures\_enabled) | Flag to enable SLM Snapshot deletion failures monitor | `string` | `"true"` | no |
+| <a name="input_slm_snapshot_deletion_failures_extra_tags"></a> [slm\_snapshot\_deletion\_failures\_extra\_tags](#input\_slm\_snapshot\_deletion\_failures\_extra\_tags) | Extra tags for SLM Snapshot deletion failures monitor | `list(string)` | `[]` | no |
+| <a name="input_slm_snapshot_deletion_failures_message"></a> [slm\_snapshot\_deletion\_failures\_message](#input\_slm\_snapshot\_deletion\_failures\_message) | Custom message for SLM Snapshot deletion failures monitor | `string` | `""` | no |
+| <a name="input_slm_snapshot_deletion_failures_threshold_critical"></a> [slm\_snapshot\_deletion\_failures\_threshold\_critical](#input\_slm\_snapshot\_deletion\_failures\_threshold\_critical) | SLM Snapshot deletion failures critical threshold | `string` | `1` | no |
+| <a name="input_slm_snapshot_deletion_failures_time_aggregator"></a> [slm\_snapshot\_deletion\_failures\_time\_aggregator](#input\_slm\_snapshot\_deletion\_failures\_time\_aggregator) | Time aggregator for SLM Snapshot deletion failures monitor | `string` | n/a | yes |
+| <a name="input_slm_snapshot_deletion_failures_timeframe"></a> [slm\_snapshot\_deletion\_failures\_timeframe](#input\_slm\_snapshot\_deletion\_failures\_timeframe) | SLM Snapshot deletion failures timeframe | `string` | `"last_5m"` | no |
+| <a name="input_slm_snapshots_failed_enabled"></a> [slm\_snapshots\_failed\_enabled](#input\_slm\_snapshots\_failed\_enabled) | Flag to enable SLM Snapshots Failed monitor | `string` | `"true"` | no |
+| <a name="input_slm_snapshots_failed_extra_tags"></a> [slm\_snapshots\_failed\_extra\_tags](#input\_slm\_snapshots\_failed\_extra\_tags) | Extra tags for SLM Snapshots Failed monitor | `list(string)` | `[]` | no |
+| <a name="input_slm_snapshots_failed_message"></a> [slm\_snapshots\_failed\_message](#input\_slm\_snapshots\_failed\_message) | Custom message for SLM Snapshots Failed monitor | `string` | `""` | no |
+| <a name="input_slm_snapshots_failed_threshold_critical"></a> [slm\_snapshots\_failed\_threshold\_critical](#input\_slm\_snapshots\_failed\_threshold\_critical) | SLM Snapshots Failed critical threshold | `string` | `1` | no |
+| <a name="input_slm_snapshots_failed_time_aggregator"></a> [slm\_snapshots\_failed\_time\_aggregator](#input\_slm\_snapshots\_failed\_time\_aggregator) | Time aggregator for SLM Snapshots Failed monitor | `string` | n/a | yes |
+| <a name="input_slm_snapshots_failed_timeframe"></a> [slm\_snapshots\_failed\_timeframe](#input\_slm\_snapshots\_failed\_timeframe) | SLM Snapshots Failed timeframe | `string` | `"last_5m"` | no |
 | <a name="input_tags"></a> [tags](#input\_tags) | Global variables | `list(string)` | <pre>[<br>  "type:database",<br>  "provider:elasticsearch",<br>  "resource:elasticsearch"<br>]</pre> | no |
 | <a name="input_task_time_in_queue_change_enabled"></a> [task\_time\_in\_queue\_change\_enabled](#input\_task\_time\_in\_queue\_change\_enabled) | Flag to enable Cluster Status monitor | `string` | `"true"` | no |
 | <a name="input_task_time_in_queue_change_extra_tags"></a> [task\_time\_in\_queue\_change\_extra\_tags](#input\_task\_time\_in\_queue\_change\_extra\_tags) | Extra tags for Cluster Status monitor | `list(string)` | `[]` | no |
@@ -295,6 +311,8 @@ Creates DataDog monitors with the following checks:
 | <a name="output_request_cache_evictions_change_id"></a> [request\_cache\_evictions\_change\_id](#output\_request\_cache\_evictions\_change\_id) | id for monitor request\_cache\_evictions\_change |
 | <a name="output_search_query_change_id"></a> [search\_query\_change\_id](#output\_search\_query\_change\_id) | id for monitor search\_query\_change |
 | <a name="output_search_query_latency_id"></a> [search\_query\_latency\_id](#output\_search\_query\_latency\_id) | id for monitor search\_query\_latency |
+| <a name="output_slm_snapshot_deletion_failures_id"></a> [slm\_snapshot\_deletion\_failures\_id](#output\_slm\_snapshot\_deletion\_failures\_id) | id for monitor slm\_snapshot\_deletion\_failures |
+| <a name="output_slm_snapshots_failed_id"></a> [slm\_snapshots\_failed\_id](#output\_slm\_snapshots\_failed\_id) | id for monitor slm\_snapshots\_failed |
 | <a name="output_task_time_in_queue_change_id"></a> [task\_time\_in\_queue\_change\_id](#output\_task\_time\_in\_queue\_change\_id) | id for monitor task\_time\_in\_queue\_change |
 <!-- END_TF_DOCS -->
 ## Related documentation
diff --git a/database/elasticsearch/inputs.tf b/database/elasticsearch/inputs.tf
index 5a0390b1..64a3dd71 100644
--- a/database/elasticsearch/inputs.tf
+++ b/database/elasticsearch/inputs.tf
@@ -1110,3 +1110,80 @@ variable "not_responding_extra_tags" {
   default     = []
 }
 
+#
+# SLM - Snapshots Failed
+#
+
+variable "slm_snapshots_failed_enabled" {
+  description = "Flag to enable SLM Snapshots Failed monitor"
+  type        = string
+  default     = "true"
+}
+
+variable "slm_snapshots_failed_message" {
+  description = "Custom message for SLM Snapshots Failed monitor"
+  type        = string
+  default     = ""
+}
+
+variable "slm_snapshots_failed_time_aggregator" {
+  description = "Time aggregator for SLM Snapshots Failed monitor"
+  type        = string
+}
+
+variable "slm_snapshots_failed_threshold_critical" {
+  description = "SLM Snapshots Failed critical threshold"
+  type        = string
+  default     = 1
+}
+
+variable "slm_snapshots_failed_timeframe" {
+  description = "SLM Snapshots Failed timeframe"
+  type        = string
+  default     = "last_5m"
+}
+
+variable "slm_snapshots_failed_extra_tags" {
+  description = "Extra tags for SLM Snapshots Failed monitor"
+  type        = list(string)
+  default     = []
+}
+
+#
+# SLM - Snapshot deletion failures
+#
+
+variable "slm_snapshot_deletion_failures_enabled" {
+  description = "Flag to enable SLM Snapshot deletion failures monitor"
+  type        = string
+  default     = "true"
+}
+
+variable "slm_snapshot_deletion_failures_message" {
+  description = "Custom message for SLM Snapshot deletion failures monitor"
+  type        = string
+  default     = ""
+}
+
+variable "slm_snapshot_deletion_failures_time_aggregator" {
+  description = "Time aggregator for SLM Snapshot deletion failures monitor"
+  type        = string
+}
+
+variable "slm_snapshot_deletion_failures_threshold_critical" {
+  description = "SLM Snapshot deletion failures critical threshold"
+  type        = string
+  default     = 1
+}
+
+variable "slm_snapshot_deletion_failures_timeframe" {
+  description = "SLM Snapshot deletion failures timeframe"
+  type        = string
+  default     = "last_5m"
+}
+
+variable "slm_snapshot_deletion_failures_extra_tags" {
+  description = "Extra tags for SLM Snapshot deletion failures monitor"
+  type        = list(string)
+  default     = []
+}
diff --git a/database/elasticsearch/monitors-elasticsearch.tf b/database/elasticsearch/monitors-elasticsearch.tf
index ef2c6b13..f71fa325 100644
--- a/database/elasticsearch/monitors-elasticsearch.tf
+++ b/database/elasticsearch/monitors-elasticsearch.tf
@@ -3,7 +3,7 @@
 #
 resource "datadog_monitor" "not_responding" {
   count   = var.not_responding_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch does not respond"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch does not respond on {{server}}:{{port}}"
   message = coalesce(var.not_responding_message, var.message)
   type    = "service check"
 
@@ -34,7 +34,7 @@ EOQ
 #
 resource "datadog_monitor" "cluster_status_not_green" {
   count   = var.cluster_status_not_green_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster status not green"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster status not green on {{cluster_name}}"
   message = coalesce(var.cluster_status_not_green_message, var.message)
   type    = "metric alert"
 
@@ -66,7 +66,7 @@ EOQ
 #
 resource "datadog_monitor" "cluster_initializing_shards" {
   count   = var.cluster_initializing_shards_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster is initializing shards"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster is initializing shards on {{cluster_name}}"
   message = coalesce(var.cluster_initializing_shards_message, var.message)
   type    = "metric alert"
 
@@ -97,7 +97,7 @@ EOQ
 #
 resource "datadog_monitor" "cluster_relocating_shards" {
   count   = var.cluster_relocating_shards_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster is relocating shards"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster is relocating shards on {{cluster_name}}"
   message = coalesce(var.cluster_relocating_shards_message, var.message)
   type    = "metric alert"
 
@@ -128,7 +128,7 @@ EOQ
 #
 resource "datadog_monitor" "cluster_unassigned_shards" {
   count   = var.cluster_unassigned_shards_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster has unassigned shards"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster has unassigned shards on {{cluster_name}}"
   message = coalesce(var.cluster_unassigned_shards_message, var.message)
   type    = "metric alert"
 
@@ -159,7 +159,7 @@ EOQ
 #
 resource "datadog_monitor" "node_free_space" {
   count   = var.node_free_space_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch free space < 10%"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch free space < 10% on {{node_name}}"
   message = coalesce(var.node_free_space_message, var.message)
 
   type = "query alert"
@@ -194,7 +194,7 @@ EOQ
 #
 resource "datadog_monitor" "jvm_heap_memory_usage" {
   count   = var.jvm_heap_memory_usage_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM HEAP memory usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM HEAP memory usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}} on {{node_name}}"
   message = coalesce(var.jvm_heap_memory_usage_message, var.message)
   type    = "query alert"
 
@@ -225,7 +225,7 @@ EOQ
 #
 resource "datadog_monitor" "jvm_memory_young_usage" {
   count   = var.jvm_memory_young_usage_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM memory Young usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM memory Young usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}} on {{node_name}}"
   message = coalesce(var.jvm_memory_young_usage_message, var.message)
   type    = "query alert"
 
@@ -256,7 +256,7 @@ EOQ
 #
 resource "datadog_monitor" "jvm_memory_old_usage" {
   count   = var.jvm_memory_old_usage_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM memory Old usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM memory Old usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}} on {{node_name}}"
   message = coalesce(var.jvm_memory_old_usage_message, var.message)
   type    = "query alert"
 
@@ -287,7 +287,7 @@ EOQ
 #
 resource "datadog_monitor" "jvm_gc_old_collection_latency" {
   count   = var.jvm_gc_old_collection_latency_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average Old-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average Old-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}} on {{node_name}}"
   message = coalesce(var.jvm_gc_old_collection_latency_message, var.message)
   type    = "query alert"
 
@@ -321,7 +321,7 @@ EOQ
 #
 resource "datadog_monitor" "jvm_gc_young_collection_latency" {
   count   = var.jvm_gc_young_collection_latency_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average Young-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average Young-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}} on {{node_name}}"
   message = coalesce(var.jvm_gc_young_collection_latency_message, var.message)
   type    = "query alert"
 
@@ -355,7 +355,7 @@ EOQ
 #
 resource "datadog_monitor" "indexing_latency" {
   count   = var.indexing_latency_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average indexing latency by document {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average indexing latency by document {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}} on {{node_name}}"
   message = coalesce(var.indexing_latency_message, var.message)
   type    = "query alert"
 
@@ -390,7 +390,7 @@ EOQ
 #
 resource "datadog_monitor" "flush_latency" {
   count   = var.flush_latency_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average index flushing to disk latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average index flushing to disk latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}} on {{node_name}}"
   message = coalesce(var.flush_latency_message, var.message)
   type    = "query alert"
 
@@ -425,7 +425,7 @@ EOQ
 #
 resource "datadog_monitor" "http_connections_anomaly" {
   count   = var.http_connections_anomaly_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch number of current open HTTP connections anomaly detected"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch number of current open HTTP connections anomaly detected on {{node_name}}"
   message = coalesce(var.http_connections_anomaly_message, var.message)
   type    = "query alert"
 
@@ -469,7 +469,7 @@ EOQ
 #
 resource "datadog_monitor" "search_query_latency" {
   count   = var.search_query_latency_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average search query latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average search query latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}} on {{node_name}}"
   message = coalesce(var.search_query_latency_message, var.message)
   type    = "query alert"
 
@@ -504,7 +504,7 @@ EOQ
 #
 resource "datadog_monitor" "fetch_latency" {
   count   = var.fetch_latency_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average search fetch latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average search fetch latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}} on {{node_name}}"
   message = coalesce(var.fetch_latency_message, var.message)
   type    = "query alert"
 
@@ -539,7 +539,7 @@ EOQ
 #
 resource "datadog_monitor" "search_query_change" {
   count   = var.search_query_change_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of currently active queries"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of currently active queries on {{cluster_name}}"
   message = coalesce(var.search_query_change_message, var.message)
   type    = "query alert"
 
@@ -570,7 +570,7 @@ EOQ
 #
 resource "datadog_monitor" "fetch_change" {
   count   = var.fetch_change_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of search fetches currently running"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of search fetches currently running on {{cluster_name}}"
   message = coalesce(var.fetch_change_message, var.message)
   type    = "query alert"
 
@@ -601,7 +601,7 @@ EOQ
 #
 resource "datadog_monitor" "field_data_evictions_change" {
   count   = var.field_data_evictions_change_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the total number of evictions from the fielddata cache"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the total number of evictions from the fielddata cache on {{node_name}}"
   message = coalesce(var.field_data_evictions_change_message, var.message)
   type    = "query alert"
 
@@ -633,7 +633,7 @@ EOQ
 #
 resource "datadog_monitor" "query_cache_evictions_change" {
   count   = var.query_cache_evictions_change_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of query cache evictions"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of query cache evictions on {{node_name}}"
   message = coalesce(var.query_cache_evictions_change_message, var.message)
   type    = "query alert"
 
@@ -665,7 +665,7 @@ EOQ
 #
 resource "datadog_monitor" "request_cache_evictions_change" {
   count   = var.request_cache_evictions_change_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of request cache evictions"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of request cache evictions on {{node_name}}"
   message = coalesce(var.request_cache_evictions_change_message, var.message)
   type    = "query alert"
 
@@ -697,7 +697,7 @@ EOQ
 #
 resource "datadog_monitor" "task_time_in_queue_change" {
   count   = var.task_time_in_queue_change_enabled == "true" ? 1 : 0
-  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the average time spent by tasks in the queue"
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the average time spent by tasks in the queue on {{cluster_name}}"
   message = coalesce(var.task_time_in_queue_change_message, var.message)
   type    = "query alert"
 
@@ -723,3 +723,64 @@ EOQ
   tags = concat(local.common_tags, var.tags, var.task_time_in_queue_change_extra_tags)
 }
 
+#
+# SLM - Snapshots Failed
+#
+
+resource "datadog_monitor" "slm_snapshots_failed" {
+  count   = var.slm_snapshots_failed_enabled == "true" ? 1 : 0
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch {{policy}} snapshot failed on {{cluster_name}}"
+  message = coalesce(var.slm_snapshots_failed_message, var.message)
+  type    = "query alert"
+
+  query = <<EOQ
+  ${var.slm_snapshots_failed_time_aggregator}(${var.slm_snapshots_failed_timeframe}):
+    avg:elasticsearch.slm.snapshots_failed${module.filter-tags.query_alert} by {cluster_name,repository,policy}
+  >= ${var.slm_snapshots_failed_threshold_critical}
+EOQ
+
+  monitor_thresholds {
+    critical = var.slm_snapshots_failed_threshold_critical
+  }
+
+  evaluation_delay    = var.evaluation_delay
+  new_host_delay      = var.new_host_delay
+  new_group_delay     = var.new_group_delay
+  notify_audit        = false
+  include_tags        = true
+  require_full_window = true
+  notify_no_data      = false
+
+  tags = concat(local.common_tags, var.tags, var.slm_snapshots_failed_extra_tags)
+}
+
+#
+# SLM - Snapshot deletion failure
+#
+
+resource "datadog_monitor" "slm_snapshot_deletion_failures" {
+  count   = var.slm_snapshot_deletion_failures_enabled == "true" ? 1 : 0
+  name    = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch {{policy}} snapshot deletion failure on {{cluster_name}}"
+  message = coalesce(var.slm_snapshot_deletion_failures_message, var.message)
+  type    = "query alert"
+
+  query = <<EOQ
+  ${var.slm_snapshot_deletion_failures_time_aggregator}(${var.slm_snapshot_deletion_failures_timeframe}):
+    avg:elasticsearch.slm.snapshot_deletion_failures${module.filter-tags.query_alert} by {cluster_name,repository,policy}
+  >= ${var.slm_snapshot_deletion_failures_threshold_critical}
+EOQ
+
+  monitor_thresholds {
+    critical = var.slm_snapshot_deletion_failures_threshold_critical
+  }
+
+  evaluation_delay    = var.evaluation_delay
+  new_host_delay      = var.new_host_delay
+  new_group_delay     = var.new_group_delay
+  notify_audit        = false
+  include_tags        = true
+  require_full_window = true
+  notify_no_data      = false
+
+  tags = concat(local.common_tags, var.tags, var.slm_snapshot_deletion_failures_extra_tags)
+}
diff --git a/database/elasticsearch/outputs.tf b/database/elasticsearch/outputs.tf
index b70b9cbb..ab22e3cf 100644
--- a/database/elasticsearch/outputs.tf
+++ b/database/elasticsearch/outputs.tf
@@ -103,6 +103,16 @@ output "search_query_latency_id" {
   value       = datadog_monitor.search_query_latency.*.id
 }
 
+output "slm_snapshot_deletion_failures_id" {
+  description = "id for monitor slm_snapshot_deletion_failures"
+  value       = datadog_monitor.slm_snapshot_deletion_failures.*.id
+}
+
+output "slm_snapshots_failed_id" {
+  description = "id for monitor slm_snapshots_failed"
+  value       = datadog_monitor.slm_snapshots_failed.*.id
+}
+
 output "task_time_in_queue_change_id" {
   description = "id for monitor task_time_in_queue_change"
   value       = datadog_monitor.task_time_in_queue_change.*.id

From 513a0220257814ad30e83e8ea9e0160f0a6ee5ae Mon Sep 17 00:00:00 2001
From: Matthieu Bourgain <matthieu.bourgain@fr.clara.net>
Date: Fri, 15 Nov 2024 15:47:19 +0100
Subject: [PATCH 2/2] fix missing default on variables

---
 database/elasticsearch/README.md |  4 ++--
 database/elasticsearch/inputs.tf | 26 ++++++++++++++------------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/database/elasticsearch/README.md b/database/elasticsearch/README.md
index c8d8dccc..cb0d29de 100644
--- a/database/elasticsearch/README.md
+++ b/database/elasticsearch/README.md
@@ -266,13 +266,13 @@ Creates DataDog monitors with the following checks:
 | <a name="input_slm_snapshot_deletion_failures_extra_tags"></a> [slm\_snapshot\_deletion\_failures\_extra\_tags](#input\_slm\_snapshot\_deletion\_failures\_extra\_tags) | Extra tags for SLM Snapshot deletion failures monitor | `list(string)` | `[]` | no |
 | <a name="input_slm_snapshot_deletion_failures_message"></a> [slm\_snapshot\_deletion\_failures\_message](#input\_slm\_snapshot\_deletion\_failures\_message) | Custom message for SLM Snapshot deletion failures monitor | `string` | `""` | no |
 | <a name="input_slm_snapshot_deletion_failures_threshold_critical"></a> [slm\_snapshot\_deletion\_failures\_threshold\_critical](#input\_slm\_snapshot\_deletion\_failures\_threshold\_critical) | SLM Snapshot deletion failures critical threshold | `string` | `1` | no |
-| <a name="input_slm_snapshot_deletion_failures_time_aggregator"></a> [slm\_snapshot\_deletion\_failures\_time\_aggregator](#input\_slm\_snapshot\_deletion\_failures\_time\_aggregator) | Time aggregator for SLM Snapshot deletion failures monitor | `string` | n/a | yes |
+| <a name="input_slm_snapshot_deletion_failures_time_aggregator"></a> [slm\_snapshot\_deletion\_failures\_time\_aggregator](#input\_slm\_snapshot\_deletion\_failures\_time\_aggregator) | Time aggregator for SLM Snapshot deletion failures monitor | `string` | `"avg"` | no |
 | <a name="input_slm_snapshot_deletion_failures_timeframe"></a> [slm\_snapshot\_deletion\_failures\_timeframe](#input\_slm\_snapshot\_deletion\_failures\_timeframe) | SLM Snapshot deletion failures timeframe | `string` | `"last_5m"` | no |
 | <a name="input_slm_snapshots_failed_enabled"></a> [slm\_snapshots\_failed\_enabled](#input\_slm\_snapshots\_failed\_enabled) | Flag to enable SLM Snapshots Failed monitor | `string` | `"true"` | no |
 | <a name="input_slm_snapshots_failed_extra_tags"></a> [slm\_snapshots\_failed\_extra\_tags](#input\_slm\_snapshots\_failed\_extra\_tags) | Extra tags for SLM Snapshots Failed monitor | `list(string)` | `[]` | no |
 | <a name="input_slm_snapshots_failed_message"></a> [slm\_snapshots\_failed\_message](#input\_slm\_snapshots\_failed\_message) | Custom message for SLM Snapshots Failed monitor | `string` | `""` | no |
 | <a name="input_slm_snapshots_failed_threshold_critical"></a> [slm\_snapshots\_failed\_threshold\_critical](#input\_slm\_snapshots\_failed\_threshold\_critical) | SLM Snapshots Failed critical threshold | `string` | `1` | no |
-| <a name="input_slm_snapshots_failed_time_aggregator"></a> [slm\_snapshots\_failed\_time\_aggregator](#input\_slm\_snapshots\_failed\_time\_aggregator) | Time aggregator for SLM Snapshots Failed monitor | `string` | n/a | yes |
+| <a name="input_slm_snapshots_failed_time_aggregator"></a> [slm\_snapshots\_failed\_time\_aggregator](#input\_slm\_snapshots\_failed\_time\_aggregator) | Time aggregator for SLM Snapshots Failed monitor | `string` | `"avg"` | no |
 | <a name="input_slm_snapshots_failed_timeframe"></a> [slm\_snapshots\_failed\_timeframe](#input\_slm\_snapshots\_failed\_timeframe) | SLM Snapshots Failed timeframe | `string` | `"last_5m"` | no |
 | <a name="input_tags"></a> [tags](#input\_tags) | Global variables | `list(string)` | <pre>[<br>  "type:database",<br>  "provider:elasticsearch",<br>  "resource:elasticsearch"<br>]</pre> | no |
 | <a name="input_task_time_in_queue_change_enabled"></a> [task\_time\_in\_queue\_change\_enabled](#input\_task\_time\_in\_queue\_change\_enabled) | Flag to enable Cluster Status monitor | `string` | `"true"` | no |
diff --git a/database/elasticsearch/inputs.tf b/database/elasticsearch/inputs.tf
index 64a3dd71..8efb9000 100644
--- a/database/elasticsearch/inputs.tf
+++ b/database/elasticsearch/inputs.tf
@@ -1129,12 +1129,7 @@ variable "slm_snapshots_failed_message" {
 variable "slm_snapshots_failed_time_aggregator" {
   description = "Time aggregator for SLM Snapshots Failed monitor"
   type        = string
-}
-
-variable "slm_snapshots_failed_threshold_critical" {
-  description = "SLM Snapshots Failed critical threshold"
-  type        = string
-  default     = 1
+  default     = "avg"
 }
 
 variable "slm_snapshots_failed_timeframe" {
@@ -1143,6 +1138,12 @@ variable "slm_snapshots_failed_timeframe" {
   default     = "last_5m"
 }
 
+variable "slm_snapshots_failed_threshold_critical" {
+  description = "SLM Snapshots Failed critical threshold"
+  type        = string
+  default     = 1
+}
+
 variable "slm_snapshots_failed_extra_tags" {
   description = "Extra tags for SLM Snapshots Failed monitor"
   type        = list(string)
@@ -1168,12 +1169,7 @@ variable "slm_snapshot_deletion_failures_message" {
 variable "slm_snapshot_deletion_failures_time_aggregator" {
   description = "Time aggregator for SLM Snapshot deletion failures monitor"
   type        = string
-}
-
-variable "slm_snapshot_deletion_failures_threshold_critical" {
-  description = "SLM Snapshot deletion failures critical threshold"
-  type        = string
-  default     = 1
+  default     = "avg"
 }
 
 variable "slm_snapshot_deletion_failures_timeframe" {
@@ -1182,6 +1178,12 @@ variable "slm_snapshot_deletion_failures_timeframe" {
   default     = "last_5m"
 }
 
+variable "slm_snapshot_deletion_failures_threshold_critical" {
+  description = "SLM Snapshot deletion failures critical threshold"
+  type        = string
+  default     = 1
+}
+
 variable "slm_snapshot_deletion_failures_extra_tags" {
   description = "Extra tags for SLM Snapshot deletion failures monitor"
   type        = list(string)