Skip to content

Commit

Permalink
Make Elasticsearch restarts always rolling
Browse files Browse the repository at this point in the history
fixes #343
  • Loading branch information
widhalmt committed Oct 24, 2024
1 parent 65dfa4f commit b57009e
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 131 deletions.
7 changes: 3 additions & 4 deletions roles/elasticsearch/handlers/main.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
---
# handlers file for elasticsearch
- name: Restart Elasticsearch
ansible.builtin.service:
name: elasticsearch
state: restarted
daemon_reload: yes
ansible.builtin.include_tasks: restart_elasticsearch.yml
with_items: "{{ groups[elasticstack_elasticsearch_group_name] }}"
when:
- "hostvars[item].inventory_hostname == inventory_hostname"
- elasticsearch_enable | bool
- not elasticsearch_freshstart.changed | bool
- not elasticsearch_freshstart_security.changed | bool
Expand Down
12 changes: 12 additions & 0 deletions roles/elasticsearch/handlers/restart_elasticsearch.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---

- name: Check for running Elasticsearch service
ansible.builtin.systemd:
name: elasticsearch
register: elasticsearch_running

- name: Include rolling stop
ansible.builtin.include_tasks: tasks/elasticsearch-rolling-stop.yml

- name: Include rolling start
ansible.builtin.include_tasks: tasks/elasticsearch-rolling-start.yml
83 changes: 83 additions & 0 deletions roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Ansible
#
# Rolling Upgrade of Elasticsearch with security on
# Source from: author: Jeff Steinmetz, @jeffsteinmetz; Bin Li, @holysoros
# Modifications: author: Daniel Neuberger @netways.de
# More modifications: NETWAYS Professional Services GmbH
# latest tested with Ansible 2.9 and later

---

# For now we support upgrade only for clusters with security enabled
# If you positively need support for safely upgrading clusters without security,
# feel free to open an issue at https://github.com/NETWAYS/ansible-collection-elasticstack/issues

- name: Start elasticsearch
ansible.builtin.service:
name: elasticsearch
enabled: yes
state: started
when:
- elasticsearch_running.status.ActiveState == "active"
- not elasticsearch_unsafe_upgrade_restart | bool

- name: Restart elasticsearch (fast, for non-prod)
ansible.builtin.service:
name: elasticsearch
enabled: yes
state: restarted
when:
- elasticsearch_running.status.ActiveState == "active"
- elasticsearch_unsafe_upgrade_restart | bool

- name: Wait for elasticsearch node to come back up if it was stopped
ansible.builtin.wait_for:
host: "{{ elasticsearch_api_host }}"
port: "{{ elasticstack_elasticsearch_http_port }}"
delay: 30

- name: Confirm the node joins the cluster # noqa: risky-shell-pipe
ansible.builtin.shell: >
if test -n "$(ps -p $$ | grep bash)"; then set -o pipefail; fi;
curl
-k
-u elastic:{{ elasticstack_password.stdout }}
-s
-m 2
'{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cat/nodes?h=name'
| grep
-E
'^{{ elasticsearch_nodename }}$'
register: result
until: result.rc == 0
retries: 200
delay: 3
changed_when: false

- name: Enable shard allocation for the cluster
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
method: PUT
body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
body_format: json
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
# next line is boolean not string, so no quotes around true
# use python truthiness
until: "response.json.acknowledged == true"
retries: 5
delay: 30

- name: Wait for cluster health to return to yellow or green
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
method: GET
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
until: "response.json.status == 'yellow' or response.json.status == 'green'"
retries: 5
delay: 30
76 changes: 76 additions & 0 deletions roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Ansible
#
# Rolling Upgrade of Elasticsearch with security on
# Source from: author: Jeff Steinmetz, @jeffsteinmetz; Bin Li, @holysoros
# Modifications: author: Daniel Neuberger @netways.de
# More modifications: NETWAYS Professional Services GmbH
# latest tested with Ansible 2.9 and later

---

# For now we support upgrade only for clusters with security enabled
# If you positively need support for safely upgrading clusters without security,
# feel free to open an issue at https://github.com/NETWAYS/ansible-collection-elasticstack/issues
- name: Set connection protocol to https
ansible.builtin.set_fact:
elasticsearch_http_protocol: "https"

# Usually we should not need this step. It's only there to recover from broken upgrade plays
# Without this step the cluster would never recover and the play would always fail
- name: Enable shard allocation for the cluster
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
method: PUT
body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
body_format: json
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
# next line is boolean not string, so no quotes around true
# use python truthiness
until: "response.json.acknowledged == true"
retries: 5
delay: 30

# this step is key!!! Don't restart more nodes
# until all shards have completed recovery
- name: Wait for cluster health to return to green
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
method: GET
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
until: "response.json.status == 'green'"
retries: 50
delay: 30

# Disabling shard allocation right after enabling it seems redundant. Please see above for details.
- name: Disable shard allocation for the cluster
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
method: PUT
body: '{ "persistent": { "cluster.routing.allocation.enable": "none" }}'
body_format: json
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no

- name: Stop non essential indexing to speed up shard recovery
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_flush"
method: POST
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
failed_when: false

- name: Shutdown elasticsearch service
ansible.builtin.service:
name: elasticsearch
enabled: yes
state: stopped
when:
- not elasticsearch_unsafe_upgrade_restart | bool
131 changes: 4 additions & 127 deletions roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,65 +71,8 @@
- groups[elasticstack_elasticsearch_group_name] | length > 1
block:

# Usually we should not need this step. It's only there to recover from broken upgrade plays
# Without this step the cluster would never recover and the play would always fail
- name: Enable shard allocation for the cluster
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
method: PUT
body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
body_format: json
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
# next line is boolean not string, so no quotes around true
# use python truthiness
until: "response.json.acknowledged == true"
retries: 5
delay: 30

# this step is key!!! Don't restart more nodes
# until all shards have completed recovery
- name: Wait for cluster health to return to green
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
method: GET
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
until: "response.json.status == 'green'"
retries: 50
delay: 30

# Disabling shard allocation right after enabling it seems redundant. Please see above for details.
- name: Disable shard allocation for the cluster
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
method: PUT
body: '{ "persistent": { "cluster.routing.allocation.enable": "none" }}'
body_format: json
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no

- name: Stop non essential indexing to speed up shard recovery
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_flush"
method: POST
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
failed_when: false

- name: Shutdown elasticsearch service
ansible.builtin.service:
name: elasticsearch
enabled: yes
state: stopped
when:
- not elasticsearch_unsafe_upgrade_restart | bool
- name: Include rolling stop
ansible.builtin.include_tasks: elasticsearch-rolling-stop.yml

- name: Update Elasticsearch - rpm with managed repositories
ansible.builtin.package:
Expand All @@ -147,72 +90,6 @@
- ansible_os_family == "Debian" or
not elasticstack_full_stack | bool

- name: Start elasticsearch
ansible.builtin.service:
name: elasticsearch
enabled: yes
state: started
when:
- elasticsearch_running.status.ActiveState == "active"
- not elasticsearch_unsafe_upgrade_restart | bool

- name: Restart elasticsearch (fast, for non-prod)
ansible.builtin.service:
name: elasticsearch
enabled: yes
state: restarted
when:
- elasticsearch_running.status.ActiveState == "active"
- elasticsearch_unsafe_upgrade_restart | bool

- name: Wait for elasticsearch node to come back up if it was stopped
ansible.builtin.wait_for:
host: "{{ elasticsearch_api_host }}"
port: "{{ elasticstack_elasticsearch_http_port }}"
delay: 30

- name: Confirm the node joins the cluster # noqa: risky-shell-pipe
ansible.builtin.shell: >
if test -n "$(ps -p $$ | grep bash)"; then set -o pipefail; fi;
curl
-k
-u elastic:{{ elasticstack_password.stdout }}
-s
-m 2
'{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cat/nodes?h=name'
| grep
-E
'^{{ elasticsearch_nodename }}$'
register: result
until: result.rc == 0
retries: 200
delay: 3
changed_when: false

- name: Enable shard allocation for the cluster
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
method: PUT
body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
body_format: json
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
# next line is boolean not string, so no quotes around true
# use python truthiness
until: "response.json.acknowledged == true"
retries: 5
delay: 30
- name: Include rolling start
ansible.builtin.include_tasks: elasticsearch-rolling-start.yml

Check warning on line 95 in roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml

View workflow job for this annotation

GitHub Actions / lint

95:1 [empty-lines] too many blank lines (1 > 0)

Check warning on line 95 in roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml

View workflow job for this annotation

GitHub Actions / lint_full / lint

95:1 [empty-lines] too many blank lines (1 > 0)
- name: Wait for cluster health to return to yellow or green
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
method: GET
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
until: "response.json.status == 'yellow' or response.json.status == 'green'"
retries: 5
delay: 30

0 comments on commit b57009e

Please sign in to comment.