diff --git a/.gitignore b/.gitignore index 364e3315..d2a60eff 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,12 @@ __pycache__/ *.tar.gz venv/ tests/integration/ +!tests/integration/ +tests/integration/* +!tests/integration/elasticsearch_cluster_settings_contract.yml +!tests/integration/elasticsearch_template_contract.yml +!tests/integration/elasticsearch_template_inventory.ini +!tests/integration/elasticsearch_upgrade_detection_contract.yml +!tests/integration/rolling_restart_contract.yml +!tests/integration/rolling_restart_inventory.ini site/ diff --git a/molecule/elasticsearch_default/side_effect.yml b/molecule/elasticsearch_default/side_effect.yml new file mode 100644 index 00000000..f5676449 --- /dev/null +++ b/molecule/elasticsearch_default/side_effect.yml @@ -0,0 +1,79 @@ +--- +- name: Trigger config-change rolling restart + hosts: all + vars: + elasticstack_full_stack: false + elasticstack_release: "{{ lookup('env', 'ELASTIC_RELEASE') | default('9', true) | int }}" + elasticsearch_security: true + elasticsearch_http_protocol: https + elasticsearch_heap: "1" + elasticstack_no_log: false + elasticsearch_elastic_password: "TestPassword123!" + elasticsearch_jvm_custom_parameters: + - "-Des.config.restart.marker=true" + elasticsearch_config_restart_health_retries: 20 + elasticsearch_config_restart_health_delay: 3 + elasticsearch_cluster_settings: + action.destructive_requires_name: "true" + tasks: + - name: Reset shared role guard for config restart test + ansible.builtin.set_fact: + _elasticstack_role_imported: false + + - name: Record ES PID before config change + ansible.builtin.command: pgrep -f 'org.elasticsearch.bootstrap.Elasticsearch' + register: _es_pid_before_config_restart + changed_when: false + + - name: Include Elasticsearch with changed JVM config + ansible.builtin.include_role: + name: oddly.elasticstack.elasticsearch + + - name: Record ES PID after config change + ansible.builtin.command: pgrep -f 'org.elasticsearch.bootstrap.Elasticsearch' + register: _es_pid_after_config_restart + changed_when: false + + - name: Verify config change restarted Elasticsearch + ansible.builtin.assert: + that: + - _es_pid_before_config_restart.stdout != _es_pid_after_config_restart.stdout + fail_msg: >- + Elasticsearch was not restarted after config change on {{ inventory_hostname }}. + PID remained {{ _es_pid_after_config_restart.stdout }}. + success_msg: >- + Elasticsearch restarted after config change on {{ inventory_hostname }}. + + - name: Build config restart rolling timeline # noqa: run-once[task] + ansible.builtin.set_fact: + _es_first_restart_host: "{{ groups['elasticsearch'][0] }}" + _es_second_restart_host: "{{ groups['elasticsearch'][1] }}" + _es_first_restart_started_usec: "{{ hostvars[groups['elasticsearch'][0]]._elasticsearch_config_restart_started_usec | int }}" + _es_first_restart_health_complete_usec: "{{ hostvars[groups['elasticsearch'][0]]._elasticsearch_config_restart_health_complete_usec | int }}" + _es_second_restart_started_usec: "{{ hostvars[groups['elasticsearch'][1]]._elasticsearch_config_restart_started_usec | int }}" + _es_second_restart_health_complete_usec: "{{ hostvars[groups['elasticsearch'][1]]._elasticsearch_config_restart_health_complete_usec | int }}" + run_once: true + + - name: Verify config restart was rolling # noqa: run-once[task] + ansible.builtin.assert: + that: + - groups['elasticsearch'] | length == 2 + - _es_first_restart_started_usec | int > 0 + - _es_first_restart_health_complete_usec | int > 0 + - _es_second_restart_started_usec | int > 0 + - _es_second_restart_health_complete_usec | int > 0 + - _es_first_restart_started_usec | int <= _es_first_restart_health_complete_usec | int + - _es_second_restart_started_usec | int <= _es_second_restart_health_complete_usec | int + - _es_first_restart_health_complete_usec | int <= _es_second_restart_started_usec | int + fail_msg: >- + Elasticsearch config restart did not wait for the first node's + post-restart health gate before starting the next node. + {{ _es_first_restart_host }} started={{ _es_first_restart_started_usec }} + health_complete={{ _es_first_restart_health_complete_usec }}; + {{ _es_second_restart_host }} started={{ _es_second_restart_started_usec }} + health_complete={{ _es_second_restart_health_complete_usec }}. + success_msg: >- + Elasticsearch config restart was rolling and health-gated. + {{ _es_first_restart_host }} completed health before + {{ _es_second_restart_host }} started. + run_once: true diff --git a/roles/elasticsearch/defaults/main.yml b/roles/elasticsearch/defaults/main.yml index 72583188..c733ea37 100644 --- a/roles/elasticsearch/defaults/main.yml +++ b/roles/elasticsearch/defaults/main.yml @@ -78,6 +78,20 @@ elasticsearch_conf_dir: "/etc/elasticsearch/" elasticsearch_group: elasticsearch # @var elasticsearch_api_host:description: Hostname or IP used for Elasticsearch API health checks elasticsearch_api_host: localhost +# @var elasticsearch_config_restart_strategy:description: Strategy for config-triggered Elasticsearch restarts. Use rolling for multi-node clusters, direct for legacy all-host handler behavior +elasticsearch_config_restart_strategy: rolling +# @var elasticsearch_config_restart_flush:description: Flush indices before each node restart during config-triggered rolling restarts +elasticsearch_config_restart_flush: true +# @var elasticsearch_config_restart_wait_status:description: Minimum cluster health status before and after each config-triggered rolling restart. Use yellow or green +elasticsearch_config_restart_wait_status: green +# @var elasticsearch_config_restart_health_retries:description: Number of cluster health polling attempts during config-triggered rolling restarts +elasticsearch_config_restart_health_retries: 50 +# @var elasticsearch_config_restart_health_delay:description: Delay in seconds between cluster health polling attempts during config-triggered rolling restarts +elasticsearch_config_restart_health_delay: 30 +# @var elasticsearch_config_restart_node_retries:description: Number of attempts to confirm the restarted node has rejoined the cluster +elasticsearch_config_restart_node_retries: 200 +# @var elasticsearch_config_restart_node_delay:description: Delay in seconds between restarted node join checks +elasticsearch_config_restart_node_delay: 3 # @var elasticsearch_jvm_custom_parameters:description: Additional JVM parameters appended to jvm.options.d. Use for GC tuning, debug flags, etc # @var elasticsearch_jvm_custom_parameters:example: > diff --git a/roles/elasticsearch/handlers/main.yml b/roles/elasticsearch/handlers/main.yml index fa191524..1ac3d3f9 100644 --- a/roles/elasticsearch/handlers/main.yml +++ b/roles/elasticsearch/handlers/main.yml @@ -1,12 +1,54 @@ --- # handlers file for elasticsearch -- name: Restart Elasticsearch +- name: Mark Elasticsearch restart requested + ansible.builtin.set_fact: + _elasticsearch_restart_requested: true + listen: Restart Elasticsearch + when: + - not ansible_check_mode + - elasticsearch_enable | bool + - not (_elasticsearch_freshstart | default({'changed': false})).changed | bool + - not (_elasticsearch_freshstart_security | default({'changed': false})).changed | bool + - not _elasticsearch_rolling_upgrade_performed | default(false) | bool + +- name: Restart Elasticsearch directly ansible.builtin.include_tasks: restart_and_verify_elasticsearch.yml + listen: Restart Elasticsearch + when: + - not ansible_check_mode + - elasticsearch_enable | bool + - > + elasticsearch_config_restart_strategy == 'direct' + or groups[elasticstack_elasticsearch_group_name] | default([]) | length <= 1 + - not (_elasticsearch_freshstart | default({'changed': false})).changed | bool + - not (_elasticsearch_freshstart_security | default({'changed': false})).changed | bool + - not _elasticsearch_rolling_upgrade_performed | default(false) | bool + +- name: Clear direct Elasticsearch restart request + ansible.builtin.set_fact: + _elasticsearch_restart_requested: false + listen: Restart Elasticsearch + when: + - not ansible_check_mode + - elasticsearch_enable | bool + - > + elasticsearch_config_restart_strategy == 'direct' + or groups[elasticstack_elasticsearch_group_name] | default([]) | length <= 1 + - not (_elasticsearch_freshstart | default({'changed': false})).changed | bool + - not (_elasticsearch_freshstart_security | default({'changed': false})).changed | bool + - not _elasticsearch_rolling_upgrade_performed | default(false) | bool + +- name: Restart Elasticsearch rolling + ansible.builtin.include_tasks: restart_and_verify_elasticsearch_rolling.yml + listen: Restart Elasticsearch + run_once: true when: - not ansible_check_mode - elasticsearch_enable | bool - - not _elasticsearch_freshstart.changed | bool - - not _elasticsearch_freshstart_security.changed | bool + - elasticsearch_config_restart_strategy == 'rolling' + - groups[elasticstack_elasticsearch_group_name] | default([]) | length > 1 + - not (_elasticsearch_freshstart | default({'changed': false})).changed | bool + - not (_elasticsearch_freshstart_security | default({'changed': false})).changed | bool - not _elasticsearch_rolling_upgrade_performed | default(false) | bool - name: Restart kibana if available for elasticsearch certificates diff --git a/roles/elasticsearch/tasks/elasticsearch-cluster-settings.yml b/roles/elasticsearch/tasks/elasticsearch-cluster-settings.yml new file mode 100644 index 00000000..30336d7e --- /dev/null +++ b/roles/elasticsearch/tasks/elasticsearch-cluster-settings.yml @@ -0,0 +1,58 @@ +--- + +- name: elasticsearch-cluster-settings | Build effective cluster settings + ansible.builtin.set_fact: + _elasticsearch_effective_cluster_settings: >- + {{ (elasticsearch_logsdb | default(false) | bool) + | ternary({'cluster.logsdb.enabled': 'true'}, {}) + | combine(elasticsearch_cluster_settings | default({})) }} + +- name: elasticsearch-cluster-settings | Apply persistent cluster settings # noqa: run-once[task] + when: + - _elasticsearch_effective_cluster_settings | length > 0 + - elasticsearch_security | bool | ternary(elasticstack_password is defined and (elasticstack_password.stdout | default('') | length > 0), true) + - not ansible_check_mode + run_once: true + delegate_to: "{{ elasticstack_ca_host | default(inventory_hostname) }}" + block: + - name: elasticsearch-cluster-settings | Read current persistent cluster settings + ansible.builtin.uri: + url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings?flat_settings=true" + method: GET + user: "{{ 'elastic' if elasticsearch_security | bool else omit }}" + password: "{{ elasticstack_password.stdout if elasticsearch_security | bool else omit }}" + force_basic_auth: "{{ elasticsearch_security | bool }}" + validate_certs: "{{ elasticsearch_validate_api_certs }}" + return_content: true + register: _elasticsearch_current_cluster_settings + no_log: "{{ elasticstack_no_log }}" + + - name: elasticsearch-cluster-settings | Check if settings already match + ansible.builtin.set_fact: + _elasticsearch_cluster_settings_changed: "{{ _needs_update | trim }}" + vars: + _current: "{{ _elasticsearch_current_cluster_settings.json.persistent }}" + _needs_update: >- + {% set ns = namespace(changed=false) %} + {% for key, value in _elasticsearch_effective_cluster_settings.items() %} + {% if _current.get(key) is none or _current[key] | string != value | string %} + {% set ns.changed = true %} + {% endif %} + {% endfor %} + {{ ns.changed }} + + - name: elasticsearch-cluster-settings | Apply cluster settings + ansible.builtin.uri: + url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings" + method: PUT + body_format: json + body: + persistent: "{{ _elasticsearch_effective_cluster_settings }}" + user: "{{ 'elastic' if elasticsearch_security | bool else omit }}" + password: "{{ elasticstack_password.stdout if elasticsearch_security | bool else omit }}" + force_basic_auth: "{{ elasticsearch_security | bool }}" + validate_certs: "{{ elasticsearch_validate_api_certs }}" + status_code: 200 + no_log: "{{ elasticstack_no_log }}" + when: _elasticsearch_cluster_settings_changed | bool + changed_when: true diff --git a/roles/elasticsearch/tasks/elasticsearch-upgrade-detection.yml b/roles/elasticsearch/tasks/elasticsearch-upgrade-detection.yml new file mode 100644 index 00000000..40acb932 --- /dev/null +++ b/roles/elasticsearch/tasks/elasticsearch-upgrade-detection.yml @@ -0,0 +1,36 @@ +--- + +# Detect whether a rolling upgrade is needed. Any version change — major, +# minor, or patch — should restart nodes one at a time with shard allocation +# management rather than restarting all nodes simultaneously. +# +# Pre-install detection covers cases where we know the target version: +# 1. Pinned version higher than installed (elasticstack_version: "9.2.0") +# 2. Major version change (elasticstack_release differs from installed) +# +# For "latest" mode, we can't know pre-install whether a newer version is +# available. The normal package tasks handle installation with state: latest, +# and if the package changed, a rolling restart is triggered post-install. +- name: elasticsearch-upgrade-detection | Detect if rolling upgrade is needed + ansible.builtin.set_fact: + _elasticsearch_needs_rolling_upgrade: >- + {{ ansible_facts.packages['elasticsearch'] is defined and + ansible_facts.packages['elasticsearch'][0].version is defined and + ((elasticstack_version is defined and + elasticstack_version != 'latest' and + elasticstack_version is version(ansible_facts.packages['elasticsearch'][0].version, '>')) + or + (elasticstack_release | default(8) | int != (ansible_facts.packages['elasticsearch'][0].version.split('.')[0] | int))) }} + +- name: elasticsearch-upgrade-detection | Check upgrade path requirement for ES 9.x + ansible.builtin.fail: + msg: | + UPGRADE PATH VIOLATION: Elasticsearch 9.x requires 8.19.x first. + Current version: {{ ansible_facts.packages['elasticsearch'][0].version }} + You must upgrade to 8.19.x before upgrading to 9.x. + See: https://www.elastic.co/docs/deploy-manage/upgrade/deployment-or-cluster + when: + - elasticstack_release | default(8) | int >= 9 + - ansible_facts.packages['elasticsearch'] is defined + - ansible_facts.packages['elasticsearch'][0].version is version('8.0.0', '>=') + - ansible_facts.packages['elasticsearch'][0].version is version('8.19.0', '<') diff --git a/roles/elasticsearch/tasks/main.yml b/roles/elasticsearch/tasks/main.yml index d3d7c2b5..8102d7ff 100644 --- a/roles/elasticsearch/tasks/main.yml +++ b/roles/elasticsearch/tasks/main.yml @@ -4,40 +4,8 @@ ansible.builtin.package_facts: manager: auto -# Detect whether a rolling upgrade is needed. Any version change — major, -# minor, or patch — should restart nodes one at a time with shard allocation -# management rather than restarting all nodes simultaneously. -# -# Pre-install detection covers cases where we know the target version: -# 1. Pinned version higher than installed (elasticstack_version: "9.2.0") -# 2. Major version change (elasticstack_release differs from installed) -# -# For "latest" mode, we can't know pre-install whether a newer version is -# available. The normal package tasks handle installation with state: latest, -# and if the package changed, a rolling restart is triggered post-install. - name: Detect if rolling upgrade is needed (pre-install) - ansible.builtin.set_fact: - _elasticsearch_needs_rolling_upgrade: >- - {{ ansible_facts.packages['elasticsearch'] is defined and - ansible_facts.packages['elasticsearch'][0].version is defined and - ((elasticstack_version is defined and - elasticstack_version != 'latest' and - elasticstack_version is version(ansible_facts.packages['elasticsearch'][0].version, '>')) - or - (elasticstack_release | int != (ansible_facts.packages['elasticsearch'][0].version.split('.')[0] | int))) }} - -- name: Check upgrade path requirement for ES 9.x - ansible.builtin.fail: - msg: | - UPGRADE PATH VIOLATION: Elasticsearch 9.x requires 8.19.x first. - Current version: {{ ansible_facts.packages['elasticsearch'][0].version }} - You must upgrade to 8.19.x before upgrading to 9.x. - See: https://www.elastic.co/docs/deploy-manage/upgrade/deployment-or-cluster - when: - - elasticstack_release | int >= 9 - - ansible_facts.packages['elasticsearch'] is defined - - ansible_facts.packages['elasticsearch'][0].version is version('8.0.0', '>=') - - ansible_facts.packages['elasticsearch'][0].version is version('8.19.0', '<') + ansible.builtin.include_tasks: elasticsearch-upgrade-detection.yml - name: Include global role ansible.builtin.import_role: @@ -655,59 +623,5 @@ # -- Persistent cluster settings via _cluster/settings API -- -- name: Build effective cluster settings - ansible.builtin.set_fact: - _elasticsearch_effective_cluster_settings: >- - {{ (elasticsearch_logsdb | bool) - | ternary({'cluster.logsdb.enabled': 'true'}, {}) - | combine(elasticsearch_cluster_settings | default({})) }} - -- name: Apply persistent cluster settings # noqa: run-once[task] - when: - - _elasticsearch_effective_cluster_settings | length > 0 - - elasticsearch_security | bool | ternary(elasticstack_password is defined and (elasticstack_password.stdout | default('') | length > 0), true) - - not ansible_check_mode - run_once: true - delegate_to: "{{ elasticstack_ca_host | default(inventory_hostname) }}" - block: - - name: Read current persistent cluster settings - ansible.builtin.uri: - url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings?flat_settings=true" - method: GET - user: "{{ 'elastic' if elasticsearch_security | bool else omit }}" - password: "{{ elasticstack_password.stdout if elasticsearch_security | bool else omit }}" - force_basic_auth: "{{ elasticsearch_security | bool }}" - validate_certs: "{{ elasticsearch_validate_api_certs }}" - return_content: true - register: _elasticsearch_current_cluster_settings - no_log: "{{ elasticstack_no_log }}" - - - name: Check if settings already match - ansible.builtin.set_fact: - _elasticsearch_cluster_settings_changed: "{{ _needs_update | trim }}" - vars: - _current: "{{ _elasticsearch_current_cluster_settings.json.persistent }}" - _needs_update: >- - {% set ns = namespace(changed=false) %} - {% for key, value in _elasticsearch_effective_cluster_settings.items() %} - {% if _current.get(key) is none or _current[key] | string != value | string %} - {% set ns.changed = true %} - {% endif %} - {% endfor %} - {{ ns.changed }} - - - name: Apply cluster settings - ansible.builtin.uri: - url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings" - method: PUT - body_format: json - body: - persistent: "{{ _elasticsearch_effective_cluster_settings }}" - user: "{{ 'elastic' if elasticsearch_security | bool else omit }}" - password: "{{ elasticstack_password.stdout if elasticsearch_security | bool else omit }}" - force_basic_auth: "{{ elasticsearch_security | bool }}" - validate_certs: "{{ elasticsearch_validate_api_certs }}" - status_code: 200 - no_log: "{{ elasticstack_no_log }}" - when: _elasticsearch_cluster_settings_changed | bool - changed_when: true +- name: Apply persistent cluster settings + ansible.builtin.include_tasks: elasticsearch-cluster-settings.yml diff --git a/roles/elasticsearch/tasks/restart_and_verify_elasticsearch_rolling.yml b/roles/elasticsearch/tasks/restart_and_verify_elasticsearch_rolling.yml new file mode 100644 index 00000000..fc4bc3f2 --- /dev/null +++ b/roles/elasticsearch/tasks/restart_and_verify_elasticsearch_rolling.yml @@ -0,0 +1,39 @@ +--- + +- name: restart_and_verify_elasticsearch_rolling | Build restart host list + ansible.builtin.set_fact: + _elasticsearch_restart_hosts: [] + +- name: restart_and_verify_elasticsearch_rolling | Add requested hosts to restart list + ansible.builtin.set_fact: + _elasticsearch_restart_hosts: "{{ _elasticsearch_restart_hosts + [item] }}" + loop: "{{ groups[elasticstack_elasticsearch_group_name] | default([]) }}" + when: hostvars[item]._elasticsearch_restart_requested | default(false) | bool + +- name: restart_and_verify_elasticsearch_rolling | Validate restart strategy inputs + ansible.builtin.assert: + that: + - elasticsearch_config_restart_strategy in ['rolling', 'direct'] + - elasticsearch_config_restart_wait_status in ['yellow', 'green'] + fail_msg: >- + elasticsearch_config_restart_strategy must be rolling or direct, and + elasticsearch_config_restart_wait_status must be yellow or green. + +- name: restart_and_verify_elasticsearch_rolling | Validate credentials for secured rolling restart + ansible.builtin.assert: + that: + - elasticstack_password is defined + - elasticstack_password.stdout | default('') | length > 0 + fail_msg: >- + Cannot perform a safe rolling Elasticsearch restart because security is + enabled but the elastic user password is unavailable. + when: + - elasticsearch_security | bool + - _elasticsearch_restart_hosts | length > 0 + +- name: restart_and_verify_elasticsearch_rolling | Restart requested nodes one at a time + ansible.builtin.include_tasks: restart_and_verify_elasticsearch_rolling_node.yml + loop: "{{ _elasticsearch_restart_hosts }}" + loop_control: + loop_var: _elasticsearch_restart_host + when: _elasticsearch_restart_hosts | length > 0 diff --git a/roles/elasticsearch/tasks/restart_and_verify_elasticsearch_rolling_node.yml b/roles/elasticsearch/tasks/restart_and_verify_elasticsearch_rolling_node.yml new file mode 100644 index 00000000..c95b2673 --- /dev/null +++ b/roles/elasticsearch/tasks/restart_and_verify_elasticsearch_rolling_node.yml @@ -0,0 +1,301 @@ +--- + +- name: restart_and_verify_elasticsearch_rolling_node | Prepare host-specific values + ansible.builtin.set_fact: + _elasticsearch_allocation_disabled: false + _elasticsearch_restart_api_host: "{{ hostvars[_elasticsearch_restart_host].elasticsearch_api_host | default(elasticsearch_api_host) }}" + _elasticsearch_restart_nodename: >- + {{ hostvars[_elasticsearch_restart_host].elasticsearch_nodename + | default(hostvars[_elasticsearch_restart_host].ansible_facts.hostname | default(_elasticsearch_restart_host)) }} + _elasticsearch_restart_protocol: >- + {{ 'https' + if (elasticsearch_security | bool) + else hostvars[_elasticsearch_restart_host].elasticsearch_http_protocol + | default(elasticsearch_http_protocol | default('http')) }} + _elasticsearch_restart_port: "{{ hostvars[_elasticsearch_restart_host].elasticstack_elasticsearch_http_port | default(elasticstack_elasticsearch_http_port) }}" + _elasticsearch_health_statuses: "{{ ['green'] if elasticsearch_config_restart_wait_status == 'green' else ['green', 'yellow'] }}" + _elasticsearch_cleanup_host: >- + {{ ((groups[elasticstack_elasticsearch_group_name] | default([])) + | difference([_elasticsearch_restart_host]) + | first) + | default(_elasticsearch_restart_host, true) }} + +- name: restart_and_verify_elasticsearch_rolling_node | Prepare cleanup delegate values + ansible.builtin.set_fact: + _elasticsearch_cleanup_api_host: "{{ hostvars[_elasticsearch_cleanup_host].elasticsearch_api_host | default(elasticsearch_api_host) }}" + _elasticsearch_cleanup_protocol: >- + {{ 'https' + if (elasticsearch_security | bool) + else hostvars[_elasticsearch_cleanup_host].elasticsearch_http_protocol + | default(elasticsearch_http_protocol | default('http')) }} + _elasticsearch_cleanup_port: "{{ hostvars[_elasticsearch_cleanup_host].elasticstack_elasticsearch_http_port | default(elasticstack_elasticsearch_http_port) }}" + +- name: "restart_and_verify_elasticsearch_rolling_node | Restart {{ _elasticsearch_restart_host }}" + block: + - name: restart_and_verify_elasticsearch_rolling_node | Wait for cluster health before restart + ansible.builtin.uri: + url: "{{ _elasticsearch_restart_protocol }}://{{ _elasticsearch_restart_api_host }}:{{ _elasticsearch_restart_port }}/_cluster/health" + method: GET + user: "{{ 'elastic' if elasticsearch_security | bool else omit }}" + password: "{{ elasticstack_password.stdout if elasticsearch_security | bool else omit }}" + force_basic_auth: "{{ elasticsearch_security | bool }}" + validate_certs: "{{ elasticsearch_validate_api_certs }}" + register: _elasticsearch_pre_restart_health + until: + - (_elasticsearch_pre_restart_health.json.status | default('')) in _elasticsearch_health_statuses + - (_elasticsearch_pre_restart_health.json.relocating_shards | default(0) | int) == 0 + - (_elasticsearch_pre_restart_health.json.initializing_shards | default(0) | int) == 0 + retries: "{{ elasticsearch_config_restart_health_retries }}" + delay: "{{ elasticsearch_config_restart_health_delay }}" + changed_when: false + no_log: "{{ elasticstack_no_log }}" + delegate_to: "{{ _elasticsearch_restart_host }}" + + - name: restart_and_verify_elasticsearch_rolling_node | Disable replica allocation + ansible.builtin.uri: + url: "{{ _elasticsearch_restart_protocol }}://{{ _elasticsearch_restart_api_host }}:{{ _elasticsearch_restart_port }}/_cluster/settings" + method: PUT + body: '{ "persistent": { "cluster.routing.allocation.enable": "primaries" }}' + body_format: json + user: "{{ 'elastic' if elasticsearch_security | bool else omit }}" + password: "{{ elasticstack_password.stdout if elasticsearch_security | bool else omit }}" + force_basic_auth: "{{ elasticsearch_security | bool }}" + validate_certs: "{{ elasticsearch_validate_api_certs }}" + register: _elasticsearch_disable_allocation + until: (_elasticsearch_disable_allocation.json | default({})).acknowledged | default(false) + retries: 5 + delay: 10 + no_log: "{{ elasticstack_no_log }}" + delegate_to: "{{ _elasticsearch_restart_host }}" + + - name: restart_and_verify_elasticsearch_rolling_node | Record disabled allocation + ansible.builtin.set_fact: + _elasticsearch_allocation_disabled: true + + - name: restart_and_verify_elasticsearch_rolling_node | Flush indices before restart + ansible.builtin.uri: + url: "{{ _elasticsearch_restart_protocol }}://{{ _elasticsearch_restart_api_host }}:{{ _elasticsearch_restart_port }}/_flush" + method: POST + user: "{{ 'elastic' if elasticsearch_security | bool else omit }}" + password: "{{ elasticstack_password.stdout if elasticsearch_security | bool else omit }}" + force_basic_auth: "{{ elasticsearch_security | bool }}" + validate_certs: "{{ elasticsearch_validate_api_certs }}" + status_code: [200, 409, 503] + register: _elasticsearch_restart_flush + failed_when: false + changed_when: false + no_log: "{{ elasticstack_no_log }}" + delegate_to: "{{ _elasticsearch_restart_host }}" + when: elasticsearch_config_restart_flush | bool + + - name: restart_and_verify_elasticsearch_rolling_node | Warn if flush failed + ansible.builtin.debug: + msg: >- + Flush returned status {{ _elasticsearch_restart_flush.status | default('unknown') }}: + {{ _elasticsearch_restart_flush.msg | default('') }} + when: + - elasticsearch_config_restart_flush | bool + - (_elasticsearch_restart_flush.status | default(0)) != 200 + + - name: restart_and_verify_elasticsearch_rolling_node | Record restart start marker + ansible.builtin.command: + cmd: date +%s%6N + register: _elasticsearch_restart_started_at + changed_when: false + delegate_to: "{{ _elasticsearch_restart_host }}" + + - name: restart_and_verify_elasticsearch_rolling_node | Store restart start marker + ansible.builtin.set_fact: + _elasticsearch_config_restart_started_usec: "{{ _elasticsearch_restart_started_at.stdout | int }}" + delegate_to: "{{ _elasticsearch_restart_host }}" + delegate_facts: true + + - name: restart_and_verify_elasticsearch_rolling_node | Reset Elasticsearch failed state before restart + ansible.builtin.command: + cmd: systemctl reset-failed elasticsearch.service + changed_when: false + failed_when: false + delegate_to: "{{ _elasticsearch_restart_host }}" + when: not _elasticsearch_restart_test_mode | default(false) | bool + + - name: restart_and_verify_elasticsearch_rolling_node | Restart Elasticsearch service + ansible.builtin.service: + name: elasticsearch + state: restarted + daemon_reload: true + delegate_to: "{{ _elasticsearch_restart_host }}" + when: not _elasticsearch_restart_test_mode | default(false) | bool + + - name: restart_and_verify_elasticsearch_rolling_node | Verify Elasticsearch service is running + ansible.builtin.systemd: + name: elasticsearch + register: _elasticsearch_service_state + until: _elasticsearch_service_state.status.ActiveState == 'active' + retries: 5 + delay: 3 + delegate_to: "{{ _elasticsearch_restart_host }}" + when: not _elasticsearch_restart_test_mode | default(false) | bool + + - name: restart_and_verify_elasticsearch_rolling_node | Wait for Elasticsearch HTTP port + ansible.builtin.wait_for: + host: "{{ _elasticsearch_restart_api_host }}" + port: "{{ _elasticsearch_restart_port }}" + delay: "{{ 0 if _elasticsearch_restart_test_mode | default(false) | bool else 10 }}" + timeout: 600 + delegate_to: "{{ _elasticsearch_restart_host }}" + + - name: restart_and_verify_elasticsearch_rolling_node | Confirm node has rejoined the cluster + ansible.builtin.uri: + url: "{{ _elasticsearch_restart_protocol }}://{{ _elasticsearch_restart_api_host }}:{{ _elasticsearch_restart_port }}/_cat/nodes?h=name" + method: GET + user: "{{ 'elastic' if elasticsearch_security | bool else omit }}" + password: "{{ elasticstack_password.stdout if elasticsearch_security | bool else omit }}" + force_basic_auth: "{{ elasticsearch_security | bool }}" + validate_certs: "{{ elasticsearch_validate_api_certs }}" + return_content: true + register: _elasticsearch_nodes + until: _elasticsearch_restart_nodename in (_elasticsearch_nodes.content | default('')).split() + retries: "{{ elasticsearch_config_restart_node_retries }}" + delay: "{{ elasticsearch_config_restart_node_delay }}" + changed_when: false + no_log: "{{ elasticstack_no_log }}" + delegate_to: "{{ _elasticsearch_restart_host }}" + + - name: restart_and_verify_elasticsearch_rolling_node | Re-enable shard allocation + ansible.builtin.uri: + url: "{{ _elasticsearch_restart_protocol }}://{{ _elasticsearch_restart_api_host }}:{{ _elasticsearch_restart_port }}/_cluster/settings" + method: PUT + body: '{ "persistent": { "cluster.routing.allocation.enable": null }}' + body_format: json + user: "{{ 'elastic' if elasticsearch_security | bool else omit }}" + password: "{{ elasticstack_password.stdout if elasticsearch_security | bool else omit }}" + force_basic_auth: "{{ elasticsearch_security | bool }}" + validate_certs: "{{ elasticsearch_validate_api_certs }}" + register: _elasticsearch_enable_allocation + until: (_elasticsearch_enable_allocation.json | default({})).acknowledged | default(false) + retries: 5 + delay: 30 + no_log: "{{ elasticstack_no_log }}" + delegate_to: "{{ _elasticsearch_restart_host }}" + + - name: restart_and_verify_elasticsearch_rolling_node | Wait for cluster health after restart + ansible.builtin.uri: + url: "{{ _elasticsearch_restart_protocol }}://{{ _elasticsearch_restart_api_host }}:{{ _elasticsearch_restart_port }}/_cluster/health" + method: GET + user: "{{ 'elastic' if elasticsearch_security | bool else omit }}" + password: "{{ elasticstack_password.stdout if elasticsearch_security | bool else omit }}" + force_basic_auth: "{{ elasticsearch_security | bool }}" + validate_certs: "{{ elasticsearch_validate_api_certs }}" + register: _elasticsearch_post_restart_health + until: + - (_elasticsearch_post_restart_health.json.status | default('')) in _elasticsearch_health_statuses + - (_elasticsearch_post_restart_health.json.relocating_shards | default(0) | int) == 0 + - (_elasticsearch_post_restart_health.json.initializing_shards | default(0) | int) == 0 + retries: "{{ elasticsearch_config_restart_health_retries }}" + delay: "{{ elasticsearch_config_restart_health_delay }}" + changed_when: false + no_log: "{{ elasticstack_no_log }}" + delegate_to: "{{ _elasticsearch_restart_host }}" + + - name: restart_and_verify_elasticsearch_rolling_node | Record post-health completion marker + ansible.builtin.command: + cmd: date +%s%6N + register: _elasticsearch_restart_health_completed_at + changed_when: false + delegate_to: "{{ _elasticsearch_restart_host }}" + + - name: restart_and_verify_elasticsearch_rolling_node | Store post-health completion marker + ansible.builtin.set_fact: + _elasticsearch_config_restart_health_complete_usec: "{{ _elasticsearch_restart_health_completed_at.stdout | int }}" + delegate_to: "{{ _elasticsearch_restart_host }}" + delegate_facts: true + + - name: restart_and_verify_elasticsearch_rolling_node | Clear allocation state + ansible.builtin.set_fact: + _elasticsearch_allocation_disabled: false + + - name: restart_and_verify_elasticsearch_rolling_node | Clear restart request + ansible.builtin.set_fact: + _elasticsearch_restart_requested: false + delegate_to: "{{ _elasticsearch_restart_host }}" + delegate_facts: true + + rescue: + - name: restart_and_verify_elasticsearch_rolling_node | Get recent Elasticsearch journal output + ansible.builtin.command: + cmd: journalctl -u elasticsearch --no-pager -n 50 + register: _elasticsearch_restart_journal + changed_when: false + delegate_to: "{{ _elasticsearch_restart_host }}" + + - name: restart_and_verify_elasticsearch_rolling_node | Fail with startup diagnostics + ansible.builtin.fail: + msg: | + Elasticsearch failed to restart on {{ _elasticsearch_restart_host }}. + + Recent log output: + {{ _elasticsearch_restart_journal.stdout }} + + always: + - name: restart_and_verify_elasticsearch_rolling_node | Re-enable shard allocation via surviving peer + ansible.builtin.uri: + url: "{{ _elasticsearch_cleanup_protocol }}://{{ _elasticsearch_cleanup_api_host }}:{{ _elasticsearch_cleanup_port }}/_cluster/settings" + method: PUT + body: '{ "persistent": { "cluster.routing.allocation.enable": null }}' + body_format: json + user: "{{ 'elastic' if elasticsearch_security | bool else omit }}" + password: "{{ elasticstack_password.stdout if elasticsearch_security | bool else omit }}" + force_basic_auth: "{{ elasticsearch_security | bool }}" + validate_certs: "{{ elasticsearch_validate_api_certs }}" + register: _elasticsearch_enable_allocation + until: (_elasticsearch_enable_allocation.json | default({})).acknowledged | default(false) + retries: 5 + delay: 30 + no_log: "{{ elasticstack_no_log }}" + delegate_to: "{{ _elasticsearch_cleanup_host }}" + when: _elasticsearch_allocation_disabled | bool + + - name: restart_and_verify_elasticsearch_rolling_node | Wait for cluster health after restart via surviving peer + ansible.builtin.uri: + url: "{{ _elasticsearch_cleanup_protocol }}://{{ _elasticsearch_cleanup_api_host }}:{{ _elasticsearch_cleanup_port }}/_cluster/health" + method: GET + user: "{{ 'elastic' if elasticsearch_security | bool else omit }}" + password: "{{ elasticstack_password.stdout if elasticsearch_security | bool else omit }}" + force_basic_auth: "{{ elasticsearch_security | bool }}" + validate_certs: "{{ elasticsearch_validate_api_certs }}" + register: _elasticsearch_post_restart_health + until: + - (_elasticsearch_post_restart_health.json.status | default('')) in _elasticsearch_health_statuses + - (_elasticsearch_post_restart_health.json.relocating_shards | default(0) | int) == 0 + - (_elasticsearch_post_restart_health.json.initializing_shards | default(0) | int) == 0 + retries: "{{ elasticsearch_config_restart_health_retries }}" + delay: "{{ elasticsearch_config_restart_health_delay }}" + changed_when: false + no_log: "{{ elasticstack_no_log }}" + delegate_to: "{{ _elasticsearch_cleanup_host }}" + when: _elasticsearch_allocation_disabled | bool + + - name: restart_and_verify_elasticsearch_rolling_node | Record post-health completion marker + ansible.builtin.command: + cmd: date +%s%6N + register: _elasticsearch_restart_health_completed_at + changed_when: false + delegate_to: "{{ _elasticsearch_cleanup_host }}" + when: _elasticsearch_allocation_disabled | bool + + - name: restart_and_verify_elasticsearch_rolling_node | Store post-health completion marker + ansible.builtin.set_fact: + _elasticsearch_config_restart_health_complete_usec: "{{ _elasticsearch_restart_health_completed_at.stdout | int }}" + delegate_to: "{{ _elasticsearch_restart_host }}" + delegate_facts: true + when: _elasticsearch_allocation_disabled | bool + + - name: restart_and_verify_elasticsearch_rolling_node | Clear allocation state + ansible.builtin.set_fact: + _elasticsearch_allocation_disabled: false + + - name: restart_and_verify_elasticsearch_rolling_node | Clear restart request + ansible.builtin.set_fact: + _elasticsearch_restart_requested: false + delegate_to: "{{ _elasticsearch_restart_host }}" + delegate_facts: true diff --git a/tests/fakes/fake_es_rolling_api.py b/tests/fakes/fake_es_rolling_api.py new file mode 100644 index 00000000..4c0acf66 --- /dev/null +++ b/tests/fakes/fake_es_rolling_api.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +"""Small fake Elasticsearch API used by the rolling restart contract test.""" + +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +import argparse +import json +import threading + + +class State: + def __init__(self, nodes, log_path, persistent_settings, fail_nodes_on): + self.nodes = nodes + self.log_path = log_path + self.persistent_settings = persistent_settings + self.fail_nodes_on = fail_nodes_on + self.lock = threading.Lock() + + def log(self, port, method, path, body): + with self.lock: + with open(self.log_path, "a", encoding="utf-8") as log_file: + log_file.write( + json.dumps( + { + "port": port, + "method": method, + "path": path, + "body": body, + }, + sort_keys=True, + ) + + "\n" + ) + + +def handler(state): + class FakeElasticsearchHandler(BaseHTTPRequestHandler): + def _send_json(self, payload, status=200): + body = json.dumps(payload).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _read_body(self): + length = int(self.headers.get("Content-Length", "0")) + if length == 0: + return "" + return self.rfile.read(length).decode("utf-8") + + def do_GET(self): + state.log(self.server.server_port, "GET", self.path, "") + if self.path.startswith("/_cluster/settings"): + with state.lock: + persistent_settings = dict(state.persistent_settings) + self._send_json({"persistent": persistent_settings}) + return + if self.path.startswith("/_cluster/health"): + self._send_json( + { + "status": "green", + "relocating_shards": 0, + "initializing_shards": 0, + } + ) + return + if self.path.startswith("/_cat/nodes"): + if self.server.server_port in state.fail_nodes_on: + self._send_json({"error": "simulated rejoin failure"}, status=503) + return + body = ("\n".join(state.nodes) + "\n").encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/plain") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + return + self._send_json({"error": "not found"}, status=404) + + def do_POST(self): + body = self._read_body() + state.log(self.server.server_port, "POST", self.path, body) + if self.path.startswith("/_flush"): + self._send_json({"_shards": {"failed": 0}}) + return + self._send_json({"error": "not found"}, status=404) + + def do_PUT(self): + body = self._read_body() + state.log(self.server.server_port, "PUT", self.path, body) + if self.path.startswith("/_cluster/settings"): + try: + payload = json.loads(body) + except json.JSONDecodeError: + payload = {} + with state.lock: + for key, value in payload.get("persistent", {}).items(): + if value is None: + state.persistent_settings.pop(key, None) + else: + state.persistent_settings[key] = str(value) + self._send_json({"acknowledged": True}) + return + self._send_json({"error": "not found"}, status=404) + + def log_message(self, _format, *args): + return + + return FakeElasticsearchHandler + + +def serve(port, state): + httpd = ThreadingHTTPServer(("127.0.0.1", port), handler(state)) + httpd.serve_forever() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--ports", required=True) + parser.add_argument("--nodes", required=True) + parser.add_argument("--log", required=True) + parser.add_argument("--persistent-settings", default="{}") + parser.add_argument( + "--fail-nodes-on", + default="", + help="Comma-separated ports whose /_cat/nodes endpoint returns 503", + ) + args = parser.parse_args() + + ports = [int(port) for port in args.ports.split(",")] + fail_nodes_on = { + int(port) for port in args.fail_nodes_on.split(",") if port.strip() + } + state = State( + args.nodes.split(","), + args.log, + json.loads(args.persistent_settings), + fail_nodes_on, + ) + + for port in ports: + thread = threading.Thread(target=serve, args=(port, state), daemon=True) + thread.start() + + threading.Event().wait() + + +if __name__ == "__main__": + main() diff --git a/tests/integration/elasticsearch_cluster_settings_contract.yml b/tests/integration/elasticsearch_cluster_settings_contract.yml new file mode 100644 index 00000000..e25757cf --- /dev/null +++ b/tests/integration/elasticsearch_cluster_settings_contract.yml @@ -0,0 +1,140 @@ +--- +- name: Start fake Elasticsearch API for cluster settings + hosts: localhost + gather_facts: false + vars: + _fake_es_log: /tmp/elasticstack-cluster-settings-api.jsonl + tasks: + - name: Stop previous fake API if present + ansible.builtin.shell: | + set -o pipefail + if [ -f /tmp/elasticstack-cluster-settings-api.pid ]; then + kill "$(cat /tmp/elasticstack-cluster-settings-api.pid)" 2>/dev/null || true + fi + args: + executable: /bin/bash + changed_when: false + + - name: Reset fake API log + ansible.builtin.copy: + dest: "{{ _fake_es_log }}" + content: "" + mode: "0644" + + - name: Start fake API server + ansible.builtin.shell: | + set -o pipefail + nohup python3 {{ playbook_dir }}/../fakes/fake_es_rolling_api.py \ + --ports 19210 \ + --nodes es1 \ + --log {{ _fake_es_log }} >/tmp/elasticstack-cluster-settings-api.out 2>&1 & + echo $! + args: + executable: /bin/bash + register: _fake_es_api + changed_when: true + + - name: Store fake API PID + ansible.builtin.copy: + dest: /tmp/elasticstack-cluster-settings-api.pid + content: "{{ _fake_es_api.stdout }}" + mode: "0644" + + - name: Wait for fake API + ansible.builtin.uri: + url: http://127.0.0.1:19210/_cluster/settings?flat_settings=true + method: GET + register: _fake_api_settings + until: _fake_api_settings.status | default(0) == 200 + retries: 10 + delay: 1 + +- name: Cluster settings contract + hosts: localhost + gather_facts: false + vars: + elasticsearch_logsdb: true + elasticsearch_cluster_settings: + action.destructive_requires_name: "true" + elasticsearch_security: false + elasticstack_no_log: false + elasticsearch_http_protocol: http + elasticsearch_api_host: 127.0.0.1 + elasticstack_elasticsearch_http_port: 19210 + elasticsearch_validate_api_certs: false + elasticstack_ca_host: localhost + tasks: + - name: Apply cluster settings first time + ansible.builtin.include_tasks: ../../roles/elasticsearch/tasks/elasticsearch-cluster-settings.yml + + - name: Read fake API log after first apply + ansible.builtin.command: + cmd: cat /tmp/elasticstack-cluster-settings-api.jsonl + register: _fake_api_log + changed_when: false + + - name: Assert first apply sent expected settings + ansible.builtin.assert: + that: + - '_fake_api_log.stdout is search(''"method": "PUT"'')' + - "_fake_api_log.stdout is search('cluster.logsdb.enabled')" + - "_fake_api_log.stdout is search('action.destructive_requires_name')" + + - name: Reset fake API log + ansible.builtin.copy: + dest: /tmp/elasticstack-cluster-settings-api.jsonl + content: "" + mode: "0644" + + - name: Apply matching cluster settings again + ansible.builtin.include_tasks: ../../roles/elasticsearch/tasks/elasticsearch-cluster-settings.yml + + - name: Read fake API log after idempotent apply + ansible.builtin.command: + cmd: cat /tmp/elasticstack-cluster-settings-api.jsonl + register: _fake_api_log + changed_when: false + + - name: Assert matching settings did not call PUT + ansible.builtin.assert: + that: + - '_fake_api_log.stdout is search(''"method": "GET"'')' + - '_fake_api_log.stdout is not search(''"method": "PUT"'')' + + - name: Reset fake API log before missing credentials case + ansible.builtin.copy: + dest: /tmp/elasticstack-cluster-settings-api.jsonl + content: "" + mode: "0644" + + - name: Skip secured settings without password + ansible.builtin.include_tasks: ../../roles/elasticsearch/tasks/elasticsearch-cluster-settings.yml + vars: + elasticsearch_security: true + elasticsearch_cluster_settings: + indices.recovery.max_bytes_per_sec: "80mb" + + - name: Read fake API log after skipped secured apply + ansible.builtin.command: + cmd: cat /tmp/elasticstack-cluster-settings-api.jsonl + register: _fake_api_log + changed_when: false + + - name: Assert missing credentials skipped all API calls + ansible.builtin.assert: + that: + - _fake_api_log.stdout | length == 0 + +- name: Stop fake Elasticsearch API for cluster settings + hosts: localhost + gather_facts: false + tasks: + - name: Stop fake API server + ansible.builtin.shell: | + set -o pipefail + if [ -f /tmp/elasticstack-cluster-settings-api.pid ]; then + kill "$(cat /tmp/elasticstack-cluster-settings-api.pid)" 2>/dev/null || true + fi + args: + executable: /bin/bash + changed_when: false diff --git a/tests/integration/elasticsearch_template_contract.yml b/tests/integration/elasticsearch_template_contract.yml new file mode 100644 index 00000000..07c773b9 --- /dev/null +++ b/tests/integration/elasticsearch_template_contract.yml @@ -0,0 +1,92 @@ +--- +- name: Elasticsearch template contract - single node + hosts: single_es + gather_facts: false + vars: + ansible_managed: "Ansible managed" + elasticstack_elasticsearch_group_name: single_es + elasticsearch_datapath: /var/lib/elasticsearch + elasticsearch_logpath: /var/log/elasticsearch + elasticsearch_clustername: template-single + elasticsearch_cluster_set_up: false + elasticsearch_ml_enabled: false + elasticsearch_security: false + elasticsearch_memory_lock: false + elasticsearch_extra_config: + cluster.name: should-be-filtered + thread_pool.write.queue_size: 500 + tasks: + - name: Include Elasticsearch role vars + ansible.builtin.include_vars: ../../roles/elasticsearch/vars/main.yml + + - name: Render single-node elasticsearch.yml + ansible.builtin.template: + src: ../../roles/elasticsearch/templates/elasticsearch.yml.j2 + dest: /tmp/elasticsearch-template-single.yml + mode: "0644" + + - name: Read single-node elasticsearch.yml + ansible.builtin.slurp: + src: /tmp/elasticsearch-template-single.yml + register: _single_template + + - name: Assert single-node template contract + ansible.builtin.assert: + that: + - "'cluster.name: \"template-single\"' in (_single_template.content | b64decode)" + - "'discovery.type: single-node' in (_single_template.content | b64decode)" + - "'xpack.security.enabled: false' in (_single_template.content | b64decode)" + - "'cluster.name: should-be-filtered' not in (_single_template.content | b64decode)" + - "'thread_pool.write.queue_size: 500' in (_single_template.content | b64decode)" + +- name: Elasticsearch template contract - multi node external certs + hosts: elasticsearch + gather_facts: false + vars: + ansible_managed: "Ansible managed" + elasticstack_elasticsearch_group_name: elasticsearch + elasticsearch_datapath: /var/lib/elasticsearch + elasticsearch_logpath: /var/log/elasticsearch + elasticsearch_clustername: template-multi + elasticsearch_cluster_set_up: false + elasticsearch_ml_enabled: true + elasticsearch_security: true + elasticsearch_http_security: true + elasticsearch_security_enrollment: false + elasticsearch_ssl_verification_mode: certificate + elasticsearch_cert_source: external + _elasticsearch_transport_cert_format: pem + _elasticsearch_http_cert_format: pem + _elasticsearch_external_has_ca: true + elasticsearch_memory_lock: true + elasticsearch_fs_repo: + - /mnt/snapshots + tasks: + - name: Include Elasticsearch role vars + ansible.builtin.include_vars: ../../roles/elasticsearch/vars/main.yml + + - name: Render multi-node elasticsearch.yml + ansible.builtin.template: + src: ../../roles/elasticsearch/templates/elasticsearch.yml.j2 + dest: "/tmp/elasticsearch-template-{{ inventory_hostname }}.yml" + mode: "0644" + + - name: Read multi-node elasticsearch.yml + ansible.builtin.slurp: + src: "/tmp/elasticsearch-template-{{ inventory_hostname }}.yml" + register: _multi_template + + - name: Assert multi-node template contract + ansible.builtin.assert: + that: + - "'cluster.name: \"template-multi\"' in (_multi_template.content | b64decode)" + - "'discovery.type: single-node' not in (_multi_template.content | b64decode)" + - "'10.10.10.1' in (_multi_template.content | b64decode)" + - "'10.10.10.2' in (_multi_template.content | b64decode)" + - "'es-node-1' in (_multi_template.content | b64decode)" + - "'es-node-2' in (_multi_template.content | b64decode)" + - "'xpack.security.transport.ssl.certificate: certs/' ~ inventory_hostname ~ '-transport.crt' in (_multi_template.content | b64decode)" + - "'xpack.security.http.ssl.certificate: certs/' ~ inventory_hostname ~ '-http.crt' in (_multi_template.content | b64decode)" + - "'xpack.security.transport.ssl.certificate_authorities: [\"certs/ca.crt\"]' in (_multi_template.content | b64decode)" + - "'bootstrap.memory_lock: true' in (_multi_template.content | b64decode)" + - "'/mnt/snapshots' in (_multi_template.content | b64decode)" diff --git a/tests/integration/elasticsearch_template_inventory.ini b/tests/integration/elasticsearch_template_inventory.ini new file mode 100644 index 00000000..46c9c4cc --- /dev/null +++ b/tests/integration/elasticsearch_template_inventory.ini @@ -0,0 +1,6 @@ +[single_es] +single ansible_connection=local elasticsearch_nodename=single-node + +[elasticsearch] +es1 ansible_connection=local ansible_host=10.10.10.1 elasticsearch_nodename=es-node-1 +es2 ansible_connection=local ansible_host=10.10.10.2 elasticsearch_nodename=es-node-2 diff --git a/tests/integration/elasticsearch_upgrade_detection_contract.yml b/tests/integration/elasticsearch_upgrade_detection_contract.yml new file mode 100644 index 00000000..56bb7a20 --- /dev/null +++ b/tests/integration/elasticsearch_upgrade_detection_contract.yml @@ -0,0 +1,108 @@ +--- +- name: Elasticsearch upgrade detection contract + hosts: localhost + gather_facts: false + vars: + elasticstack_version: latest + tasks: + - name: Detect no upgrade when installed major matches release + vars: + elasticstack_release: 9 + block: + - name: Set matching installed package facts + ansible.builtin.set_fact: + ansible_facts: + packages: + elasticsearch: + - version: 9.1.0 + + - name: Run upgrade detection + ansible.builtin.include_tasks: ../../roles/elasticsearch/tasks/elasticsearch-upgrade-detection.yml + + - name: Assert matching release does not need rolling upgrade + ansible.builtin.assert: + that: + - not _elasticsearch_needs_rolling_upgrade | bool + + - name: Detect pinned version upgrade + vars: + elasticstack_release: 9 + elasticstack_version: 9.2.0 + block: + - name: Set older installed package facts + ansible.builtin.set_fact: + ansible_facts: + packages: + elasticsearch: + - version: 9.1.0 + + - name: Run upgrade detection + ansible.builtin.include_tasks: ../../roles/elasticsearch/tasks/elasticsearch-upgrade-detection.yml + + - name: Assert pinned newer version needs rolling upgrade + ansible.builtin.assert: + that: + - _elasticsearch_needs_rolling_upgrade | bool + + - name: Detect major release upgrade from supported 8.19 + vars: + elasticstack_release: 9 + elasticstack_version: latest + block: + - name: Set supported 8.19 package facts + ansible.builtin.set_fact: + ansible_facts: + packages: + elasticsearch: + - version: 8.19.6 + + - name: Run upgrade detection + ansible.builtin.include_tasks: ../../roles/elasticsearch/tasks/elasticsearch-upgrade-detection.yml + + - name: Assert supported major upgrade needs rolling upgrade + ansible.builtin.assert: + that: + - _elasticsearch_needs_rolling_upgrade | bool + + - name: Detect absent package as no pre-install rolling upgrade + vars: + elasticstack_release: 9 + elasticstack_version: latest + block: + - name: Set absent package facts + ansible.builtin.set_fact: + ansible_facts: + packages: {} + + - name: Run upgrade detection + ansible.builtin.include_tasks: ../../roles/elasticsearch/tasks/elasticsearch-upgrade-detection.yml + + - name: Assert absent package does not need pre-install rolling upgrade + ansible.builtin.assert: + that: + - not _elasticsearch_needs_rolling_upgrade | bool + + - name: Reject unsupported 8.x to 9.x upgrade path + vars: + elasticstack_release: 9 + elasticstack_version: latest + block: + - name: Set unsupported 8.x package facts + ansible.builtin.set_fact: + ansible_facts: + packages: + elasticsearch: + - version: 8.18.2 + + - name: Run upgrade detection expecting failure + ansible.builtin.include_tasks: ../../roles/elasticsearch/tasks/elasticsearch-upgrade-detection.yml + + - name: Unsupported upgrade path should fail + ansible.builtin.fail: + msg: Unsupported 8.x to 9.x upgrade path did not fail. + + rescue: + - name: Assert unsupported upgrade failure is clear + ansible.builtin.assert: + that: + - ansible_failed_result.msg is search('Elasticsearch 9.x requires 8.19.x first') diff --git a/tests/integration/rolling_restart_contract.yml b/tests/integration/rolling_restart_contract.yml new file mode 100644 index 00000000..7500d71e --- /dev/null +++ b/tests/integration/rolling_restart_contract.yml @@ -0,0 +1,308 @@ +--- +- name: Start fake Elasticsearch APIs + hosts: localhost + gather_facts: false + vars: + _fake_es_log: /tmp/elasticstack-rolling-restart-api.jsonl + tasks: + - name: Stop previous fake API if present + ansible.builtin.shell: | + set -o pipefail + if [ -f /tmp/elasticstack-rolling-restart-api.pid ]; then + kill "$(cat /tmp/elasticstack-rolling-restart-api.pid)" 2>/dev/null || true + fi + args: + executable: /bin/bash + changed_when: false + + - name: Reset fake API log + ansible.builtin.copy: + dest: "{{ _fake_es_log }}" + content: "" + mode: "0644" + + - name: Start fake API server + ansible.builtin.shell: | + set -o pipefail + nohup python3 {{ playbook_dir }}/../fakes/fake_es_rolling_api.py \ + --ports 19200,19201 \ + --nodes es1,es2 \ + --log {{ _fake_es_log }} >/tmp/elasticstack-rolling-restart-api.out 2>&1 & + echo $! + args: + executable: /bin/bash + register: _fake_es_api + changed_when: true + + - name: Store fake API PID + ansible.builtin.copy: + dest: /tmp/elasticstack-rolling-restart-api.pid + content: "{{ _fake_es_api.stdout }}" + mode: "0644" + + - name: Wait for fake API + ansible.builtin.uri: + url: http://127.0.0.1:19200/_cluster/health + method: GET + register: _fake_api_health + until: _fake_api_health.status | default(0) == 200 + retries: 10 + delay: 1 + +- name: Rolling restart contract - flush enabled + hosts: elasticsearch + gather_facts: false + vars: + _elasticsearch_restart_test_mode: true + elasticsearch_security: false + elasticstack_no_log: false + elasticstack_elasticsearch_group_name: elasticsearch + elasticsearch_http_protocol: http + elasticsearch_validate_api_certs: false + elasticsearch_config_restart_strategy: rolling + elasticsearch_config_restart_flush: true + elasticsearch_config_restart_wait_status: yellow + elasticsearch_config_restart_health_retries: 1 + elasticsearch_config_restart_health_delay: 0 + elasticsearch_config_restart_node_retries: 1 + elasticsearch_config_restart_node_delay: 0 + tasks: + - name: Mark all hosts for restart + ansible.builtin.set_fact: + _elasticsearch_restart_requested: true + + - name: Include rolling restart task # noqa: run-once[task] + ansible.builtin.include_tasks: ../../roles/elasticsearch/tasks/restart_and_verify_elasticsearch_rolling.yml + run_once: true + +- name: Verify flush-enabled contract + hosts: localhost + gather_facts: false + tasks: + - name: Read fake API log + ansible.builtin.command: + cmd: cat /tmp/elasticstack-rolling-restart-api.jsonl + register: _fake_api_log + changed_when: false + + - name: Assert both fake nodes were processed with flush + ansible.builtin.assert: + that: + - '_fake_api_log.stdout is search(''"path": "/_flush", "port": 19200'')' + - '_fake_api_log.stdout is search(''"path": "/_flush", "port": 19201'')' + - "_fake_api_log.stdout is search('primaries')" + - "_fake_api_log.stdout is search('null')" + + - name: Reset fake API log + ansible.builtin.copy: + dest: /tmp/elasticstack-rolling-restart-api.jsonl + content: "" + mode: "0644" + +- name: Rolling restart contract - flush disabled and subset restart + hosts: elasticsearch + gather_facts: false + vars: + _elasticsearch_restart_test_mode: true + elasticsearch_security: false + elasticstack_no_log: false + elasticstack_elasticsearch_group_name: elasticsearch + elasticsearch_http_protocol: http + elasticsearch_validate_api_certs: false + elasticsearch_config_restart_strategy: rolling + elasticsearch_config_restart_flush: false + elasticsearch_config_restart_wait_status: green + elasticsearch_config_restart_health_retries: 1 + elasticsearch_config_restart_health_delay: 0 + elasticsearch_config_restart_node_retries: 1 + elasticsearch_config_restart_node_delay: 0 + tasks: + - name: Mark only es1 for restart + ansible.builtin.set_fact: + _elasticsearch_restart_requested: "{{ inventory_hostname == 'es1' }}" + + - name: Include rolling restart task # noqa: run-once[task] + ansible.builtin.include_tasks: ../../roles/elasticsearch/tasks/restart_and_verify_elasticsearch_rolling.yml + run_once: true + +- name: Verify flush-disabled subset contract + hosts: localhost + gather_facts: false + tasks: + - name: Read fake API log + ansible.builtin.command: + cmd: cat /tmp/elasticstack-rolling-restart-api.jsonl + register: _fake_api_log + changed_when: false + + - name: Assert only es1 was processed and flush was skipped + ansible.builtin.assert: + that: + - '_fake_api_log.stdout is search(''"port": 19200'')' + - '_fake_api_log.stdout is not search(''"port": 19201'')' + - '_fake_api_log.stdout is not search(''"path": "/_flush"'')' + + - name: Reset fake API log + ansible.builtin.copy: + dest: /tmp/elasticstack-rolling-restart-api.jsonl + content: "" + mode: "0644" + +- name: Rolling restart contract - missing secured credentials fails + hosts: elasticsearch[0] + gather_facts: false + vars: + _elasticsearch_restart_test_mode: true + elasticsearch_security: true + elasticstack_no_log: false + elasticstack_elasticsearch_group_name: elasticsearch + elasticsearch_config_restart_strategy: rolling + elasticsearch_config_restart_flush: false + elasticsearch_config_restart_wait_status: yellow + tasks: + - name: Mark one host for restart + ansible.builtin.set_fact: + _elasticsearch_restart_requested: "{{ inventory_hostname == 'es1' }}" + + - name: Verify missing credentials fail before restart + block: + - name: Include rolling restart task without credentials # noqa: run-once[task] + ansible.builtin.include_tasks: ../../roles/elasticsearch/tasks/restart_and_verify_elasticsearch_rolling.yml + run_once: true + + - name: Missing credentials should fail # noqa: run-once[task] + ansible.builtin.fail: + msg: Missing credentials did not fail. + run_once: true + + rescue: + - name: Assert missing credentials error is clear # noqa: run-once[task] + ansible.builtin.assert: + that: + - "ansible_failed_result.msg is search('elastic user password is unavailable')" + run_once: true + +- name: Restart fake API with simulated rejoin failure on es1 + hosts: localhost + gather_facts: false + tasks: + - name: Stop previous fake API + ansible.builtin.shell: | + set -o pipefail + if [ -f /tmp/elasticstack-rolling-restart-api.pid ]; then + kill "$(cat /tmp/elasticstack-rolling-restart-api.pid)" 2>/dev/null || true + fi + args: + executable: /bin/bash + changed_when: false + + - name: Reset fake API log + ansible.builtin.copy: + dest: /tmp/elasticstack-rolling-restart-api.jsonl + content: "" + mode: "0644" + + - name: Start fake API with es1 rejoin failure + ansible.builtin.shell: | + set -o pipefail + nohup python3 {{ playbook_dir }}/../fakes/fake_es_rolling_api.py \ + --ports 19200,19201 \ + --nodes es1,es2 \ + --fail-nodes-on 19200 \ + --log /tmp/elasticstack-rolling-restart-api.jsonl \ + >/tmp/elasticstack-rolling-restart-api.out 2>&1 & + echo $! + args: + executable: /bin/bash + register: _fake_es_api + changed_when: true + + - name: Store fake API PID + ansible.builtin.copy: + dest: /tmp/elasticstack-rolling-restart-api.pid + content: "{{ _fake_es_api.stdout }}" + mode: "0644" + + - name: Wait for fake API + ansible.builtin.uri: + url: http://127.0.0.1:19201/_cluster/health + method: GET + register: _fake_api_health + until: _fake_api_health.status | default(0) == 200 + retries: 10 + delay: 1 + +- name: Rolling restart contract - cleanup via surviving peer on failure + hosts: elasticsearch + gather_facts: false + vars: + _elasticsearch_restart_test_mode: true + elasticsearch_security: false + elasticstack_no_log: false + elasticstack_elasticsearch_group_name: elasticsearch + elasticsearch_http_protocol: http + elasticsearch_validate_api_certs: false + elasticsearch_config_restart_strategy: rolling + elasticsearch_config_restart_flush: false + elasticsearch_config_restart_wait_status: green + elasticsearch_config_restart_health_retries: 1 + elasticsearch_config_restart_health_delay: 0 + elasticsearch_config_restart_node_retries: 1 + elasticsearch_config_restart_node_delay: 0 + tasks: + - name: Mark only es1 for restart + ansible.builtin.set_fact: + _elasticsearch_restart_requested: "{{ inventory_hostname == 'es1' }}" + + - name: Expect rolling restart to fail but cleanup via peer + block: + - name: Include rolling restart task # noqa: run-once[task] + ansible.builtin.include_tasks: ../../roles/elasticsearch/tasks/restart_and_verify_elasticsearch_rolling.yml + run_once: true + + - name: Rejoin failure should have raised # noqa: run-once[task] + ansible.builtin.fail: + msg: Simulated rejoin failure did not raise. + run_once: true + + rescue: + - name: Absorb expected failure # noqa: run-once[task] + ansible.builtin.debug: + msg: Rolling restart failed as expected; cleanup should have run via es2. + run_once: true + +- name: Verify cleanup used surviving peer + hosts: localhost + gather_facts: false + tasks: + - name: Read fake API log + ansible.builtin.command: + cmd: cat /tmp/elasticstack-rolling-restart-api.jsonl + register: _fake_api_log + changed_when: false + + - name: Assert cleanup PUT with null landed on port 19201 (surviving peer) + ansible.builtin.assert: + that: + - '_fake_api_log.stdout is search(''"method": "PUT", "path": "/_cluster/settings", "port": 19201'')' + - "_fake_api_log.stdout is search('null')" + + - name: Reset fake API log + ansible.builtin.copy: + dest: /tmp/elasticstack-rolling-restart-api.jsonl + content: "" + mode: "0644" + +- name: Stop fake Elasticsearch APIs + hosts: localhost + gather_facts: false + tasks: + - name: Stop fake API server + ansible.builtin.shell: | + set -o pipefail + if [ -f /tmp/elasticstack-rolling-restart-api.pid ]; then + kill "$(cat /tmp/elasticstack-rolling-restart-api.pid)" 2>/dev/null || true + fi + args: + executable: /bin/bash + changed_when: false diff --git a/tests/integration/rolling_restart_inventory.ini b/tests/integration/rolling_restart_inventory.ini new file mode 100644 index 00000000..4430fa6c --- /dev/null +++ b/tests/integration/rolling_restart_inventory.ini @@ -0,0 +1,3 @@ +[elasticsearch] +es1 ansible_connection=local elasticsearch_api_host=127.0.0.1 elasticstack_elasticsearch_http_port=19200 elasticsearch_nodename=es1 +es2 ansible_connection=local elasticsearch_api_host=127.0.0.1 elasticstack_elasticsearch_http_port=19201 elasticsearch_nodename=es2