Update rejoin role to make it more automatic (needs testing)

Need to expand variables
metno · Jan 23, 2025 · aa9d7ff · aa9d7ff
1 parent fd5448d
commit aa9d7ff
Show file tree

Hide file tree

Showing 6 changed files with 98 additions and 18 deletions.
diff --git a/ansible/README.md b/ansible/README.md
@@ -182,7 +182,11 @@ repmgr standby switchover -f /etc/repmgr.conf --siblings-follow
 This is used in the case where the primary has gone down (e.g. unplanned downtime of a data room).
 Make sure you know which one you want to promote!
 
-TODO: can (should?) this be automated?
+```terminal
+ansible-playbook -i inventory.yml -e primary=lard-a -e standby=lard-b rejoin.yml
+```
+
+This can also be done manually following the following steps:
 
 #### A. Promote standby node to primary
 
@@ -194,7 +198,7 @@ TODO: can (should?) this be automated?
    postgres@lard-b:~$ repmgr -f /etc/repmgr.conf cluster show
    ```
 
-   The primary should say it's **uncreachable**.
+   The primary should say it's **unreachable**.
 
 1. Then promote the standby to primary:
 
@@ -207,6 +211,12 @@ TODO: can (should?) this be automated?
 1. Then move the IP in the OpenStack GUI (`Network → Floating IPs`, dissasociate
    it then associated it with the ipalias port on the other VM).
 
+1. Restart LARD ingestion service in the new primary
+
+   ```terminal
+   ubuntu@lard-b:~$ sudo systemctl start lard_ingestion.service
+   ```
+
 #### B. Rejoin old primary
 
 The cluster will be in a slightly confused state, because this VM still thinks
@@ -234,12 +244,10 @@ be no data loss.
 1. With a **playbook**:
 
    ```terminal
-   ansible-playbook -i inventory.yml -e rejoin=lard-a -e primary=lard-b rejoin.yml 
+   ansible-playbook -i inventory.yml -e primary=lard-a -e standby=lard-b rejoin.yml --skip-tags promote
    ```
 
-   where `rejoin` is the host name of the primary node that has been down and should now be a standby.
-
-If you want to do this **manually** you can follow the steps in the `rejoin` role tasks.
+   where `primary` is the host name of the primary node that has been down and should now be a standby.
 
 #### Testing
 

diff --git a/ansible/rejoin.yml b/ansible/rejoin.yml
@@ -1,14 +1,46 @@
 ---
 - name: Rejoin
-  hosts: "{{ rejoin }}"
+  hosts: localhost
   remote_user: ubuntu
   vars:
     # Old primary host that went down
-    rejoin: # provide via cmd
-    # New primary host after it was promoted
     primary: # provide via cmd
+    # Old standby that will be promoted to primary
+    standby: # provide via cmd
 
-  roles:
-    - role: rejoin
+  tasks:
+    - name: Promote standby
+      ansible.builtin.include_role:
+        name: rejoin
+        tasks_from: promote.yml
+        apply:
+          delegate_to: "{{ standby }}"
+      tags: "promote"
+
+    - name: Perform IP switchover
+      ansible.builtin.include_role:
+        name: ostack
+        tasks_from: move_floating_ip.yml
+      vars:
+        ostack_primary: "{{ primary }}"
+        ostack_standby: "{{ standby }}"
+      tags: "promote"
+
+    # TODO: should this happen before or after rejoining the old primary
+    - name: Restart LARD ingestion service
+      ansible.builtin.systemd_service:
+        name: lard_ingestion
+        state: restarted
+      become: true
+      delegate_to: "{{ standby }}"
+      tags: "promote"
+
+    - name: Rejoin old primary
+      ansible.builtin.include_role:
+        name: rejoin
+        tasks_from: rejoin.yml
+        apply:
+          delegate_to: "{{ primary }}"
       vars:
-        rejoin_primary_ip: "{{ hostvars[primary].ansible_host }}"
+        # TODO: this should be done via DNS once we have those set up
+        rejoin_primary_ip: "{{ hostvars[standby].ansible_host }}"
diff --git a/ansible/roles/rejoin/default/main.yml b/ansible/roles/rejoin/default/main.yml
@@ -1,2 +1,2 @@
 ---
-rejoin_primary_ip:
+rejoin_ip:
diff --git a/ansible/roles/rejoin/tasks/promote.yml b/ansible/roles/rejoin/tasks/promote.yml
@@ -0,0 +1,40 @@
+---
+- name: Check cluster
+  ansible.builtin.command: repmgr -f /etc/repmgr.conf cluster show
+  become: true
+  become_user: postgres
+  register: cluster_status
+  changed_when: false
+
+  # TODO: check that primary says "unreachable"?
+- name: Print cluster status
+  ansible.builtin.debug:
+    msg: "{{ cluster_status }}"
+
+- name: Dry run of standby promotion
+  ansible.builtin.command: repmgr -f /etc/repmgr.conf standby promote --dry-run
+  become: true
+  become_user: postgres
+  changed_when: false
+  register: dry_run_promote
+
+- name: Print result of dry-run
+  ansible.builtin.debug:
+    msg: "{{ dry_run_promote }}"
+
+# TODO: should postgres service be restarted?
+# TODO: check that primary says "failed"?
+- name: Promote standby
+  ansible.builtin.command: repmgr -f /etc/repmgr.conf standby promote
+  become: true
+  become_user: postgres
+  changed_when: true
+  # TODO: this will keep crashing until the the IP alias is moved to the standby
+  # So probably best to restart after the IP switch
+  # - name: Start LARD ingestion service
+  #   ansible.builtin.systemd_service:
+  #     daemon_reload: true
+  #     name: lard_ingestion
+  #     state: restarted
+  #     enabled: true
+  #   become: true
diff --git a/ansible/roles/rejoin/tasks/main.yml → ansible/roles/rejoin/tasks/rejoin.yml b/ansible/roles/rejoin/tasks/main.yml → ansible/roles/rejoin/tasks/rejoin.yml
@@ -8,7 +8,7 @@
 - name: Dry run of rejoin
   ansible.builtin.command: >
     repmgr node rejoin
-    -f /etc/repmgr.conf -d 'host='{{ rejoin_primary_ip }}' user=repmgr dbname=repmgr connect_timeout=2'
+    -f /etc/repmgr.conf -d 'host='{{ rejoin_ip }}' user=repmgr dbname=repmgr connect_timeout=2'
     --force-rewind=/usr/lib/postgresql/16/bin/pg_rewind --verbose --dry-run
   become: true
   become_user: postgres
@@ -24,7 +24,7 @@
 - name: Rejoin old primary as standby
   ansible.builtin.command: >
     repmgr node rejoin
-    -f /etc/repmgr.conf -d 'host='{{ rejoin_primary_ip }}' user=repmgr dbname=repmgr connect_timeout=2'
+    -f /etc/repmgr.conf -d 'host='{{ rejoin_ip }}' user=repmgr dbname=repmgr connect_timeout=2'
     --force-rewind=/usr/lib/postgresql/16/bin/pg_rewind --verbose
   become: true
   become_user: postgres

diff --git a/ansible/switchover.yml b/ansible/switchover.yml
@@ -16,13 +16,13 @@
         name: postgresql
         state: restarted
       become: true
-      delegate_to: primary
+      delegate_to: "{{ primary }}"
 
     - name: Perform Postgres switchover
       ansible.builtin.include_role:
         name: switchover
         apply:
-          delegate_to: standby
+          delegate_to: "{{ standby }}"
 
     - name: Perform IP switchover
       ansible.builtin.include_role:
@@ -37,4 +37,4 @@
         name: lard_ingestion
         state: restarted
       become: true
-      delegate_to: standby
+      delegate_to: "{{ standby }}"