diff --git a/build/ansible-runner/Dockerfile b/build/ansible-runner/Dockerfile index 6411eb40be3..830a00394b4 100644 --- a/build/ansible-runner/Dockerfile +++ b/build/ansible-runner/Dockerfile @@ -8,8 +8,9 @@ RUN apt-get clean && \ apt-get install -y --no-install-recommends python-minimal python-pip netcat iproute2 jq sshpass \ curl openssh-client python-setuptools && rm -rf /var/lib/apt/lists/* +RUN pip install --upgrade pip #Installing ansible and dependencies for k8s module -RUN pip install ansible==2.7.3 openshift jmespath +RUN pip install ansible==2.7.3 openshift jmespath boto boto3 RUN touch /mnt/parameters.yml /mnt/cloud_config.yml @@ -21,8 +22,7 @@ RUN gcloud --version #Installing Kubectl ENV KUBE_LATEST_VERSION="v1.12.0" RUN curl -L https://storage.googleapis.com/kubernetes-release/release/${KUBE_LATEST_VERSION}/bin/linux/amd64/kubectl -o /usr/local/bin/kubectl && \ - chmod +x /usr/local/bin/kubectl && \ - curl -o /usr/local/bin/aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.10.3/2018-07-26/bin/linux/amd64/aws-iam-authenticator && \chmod +x /usr/local/bin/aws-iam-authenticator + chmod +x /usr/local/bin/kubectl #Adding hosts entries and making ansible folders RUN mkdir /etc/ansible/ /ansible && \ diff --git a/ceps/0001-cep-template.md b/ceps/0001-cep-template.md new file mode 100644 index 00000000000..6b10588fb1e --- /dev/null +++ b/ceps/0001-cep-template.md @@ -0,0 +1,173 @@ +--- +cep-number: 0 +title: My CEP +authors: +- "@ksatchit" +owners: + +- TBD + +- "@ksatchit" +editor: TBD +creation-date: yyyy-mm-dd +last-updated: yyyy-mm-dd +status: provisional/implementable/implemented/deferred/rejected/withdrawn/replaced +see-also: + +- CEP-1 + +- CEP-2 + +replaces: + +- CEP-3 + +superseded-by: + +- CEP-100 +--- + +# Title + +This is the title of the Chaos Enhancement Proposal (CEP). +Keep it simple and descriptive. +A good title can help communicate what the CEP is and should be considered as part of any review. + +The title should be lowercased and spaces/punctuation should be replaced with `-`. + +To get started with this template: +1. **Make a copy of this template.** + Name it `YYYYMMDD-my-title.md`. +1. **Fill out the "overview" sections.** + This includes the Summary and Motivation sections. +1. **Create a PR.** + Name it `[CEP NUMBER] Title`, e.g. `[CEP 20191014] Initial work on Chaos Operator`. + Assign it to owner(s) that are working on these features. +1. **Merge early.** + Avoid getting hung up on specific details and instead aim to get the goal of the CEP merged quickly. + The best way to do this is to just start with the "Overview" sections and fill out details incrementally in follow on PRs. + View anything marked as a `provisional` as a working document and subject to change. + Aim for single topic PRs to keep discussions focused. + If you disagree with what is already in a document, open a new PR with suggested changes. + +The canonical place for the latest set of instructions (and the likely source of this file) is [here](/ceps/0001-cep-template.md). + +The `Metadata` section above is intended to support the creation of tooling around the CEP process. + +## Table of Contents + +A table of contents is helpful for quickly jumping to sections of a CEP and for highlighting any additional information provided beyond the standard CEP template. +[Tools for generating](https://github.com/ekalinin/github-markdown-toc) a table of contents from markdown are available. + +- [Table of Contents](#table-of-contents) + +- [Summary](#summary) + +- [Motivation](#motivation) + + - [Goals](#goals) + - [Non-Goals](#non-goals) + +- [Proposal](#proposal) + + - [User Stories](#user-stories-optional) + - [Story 1](#story-1) + - [Story 2](#story-2) + - [Implementation Details/Notes/Constraints](#implementation-detailsnotesconstraints) + - [Risks and Mitigations](#risks-and-mitigations) + +- [Graduation Criteria](#graduation-criteria) + +- [Implementation History](#implementation-history) + +- [Drawbacks](#drawbacks) + +- [Alternatives](#alternatives) + +- [Infrastructure Needed [optional]](#infrastructure-needed) + +## Summary + +The `Summary` section is incredibly important for producing high quality user focused documentation such as release notes +or a development road map.It should be possible to collect this information before implementation begins in order to avoid +requiring implementors to split their attention between writing release notes and implementing the feature itself. +CEP editors should help to ensure that the tone and content of the `Summary` section is useful for a wide audience. + +A good summary is probably at least a paragraph in length. + +## Motivation + +This section is for explicitly listing the motivation, goals and non-goals of this CEP. +Describe why the change is important and the benefits to users. +The motivation section can optionally provide links to [experience reports](https://github.com/golang/go/wiki/ExperienceReports) to demonstrate the interest in a CEP +within the wider Litmus community. + +### Goals + +List the specific goals of the CEP. +How will we know that this has succeeded? + +### Non-Goals + +What is out of scope for his CEP? +Listing non-goals helps to focus discussion and make progress. + +## Proposal + +This is where we get down to the nitty gritty of what the proposal actually is. + +### User Stories (optional) + +Detail the things that people will be able to do if this CEP is implemented. +Include as much detail as possible so that people can understand the "how" of the system. +The goal here is to make this feel real for users without getting bogged down. + +#### Story 1 + +#### Story 2 + +### Implementation Details/Notes/Constraints (optional) + +What are the caveats to the implementation? +What are some important details that didn't come across above. +Go in to as much detail as necessary here. +This might be a good place to talk about core concepts and how they releate. + +### Risks and Mitigations + +What are the risks of this proposal and how do we mitigate. +Think broadly. +For example, consider both security and how this will impact the larger kubernetes ecosystem. + +## Graduation Criteria + +How will we know that this has succeeded? +Gathering user feedback is crucial for building high quality experiences and owners have the important responsibility +of setting milestones for stability and completeness. + +## Implementation History + +Major milestones in the life cycle of a CEP should be tracked in `Implementation History. +Major milestones might include the following. + +- the `Summary` and `Motivation` sections being merged signaling owner acceptance +- the `Proposal` section being merged signaling agreement on a proposed design +- the date implementation started +- the first Litmus release where an initial version of the CEP was available +- the version of Litmus where the CEP graduated to general availability +- when the CEP was retired or superseded + +## Drawbacks (optional) + +Why should this CEP _not_ be implemented. + +## Alternatives (optional) + +Similar to the `Drawbacks` section the `Alternatives` section is used to highlight and record other possible approaches +to delivering the value proposed by a CEP. + +## Infrastructure Needed (optional) + +Use this section if you need things from the project/owner. +Examples include a new subproject, repos requested, github details. +Listing these here allows a owner to get the process for these resources started right away. diff --git a/ceps/README.md b/ceps/README.md new file mode 100644 index 00000000000..1fed4d37ceb --- /dev/null +++ b/ceps/README.md @@ -0,0 +1,50 @@ +# Chaos Enhancement Proposals (CEPs) + +A Chaos Enhancement Proposal (CEP) is a way to propose, communicate and coordinate on new efforts for the LitmusChaos project. +You can read the full details of the project in [CEP-1](0001-chaos-enhancement-proposal-process.md). + +This process is still in _alpha_ state and is mandatory for all major feature beginning release 0.9. + +## Quick start for the CEP process + +- Socialize an idea with the Litmus contributors.Make sure that others think the work is worth taking up and will help review the CEP and any code changes required. +- Follow the process outlined in the [CEP template](YYYYMMDD-cep-template.md) + +## FAQs + +### Do I have to use the CEP process + +No... but we hope that you will. +Over time having a rich set of CEPs in one place will make it easier for people to track what is going in the community +and find a structured historic record. + +CEPs are required when the changes are wide-ranging & are feature-level items. +These changes are usually coordinated through Litmus maintainers. + +### Why would I want to use the CEP process + +Our aim with CEPs is to clearly communicate new efforts to the Litmus Chaos contributor community. +As such, we want to build a well curated set of clear proposals in a common format with useful metadata. + +We are inspired by KEPs, i.e., [Kubernetes Enhancement Proposals](https://github.com/kubernetes/enhancements/tree/master/keps) + +### Do I put my CEP in the root CEP directory or a SIG subdirectory + +If the CEP is mainly restricted to one SIG's purview then it should be in a CEP directory for that SIG. +If the CEP is widely impacting much of Litmus, it should be put at the root of this directory. + +### What will it take for CEPs to "graduate" out of "beta" + +Things we'd like to see happen to consider CEPs well on their way. + +- A set of CEPs that show healthy process around describing an effort and recording decisions in a reasonable amount of time. +- CEPs exposed on a searchable and indexable web site. +- Presubmit checks for CEPs around metadata format and markdown validity. + +Even so, the process can evolve. As we find new techniques we can improve our processes. + +### My FAQ isn't answered here + +The CEP process is still evolving! +If something is missing or not answered here feel free to reach out to [LitmusChaos Community](https://kubernetes.slack.com/messages/CNXNB0ZTN). +If you want to propose a change to the CEP process you can open a PR on [CEP-1](0001-cep-template.md) with your proposal. diff --git a/chaoslib/litmus/container_kill/containerd_chaos/containerd.j2 b/chaoslib/litmus/container_kill/containerd_chaos/containerd.j2 new file mode 100644 index 00000000000..66dea5eb74a --- /dev/null +++ b/chaoslib/litmus/container_kill/containerd_chaos/containerd.j2 @@ -0,0 +1,28 @@ +apiVersion: extensions/apps/v1 +kind: DaemonSet +metadata: + name: containerd-chaos +spec: + template: + metadata: + labels: + app: crictl + name: containerd-chaos + spec: + containers: + - image: {{ containerd_image }} + imagePullPolicy: Always + name: containerd-chaos + command: ['sh', '-c', 'echo Hello! && sleep 1800'] + volumeMounts: + - name: cri-socket + mountPath: /run/containerd/containerd.sock + - name: cri-config + mountPath: /etc/crictl.yaml + volumes: + - hostPath: + path: /run/containerd/containerd.sock + name: cri-socket + - hostPath: + path: /etc/crictl.yaml + name: cri-config diff --git a/chaoslib/litmus/container_kill/containerd_chaos/crictl-chaos.yml b/chaoslib/litmus/container_kill/containerd_chaos/crictl-chaos.yml new file mode 100644 index 00000000000..47ff963dbe7 --- /dev/null +++ b/chaoslib/litmus/container_kill/containerd_chaos/crictl-chaos.yml @@ -0,0 +1,157 @@ +--- +- name: Patch the chaoslib image + template: + src: /chaoslib/litmus/container_kill/containerd_chaos/containerd.j2 + dest: /chaoslib/litmus/container_kill/containerd_chaos/containerd-chaos-ds.yml + vars: + containerd_image: "{{ lib_image }}" + +- block: + + - name: Setup containerd chaos infrastructure. + shell: > + kubectl apply -f /chaoslib/litmus/container_kill/containerd_chaos/containerd-chaos-ds.yml + -n {{ namespace }} + args: + executable: /bin/bash + register: result + + - name: Confirm that the containerd-chaos ds is running on all nodes. + shell: > + kubectl get pod -l app=crictl + --no-headers -o custom-columns=:status.phase + -n {{ namespace }} | sort | uniq + args: + executable: /bin/bash + register: result + until: "result.stdout == 'Running'" + delay: 3 + retries: 60 + ignore_errors: true + + - block: + - name: Select the app pod + shell: > + kubectl get pod -l {{ label }} -n {{ namespace }} + -o=custom-columns=NAME:".metadata.name" --no-headers + | shuf | head -1 + args: + executable: /bin/bash + register: pod_name + + - name: Record application pod name + set_fact: + app_pod: "{{ pod_name.stdout }}" + when: app_pod is undefined + + - name: Identify the node where application is running + shell: > + kubectl get pod {{ app_pod }} -n {{ namespace }} + --no-headers -o custom-columns=:spec.nodeName + args: + executable: /bin/bash + register: result + failed_when: result is failed + + - name: Record the application node name + set_fact: + app_node: "{{ result.stdout }}" + + - name: Record the containerd-chaos pod on app node + shell: > + kubectl get pod -l app=crictl -o wide + -n {{ namespace }} | grep {{ app_node }} + | awk '{print $1}' + args: + executable: /bin/bash + register: chaos_pod + failed_when: chaos_pod is failed + + - block: + + - name: Record the application container + shell: > + kubectl get pods -l {{ label }} -n {{ namespace }} -o jsonpath='{.items[0].spec.containers[0].name}' + args: + executable: /bin/bash + register: container + + - name: Record the app_container + set_fact: + app_container: "{{ container.stdout }}" + + when: app_container is undefined + + - name: Obtain the pod ID through Pod name + shell: > + kubectl exec {{ chaos_pod.stdout}} -n {{ namespace }} -- + crictl pods | grep "{{ app_pod }}" | awk '{print $1}' + args: + executable: /bin/bash + register: pod_id + failed_when: pod_id is failed + + - name: Obtain the container ID using pod name and container name + shell: > + kubectl exec {{ chaos_pod.stdout}} -n {{ namespace }} -- + crictl ps | grep {{ pod_id.stdout }} | grep {{ app_container }} | awk '{print $1}' + args: + executable: /bin/bash + register: container_id + failed_when: container_id is failed + + - name: Kill the container + shell: > + kubectl exec {{ chaos_pod.stdout}} -n {{ namespace }} -- + crictl stop "{{ container_id.stdout }}" + args: + executable: /bin/bash + register: result + failed_when: result is failed + + - name: Obtain the container ID using pod name and container name + shell: > + kubectl exec {{ chaos_pod.stdout}} -n {{ namespace }} -- + crictl ps | grep {{ pod_id.stdout }} | grep {{ app_container }} | awk '{print $1}' + args: + executable: /bin/bash + register: new_container_id + until: "new_container_id.stdout != ''" + delay: 5 + retries: 20 + + - name: Check if the new container is running. + shell: > + kubectl exec {{ chaos_pod.stdout}} -n {{ namespace }} -- + crictl ps | grep {{ new_container_id.stdout }} + args: + executable: /bin/bash + register: status + until: "'Running' in status.stdout" + delay: 3 + retries: 30 + + when: action == "killapp" + +- block: + + - name: Delete the crictl-chaos daemonset + shell: > + kubectl delete -f /chaoslib/litmus/container_kill/containerd_chaos/containerd-chaos-ds.yml + -n {{ namespace }} + args: + executable: /bin/bash + register: result + + - name: Confirm that the containerd-chaos pod is deleted successfully + shell: > + kubectl get pod -l app=crictl + --no-headers -n {{ namespace }} + args: + executable: /bin/bash + register: result + until: "result.stdout == ''" + delay: 3 + retries: 50 + + when: action == "delete-containerd" diff --git a/chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml b/chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml index 200f3adab3f..ac8121cac19 100644 --- a/chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml +++ b/chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml @@ -35,11 +35,11 @@ - name: Fetch the value of Request ephemeral-storage Unit in KB set_fact: - req_storage_KB: "{{ req_value_storage.stdout }}" + req_storage_KB: "{{ lookup('vars', req_value_storage.stdout) }}" - name: Fetch the value of Limit ephemeral-storage Unit in KB set_fact: - limit_storage_KB: "{{ limit_value_storage.stdout }}" + limit_storage_KB: "{{ lookup('vars', limit_value_storage.stdout) }}" - include_tasks: /chaoslib/litmus/disk_fill/convert_fill_percentage.yml @@ -105,11 +105,6 @@ - include_tasks: /chaoslib/litmus/disk_fill/file_creation.yml -- include_tasks: /utils/common/status_app_pod.yml - vars: - delay: 1 - retries: 60 - - name: Waiting for Chaos Duration wait_for: timeout: "{{ c_duration }}" @@ -121,10 +116,22 @@ executable: /bin/bash register: pod_status +- name: Check if the pod is Evicted + shell: > + kubectl get pod {{ pod_name.stdout }} -n {{ a_ns }} -o jsonpath='{.status.reason}' + args: + executable: /bin/bash + register: eviction_status + +- name: Delete Pod if evicted + shell: > + kubectl delete pod {{ pod_name.stdout }} -n {{ a_ns }} + when: eviction_status.stdout == "Evicted" + - name: If Pod is not evicted / running shell: > kubectl exec -it {{ disk_fill_pod.stdout }} -n {{ a_ns }} -- sh -c "rm -rf /diskfill/{{ containerID.stdout }}/diskfill" - when: pod_status.stdout != "Evicted" + when: pod_status.stdout == "Running" - name: Delete DaemonSet disk-fill shell: > diff --git a/chaoslib/litmus/kill_random_pod.yml b/chaoslib/litmus/kill_random_pod.yml index 56cf73977e2..2c561a3bf0a 100644 --- a/chaoslib/litmus/kill_random_pod.yml +++ b/chaoslib/litmus/kill_random_pod.yml @@ -1,21 +1,31 @@ -- name: Get a list of all pods from given namespace - k8s_facts: - kind: Pod - namespace: "{{ a_ns }}" - label_selectors: - - "{{a_label}}" - register: pod_list - -- name: Select a random pod to kill - set_fact: - a_pod_to_kill: "{{ pod_list.resources | random | json_query('metadata.name') }}" +- block: + - name: Get a list of all pods from given namespace + k8s_facts: + kind: Pod + namespace: "{{ app_ns }}" + label_selectors: + - "{{ app_label }}" + register: pod_list + + - name: Select a random pod to kill + set_fact: + app_pod: "{{ pod_list.resources | random | json_query('metadata.name') }}" + + when: app_pod_name is undefined or app_pod_name == '' + +- block: + - name: Record app pod + set_fact: + app_pod: "{{ app_pod_name }}" + + when: app_pod_name is defined and app_pod_name != '' - debug: - msg: "Killing pod {{a_pod_to_kill}}" + msg: "Killing pod {{ app_pod }}" - name: Force Kill application pod shell: | - kubectl delete pod -n {{ a_ns }} --force --grace-period=0 --wait=false {{a_pod_to_kill}} + kubectl delete pod -n {{ app_ns }} --force --grace-period=0 --wait=false {{ app_pod }} args: executable: /bin/bash register: result @@ -23,7 +33,7 @@ - name: Kill application pod shell: | - kubectl delete pod -n {{ a_ns }} --grace-period=0 --wait=false {{a_pod_to_kill}} + kubectl delete pod -n {{ app_ns }} --grace-period=0 --wait=false {{ app_pod }} args: executable: /bin/bash register: result @@ -31,4 +41,4 @@ - name: Wait for the interval timer pause: - seconds: "{{c_interval}}" \ No newline at end of file + seconds: "{{ c_interval }}" diff --git a/chaoslib/litmus/platform/aws/disk_loss.yml b/chaoslib/litmus/platform/aws/disk_loss.yml new file mode 100644 index 00000000000..de0b30a50b2 --- /dev/null +++ b/chaoslib/litmus/platform/aws/disk_loss.yml @@ -0,0 +1,21 @@ +- name: Detaching the disk + ec2_vol: + id: "{{ disk_name }}" + instance: None + region: "{{ zone_name }}" + +- name: chaos injection for {{ c_duration }}s + wait_for: + timeout: "{{ c_duration }}" + +- name: Verify that the disk is connected to node (post) + include_tasks: "/utils/cloud/aws/status_disk.yml" + +- block: + - name: If disk is not attached, it will attach manually + ec2_vol: + instance: "{{ node_name }}" + id: "{{ disk_name }}" + device_name: "{{ device_name }}" + region: "{{ zone_name }}" + when: "inuse == false" diff --git a/chaoslib/litmus/platform/gke/disk_loss.yml b/chaoslib/litmus/platform/gke/disk_loss.yml index 3eef6e8a6c4..c97ad31a34b 100644 --- a/chaoslib/litmus/platform/gke/disk_loss.yml +++ b/chaoslib/litmus/platform/gke/disk_loss.yml @@ -1,5 +1,5 @@ - name: Detaching the disk - shell: gcloud compute instances detach-disk {{ node_name }} --device-name {{ disk_name }} --zone {{ zone_name }} + shell: gcloud compute instances detach-disk {{ node_name }} --disk {{ disk_name }} --zone {{ zone_name }} - name: chaos injection for {{ c_duration }}s wait_for: diff --git a/chaoslib/litmus/pod_failure_by_litmus.yml b/chaoslib/litmus/pod_failure_by_litmus.yml index 95715966d3a..17e6296d597 100644 --- a/chaoslib/litmus/pod_failure_by_litmus.yml +++ b/chaoslib/litmus/pod_failure_by_litmus.yml @@ -1,13 +1,19 @@ - name: Derive chaos iterations set_fact: - chaos_iterations: "{{ (c_duration|int / c_interval|int)|int }}" + c_iterations: "{{ (c_duration|int / c_interval|int)|int }}" + when: c_iterations is undefined + +- name: Derive chaos interval + set_fact: + c_interval: "{{ (c_duration|int / c_iterations|int)|int }}" + when: c_interval is undefined - name: Set min chaos count to 1 if interval > duration set_fact: - chaos_iterations: 1 - when: "chaos_iterations == '0'" + c_iterations: 1 + when: "c_iterations == '0'" - name: Kill random pod include: kill_random_pod.yml - with_sequence: start=1 end={{ chaos_iterations }} + with_sequence: start=1 end={{ c_iterations }} \ No newline at end of file diff --git a/chaoslib/pumba/network_chaos/induce_latency.yml b/chaoslib/pumba/network_chaos/induce_latency.yml index 8a86a3f2dfd..5498c837eb0 100644 --- a/chaoslib/pumba/network_chaos/induce_latency.yml +++ b/chaoslib/pumba/network_chaos/induce_latency.yml @@ -2,6 +2,6 @@ shell: > kubectl exec {{ pumba_pod.stdout }} -n {{ a_ns }} -- pumba netem --interface {{ n_interface }} --duration {{ c_duration }}ms delay - --time {{ n_latency }} re2:k8s_{{ c_container }}_{{ app_pod.stdout }} + --time {{ n_latency }} re2:k8s_{{ c_container }}_{{ app_pod }} args: executable: /bin/bash diff --git a/chaoslib/pumba/network_chaos/induce_packet_loss.yml b/chaoslib/pumba/network_chaos/induce_packet_loss.yml index 5fa8c979c26..5367ace8ec3 100644 --- a/chaoslib/pumba/network_chaos/induce_packet_loss.yml +++ b/chaoslib/pumba/network_chaos/induce_packet_loss.yml @@ -2,6 +2,6 @@ shell: > kubectl exec {{ pumba_pod.stdout }} -n {{ a_ns }} -- pumba netem --interface {{ n_interface }} --duration {{ c_duration }}ms - loss --percent {{ n_packet_loss }} re2:k8s_{{ c_container }}_{{ app_pod.stdout }} + loss --percent {{ n_packet_loss }} re2:k8s_{{ c_container }}_{{ app_pod }} args: executable: /bin/bash diff --git a/chaoslib/pumba/network_chaos/network_chaos.yml b/chaoslib/pumba/network_chaos/network_chaos.yml index f4f4831322f..f8762025435 100644 --- a/chaoslib/pumba/network_chaos/network_chaos.yml +++ b/chaoslib/pumba/network_chaos/network_chaos.yml @@ -27,18 +27,28 @@ retries: 60 ignore_errors: true - - name: Select the app pod - shell: > - kubectl get pod -l {{ a_label }} -n {{ a_ns }} - -o=custom-columns=:metadata.name --no-headers - | shuf | head -1 - args: - executable: /bin/bash - register: app_pod + - block: + - name: Select the app pod + shell: > + kubectl get pod -l {{ a_label }} -n {{ a_ns }} + -o=custom-columns=:metadata.name --no-headers + | shuf | head -1 + args: + executable: /bin/bash + register: app_pod_name + + - name: Record app pod name + set_fact: + app_pod: "{{ app_pod_name.stdout }}" + when: "app_pod is undefined" + + # here app_ns is the namespace of pod on which we are performing network loss/delay + # in genric experiments app_ns is same as a_ns + # in openebs experiments app_ns is the namespace where openebs is installed i.e, openebs - name: Identify the application node shell: > - kubectl get pod {{ app_pod.stdout }} -n {{ a_ns }} + kubectl get pod {{ app_pod }} -n {{ app_ns }} --no-headers -o custom-columns=:spec.nodeName args: executable: /bin/bash @@ -60,7 +70,7 @@ - name: Tear down pumba infrastructure shell: > - kubectl delete -f /chaoslib/pumba/pumba_kube.yaml -n {{ a_ns }} + kubectl delete -f /chaoslib/pumba/pumba_kube.yml -n {{ a_ns }} args: executable: /bin/bash @@ -94,6 +104,4 @@ delay: 20 retries: 15 - when: "pumb_deploy_result.rc == 0" - - + when: "pumb_deploy_result.rc == 0" \ No newline at end of file diff --git a/executor/README.md b/executor/README.md index 7ddf841d206..7054c1741d9 100644 --- a/executor/README.md +++ b/executor/README.md @@ -132,3 +132,7 @@ - The name of file which contains data for configmap in experimentCR should be parameters.yml - The configmap is mounted in this default directory: /mnt/ + +- Executor is currently unable to parse more than one secret. + +- The secret is mounted in this default directory: /tmp/ diff --git a/executor/executor.yml b/executor/executor.yml index 924cbf8f1d3..4d5207840cc 100644 --- a/executor/executor.yml +++ b/executor/executor.yml @@ -65,13 +65,25 @@ executable: /bin/bash register: c_job_args -- name: Fetching data for the configmap +- name: Check Availability of configmaps shell: > kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.configmaps[0].data.parameters\.yml}' args: executable: /bin/bash register: configMap_available +- name: Check Availability of secrets + shell: > + kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.secrets[0].name}' + args: + executable: /bin/bash + register: secret_available + +- name: Record availability of configmaps and secrets + set_fact: + configMap_available: "{{ configMap_available.stdout }}" + secret_available: "{{ secret_available.stdout }}" + - include: experiment_env_getter.yml with_sequence: start=0 count="{{c_env_length.stdout | int}}" @@ -138,10 +150,10 @@ --labels={{c_job_labels.stdout}} {{c_env_list}} --command -- /bin/bash {{c_job_args.stdout}} args: executable: /bin/bash - when: configMap_available.stdout == '' + when: configMap_available == '' and secret_available == '' - include: experiment_configmap.yml - when: configMap_available.stdout != '' + when: configMap_available != '' or secret_available != '' - name: Monitoring the litmus chaos job for completion shell: > diff --git a/executor/experiment_configmap.yml b/executor/experiment_configmap.yml index e972bce203b..c1315e6d374 100644 --- a/executor/experiment_configmap.yml +++ b/executor/experiment_configmap.yml @@ -1,23 +1,44 @@ -- name: Fetching data for the configmap - shell: > - kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.configmaps[0].data.parameters\.yml}' > parameters.yml - args: - executable: /bin/bash +- block: + - name: Fetching data for the configmap + shell: > + kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.configmaps[0].data.parameters\.yml}' > parameters.yml + args: + executable: /bin/bash -- name: Fetching name of configmap - shell: > - kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.configmaps[0].name}' - args: - executable: /bin/bash - register: c_map_name + - name: Fetching name of configmap + shell: > + kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.configmaps[0].name}' + args: + executable: /bin/bash + register: c_map_name -- name: Creating configmap - shell: - kubectl create configmap {{c_map_name.stdout}} --from-file=parameters.yml -n {{c_app_ns}} - args: - executable: /bin/bash + - name: Creating configmap + shell: + kubectl create configmap {{c_map_name.stdout}} --from-file=parameters.yml -n {{c_app_ns}} + args: + executable: /bin/bash + + when: configMap_available != '' + +- block: + + - name: Fetching name of secret + shell: > + kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.secrets[0].name}' + args: + executable: /bin/bash + register: c_secret_name + + - name: Fetching mount path for secret + shell: > + kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.secrets[0].mountPath}' + args: + executable: /bin/bash + register: c_mount_path + + when: secret_available != '' -- name: Run the chaos experiment job +- name: Get the job yaml shell: kubectl run {{ c_experiment_name }}-{{random_string.stdout}} --restart=OnFailure --image={{c_image.stdout}} --namespace={{c_app_ns}} --serviceaccount={{ c_svc_acc }} --image-pull-policy=Always @@ -25,32 +46,94 @@ args: executable: /bin/bash -- name: Include the volumeMounts in jobYml - lineinfile: - dest: cjob.yml - insertafter: "resources: {}" - state: present - line: ' {{item}}' - with_items: - - " mountPath: /mnt/" - - "- name: parameters" - - "volumeMounts:" - -- name: Include the volumes in jobYml - lineinfile: - dest: cjob.yml - insertafter: "serviceAccountName" - state: present - line: ' {{item}}' - with_items: - - " name: {{c_map_name.stdout}}" - - " configMap:" - - "- name: parameters" - - "volumes:" +- block: + + - name: Include the volumeMounts in jobYml - configmap only + lineinfile: + dest: cjob.yml + insertafter: "resources: {}" + state: present + line: ' {{item}}' + with_items: + - " mountPath: /mnt/" + - "- name: parameters" + - "volumeMounts:" + + - name: Include the volumes in jobYml - configmap only + lineinfile: + dest: cjob.yml + insertafter: "serviceAccountName" + state: present + line: ' {{item}}' + with_items: + - " name: {{c_map_name.stdout}}" + - " configMap:" + - "- name: parameters" + - "volumes:" + + when: configMap_available != '' and secret_available == '' + +- block: + + - name: Include the volumeMounts in jobYml - secret only + lineinfile: + dest: cjob.yml + insertafter: "resources: {}" + state: present + line: ' {{item}}' + with_items: + - " mountPath: {{ c_mount_path.stdout }}" + - "- name: cloud-config" + - "volumeMounts:" + + - name: Include the volumes in jobYml + lineinfile: + dest: cjob.yml + insertafter: "serviceAccountName" + state: present + line: ' {{item}}' + with_items: + - " secretName: {{ c_secret_name.stdout }}" + - " secret:" + - "- name: cloud-config" + - "volumes:" + + when: configMap_available == '' and secret_available != '' + +- block: + + - name: Include the volumeMounts in jobYml - secret and configmap both + lineinfile: + dest: cjob.yml + insertafter: "resources: {}" + state: present + line: ' {{item}}' + with_items: + - " mountPath: {{ c_mount_path.stdout }}" + - "- name: cloud-config" + - " mountPath: /mnt/" + - "- name: parameters" + - "volumeMounts:" + + - name: Include the volumes in jobYml + lineinfile: + dest: cjob.yml + insertafter: "serviceAccountName" + state: present + line: ' {{item}}' + with_items: + - " secretName: {{ c_secret_name.stdout }}" + - " secret:" + - "- name: cloud-config" + - " name: {{c_map_name.stdout}}" + - " configMap:" + - "- name: parameters" + - "volumes:" + + when: configMap_available != '' and secret_available != '' - name: create job shell: kubectl create -f cjob.yml -n {{c_app_ns}} args: executable: /bin/bash - diff --git a/experiments/generic/container_kill/container_kill_k8s_job.yml b/experiments/generic/container_kill/container_kill_k8s_job.yml index 5507fa32bb7..3d22aeede01 100644 --- a/experiments/generic/container_kill/container_kill_k8s_job.yml +++ b/experiments/generic/container_kill/container_kill_k8s_job.yml @@ -35,18 +35,6 @@ spec: - name: TARGET_CONTAINER value: '' - # provide application namespace - - name: APP_NAMESPACE - value: '' - - # provide application labels - - name: APP_LABEL - value: ''>>>>>>> master - - # provide target container - - name: TARGET_CONTAINER - value: '' - # provide chaosengine name - name: CHAOSENGINE value: '' diff --git a/experiments/generic/disk_loss/README.md b/experiments/generic/disk_loss/README.md index 9f1ef357d0f..16f85e2961b 100644 --- a/experiments/generic/disk_loss/README.md +++ b/experiments/generic/disk_loss/README.md @@ -76,35 +76,42 @@ Cloud Platform name Mandatory + CLOUD_NAMESPACE This is a chaos namespace which will create all infra chaos resources in that namespace Mandatory - + PROJECT_ID GCP project ID Mandatory - + NODE_NAME Node name of the cluster Mandatory + DISK_NAME Disk Name of the node, it must be an external disk. Mandatory + + DEVICE_NAME + Enter the device name which you wanted to mount only for AWS. + Mandatory - + ZONE_NAME Zone Name of the node Mandatory - + CHAOSENGINE ChaosEngine CR name associated with the experiment instance Mandatory + CHAOS_SERVICE_ACCOUNT Service account used by the litmus Mandatory diff --git a/experiments/generic/disk_loss/chaosutil.j2 b/experiments/generic/disk_loss/chaosutil.j2 index 0fb487d966f..ce1781bf7d3 100644 --- a/experiments/generic/disk_loss/chaosutil.j2 +++ b/experiments/generic/disk_loss/chaosutil.j2 @@ -1,6 +1,8 @@ # All code here is not indented because j2 is space sensitive # checks if cloud_platform is set or not -{% if cloud_platform is defined and cloud_platform == 'GCP' or cloud_platform == 'AWS' %} +{% if cloud_platform is defined and cloud_platform == 'GCP' %} c_util: /chaoslib/litmus/platform/gke/disk_loss.yml +{% elif cloud_platform is defined and cloud_platform == 'AWS' %} +c_util: /chaoslib/litmus/platform/aws/disk_loss.yml {% endif %} diff --git a/experiments/generic/disk_loss/disk_loss_ansible_logic.yml b/experiments/generic/disk_loss/disk_loss_ansible_logic.yml index deec1e164cb..dc2d6e1b2f7 100644 --- a/experiments/generic/disk_loss/disk_loss_ansible_logic.yml +++ b/experiments/generic/disk_loss/disk_loss_ansible_logic.yml @@ -10,6 +10,7 @@ cloud_platform: "{{ lookup('env','CLOUD_PLATFORM') }}" c_ns: "{{ lookup('env','CHAOS_NAMESPACE') }}" disk_name: "{{ lookup('env','DISK_NAME') }}" + device_name: "{{ lookup('env', 'DEVICE_NAME') }}" node_name: "{{ lookup('env','NODE_NAME') }}" project_id: "{{ lookup('env','PROJECT_ID') }}" zone_name: "{{ lookup('env','ZONE_NAME') }}" @@ -47,16 +48,30 @@ - name: Gcloud authentication include_tasks: "/utils/cloud/gcp/gcloud_configure.yml" when: "cloud_platform == 'GCP'" + + # AWS authentication + - name: AWS authentication + include_tasks: "/utils/cloud/aws/aws_configure.yml" + when: "cloud_platform == 'AWS'" ## PRE-CHAOS DISK LIVENESS CHECK - name: Verify that the disk is connected to node (pre) include_tasks: "/utils/cloud/gcp/status_disk.yml" + when: "cloud_platform == 'GCP'" + + - name: Verify that the disk is connected to node (pre) + include_tasks: "/utils/cloud/aws/status_disk.yml" + when: "cloud_platform == 'AWS'" # Checking disk is attached to node - debug: - msg: echo "disk attached" + msg: "specified disk is attached to node" when: "inuse == true" + - fail: + msg: "specified disk not attached to node" + when: "inuse == false" + ## INJECTING CHAOS - name: Injecting the chaos include_tasks: "{{ c_util }}" @@ -72,12 +87,21 @@ ## POST-CHAOS DISK LIVENESS CHECK - name: Verify that the disk is connected to node (post) include_tasks: "/utils/cloud/gcp/status_disk.yml" - + when: "cloud_platform == 'GCP'" + + - name: Verify that the disk is connected to node (post) + include_tasks: "/utils/cloud/aws/status_disk.yml" + when: "cloud_platform == 'AWS'" + # Checking disk is attached to node - debug: - msg: echo "disk attached" + msg: "specified disk is attached to node" when: "inuse == true" + - fail: + msg: "specified disk not re-attached to node" + when: "inuse == false" + - set_fact: flag: "Pass" @@ -92,4 +116,4 @@ - include_tasks: "/utils/runtime/update_chaos_result_resource.yml" vars: status: 'EOT' - namespace: "{{ c_ns }}" \ No newline at end of file + namespace: "{{ c_ns }}" diff --git a/experiments/generic/disk_loss/disk_loss_k8s_job.yml b/experiments/generic/disk_loss/disk_loss_k8s_job.yml index 68d544ba5a2..8a5b25993c0 100644 --- a/experiments/generic/disk_loss/disk_loss_k8s_job.yml +++ b/experiments/generic/disk_loss/disk_loss_k8s_job.yml @@ -57,6 +57,10 @@ spec: # Enter the disk name - name: DISK_NAME value: '' + + # Enter the device name + - name: DEVICE_NAME + value: '' # Enter the zone name - name: ZONE_NAME @@ -70,7 +74,7 @@ spec: args: ['-c', 'ansible-playbook ./experiments/generic/disk_loss/disk_loss_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0'] volumeMounts: - name: parameters - mountPath: /mnt/ + mountPath: /tmp/ volumes: - name: parameters # Enter the secret name of the service account, you want to mount diff --git a/experiments/generic/disk_loss/disk_status_check.j2 b/experiments/generic/disk_loss/disk_status_check.j2 index 1f197caa8fd..8a5e2044252 100644 --- a/experiments/generic/disk_loss/disk_status_check.j2 +++ b/experiments/generic/disk_loss/disk_status_check.j2 @@ -1,6 +1,8 @@ # All code here is not indented because j2 is space sensitive # Initially, it "inuse" set to false {% set disk = namespace(inuse=false) %} +# For GCP +{% if cloud_platform is defined and cloud_platform == 'GCP' %} {% set expect_user = 'https://www.googleapis.com/compute/v1/projects/' + project_id + '/zones/' + zone_name + '/instances/' + node_name %} # loop through all the disk users and checks if current_user is equal to expect_user {% for current_user in disk_users.stdout_lines %} @@ -9,8 +11,23 @@ {% set disk.inuse = true %} {% endif %} {% endfor %} + +# For AWS +{% elif cloud_platform is defined and cloud_platform == 'AWS' %} +{% set expect_user = node_name %} +# loop through all the disk users and checks if current_user is equal to expect_user +{% for current_user in disk_users.volumes %} +{% if current_user.attachment_set.instance_id == expect_user and current_user.attachment_set.status == "attached" %} +# If the condition is true, then set "inuse" to true +{% set disk.inuse = true %} +{% endif %} +{% endfor %} +{% endif %} + +# This will append inuse: true/false {% if disk.inuse == true %} inuse: true {% else %} inuse: false -{% endif %} \ No newline at end of file +{% endif %} + diff --git a/experiments/generic/pod_delete/pod_delete_ansible_logic.yml b/experiments/generic/pod_delete/pod_delete_ansible_logic.yml index 0ac273739bc..f320bd2eaf0 100644 --- a/experiments/generic/pod_delete/pod_delete_ansible_logic.yml +++ b/experiments/generic/pod_delete/pod_delete_ansible_logic.yml @@ -48,7 +48,8 @@ - include_tasks: "{{ c_util }}" vars: - c_svc_acc: "{{ lookup('env','CHAOS_SERVICE_ACCOUNT') }}" + app_ns: "{{ a_ns }}" + app_label: "{{ a_label }}" ## POST-CHAOS APPLICATION LIVENESS CHECK diff --git a/experiments/generic/pod_network_latency/pod_network_latency_ansible_logic.yml b/experiments/generic/pod_network_latency/pod_network_latency_ansible_logic.yml index 07736e4b8ba..dd92f88ac55 100644 --- a/experiments/generic/pod_network_latency/pod_network_latency_ansible_logic.yml +++ b/experiments/generic/pod_network_latency/pod_network_latency_ansible_logic.yml @@ -45,6 +45,8 @@ ## FAULT INJECTION - include_tasks: "{{ c_util }}" + vars: + app_ns: "{{ a_ns }}" ## POST-CHAOS APPLICATION LIVENESS CHECK - name: Verify AUT liveness post fault-injection diff --git a/experiments/generic/pod_network_latency/pod_network_latency_k8s_job.yml b/experiments/generic/pod_network_latency/pod_network_latency_k8s_job.yml index 7bb009965d1..3ae8866bca9 100644 --- a/experiments/generic/pod_network_latency/pod_network_latency_k8s_job.yml +++ b/experiments/generic/pod_network_latency/pod_network_latency_k8s_job.yml @@ -33,14 +33,6 @@ spec: - name: APP_KIND value: '' - # provide application labels - - name: APP_LABEL - value: '' - - # provide application kind - - name: APP_KIND - value: '' - - name: TARGET_CONTAINER value: '' diff --git a/experiments/generic/pod_network_loss/pod_network_loss_ansible_logic.yml b/experiments/generic/pod_network_loss/pod_network_loss_ansible_logic.yml index 8b7e1934bb9..85d3947a7b1 100644 --- a/experiments/generic/pod_network_loss/pod_network_loss_ansible_logic.yml +++ b/experiments/generic/pod_network_loss/pod_network_loss_ansible_logic.yml @@ -45,6 +45,8 @@ ## FAULT INJECTION - include_tasks: "{{ c_util }}" + vars: + app_ns: "{{ a_ns }}" ## POST-CHAOS APPLICATION LIVENESS CHECK - name: Verify AUT liveness post fault-injection diff --git a/experiments/generic/pod_network_loss/pod_network_loss_k8s_job.yml b/experiments/generic/pod_network_loss/pod_network_loss_k8s_job.yml index f41903ad9ee..9930760b63d 100644 --- a/experiments/generic/pod_network_loss/pod_network_loss_k8s_job.yml +++ b/experiments/generic/pod_network_loss/pod_network_loss_k8s_job.yml @@ -33,18 +33,6 @@ spec: - name: APP_KIND value: '' - # provide application namespace - - name: APP_NAMESPACE - value: '' - - # provide application labels - - name: APP_LABEL - value: '' - - # provide application kind - - name: APP_KIND - value: '' - # provide target conatiner - name: TARGET_CONTAINER value: '' diff --git a/experiments/kafka/kafka-broker-disk-failure/chaosutil.j2 b/experiments/kafka/kafka-broker-disk-failure/chaosutil.j2 new file mode 100644 index 00000000000..3cf087ab8de --- /dev/null +++ b/experiments/kafka/kafka-broker-disk-failure/chaosutil.j2 @@ -0,0 +1,6 @@ +# checks if cloud_platform is set or not +{% if cloud_platform is defined and cloud_platform == 'GCP' %} + c_util: /chaoslib/litmus/platform/gke/disk_loss.yml +{% elif cloud_platform is defined and cloud_platform == 'AWS' %} + c_util: /chaoslib/litmus/platform/aws/disk_loss.yml +{% endif %} diff --git a/experiments/kafka/kafka-broker-disk-failure/disk_status_check.j2 b/experiments/kafka/kafka-broker-disk-failure/disk_status_check.j2 new file mode 100644 index 00000000000..8a5e2044252 --- /dev/null +++ b/experiments/kafka/kafka-broker-disk-failure/disk_status_check.j2 @@ -0,0 +1,33 @@ +# All code here is not indented because j2 is space sensitive +# Initially, it "inuse" set to false +{% set disk = namespace(inuse=false) %} +# For GCP +{% if cloud_platform is defined and cloud_platform == 'GCP' %} +{% set expect_user = 'https://www.googleapis.com/compute/v1/projects/' + project_id + '/zones/' + zone_name + '/instances/' + node_name %} +# loop through all the disk users and checks if current_user is equal to expect_user +{% for current_user in disk_users.stdout_lines %} +{% if current_user == expect_user %} +# If the condition is true, then set "inuse" to true +{% set disk.inuse = true %} +{% endif %} +{% endfor %} + +# For AWS +{% elif cloud_platform is defined and cloud_platform == 'AWS' %} +{% set expect_user = node_name %} +# loop through all the disk users and checks if current_user is equal to expect_user +{% for current_user in disk_users.volumes %} +{% if current_user.attachment_set.instance_id == expect_user and current_user.attachment_set.status == "attached" %} +# If the condition is true, then set "inuse" to true +{% set disk.inuse = true %} +{% endif %} +{% endfor %} +{% endif %} + +# This will append inuse: true/false +{% if disk.inuse == true %} +inuse: true +{% else %} +inuse: false +{% endif %} + diff --git a/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-logic.yml b/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-logic.yml new file mode 100644 index 00000000000..b5060a3c0f0 --- /dev/null +++ b/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-logic.yml @@ -0,0 +1,157 @@ +--- +- hosts: localhost + connection: local + + vars: + c_experiment: "kafka-broker-disk-failure" + c_duration: "{{ lookup('env','TOTAL_CHAOS_DURATION') }}" + cloud_platform: "{{ lookup('env','CLOUD_PLATFORM') }}" + disk_name: "{{ lookup('env','DISK_NAME') }}" + project_id: "{{ lookup('env','PROJECT_ID') }}" + zone_name: "{{ lookup('env','ZONE_NAME') }}" + kafka_ns: "{{ lookup('env','KAFKA_NAMESPACE') }}" + kafka_label: "{{ lookup('env','KAFKA_LABEL') }}" + kafka_kind: "{{ lookup('env','KAFKA_KIND') }}" + kafka_broker: "{{ lookup('env','KAFKA_BROKER') }}" + kafka_stream: "{{ lookup('env','KAFKA_LIVENESS_STREAM') }}" + kafka_service: "{{ lookup('env','KAFKA_SERVICE') }}" + kafka_port: "{{ lookup('env','KAFKA_PORT') }}" + kafka_replication_factor: "{{ lookup('env','KAFKA_REPLICATION_FACTOR') }}" + zk_ns: "{{ lookup('env','ZOOKEEPER_NAMESPACE') }}" + zk_label: "{{ lookup('env','ZOOKEEPER_LABEL') }}" + zk_service: "{{ lookup('env','ZOOKEEPER_SERVICE') }}" + zk_port: "{{ lookup('env','ZOOKEEPER_PORT') }}" + + tasks: + - block: + + - include: kafka-broker-disk-failure-ansible-prerequisites.yml + + - include_vars: + file: chaosutil.yml + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_experiment: "{{ lookup('env','CHAOSENGINE') }}-{{ c_experiment }}" + + when: lookup('env','CHAOSENGINE') + + ## RECORD START-OF-EXPERIMENT IN LITMUSCHAOS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ kafka_ns }}" + + - name: Verify mandatory Kafka broker and disk information + debug: + msg: "kafka-broker-pod: {{ kafka_broker }}; kafka-broker-disk: {{ disk_name }}" + failed_when: (kafka_broker is not defined or not kafka_broker) or (disk_name is not defined or not disk_name) + + ## PERFORM GCLOUD PLATFORM CONFIGURATION STEPS + + - name: Perform gcloud authentication + include_tasks: "/utils/cloud/gcp/gcloud_configure.yml" + when: "cloud_platform == 'GCP'" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify that the Kafka cluster is healthy + include_tasks: "/utils/apps/kafka/kafka_cluster_health.yml" + vars: + delay: 1 + retries: 60 + + - name: Derive the kafka-broker node name + shell: + kubectl get pod {{ kafka_broker }} -n {{ kafka_ns }} --no-headers -o custom-columns=:spec.nodeName + args: + executable: /bin/bash + register: node + + - set_fact: + node_name: "{{ node.stdout }}" + + - name: Verify that the specified disk is connected to node + include_tasks: "/utils/cloud/gcp/status_disk.yml" + when: "cloud_platform == 'GCP'" + + - debug: + msg: "specified disk is attached to node" + when: "inuse == true" + + - fail: + msg: "specified disk not attached to node" + when: "inuse == false" + + ## SETUP KAFKA CHAOS INFRA (LIVENESS CLIENT) + + - include_tasks: "/utils/apps/kafka/kafka_liveness_stream.yml" + when: kafka_stream is defined and kafka_stream != '' + + ## FAULT INJECTION + + - include_tasks: "{{ c_util }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + ## NOTE: This is disabled at present as the recovery post re-attach (in case of mounted disks) + ## is still manual + + #- name: Verify that the Kafka cluster is healthy + # include_tasks: "/utils/apps/kafka/kafka_cluster_health.yml" + # vars: + # delay: 1 + # retries: 60 + + ## CHECK FOR KAFKA LIVENESS & CLEANUP + + - block: + + - name: Verify that the Kafka liveness pod (pub-sub) is uninterrupted + include_tasks: "/utils/common/status_app_pod.yml" + vars: + a_ns: "{{ kafka_ns }}" + a_label: "name=kafka-liveness" + delay: 1 + retries: 60 + + - include_tasks: "/utils/apps/kafka/kafka_liveness_cleanup.yml" + + when: kafka_stream is defined and kafka_stream != '' + + ## POST-CHAOS DISK LIVENESS CHECK + + - name: Verify that the disk is connected to node (post) + include_tasks: "/utils/cloud/gcp/status_disk.yml" + when: "cloud_platform == 'GCP'" + + - debug: + msg: "specified disk is attached to node" + when: "inuse == true" + + - fail: + msg: "specified disk not re-attached to kafka-broker node" + when: "inuse == false" + + - set_fact: + flag: "pass" + + rescue: + - set_fact: + flag: "fail" + + - name: Cleanup kafka liveness pods if present + include_tasks: "/utils/apps/kafka/kafka_liveness_cleanup.yml" + ignore_errors: true + + always: + + ## RECORD END-OF-TEST IN LITMUSCHAOS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ kafka_ns }}" + diff --git a/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-prerequisites.yml b/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-prerequisites.yml new file mode 100644 index 00000000000..1d7e14250f9 --- /dev/null +++ b/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-prerequisites.yml @@ -0,0 +1,7 @@ +- name: Identify the chaos util to be invoked + template: + src: chaosutil.j2 + dest: chaosutil.yml + + + diff --git a/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-k8s-job.yml b/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-k8s-job.yml new file mode 100644 index 00000000000..261ae947889 --- /dev/null +++ b/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-k8s-job.yml @@ -0,0 +1,93 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: kafka-broker-disk-failure- +spec: + template: + metadata: + labels: + experiment: kafka-broker-disk-failure + spec: + # Placeholder that is updated by the executor for automated runs + # Provide appropriate SA (with desired permissions) if executed manually + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + # provide application kind + - name: KAFKA_KIND + value: 'statefulset' + + - name: KAFKA_LIVENESS_STREAM + value: 'enabled' + + - name: TOTAL_CHAOS_DURATION + value: '30' + + - name: CLOUD_PLATFORM + value: 'GCP' + + - name: PROJECT_ID + value: '' + + - name: DISK_NAME + value: '' + + - name: ZONE_NAME + value: '' + + - name: KAFKA_NAMESPACE + value: '' + + - name: KAFKA_LABEL + value: '' + + - name: KAFKA_BROKER + value: '' + + - name: KAFKA_REPLICATION_FACTOR + value: '' + + - name: KAFKA_SERVICE + value: '' + + - name: KAFKA_PORT + value: '' + + - name: ZOOKEEPER_NAMESPACE + value: '' + + - name: ZOOKEEPER_LABEL + value: '' + + - name: ZOOKEEPER_SERVICE + value: '' + + - name: ZOOKEEPER_PORT + value: '' + + - name: CHAOSENGINE + value: '' + + - name: CHAOS_SERVICE_ACCOUNT + valueFrom: + fieldRef: + fieldPath: spec.serviceAccountName + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-logic.yml -vv -i /etc/ansible/hosts; exit 0"] + volumeMounts: + - name: parameters + mountPath: /mnt/ + volumes: + - name: parameters + secret: + secretName: %SECRET_NAME% + diff --git a/experiments/kafka/kafka-broker-pod-failure/README.md b/experiments/kafka/kafka-broker-pod-failure/README.md new file mode 100644 index 00000000000..61dbf98dc36 --- /dev/null +++ b/experiments/kafka/kafka-broker-pod-failure/README.md @@ -0,0 +1,55 @@ +### Sample ChaosEngine manifest to execute kafka broker kill experiment + +- To override experiment defaults, add the ENV variables in `spec.components` of the experiment. + + ```yml + apiVersion: litmuschaos.io/v1alpha1 + kind: ChaosEngine + metadata: + name: kafka-chaos + namespace: default + spec: + appinfo: + appns: default + applabel: 'app=cp-kafka' + appkind: statefulset + chaosServiceAccount: kafka-sa + monitoring: false + experiments: + - name: kafka-broker-pod-failure + spec: + components: + # choose based on available kafka broker replicas + - name: KAFKA_REPLICATION_FACTOR + value: '3' + + # get via "kubectl get pods --show-labels -n " + - name: KAFKA_LABEL + value: 'app=cp-kafka' + + - name: KAFKA_NAMESPACE + value: 'default' + + # get via "kubectl get svc -n " + - name: KAFKA_SERVICE + value: 'kafka-cp-kafka-headless' + + # get via "kubectl get svc -n + - name: KAFKA_PORT + value: '9092' + + - name: ZOOKEEPER_NAMESPACE + value: 'default' + + # get via "kubectl get pods --show-labels -n " + - name: ZOOKEEPER_LABEL + value: 'app=cp-zookeeper' + + # get via "kubectl get svc -n + - name: ZOOKEEPER_SERVICE + value: 'kafka-cp-zookeeper-headless' + + # get via "kubectl get svc -n + - name: ZOOKEEPER_PORT + value: '2181' + ``` \ No newline at end of file diff --git a/experiments/kafka/kafka-broker-pod-failure/chaosutil.j2 b/experiments/kafka/kafka-broker-pod-failure/chaosutil.j2 new file mode 100644 index 00000000000..cbf537e7f41 --- /dev/null +++ b/experiments/kafka/kafka-broker-pod-failure/chaosutil.j2 @@ -0,0 +1,7 @@ +{% if c_lib is defined and c_lib == 'chaoskube' %} + c_util: /chaoslib/chaoskube/pod_failure_by_chaoskube.yml +{% elif c_lib is defined and c_lib == 'powerfulseal' %} + c_util: /chaoslib/powerfulseal/pod_failure_by_powerfulseal.yml +{% else %} + c_util: /chaoslib/litmus/pod_failure_by_litmus.yml +{% endif %} diff --git a/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml new file mode 100644 index 00000000000..f33b4b4bf7f --- /dev/null +++ b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml @@ -0,0 +1,113 @@ +--- +- hosts: localhost + connection: local + + vars: + c_experiment: "kafka-broker-pod-failure" + c_duration: "{{ lookup('env','TOTAL_CHAOS_DURATION') }}" + c_interval: "{{ lookup('env','CHAOS_INTERVAL') }}" + c_force: "{{ lookup('env','FORCE') }}" + c_lib: "{{ lookup('env','LIB') }}" + kafka_instance: "{{ lookup('env','KAFKA_INSTANCE_NAME') }}" + kafka_ns: "{{ lookup('env','KAFKA_NAMESPACE') }}" + kafka_label: "{{ lookup('env','KAFKA_LABEL') }}" + kafka_kind: "{{ lookup('env','KAFKA_KIND') }}" + kafka_broker: "{{ lookup('env','KAFKA_BROKER') }}" + kafka_stream: "{{ lookup('env','KAFKA_LIVENESS_STREAM') }}" + kafka_service: "{{ lookup('env','KAFKA_SERVICE') }}" + kafka_port: "{{ lookup('env','KAFKA_PORT') }}" + kafka_replication_factor: "{{ lookup('env','KAFKA_REPLICATION_FACTOR') }}" + zk_ns: "{{ lookup('env','ZOOKEEPER_NAMESPACE') }}" + zk_label: "{{ lookup('env','ZOOKEEPER_LABEL') }}" + zk_service: "{{ lookup('env','ZOOKEEPER_SERVICE') }}" + zk_port: "{{ lookup('env','ZOOKEEPER_PORT') }}" + + tasks: + - block: + + - include: kafka-broker-pod-failure-ansible-prerequisites.yml + + - include_vars: + file: chaosutil.yml + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_experiment: "{{ lookup('env','CHAOSENGINE') }}-{{ c_experiment }}" + + when: lookup('env','CHAOSENGINE') + + ## RECORD START-OF-EXPERIMENT IN LITMUSCHAOS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ kafka_ns }}" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify that the Kafka cluster is healthy + include_tasks: "/utils/apps/kafka/kafka_cluster_health.yml" + vars: + delay: 1 + retries: 60 + + ## SETUP KAFKA CHAOS INFRA AND DERIVE BROKERS UNDER TEST + + - include_tasks: "{{ kafka_broker_util }}" + + ## FAULT INJECTION + + - include_tasks: "{{ c_util }}" + vars: + app_ns: "{{ kafka_ns }}" + app_label: "{{ kafka_label }}" + + # derived from the 'kafka_broker_util' task + app_pod_name: "{{ kafka_broker }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify that the Kafka cluster is healthy + include_tasks: "/utils/apps/kafka/kafka_cluster_health.yml" + vars: + delay: 1 + retries: 60 + + ## CHECK FOR KAFKA LIVENESS & CLEANUP + + - block: + + - name: Verify that the Kafka liveness pod (pub-sub) is uninterrupted + include_tasks: "/utils/common/status_app_pod.yml" + vars: + a_ns: "{{ kafka_ns }}" + a_label: "name=kafka-liveness" + delay: 1 + retries: 60 + + - include_tasks: "/utils/apps/kafka/kafka_liveness_cleanup.yml" + + when: kafka_stream is defined and kafka_stream != '' + + - set_fact: + flag: "pass" + + + rescue: + - set_fact: + flag: "fail" + + - name: Cleanup kafka liveness pods if present + include_tasks: "/utils/apps/kafka/kafka_liveness_cleanup.yml" + ignore_errors: true + + always: + + ## RECORD END-OF-TEST IN LITMUSCHAOS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ kafka_ns }}" + diff --git a/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-prerequisites.yml b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-prerequisites.yml new file mode 100644 index 00000000000..10293ba0a95 --- /dev/null +++ b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-prerequisites.yml @@ -0,0 +1,31 @@ +- name: Identify the chaos util to be invoked + template: + src: chaosutil.j2 + dest: chaosutil.yml + +- block: + + - set_fact: + kafka_broker_util: "/utils/apps/kafka/kafka_liveness_stream.yml" + when: kafka_stream is defined and kafka_stream != '' + + - set_fact: + kafka_broker_util: "/utils/apps/kafka/display_kafka_broker_info.yml" + when: kafka_stream is not defined or kafka_stream == '' + + when: kafka_broker is defined and kafka_broker != '' + +- block: + + - set_fact: + kafka_broker_util: "/utils/apps/kafka/kafka_launch_stream_derive_leader_broker.yml" + when: kafka_stream is defined and kafka_stream != '' + + - set_fact: + kafka_broker_util: "/utils/apps/kafka/kafka_select_broker.yml" + when: kafka_stream is not defined or kafka_stream == '' + + when: kafka_broker is not defined or kafka_broker == '' + + + diff --git a/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml new file mode 100644 index 00000000000..d739d78d57f --- /dev/null +++ b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml @@ -0,0 +1,87 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: kafka-broker-pod-failure- +spec: + template: + metadata: + labels: + experiment: kafka-broker-pod-failure + spec: + # Placeholder that is updated by the executor for automated runs + # Provide appropriate SA (with desired permissions) if executed manually + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + - name: KAFKA_KIND + value: 'statefulset' + + - name: KAFKA_LIVENESS_STREAM + value: 'enabled' + + - name: TOTAL_CHAOS_DURATION + value: '15' + + - name: CHAOS_INTERVAL + value: '5' + + - name: FORCE + value: 'true' + + - name: KAFKA_INSTANCE_NAME + value: '' + + - name: KAFKA_NAMESPACE + value: '' + + - name: KAFKA_LABEL + value: '' + + - name: KAFKA_BROKER + value: '' + + - name: KAFKA_REPLICATION_FACTOR + value: '' + + - name: KAFKA_SERVICE + value: '' + + - name: KAFKA_PORT + value: '' + + - name: ZOOKEEPER_NAMESPACE + value: '' + + - name: ZOOKEEPER_LABEL + value: '' + + - name: ZOOKEEPER_SERVICE + value: '' + + - name: ZOOKEEPER_PORT + value: '' + + ## env var that describes the library used to execute the chaos + ## default: litmus. Supported values: litmus, powerfulseal, chaoskube + - name: LIB + value: '' + + - name: CHAOSENGINE + value: '' + + - name: CHAOS_SERVICE_ACCOUNT + valueFrom: + fieldRef: + fieldPath: spec.serviceAccountName + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml -vv -i /etc/ansible/hosts; exit 0"] + diff --git a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_ansible_logic.yml b/experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_ansible_logic.yml similarity index 95% rename from experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_ansible_logic.yml rename to experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_ansible_logic.yml index 2cf5450ce05..32ce62c69ea 100644 --- a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_ansible_logic.yml +++ b/experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_ansible_logic.yml @@ -11,6 +11,7 @@ chaos_duration: 600 chaos_iterations: "{{ lookup('env','CHAOS_ITERATIONS') }}" data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" + lib_image: "{{ lookup('env','LIB_IMAGE') }}" liveness_label: "{{ lookup('env','LIVENESS_APP_LABEL') }}" liveness_namespace: "{{ lookup('env','LIVENESS_APP_NAMESPACE') }}" openebs_ns: "{{ lookup('env','OPENEBS_NS') }}" @@ -92,7 +93,7 @@ vars: status: 'LOAD' ns: "{{ a_ns }}" - app_pod_name: "{{ app_pod_name.stdout }}" + pod_name: "{{ app_pod_name.stdout }}" when: data_persistence != '' ## STORAGE FAULT INJECTION @@ -113,7 +114,7 @@ vars: status: 'VERIFY' ns: "{{ a_ns }}" - app_pod_name: "{{ app_pod_name.stdout }}" + pod_name: "{{ app_pod_name.stdout }}" when: data_persistence != '' - name: Get application pod name @@ -129,7 +130,7 @@ vars: status: 'DELETE' ns: "{{ a_ns }}" - app_pod_name: "{{ new_app_pod.stdout }}" + pod_name: "{{ new_app_pod.stdout }}" when: data_persistence != '' # Check application liveness post chaos diff --git a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml b/experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_k8s_job.yml similarity index 82% rename from experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml rename to experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_k8s_job.yml index 2ed4546fe87..c34ce43df9b 100644 --- a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml +++ b/experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_k8s_job.yml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: data-persistence-configmap + name: openebs-pool-container-failure data: parameters.yml: | @@ -24,11 +24,12 @@ spec: image: litmuschaos/ansible-runner:ci imagePullPolicy: Always env: - - name: OPENEBS_NS - value: openebs - - name: ANSIBLE_STDOUT_CALLBACK value: 'default' + + #provide openebs namespace + - name: OPENEBS_NS + value: 'openebs' # provide application namespace - name: APP_NAMESPACE @@ -42,6 +43,11 @@ spec: - name: APP_PVC value: '' + # only pumba supported + # For pumba image use : gaiaadm/pumba:0.4.8 + - name: LIB_IMAGE + value: 'gaiaadm/pumba:0.4.8' + - name: LIVENESS_APP_LABEL value: '' @@ -66,4 +72,4 @@ spec: volumes: - name: parameters configMap: - name: data-persistence-configmap + name: openebs-pool-container-failure diff --git a/experiments/openebs/openebs-pool-pod-failure/README.md b/experiments/openebs/openebs-pool-pod-failure/README.md new file mode 100644 index 00000000000..5df32a2a921 --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/README.md @@ -0,0 +1,121 @@ +## Experiment Metadata + + + + + + + + + + + + + + +
Type Description Storage K8s Platform
Chaos Kill the pool pod and check if gets scheduled again OPENEBS Any
+ +## Entry-Criteria + +- Application services are accessible & pods are healthy +- Application writes are successful + +## Exit-Criteria + +- Application services are accessible & pods are healthy +- Data written prior to chaos is successfully retrieved/read +- Database consistency is maintained as per db integrity check utils +- Storage target pods are healthy + +## Notes + +- Typically used as a disruptive test, to cause loss of access to storage pool by killing it. +- The pool pod should start again and it should be healthy. + +## Associated Utils + +- [cstor_pool_delete.yml](/experiments/openebs/openebs-pool-container-failure/cstor_pool_delete.yml) +- [cstor_pool_health_check.yml](/experiments/openebs/openebs-pool-container-failure/cstor_pool_health_check.yml) +- [cstor_verify_pool_provisioning.yml](/experiments/openebs/openebs-pool-container-failure/cstor_verify_pool_provisioning.yml) +- [cstor_delete_and_verify_pool_deployment.yml](/experiments/openebs/openebs-pool-container-failure/cstor_delete_and_verify_pool_deployment.yml) + +### Procedure + +This scenario validates the behaviour of application and OpenEBS persistent volumes in the amidst of chaos induced on storage pool. The litmus experiment fails the specified pool and thereby losing the access to volumes being created on it. + +After injecting the chaos into the component specified via environmental variable, litmus experiment observes the behaviour of corresponding OpenEBS PV and the application which consumes the volume. + +Based on the value of env DATA_PERSISTENCE, the corresponding data consistency util will be executed. At present only busybox and percona-mysql are supported. Along with specifying env in the litmus experiment, user needs to pass name for configmap and the data consistency specific parameters required via configmap in the format as follows: + + parameters.yml: | + blocksize: 4k + blockcount: 1024 + testfile: difiletest + +It is recommended to pass test-name for configmap and mount the corresponding configmap as volume in the litmus pod. The above snippet holds the parameters required for validation data consistency in busybox application. + +For percona-mysql, the following parameters are to be injected into configmap. + + parameters.yml: | + dbuser: root + dbpassword: k8sDem0 + dbname: tdb + +The configmap data will be utilised by litmus experiments as its variables while executing the scenario. Based on the data provided, litmus checks if the data is consistent after recovering from induced chaos. + +## Litmusbook Environment Variables + +### Application + + + + + + + + + + + + + + + + + +
Parameter + Description
APP_NAMESPACE Namespace in which application pods are deployed
APP_LABEL Unique Labels in `key=value` format of application deployment
APP_PVC Name of persistent volume claim used for app's volume mounts
+ +### Chaos + + + + + + + + + + +
Parameter Description
CHAOS_ITERATIONS The number of chaos iterations
+ +### Health Checks + + + + + + + + + + + + + + + + + +
Parameter + Description
LIVENESS_APP_NAMESPACE Namespace in which external liveness pods are deployed, if any
LIVENESS_APP_LABEL Unique Labels in `key=value` format for external liveness pod, if any
DATA_PERSISTENCE Data accessibility & integrity verification post recovery. To check against busybox set value: "busybox" and for percona, set value: "mysql"
\ No newline at end of file diff --git a/experiments/openebs/openebs-pool-pod-failure/cstor_delete_and_verify_pool_deployment.yml b/experiments/openebs/openebs-pool-pod-failure/cstor_delete_and_verify_pool_deployment.yml new file mode 100644 index 00000000000..f1c79df5c0c --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/cstor_delete_and_verify_pool_deployment.yml @@ -0,0 +1,61 @@ +- name: Randomly select the pool deployment from cvr + shell: > + kubectl get cvr -n {{ openebs_ns }} + -l openebs.io/persistent-volume={{ pv.stdout }} --no-headers + -o=jsonpath='{range .items[*]}{.metadata.labels.cstorpool\.openebs\.io\/name}{"\n"}{end}' | + shuf -n1 | awk '{print $1}' + args: + executable: /bin/bash + register: pool_deployment + +- name: Get the resourceVersion of pool deployment + shell: > + kubectl get deployment {{ pool_deployment.stdout }} + -n {{ openebs_ns }} -o=jsonpath='{.metadata.resourceVersion}' + args: + executable: /bin/bash + register: pool_deployment_revisionSource_before + +- name: Get the pod of pool deployment + shell: > + kubectl get pods -n {{ openebs_ns }} | + grep {{ pool_deployment.stdout }} | grep -w "Running" | awk '{print $1}' + args: + executable: /bin/bash + register: cstor_pool_pod + +# including chaoslib kill-random-pod +- name: Delete the cstor pool pod for reschedule + include_tasks: /chaoslib/litmus/kill_random_pod.yml + vars: + app_ns: "{{ openebs_ns }}" + app_pod_name: "{{ cstor_pool_pod.stdout }}" + +- name: Check for pool pod in running state + shell: > + kubectl get pods -n {{ openebs_ns }} | + grep {{ pool_deployment.stdout }} | grep -v {{ cstor_pool_pod.stdout }} | + grep -w "Running" | wc -l + args: + executable: /bin/bash + register: cstor_pool_pod_cnt + until: "cstor_pool_pod_cnt.stdout == \"1\"" + delay: 30 + retries: 10 + +- name: Get resourceVersion after pod delete + shell: > + kubectl get deployment {{ pool_deployment.stdout }} + -n {{ openebs_ns }} -o=jsonpath='{.metadata.resourceVersion}' + args: + executable: /bin/bash + register: pool_deployment_revisionSource_after + +- name: Compare resourceVersions + debug: + msg: + - "Verified pool pods were restarted by fault injection" + - "Before: {{ pool_deployment_revisionSource_before.stdout }}" + - "After: {{ pool_deployment_revisionSource_after.stdout }}" + failed_when: "pool_deployment_revisionSource_before.stdout|int == pool_deployment_revisionSource_after.stdout|int" + diff --git a/experiments/openebs/openebs-pool-pod-failure/cstor_pool_delete.yml b/experiments/openebs/openebs-pool-pod-failure/cstor_pool_delete.yml new file mode 100644 index 00000000000..2941195bcb9 --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/cstor_pool_delete.yml @@ -0,0 +1,6 @@ +- include: cstor_verify_pool_provisioning.yml + +- include: cstor_delete_and_verify_pool_deployment.yml + loop: "{{ range(0, c_iterations|int, 1)|list }}" + +- include: cstor_pool_health_check.yml diff --git a/experiments/openebs/openebs-pool-pod-failure/cstor_pool_health_check.yml b/experiments/openebs/openebs-pool-pod-failure/cstor_pool_health_check.yml new file mode 100644 index 00000000000..c3885ca8598 --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/cstor_pool_health_check.yml @@ -0,0 +1,16 @@ +- name: Wait (soak) for I/O on pools + wait_for: + timeout: "{{ post_chaos_soak_time }}" + +- name: Fetch the pool pod name from cvr + include_tasks: "/utils/apps/openebs/fetch_podname_from_cvr.yaml" + +- name: Verify logs of pool pods for error strings + shell: > + kubectl logs {{ item }} -n {{ openebs_ns }} + -c cstor-pool | egrep '{{ error_messages }}' + args: + executable: /bin/bash + register: result + with_items: "{{ pool_pod_named_list }}" + failed_when: result.rc == 0 diff --git a/experiments/openebs/openebs-pool-pod-failure/cstor_verify_pool_provisioning.yml b/experiments/openebs/openebs-pool-pod-failure/cstor_verify_pool_provisioning.yml new file mode 100644 index 00000000000..4a953408659 --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/cstor_verify_pool_provisioning.yml @@ -0,0 +1,28 @@ + +- name: Fetch the replica count from storage class + include_tasks: "/utils/apps/openebs/fetch_replica_count_from_sc.yml" + +- name: Fetch the CVR count from pv + include_tasks: "/utils/apps/openebs/fetch_cvr_count_from_pv.yml" + +- name: Compare ReplicaCount and cvr_count to verify provisioning + debug: + msg: + - "replicacnt: {{ replicacnt }}" + - "cvr_count: {{ cvr_count| int }}" + failed_when: "replicacnt|int != cvr_count|int" + +- name: Get CVR status list from pv + shell: > + kubectl get cvr -n {{ openebs_ns }} + -l openebs.io/persistent-volume={{ pv.stdout }} --no-headers + -o jsonpath='{range .items[*]}{.status.phase}{"\n"}{end}' + args: + executable: /bin/bash + register: cvr_status_phase + +- name: Check status of cvr + command: echo "{{ item }}" + failed_when: "item != \"Offline\" and item != \"Degraded\" and item != \"Rebuilding\" and item != \"Healthy\"" + with_items: + - "{{ cvr_status_phase.stdout_lines }}" diff --git a/experiments/openebs/openebs-pool-pod-failure/data_persistence.j2 b/experiments/openebs/openebs-pool-pod-failure/data_persistence.j2 new file mode 100644 index 00000000000..405497dde21 --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/data_persistence.j2 @@ -0,0 +1,5 @@ +{% if data_persistence is defined and data_persistence == 'mysql' %} + consistencyutil: /utils/apps/mysql/mysql_data_persistence.yml + {% elif data_persistence is defined and data_persistence == 'busybox' %} + consistencyutil: /utils/apps/busybox/busybox_data_persistence.yml +{% endif %} \ No newline at end of file diff --git a/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_ansible_logic.yml b/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_ansible_logic.yml new file mode 100644 index 00000000000..8fe0d926e2a --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_ansible_logic.yml @@ -0,0 +1,157 @@ +--- +- hosts: localhost + connection: local + + vars: + a_label: "{{ lookup('env','APP_LABEL') }}" + a_ns: "{{ lookup('env','APP_NAMESPACE') }}" + a_pvc: "{{ lookup('env','APP_PVC') }}" + c_duration: 600 + c_interval: 5 + c_engine: "{{ lookup('env','CHAOSENGINE') }}" + c_experiment: openebs-pool-pod-failure + c_force: "{{ lookup('env','FORCE') }}" + c_iterations: "{{ lookup('env','CHAOS_ITERATIONS') }}" + data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" + liveness_label: "{{ lookup('env','LIVENESS_APP_LABEL') }}" + liveness_namespace: "{{ lookup('env','LIVENESS_APP_NAMESPACE') }}" + openebs_ns: openebs + pool_debug_msg: 'uncorrectable I/O failure|suspended|ERROR ZFS event' + + vars_files: + - /mnt/parameters.yml + - /experiments/openebs/openebs_components.yml + + tasks: + - block: + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - name: Identify the data consistency util to be invoked + template: + src: data_persistence.j2 + dest: data_persistence.yml + + - include_vars: + file: data_persistence.yml + + - name: Record the chaos util path + set_fact: + chaos_util_path: "/experiments/openebs/openebs-pool-pod-failure/cstor_pool_delete.yml" + + - name: Record the data consistency util path + set_fact: + data_consistency_util_path: "{{ consistencyutil }}" + when: data_persistence != '' + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_result: "{{ c_engine }}-{{ c_experiment }}" + + when: c_engine != '' + + ## RECORD START-OF-TEST IN LITMUS RESULT CR + + - include_tasks: /utils/runtime/create_testname.yml + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ a_ns }}" + + ## DISPLAY APP INFORMATION + + - name: Display the app information passed via the test job + debug: + msg: + - "The application info is as follows:" + - "Namespace : {{ a_ns }}" + - "Label : {{ a_label }}" + - "PVC : {{ a_pvc }}" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: app_pod_name + + - name: Verify that the AUT (Application Under Test) is running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + application_name: "{{ app_pod_name.stdout }}" + delay: 5 + retries: 60 + + - name: Create some test data + include: "{{ data_consistency_util_path }}" + vars: + status: 'LOAD' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod_name.stdout }}" + when: data_persistence != '' + + ## STORAGE FAULT INJECTION + + - include: "{{ chaos_util_path }}" + error_messages: "{{ pool_debug_msg }}" + post_chaos_soak_time : "{{ c_duration }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify AUT liveness post fault-injection + include_tasks: "/utils/common/status_app_pod.yml" + vars: + application_name: "{{ app_pod_name.stdout }}" + delay: 5 + retries: 60 + + - name: Verify application data persistence + include: "{{ data_consistency_util_path }}" + vars: + status: 'VERIFY' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod_name.stdout }}" + when: data_persistence != '' + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: new_app_pod + + - name: Verify successful database delete + include: "{{ data_consistency_util_path }}" + vars: + status: 'DELETE' + ns: "{{ a_ns }}" + pod_name: "{{ new_app_pod.stdout }}" + when: data_persistence != '' + + # Check application liveness post chaos + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - set_fact: + flag: "Pass" + + rescue: + - set_fact: + flag: "Fail" + + always: + ## RECORD END-OF-TEST IN LITMUS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ a_ns }}" diff --git a/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_k8s_job.yml b/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_k8s_job.yml new file mode 100644 index 00000000000..ceea91fb857 --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_k8s_job.yml @@ -0,0 +1,71 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: openebs-pool-pod-failure +data: + parameters.yml: | + +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: openebs-pool-pod-failure- +spec: + template: + metadata: + labels: + name: openebs-pool-pod-failure + spec: + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + #provide openebs namespace + - name: OPENEBS_NS + value: 'openebs' + + #provide application namespace + - name: APP_NAMESPACE + value: '' + + #provide application labels + - name: APP_LABEL + value: '' + + #provide application pvc + - name: APP_PVC + value: '' + + - name: FORCE + value: 'true' + + - name: LIVENESS_APP_LABEL + value: '' + + - name: LIVENESS_APP_NAMESPACE + value: '' + + - name: DATA_PERSISTENCE + value: '' + + - name: CHAOS_ITERATIONS + value: '2' + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"] + + volumeMounts: + - name: parameters + mountPath: /mnt/ + volumes: + - name: parameters + configMap: + name: openebs-pool-pod-failure diff --git a/experiments/openebs/openebs-target-container-failure/README.md b/experiments/openebs/openebs-target-container-failure/README.md new file mode 100644 index 00000000000..7ea0b556461 --- /dev/null +++ b/experiments/openebs/openebs-target-container-failure/README.md @@ -0,0 +1,109 @@ +## Experiment Metadata + + + + + + + + + + + + + + +
Type Description Storage K8s Platform
Chaos Kill the cstor target/Jiva controller container and check if gets created again OPENEBS Any
+ +## Entry-Criteria + +- Application services are accessible & pods are healthy +- Application writes are successful + +## Exit-Criteria + +- Application services are accessible & pods are healthy +- Data written prior to chaos is successfully retrieved/read +- Database consistency is maintained as per db integrity check utils +- Storage target pods are healthy + +### Notes + +- Typically used as a disruptive test, to cause loss of access to storage target by killing the containers. +- The container should be created again and it should be healthy. + +## Associated Utils +- [cstor_target_container_kill.yml](/experiments/openebs/openebs-target-container-failure/cstor_target_container_kill.yml) +- [jiva_controller_container_kill.yml](/experiments/openebs/openebs-target-container-failure/jiva_controller_container_kill.yml) +- [fetch_sc_and_provisioner.yml](/utils/apps/openebs/fetch_sc_and_provisioner.yml) +- [target_affinity_check.yml](/utils/apps/openebs/target_affinity_check.yml) + +## Litmus experiment Environment Variables + +### Application + + + + + + + + + + + + + + + + + + + + + +
Parameter + Description
APP_NAMESPACE Namespace in which application pods are deployed
APP_LABEL Unique Labels in `key=value` format of application deployment
APP_PVC Name of persistent volume claim used for app's volume mounts
DATA_PERSISTENCE Specify the application name against which data consistency has to be ensured. Example: busybox
+ +### Chaos + + + + + + + + + + +
CHAOS_TYPE The type of chaos to be induced.
TARGET_CONTAINER The container against which chaos has to be induced.
+ +### Procedure + +This scenario validates the behaviour of application and OpenEBS persistent volumes in the amidst of chaos induced on OpenEBS data plane and control plane components. + +After injecting the chaos into the component specified via environmental variable, litmus experiment observes the behaviour of corresponding OpenEBS PV and the application which consumes the volume. + +Based on the value of env `DATA_PERSISTENCE`, the corresponding data consistency util will be executed. At present only busybox and percona-mysql are supported. Along with specifying env in the litmus experiment, user needs to pass name for configmap and the data consistency specific parameters required via configmap in the format as follows: + +```yml + parameters.yml: | + blocksize: 4k + blockcount: 1024 + testfile: difiletest +``` + +It is recommended to pass test-name for configmap and mount the corresponding configmap as volume in the litmus pod. The above snippet holds the parameters required for validation data consistency in busybox application. + +For percona-mysql, the following parameters are to be injected into configmap. + +```yml + parameters.yml: | + dbuser: root + dbpassword: k8sDemo + dbname: tbd +``` + +The configmap data will be utilised by litmus experiments as its variables while executing the scenario. + +Based on the data provided, litmus checks if the data is consistent after recovering from induced chaos. diff --git a/experiments/openebs/openebs-target-container-failure/chaosutil.j2 b/experiments/openebs/openebs-target-container-failure/chaosutil.j2 new file mode 100644 index 00000000000..dd0c274cc55 --- /dev/null +++ b/experiments/openebs/openebs-target-container-failure/chaosutil.j2 @@ -0,0 +1,7 @@ +{% if stg_prov is defined and stg_prov == 'openebs.io/provisioner-iscsi' %} + {% if stg_engine is defined and stg_engine == 'cstor' %} + chaosutil: /experiments/openebs/openebs-target-container-failure/cstor_target_container_kill.yml + {% else %} + chaosutil: /experiments/openebs/openebs-target-container-failure/jiva_controller_container_kill.yml + {% endif %} +{% endif %} diff --git a/experiments/openebs/openebs-target-container-failure/cstor_target_container_kill.yml b/experiments/openebs/openebs-target-container-failure/cstor_target_container_kill.yml new file mode 100644 index 00000000000..2bae19b7bc6 --- /dev/null +++ b/experiments/openebs/openebs-target-container-failure/cstor_target_container_kill.yml @@ -0,0 +1,67 @@ +--- +- name: Pick the cstor target pod + include_tasks: /utils/apps/openebs/fetch_cstor_target_pod.yml + +- name: Get the restartCount of cstor-istgt container + shell: > + kubectl get pod {{ cstor_target_pod.stdout }} -n {{ openebs_ns }} + -o=jsonpath='{.status.containerStatuses[?(@.name==''"{{ target_container }}"'')].restartCount}' + args: + executable: /bin/bash + register: restartCount_before + +# including pumba chaoslib -> pod-failure-by-sigkill +- include_tasks: /chaoslib/pumba/pod_failure_by_sigkill.yaml + vars: + action: "killapp" + namespace: "{{ openebs_ns }}" + app_pod: "{{ cstor_target_pod.stdout }}" + app_container: "{{ target_container }}" + when: cri == 'docker' + +- include_tasks: /chaoslib/litmus/container_kill/containerd_chaos/crictl-chaos.yml + vars: + action: "killapp" + namespace: "{{ openebs_ns }}" + app_pod: "{{ cstor_target_pod.stdout }}" + app_container: "{{ target_container }}" + when: cri == 'containerd' + +- name: Check for target pod in running state + shell: > + kubectl get pod {{ cstor_target_pod.stdout }} -n {{ openebs_ns }} | + grep -w "Running" | wc -l + args: + executable: /bin/bash + register: cstor_target_pod_cnt + until: "cstor_target_pod_cnt.stdout == \"1\"" + delay: 30 + retries: 10 + +- name: Get the runningStatus of target pod + shell: > + kubectl get pod {{ cstor_target_pod.stdout }} -n {{ openebs_ns }} + -o=jsonpath='{range .status.containerStatuses[*]}{.state}{"\n"}{end}' | + grep -w running | wc -l + args: + executable: /bin/bash + register: runningStatusCount + until: "runningStatusCount.stdout == \"3\"" + delay: 30 + retries: 10 + +- name: Get the restartCount of cstor-istgt container + shell: > + kubectl get pod {{ cstor_target_pod.stdout }} -n {{ openebs_ns }} + -o=jsonpath='{.status.containerStatuses[?(@.name==''"{{target_container}}"'')].restartCount}' + args: + executable: /bin/bash + register: restartCount_after + +- name: Compare restartCounts + debug: + msg: + - "Verified pool pods were restarted by fault injection" + - "Before: {{ restartCount_before.stdout }}" + - "After: {{ restartCount_after.stdout }}" + failed_when: "{{ restartCount_after.stdout|int }} != {{ restartCount_before.stdout|int + 1 }}" diff --git a/experiments/openebs/openebs-target-container-failure/data_persistence.j2 b/experiments/openebs/openebs-target-container-failure/data_persistence.j2 new file mode 100644 index 00000000000..8b0e7e500cc --- /dev/null +++ b/experiments/openebs/openebs-target-container-failure/data_persistence.j2 @@ -0,0 +1,5 @@ +{% if data_persistence is defined and data_persistence == 'mysql' %} + consistencyutil: /utils/apps/mysql/mysql_data_persistence.yml + {% elif data_persistence is defined and data_persistence == 'busybox' %} + consistencyutil: /utils/apps/busybox/busybox_data_persistence.yml +{% endif %} diff --git a/experiments/openebs/openebs-target-container-failure/jiva_controller_container_kill.yml b/experiments/openebs/openebs-target-container-failure/jiva_controller_container_kill.yml new file mode 100644 index 00000000000..b6e11f6aefa --- /dev/null +++ b/experiments/openebs/openebs-target-container-failure/jiva_controller_container_kill.yml @@ -0,0 +1,71 @@ +--- +- name: Pick the jiva controller pod + include_tasks: /utils/apps/openebs/fetch_jiva_controller_pod.yml + +- name: Record jiva controller container name + set_fact: + ctrl_container: "{{ pv.stdout }}-{{ jiva_controller_pod_suffix }}-{{ jiva_controller_container_suffix }}" + +- name: Get the restartCount of ctrl-con container + shell: > + kubectl get pods {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -o=jsonpath='{.status.containerStatuses[?(@.name==''"{{ctrl_container}}"'')].restartCount}' + args: + executable: /bin/bash + register: restartCount_before + +# including pumba chaoslib - pod-failure-by-sigkill +- include_tasks: /chaoslib/pumba/pod_failure_by_sigkill.yaml + vars: + action: "killapp" + app_pod: "{{ jiva_controller_pod.stdout }}" + namespace: "{{ a_ns }}" + app_container: "{{ ctrl_container }}" + when: cri == 'docker' + +- include_tasks: /chaoslib/litmus/container_kill/containerd_chaos/crictl-chaos.yml + vars: + action: "killapp" + app_pod: "{{ jiva_controller_pod.stdout }}" + namespace: "{{ a_ns }}" + app_container: "{{ ctrl_container }}" + when: cri == 'containerd' + +- name: Check if the controller pod is running + shell: > + kubectl get pod {{ jiva_controller_pod.stdout }} -n {{ a_ns }} --no-headers + -o custom-columns=:.status.phase + args: + executable: /bin/bash + register: result + until: "result.stdout == 'Running'" + delay: 5 + retries: 45 + +- name: Check for controller container status + shell: > + kubectl get pod {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -o=jsonpath='{range .status.containerStatuses[*]}{.state}{"\n"}{end}' | + grep -w running | wc -l + args: + executable: /bin/bash + register: runningStatusCount + until: "runningStatusCount.stdout == \"2\"" + delay: 30 + retries: 10 + +- name: Get the restartCount of ctrl-con container + shell: > + kubectl get pods {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -o=jsonpath='{.status.containerStatuses[?(@.name==''"{{ctrl_container}}"'')].restartCount}' + args: + executable: /bin/bash + register: restartCount_after + +- name: Compare restartCounts + debug: + msg: + - "Verified containers restartcounts after fault injection" + - "Before: {{ restartCount_before.stdout }}" + - "After: {{ restartCount_after.stdout }}" + failed_when: "{{ restartCount_after.stdout|int }} != {{ restartCount_before.stdout|int + 1 }}" diff --git a/experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_ansible_logic.yml b/experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_ansible_logic.yml new file mode 100644 index 00000000000..867e4dc02b4 --- /dev/null +++ b/experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_ansible_logic.yml @@ -0,0 +1,163 @@ +--- +- hosts: localhost + connection: local + + vars: + a_label: "{{ lookup('env','APP_LABEL') }}" + a_ns: "{{ lookup('env','APP_NAMESPACE') }}" + a_pvc: "{{ lookup('env','APP_PVC') }}" + c_experiment: openebs-target-failure + c_force: "{{ lookup('env','FORCE') }}" + c_interval: "{{ lookup('env','CHAOS_INTERVAL') }}" + chaos_duration: "{{ lookup('env','CHAOS_DURATION') }}" + cri: "{{ lookup('env','CONTAINER_RUNTIME') }}" + data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" + deploy_type: "{{ lookup('env','DEPLOY_TYPE') }}" + lib_image: "{{ lookup('env','LIB_IMAGE') }}" + liveness_label: "{{ lookup('env','LIVENESS_APP_LABEL') }}" + liveness_namespace: "{{ lookup('env','LIVENESS_APP_NAMESPACE') }}" + openebs_ns: "{{ lookup('env','OPENEBS_NAMESPACE') }}" + target_container: "{{ lookup('env','TARGET_CONTAINER') }}" + + vars_files: + - /mnt/parameters.yml + - /experiments/openebs/openebs_components.yml + + tasks: + - block: + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - include: test_prerequisites.yml + + - include_vars: + file: data_persistence.yml + + - include_vars: + file: chaosutil.yml + + - name: Record the chaos util path + set_fact: + chaos_util_path: "{{ chaosutil }}" + + - name: Record the data consistency util path + set_fact: + data_consistency_util_path: "{{ consistencyutil }}" + when: data_persistence != '' + + - include_tasks: /utils/runtime/create_testname.yml + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_experiment: "{{ lookup('env','CHAOSENGINE') }}-{{ c_experiment }}" + + when: lookup('env','CHAOSENGINE') + + ## RECORD START-OF-TEST IN CHAOS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ a_ns }}" + + ## DISPLAY APP INFORMATION + + - name: Display the app information passed via the test job + debug: + msg: + - "The application info is as follows:" + - "Namespace : {{ a_ns }}" + - "Target Namespace : {{ openebs_ns }}" + - "Label : {{ a_label }}" + - "PVC : {{ a_pvc }}" + - "StorageClass : {{ sc }}" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify that the AUT (Application Under Test) is running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + application_name: "{{ app_pod_name.stdout }}" + delay: 5 + retries: 60 + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: app_pod_name + + - name: Create some test data + include: "{{ data_consistency_util_path }}" + vars: + status: 'LOAD' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod_name.stdout }}" + when: data_persistence != '' + + ## STORAGE FAULT INJECTION + + - include: "{{ chaos_util_path }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + - name: Wait (soak) for I/O on pools + wait_for: + timeout: "{{ chaos_duration }}" + + - name: Verify AUT liveness post fault-injection + include_tasks: "/utils/common/status_app_pod.yml" + vars: + application_name: "{{ app_pod_name.stdout }}" + delay: 5 + retries: 60 + + ## POST-CHAOS APPLICATION LIVENESS CHECK + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: rescheduled_app_pod + + - name: Verify application data persistence + include: "{{ data_consistency_util_path }}" + vars: + status: 'VERIFY' + ns: "{{ a_ns }}" + pod_name: "{{ rescheduled_app_pod.stdout }}" + when: data_persistence != '' + + ## Check application-target pod affinity + - include_tasks: /utils/apps/openebs/target_affinity_check.yml + when: deploy_type == 'deployment' + + ## Check statefulset application-target pod affinity + - include_tasks: /utils/apps/openebs/sts_target_affinity_check.yml + when: deploy_type == 'statefulset' + + - set_fact: + flag: "Pass" + + rescue: + - set_fact: + flag: "Fail" + + always: + + ## RECORD END-OF-TEST IN CHAOS RESULT CR + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ a_ns }}" diff --git a/experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_k8s_job.yml b/experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_k8s_job.yml new file mode 100644 index 00000000000..978b5257fb9 --- /dev/null +++ b/experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_k8s_job.yml @@ -0,0 +1,95 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: openebs-target-container-failure +data: + parameters.yml: | + +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: openebs-target-container-failure- +spec: + template: + metadata: + labels: + name: openebs-target-container-failure + spec: + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + # provide application namespace + - name: APP_NAMESPACE + value: '' + + # provide openebs namespace + - name: OPENEBS_NAMESPACE + value: 'openebs' + + # provide application label + - name: APP_LABEL + value: '' + + # provide application pvc + - name: APP_PVC + value: '' + + # it can be true or false, depending upon scenario - allowed force deletion or not + - name: FORCE + value: 'true' + + - name: LIVENESS_APP_LABEL + value: '' + + # LIB_IMAGE can be - gaiaadm/pumba:0.4.8, gprasath/crictl:ci + # For pumba image use : gaiaadm/pumba:0.4.8 + # For containerd image use : gprasath/crictl:ci + - name: LIB_IMAGE + value: 'gaiaadm/pumba:0.4.8' + + - name: LIVENESS_APP_NAMESPACE + value: '' + + - name: DATA_PERSISTENCE + value: '' + + - name: CHAOS_INTERVAL + value: '5' + + - name: CHAOS_DURATION + value: '120' + + # Specify the container runtime used , to pick the relevant chaos util + - name: CONTAINER_RUNTIME + value: 'docker' + + # TARGET_CONTAINER values: cstor-volume-mgmt , cstor-istgt + # For cstor-volume-istgt container kill use : cstor-istgt + # For volume-mgmt-kill container use : cstor-volume-mgmt + + - name: TARGET_CONTAINER + value: 'cstor-volume-mgmt' + + # DEPLOY_TYPE values: deployment, statefulset + - name: DEPLOY_TYPE + value: 'deployment' + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"] + + volumeMounts: + - name: parameters + mountPath: /mnt/ + volumes: + - name: parameters + configMap: + name: openebs-target-container-failure diff --git a/experiments/openebs/openebs-target-container-failure/test_prerequisites.yml b/experiments/openebs/openebs-target-container-failure/test_prerequisites.yml new file mode 100644 index 00000000000..16dbe18619b --- /dev/null +++ b/experiments/openebs/openebs-target-container-failure/test_prerequisites.yml @@ -0,0 +1,36 @@ +--- +- name: Fetch sc and provisioner + include_tasks: /utils/apps/openebs/fetch_sc_and_provisioner.yml + +- block: + - name: Derive PV name from PVC to query storage engine type (openebs) + shell: > + kubectl get pvc {{ a_pvc }} -n {{ a_ns }} + --no-headers -o custom-columns=:spec.volumeName + args: + executable: /bin/bash + register: pv + + - name: Check for presence & value of cas type annotation + shell: > + kubectl get pv {{ pv.stdout }} --no-headers + -o jsonpath="{.metadata.annotations.openebs\\.io/cas-type}" + args: + executable: /bin/bash + register: openebs_stg_engine + + - name: Record the storage engine name + set_fact: + stg_engine: "{{ openebs_stg_engine.stdout }}" + when: stg_prov == "openebs.io/provisioner-iscsi" + +- name: Identify the chaos util to be invoked + template: + src: chaosutil.j2 + dest: chaosutil.yml + +- name: Identify the data consistency util to be invoked + template: + src: data_persistence.j2 + dest: data_persistence.yml + diff --git a/experiments/openebs/openebs-target-network-delay/README.md b/experiments/openebs/openebs-target-network-delay/README.md new file mode 100644 index 00000000000..b359ad3de42 --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/README.md @@ -0,0 +1,128 @@ +## Experiment Metadata + + + + + + + + + + + + + + +
Type Description Storage K8s Platform
Chaos Inject delay in storage target and verify the application availability OPENEBS Any
+ +## Entry-Criteria + +- Application services are accessible & pods are healthy +- Application writes are successful + +## Exit-Criteria + +- Application services are accessible & pods are healthy +- Data written prior to chaos is successfully retrieved/read +- Database consistency is maintained as per db integrity check utils +- Storage target pods are healthy + +## Notes + +- Typically used as a disruptive test, to cause loss of access to storage target by injecting network delay using pumba. +- The application pod should be healthy once it gets recovered. + +## Associated Utils + +- [cstor_target_network_delay.yaml](/experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml) +- [jiva_controller_network_delay.yaml](/experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml) +- [fetch_sc_and_provisioner.yml](/utils/apps/openebs/fetch_sc_and_provisioner.yml) + +## Litmusbook Environment Variables + +### Application + + + + + + + + + + + + + + + + + +
Parameter + Description
APP_NAMESPACE Namespace in which application pods are deployed
APP_LABEL Unique Labels in `key=value` format of application deployment
APP_PVC Name of persistent volume claim used for app's volume mounts
+ +### Chaos + + + + + + + + + + + + + + +
Parameter Description
NETWORK_DELAY The time interval in milliseconds
CHAOS_DURATION The time interval for chaos insertion
+ +### Health Checks + + + + + + + + + + + + + + + + + +
Parameter + Description
LIVENESS_APP_NAMESPACE Namespace in which external liveness pods are deployed, if any
LIVENESS_APP_LABEL Unique Labels in `key=value` format for external liveness pod, if any
DATA_PERSISTENCY Data accessibility & integrity verification post recovery (enabled, disabled)
+ +### Procedure +​ +This scenario validates the behaviour of application and OpenEBS persistent volumes in the amidst of chaos induced on OpenEBS data plane and control plane components. +​ +After injecting the chaos into the component specified via environmental variable, litmus experiment observes the behaviour of corresponding OpenEBS PV and the application which consumes the volume. + +Based on the value of env DATA_PERSISTENCE, the corresponding data consistency util will be executed. At present only busybox and percona-mysql are supported. Along with specifying env in the litmus experiment, user needs to pass name for configmap and the data consistency specific parameters required via configmap in the format as follows: + +```yml + parameters.yml: | + blocksize: 4k + blockcount: 1024 + testfile: difiletest +``` + +It is recommended to pass test-name for configmap and mount the corresponding configmap as volume in the litmus pod. The above snippet holds the parameters required for validation data consistency in busybox application. + +For percona-mysql, the following parameters are to be injected into configmap. + +```yml + parameters.yml: | + dbuser: root + dbpassword: k8sDem0 + dbname: tdb +``` + +The configmap data will be utilised by litmus experiments as its variables while executing the scenario. Based on the data provided, litmus checks if the data is consistent after recovering from induced chaos. \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-delay/chaosutil.j2 b/experiments/openebs/openebs-target-network-delay/chaosutil.j2 new file mode 100644 index 00000000000..5637a89b8b6 --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/chaosutil.j2 @@ -0,0 +1,7 @@ +{% if stg_prov is defined and stg_prov == 'openebs.io/provisioner-iscsi' %} + {% if stg_engine is defined and stg_engine == 'cstor' %} + chaosutil: /experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml + {% else %} + chaosutil: /experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml + {% endif %} +{% endif %} diff --git a/experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml b/experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml new file mode 100644 index 00000000000..098d3285a85 --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml @@ -0,0 +1,18 @@ +--- +- name: Pick a cStor target pod belonging to the PV + shell: > + kubectl get pods -l {{ cstor_target_pod_label }} + -n {{ openebs_ns }} -o jsonpath='{.items[?(@.metadata.labels.openebs\.io/persistent-volume=="{{ pv_name }}")].metadata.name}' + args: + executable: /bin/bash + register: cstor_target_pod + +# including pumba lib -> network_chaos +- name: Inject egress delay of {{ network_delay }}ms on cstor target for {{ chaos_duration }}ms + include_tasks: /chaoslib/pumba/network_chaos/network_chaos.yml + vars: + n_interface: "eth0" + n_latency: "{{ n_delay }}" + c_container: "cstor-istgt" + app_pod: "{{ cstor_target_pod.stdout }}" + app_ns: "{{ openebs_ns }}" diff --git a/experiments/openebs/openebs-target-network-delay/data_persistence.j2 b/experiments/openebs/openebs-target-network-delay/data_persistence.j2 new file mode 100644 index 00000000000..8b0e7e500cc --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/data_persistence.j2 @@ -0,0 +1,5 @@ +{% if data_persistence is defined and data_persistence == 'mysql' %} + consistencyutil: /utils/apps/mysql/mysql_data_persistence.yml + {% elif data_persistence is defined and data_persistence == 'busybox' %} + consistencyutil: /utils/apps/busybox/busybox_data_persistence.yml +{% endif %} diff --git a/experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml b/experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml new file mode 100644 index 00000000000..c203acb0773 --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml @@ -0,0 +1,69 @@ +--- +- name: Identify the jiva controller pod belonging to the PV + shell: > + kubectl get pods -l {{ jiva_controller_pod_label }} + -n {{ a_ns }} -o jsonpath='{.items[?(@.metadata.labels.openebs\.io/persistent-volume=="{{ pv_name }}")].metadata.name}' + args: + executable: /bin/bash + register: jiva_controller_pod + +- name: Record the jiva controller pod and container name + set_fact: + jiva_controller_container_name: "{{ pv.stdout }}-{{ jiva_controller_pod_suffix }}-{{ jiva_controller_container_suffix }}" + jiva_controller_pod_name: "{{ jiva_controller_pod.stdout }}" + +- name: Get controller svc + shell: > + kubectl get svc -l {{ jiva_controller_svc_label }} + -n {{ a_ns }} -o=jsonpath='{.items[0].spec.clusterIP}' + args: + executable: /bin/bash + register: controller_svc + failed_when: controller_svc.stdout == "" + +- name: Install jq package inside a controller container + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} -c {{ jiva_controller_container_name }} + -- bash -c "apt-get update && apt-get install -y jq && apt-get install -y iproute2" + args: + executable: /bin/bash + +- name: Getting the ReplicaCount before injecting delay + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_container_name }} curl http://"{{controller_svc.stdout}}":9501/v1/volumes | jq -r '.data[].replicaCount' + args: + executable: /bin/bash + register: rcount_before + +# including pumba lib -> network_chaos +- name: Inject egress delay of {{ n_delay }}ms on jiva controller for {{ c_duration }}ms + include_tasks: /chaoslib/pumba/network_chaos/network_chaos.yml + vars: + n_interface: "eth0" + n_latency: "{{ n_delay }}" + c_container: "{{ jiva_controller_container_name }}" + app_pod: "{{ jiva_controller_pod_name }}" + app_ns: "{{ a_ns }}" + +- name: Verifying the Replica getting disconnected + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_container_name }} curl http://"{{controller_svc.stdout}}":9501/v1/volumes | jq -r '.data[].replicaCount' + args: + executable: /bin/bash + register: resp + until: resp.stdout != rcount_before.stdout + retries: 10 + delay: 15 + +- name: Verifying the replicas post network recovery + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_container_name }} curl http://"{{controller_svc.stdout}}":9501/v1/volumes | jq -r '.data[].replicaCount' + args: + executable: /bin/bash + register: replica + until: replica.stdout == rcount_before.stdout + retries: 10 + delay: 15 diff --git a/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_ansible_logic.yml b/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_ansible_logic.yml new file mode 100644 index 00000000000..98007b43c6c --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_ansible_logic.yml @@ -0,0 +1,137 @@ +--- +- hosts: localhost + connection: local + + vars: + a_label: "{{ lookup('env','APP_LABEL') }}" + a_ns: "{{ lookup('env','APP_NAMESPACE') }}" + a_pvc: "{{ lookup('env','APP_PVC') }}" + c_duration: "{{ lookup('env','CHAOS_DURATION') }}" + c_experiment: openebs-target-network-delay + lib_image: "{{ lookup('env','LIB_IMAGE') }}" + data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" + liveness_label: "{{ lookup('env','LIVENESS_APP_LABEL') }}" + liveness_namespace: "{{ lookup('env','LIVENESS_APP_NAMESPACE') }}" + n_delay: "{{ lookup('env','NETWORK_DELAY') }}" + openebs_ns: "{{ lookup('env','OPENEBS_NAMESPACE') }}" + + vars_files: + - /mnt/parameters.yml + - /experiments/openebs/openebs_components.yml + + tasks: + - block: + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + # Create test name append with run_id + - include_tasks: /utils/runtime/create_testname.yml + + - include: test_prerequisites.yml + + - include_vars: + file: data_persistence.yml + + - include_vars: + file: chaosutil.yml + + - name: Record the chaos util path + set_fact: + chaos_util_path: "{{ chaosutil }}" + + - name: Record the data consistency util path + set_fact: + data_consistency_util_path: "{{ consistencyutil }}" + when: data_persistence != '' + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_experiment: "{{ lookup('env','CHAOSENGINE') }}-{{ c_experiment }}" + + when: lookup('env','CHAOSENGINE') + + ## RECORD START-OF-TEST IN CHAOS RESULT CR + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ a_ns }}" + + ## DISPLAY APP INFORMATION + + - name: Display the app information passed via the test job + debug: + msg: + - "The application info is as follows:" + - "Namespace : {{ a_ns }}" + - "Label : {{ a_label }}" + - "PVC : {{ a_pvc }}" + - "StorageClass : {{ sc }}" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify that the AUT (Application Under Test) is running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + delay: 5 + retries: 60 + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: app_pod_name + + - name: Create some test data + include: "{{ data_consistency_util_path }}" + vars: + status: 'LOAD' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod_name.stdout }}" + when: data_persistence != '' + + ## STORAGE FAULT INJECTION + + - include: "{{ chaos_util_path }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify AUT liveness post fault-injection + include_tasks: "/utils/common/status_app_pod.yml" + vars: + delay: 5 + retries: 60 + + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - name: Verify application data persistence + include: "{{ data_consistency_util_path }}" + vars: + status: 'VERIFY' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod_name.stdout }}" + when: data_persistence != '' + + - set_fact: + flag: "Pass" + + rescue: + - set_fact: + flag: "Fail" + + always: + + ## RECORD END-OF-TEST IN CHAOS RESULT CR + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ a_ns }}" diff --git a/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_k8s_job.yml b/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_k8s_job.yml new file mode 100644 index 00000000000..eabd32bcad9 --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_k8s_job.yml @@ -0,0 +1,76 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: openebs-target-network-delay +data: + parameters.yml: | + +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: openebs-target-network-delay- +spec: + template: + metadata: + labels: + name: openebs-target-network-delay + spec: + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + # provide openebs namespace + - name: OPENEBS_NAMESPACE + value: 'openebs' + + # provide application namespace + - name: APP_NAMESPACE + value: '' + + # provide application label + - name: APP_LABEL + value: '' + + # provide application pvc + - name: APP_PVC + value: '' + + # only pumba supported + # For pumba image use : gaiaadm/pumba:0.4.8 + - name: LIB_IMAGE + value: 'gaiaadm/pumba:0.4.8' + + - name: NETWORK_DELAY + value: '60000' # in milliseconds + + - name: CHAOS_DURATION + value: '60000' # in milliseconds + + - name: LIVENESS_APP_LABEL + value: '' + + - name: LIVENESS_APP_NAMESPACE + value: '' + + - name: DATA_PERSISTENCE + value: '' + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"] + + volumeMounts: + - name: parameters + mountPath: /mnt/ + volumes: + - name: parameters + configMap: + name: openebs-target-network-delay + \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-delay/test_prerequisites.yml b/experiments/openebs/openebs-target-network-delay/test_prerequisites.yml new file mode 100644 index 00000000000..589ebafdd41 --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/test_prerequisites.yml @@ -0,0 +1,39 @@ +--- +- name: Fetch sc and provisioner + include_tasks: /utils/apps/openebs/fetch_sc_and_provisioner.yml + +- block: + - name: Derive PV name from PVC to query storage engine type (openebs) + shell: > + kubectl get pvc {{ a_pvc }} -n {{ a_ns }} + --no-headers -o custom-columns=:spec.volumeName + args: + executable: /bin/bash + register: pv + + - name: Record pv name + set_fact: + pv_name: "{{ pv.stdout }}" + + - name: Check for presence & value of cas type annotation + shell: > + kubectl get pv {{ pv_name }} --no-headers + -o jsonpath="{.metadata.annotations.openebs\\.io/cas-type}" + args: + executable: /bin/bash + register: openebs_stg_engine + + - name: Record the storage engine name + set_fact: + stg_engine: "{{ openebs_stg_engine.stdout }}" + when: stg_prov == "openebs.io/provisioner-iscsi" + +- name: Identify the chaos util to be invoked + template: + src: chaosutil.j2 + dest: chaosutil.yml + +- name: Identify the data consistency util to be invoked + template: + src: data_persistence.j2 + dest: data_persistence.yml \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-loss/README.md b/experiments/openebs/openebs-target-network-loss/README.md new file mode 100644 index 00000000000..bccde806998 --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/README.md @@ -0,0 +1,128 @@ +## Experiment Metadata + + + + + + + + + + + + + + + + +
Type Description Storage Application K8s Platform
Chaos Inject n/w delay on storage target/controller OPENEBS Percona MySQL Any
+ +## Entry-Criteria + +- Application services are accessible & pods are healthy +- Application writes are successful + +## Exit-Criteria + +- Application services are accessible & pods should not be in running state +- Storage target pods are healthy + +## Notes + +- Typically used as a disruptive test, to cause loss of access to storage by injecting prolonged network delay +- Tests Recovery workflows for the PV & data integrity post recovery + +## Associated Utils + +- [cstor_target_network_delay.yaml](/experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml) +- [jiva_controller_network_delay.yaml](/experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml) +- [fetch_sc_and_provisioner.yml](/utils/apps/openebs/fetch_sc_and_provisioner.yml) + +## Litmus experiment Environment Variables + +### Application + + + + + + + + + + + + + + + + + +
Parameter + Description
APP_NAMESPACE Namespace in which application pods are deployed
APP_LABEL Unique Labels in `key=value` format of application deployment
APP_PVC Name of persistent volume claim used for app's volume mounts
+ +### Chaos + + + + + + + + + + + + + + +
Parameter Description
NETWORK_DELAY Egress delay (in msec) on the target pod
CHAOS_DURATION Period (in sec)for which induced delay is maintained
+ +### Health Checks + + + + + + + + + + + + + + + + + +
Parameter + Description
LIVENESS_APP_NAMESPACE Namespace in which external liveness pods are deployed, if any
LIVENESS_APP_LABEL Unique Labels in `key=value` format for external liveness pod, if any
DATA_PERSISTENCE Data accessibility & integrity verification post recovery (enabled, disabled)
+ +### Procedure + +This scenario validates the behaviour of application and OpenEBS persistent volumes in the amidst of chaos induced on OpenEBS data plane and control plane components. + +After injecting the chaos into the component specified via environmental variable, litmus experiment observes the behaviour of corresponding OpenEBS PV and the application which consumes the volume. + +Based on the value of env DATA_PERSISTENCE, the corresponding data consistency util will be executed. At present only busybox and percona-mysql are supported. Along with specifying env in the litmus experiment, user needs to pass name for configmap and the data consistency specific parameters required via configmap in the format as follows: + +```yml + parameters.yml: | + blocksize: 4k + blockcount: 1024 + testfile: difiletest +``` + +It is recommended to pass test-name for configmap and mount the corresponding configmap as volume in the litmus pod. The above snippet holds the parameters required for validation data consistency in busybox application. + +For percona-mysql, the following parameters are to be injected into configmap. + +```yml + parameters.yml: | + dbuser: root + dbpassword: k8sDem0 + dbname: tdb +``` + +The configmap data will be utilised by litmus experiments as its variables while executing the scenario. Based on the data provided, litmus checks if the data is consistent after recovering from induced chaos. diff --git a/experiments/openebs/openebs-target-network-loss/chaosutil.j2 b/experiments/openebs/openebs-target-network-loss/chaosutil.j2 new file mode 100644 index 00000000000..8ca7686917f --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/chaosutil.j2 @@ -0,0 +1,7 @@ +{% if stg_prov is defined and stg_prov == 'openebs.io/provisioner-iscsi' %} + {% if stg_engine is defined and stg_engine == 'cstor' %} + chaosutil: /experiments/openebs/openebs-target-network-loss/cstor_target_network_delay.yaml + {% else %} + chaosutil: /experiments/openebs/openebs-target-network-loss/jiva_controller_network_delay.yaml + {% endif %} +{% endif %} diff --git a/experiments/openebs/openebs-target-network-loss/cstor_target_network_delay.yaml b/experiments/openebs/openebs-target-network-loss/cstor_target_network_delay.yaml new file mode 100644 index 00000000000..13979d1d2ca --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/cstor_target_network_delay.yaml @@ -0,0 +1,19 @@ +--- +- name: Pick a cStor target pod belonging to the PV + shell: > + kubectl get pods -l {{ cstor_target_pod_label }} + -n {{ openebs_ns }} -o jsonpath='{.items[?(@.metadata.labels.openebs\.io/persistent-volume=="{{ pv_name }}")].metadata.name}' + args: + executable: /bin/bash + register: cstor_target_pod + +# including pumba lib -> network_chaos +- name: Inject egress delay of {{ network_delay }}ms on cstor target for {{ chaos_duration }}ms + include_tasks: /chaoslib/pumba/network_chaos/network_chaos.yml + vars: + n_interface: "eth0" + n_packet_loss: "{{ packet_loss_perc }}" + c_container: "cstor-istgt" + app_pod: "{{ cstor_target_pod.stdout }}" + app_ns: "{{ openebs_ns }}" + \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-loss/data_persistence.j2 b/experiments/openebs/openebs-target-network-loss/data_persistence.j2 new file mode 100644 index 00000000000..8b0e7e500cc --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/data_persistence.j2 @@ -0,0 +1,5 @@ +{% if data_persistence is defined and data_persistence == 'mysql' %} + consistencyutil: /utils/apps/mysql/mysql_data_persistence.yml + {% elif data_persistence is defined and data_persistence == 'busybox' %} + consistencyutil: /utils/apps/busybox/busybox_data_persistence.yml +{% endif %} diff --git a/experiments/openebs/openebs-target-network-loss/jiva_controller_network_delay.yaml b/experiments/openebs/openebs-target-network-loss/jiva_controller_network_delay.yaml new file mode 100644 index 00000000000..733a5facf3a --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/jiva_controller_network_delay.yaml @@ -0,0 +1,70 @@ +--- +- name: Identify the jiva controller pod belonging to the PV + shell: > + kubectl get pods -l {{ jiva_controller_pod_label }} + -n {{ a_ns }} -o jsonpath='{.items[?(@.metadata.labels.openebs\.io/persistent-volume=="{{ pv_name }}")].metadata.name}' + args: + executable: /bin/bash + register: jiva_controller_pod + +- name: Record the jiva controller pod and container name + set_fact: + jiva_controller_container_name: "{{ pv.stdout }}-{{ jiva_controller_pod_suffix }}-{{ jiva_controller_container_suffix }}" + jiva_controller_pod_name: "{{ jiva_controller_pod.stdout }}" + +- name: Get controller svc + shell: > + kubectl get svc -l {{ jiva_controller_svc_label }} + -n {{ a_ns }} -o=jsonpath='{.items[0].spec.clusterIP}' + args: + executable: /bin/bash + register: controller_svc + failed_when: controller_svc.stdout == "" + +- name: Install jq package inside a controller container + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} -c {{ jiva_controller_container_name }} + -- bash -c "apt-get update && apt-get install -y jq && apt-get install -y iproute2" + args: + executable: /bin/bash + +- name: Getting the ReplicaCount before injecting delay + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_container_name }} curl http://"{{controller_svc.stdout}}":9501/v1/volumes | jq -r '.data[].replicaCount' + args: + executable: /bin/bash + register: rcount_before + +# including pumba lib -> network_chaos +- name: Inject egress delay of {{ n_delay }}ms on jiva controller for {{ c_duration }}ms + include_tasks: /chaoslib/pumba/network_chaos/network_chaos.yml + vars: + n_interface: "eth0" + n_packet_loss: "{{ packet_loss_perc }}" + c_container: "{{ jiva_controller_container_name }}" + app_pod: "{{ jiva_controller_pod_name }}" + app_ns: "{{ a_ns }}" + +- name: Verifying the Replica getting disconnected + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_container_name }} curl http://"{{controller_svc.stdout}}":9501/v1/volumes | jq -r '.data[].replicaCount' + args: + executable: /bin/bash + register: resp + until: resp.stdout != rcount_before.stdout + retries: 10 + delay: 15 + +- name: Verifying the replicas post network recovery + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_container_name }} curl http://"{{controller_svc.stdout}}":9501/v1/volumes | jq -r '.data[].replicaCount' + args: + executable: /bin/bash + register: replica + until: replica.stdout == rcount_before.stdout + retries: 10 + delay: 15 + \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_ansible_logic.yml b/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_ansible_logic.yml new file mode 100644 index 00000000000..3edb78bfe3b --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_ansible_logic.yml @@ -0,0 +1,153 @@ +--- +- hosts: localhost + connection: local + + vars: + a_label: "{{ lookup('env','APP_LABEL') }}" + a_ns: "{{ lookup('env','APP_NAMESPACE') }}" + a_pvc: "{{ lookup('env','APP_PVC') }}" + c_duration: "{{ lookup('env','CHAOS_DURATION') }}" + c_experiment: "openebs-target-network-loss" + c_force: "{{ lookup('env','FORCE') }}" + c_interval: "5" + data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" + lib_image: "{{ lookup('env','LIB_IMAGE') }}" + liveness_label: "{{ lookup('env','LIVENESS_APP_LABEL') }}" + liveness_namespace: "{{ lookup('env','LIVENESS_APP_NAMESPACE') }}" + packet_loss_perc: "{{ lookup('env','NETWORK_PACKET_LOSS_PERCENTAGE') }}" + openebs_ns: "{{ lookup('env','OPENEBS_NAMESPACE') }}" + + vars_files: + - /mnt/parameters.yml + - /experiments/openebs/openebs_components.yml + + tasks: + - block: + + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + ## DERIVE THE APP STORAGE CLASS AND CHAOS UTIL TO USE + + - include: test_prerequisites.yml + + - include_vars: + file: data_persistence.yml + + - include_vars: + file: chaosutil.yml + + - name: Record the chaos util path + set_fact: + chaos_util_path: "{{ chaosutil }}" + + - name: Record the data consistency util path + set_fact: + data_consistency_util_path: "{{ consistencyutil }}" + when: data_persistence != '' + + ## RECORD START-OF-TEST IN CHAOS RESULT CR + + - include_tasks: /utils/runtime/create_testname.yml + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_experiment: "{{ lookup('env','CHAOSENGINE') }}-{{ c_experiment }}" + + when: lookup('env','CHAOSENGINE') + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ a_ns }}" + + ## DISPLAY APP INFORMATION + + - name: Display the app information passed via the test job + debug: + msg: + - "The application info is as follows:" + - "Namespace : {{ a_ns }}" + - "Label : {{ a_label }}" + - "PVC : {{ a_pvc }}" + - "StorageClass : {{ sc }}" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + - name: Verify that the AUT (Application Under Test) is running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + delay: 5 + retries: 60 + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: app_pod + + - name: Create some test data + include: "{{ data_consistency_util_path }}" + vars: + status: 'LOAD' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod.stdout }}" + when: data_persistence != '' + + ## STORAGE FAULT INJECTION + + - include: "{{ chaos_util_path }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + # including chaoslib kill-random-pod + - name: Kill the application pod + include_tasks: /chaoslib/litmus/kill_random_pod.yml + vars: + app_ns: "{{ a_ns }}" + app_pod_name: "{{ app_pod.stdout }}" + + - name: Verify if the application pod is deleted + shell: > + kubectl get pods -n {{ a_ns }} + args: + executable: /bin/bash + register: podstatus + until: '"{{ app_pod.stdout }}" not in podstatus.stdout' + retries: 2 + delay: 150 + + - name: Obtain the newly created pod name for application + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} -o jsonpath='{.items[].metadata.name}' + args: + executable: /bin/bash + register: newpod_name + + - name: Checking application pod is not in running state + shell: kubectl get pods -n {{ a_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ newpod_name.stdout }}")].status.containerStatuses[*].state.waiting.reason}' + register: result + until: "((result.stdout.split()|unique)|length) == 1 and 'Running' not in result.stdout" + delay: 2 + retries: 150 + + - set_fact: + flag: "Pass" + + rescue: + - set_fact: + flag: "Fail" + + always: + + ## RECORD END-OF-TEST IN CHAOS RESULT CR + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ a_ns }}" + \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml b/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml new file mode 100644 index 00000000000..bd8fd7f4cd8 --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml @@ -0,0 +1,79 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: openebs-target-network-loss +data: + parameters.yml: | + +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: openebs-target-network-loss- +spec: + template: + metadata: + labels: + name: openebs-target-network-loss + spec: + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + # provide openebs namespace + - name: OPENEBS_NAMESPACE + value: 'openebs' + + # provide application namespace + - name: APP_NAMESPACE + value: '' + + # provide application label + - name: APP_LABEL + value: '' + + - name: FORCE + value: 'true' + + # provide application pvc + - name: APP_PVC + value: '' + + # only pumba supported + # For pumba image use : gaiaadm/pumba:0.4.8 + - name: LIB_IMAGE + value: 'gaiaadm/pumba:0.4.8' + + - name: NETWORK_PACKET_LOSS_PERCENTAGE + value: '100' # in percentage + + - name: CHAOS_DURATION + value: '240000' # in milliseconds + + - name: LIVENESS_APP_LABEL + value: '' + + - name: LIVENESS_APP_NAMESPACE + value: '' + + - name: DATA_PERSISTENCE + value: '' + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"] + + volumeMounts: + - name: parameters + mountPath: /mnt/ + volumes: + - name: parameters + configMap: + name: openebs-target-network-loss + \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-loss/test_prerequisites.yml b/experiments/openebs/openebs-target-network-loss/test_prerequisites.yml new file mode 100644 index 00000000000..589ebafdd41 --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/test_prerequisites.yml @@ -0,0 +1,39 @@ +--- +- name: Fetch sc and provisioner + include_tasks: /utils/apps/openebs/fetch_sc_and_provisioner.yml + +- block: + - name: Derive PV name from PVC to query storage engine type (openebs) + shell: > + kubectl get pvc {{ a_pvc }} -n {{ a_ns }} + --no-headers -o custom-columns=:spec.volumeName + args: + executable: /bin/bash + register: pv + + - name: Record pv name + set_fact: + pv_name: "{{ pv.stdout }}" + + - name: Check for presence & value of cas type annotation + shell: > + kubectl get pv {{ pv_name }} --no-headers + -o jsonpath="{.metadata.annotations.openebs\\.io/cas-type}" + args: + executable: /bin/bash + register: openebs_stg_engine + + - name: Record the storage engine name + set_fact: + stg_engine: "{{ openebs_stg_engine.stdout }}" + when: stg_prov == "openebs.io/provisioner-iscsi" + +- name: Identify the chaos util to be invoked + template: + src: chaosutil.j2 + dest: chaosutil.yml + +- name: Identify the data consistency util to be invoked + template: + src: data_persistence.j2 + dest: data_persistence.yml \ No newline at end of file diff --git a/experiments/openebs/openebs-target-pod-failure/README.md b/experiments/openebs/openebs-target-pod-failure/README.md new file mode 100644 index 00000000000..318116c8113 --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/README.md @@ -0,0 +1,98 @@ +## Experiment Metadata + + + + + + + + + + + + + + +
Type Description Storage K8s Platform
Chaos Kill the cstor/jiva target/controller pod and check if gets created again OPENEBS Any
+ +## Entry-Criteria + +- Application services are accessible & pods are healthy +- Application writes are successful + +## Exit-Criteria + +- Application services are accessible & pods are healthy +- Data written prior to chaos is successfully retrieved/read +- Database consistency is maintained as per db integrity check utils +- Storage target pods are healthy + +### Notes + +- Typically used as a disruptive test, to cause loss of access to storage target by killing the containers. +- The container should be created again and it should be healthy. + +## Associated Utils +- [cstor_target_failure.yaml](/experiments/openebs/openebs-target-pod-failure/cstor_target_failure.yaml) +- [jiva_controller_pod_failure.yaml](/experiments/openebs/openebs-target-pod-failure/jiva_controller_pod_failure.yaml) +- [fetch_cstor_target_pod.yml](/utils/apps/openebs/fetch_cstor_target_pod.yml) +- [fetch_jiva_controller_pod.yml](/utils/apps/openebs/fetch_jiva_controller_pod.yml) +- [fetch_sc_and_provisioner.yml](/utils/apps/openebs/fetch_sc_and_provisioner.yml) +- [target_affinity_check.yml](/utils/apps/openebs/target_affinity_check.yml) + +## Litmus experiment Environment Variables + +### Application + + + + + + + + + + + + + + + + + + + + + +
Parameter + Description
APP_NAMESPACE Namespace in which application pods are deployed
APP_LABEL Unique Labels in `key=value` format of application deployment
APP_PVC Name of persistent volume claim used for app's volume mounts
DATA_PERSISTENCE Specify the application name against which data consistency has to be ensured. Example: busybox
+ +### Procedure + +This scenario validates the behaviour of application and OpenEBS persistent volumes in the amidst of chaos induced on OpenEBS data plane and control plane components. + +After injecting the chaos into the component specified via environmental variable, litmus experiment observes the behaviour of corresponding OpenEBS PV and the application which consumes the volume. + +Based on the value of env `DATA_PERSISTENCE`, the corresponding data consistency util will be executed. At present only busybox and percona-mysql are supported. Along with specifying env in the litmus experiment, user needs to pass name for configmap and the data consistency specific parameters required via configmap in the format as follows: + +```yml + parameters.yml: | + blocksize: 4k + blockcount: 1024 + testfile: difiletest +``` + +It is recommended to pass test-name for configmap and mount the corresponding configmap as volume in the litmus pod. The above snippet holds the parameters required for validation data consistency in busybox application. + +For percona-mysql, the following parameters are to be injected into configmap. + +```yml + parameters.yml: | + dbuser: root + dbpassword: k8sDemo + dbname: tbd +``` + +The configmap data will be utilised by litmus experiments as its variables while executing the scenario. + +Based on the data provided, litmus checks if the data is consistent after recovering from induced chaos. diff --git a/experiments/openebs/openebs-target-pod-failure/chaosutil.j2 b/experiments/openebs/openebs-target-pod-failure/chaosutil.j2 new file mode 100644 index 00000000000..6b179e7cd90 --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/chaosutil.j2 @@ -0,0 +1,7 @@ +{% if stg_prov is defined and stg_prov == 'openebs.io/provisioner-iscsi' %} + {% if stg_engine is defined and stg_engine == 'cstor' %} + chaosutil: /experiments/openebs/openebs-target-pod-failure/cstor_target_failure.yaml + {% else %} + chaosutil: /experiments/openebs/openebs-target-pod-failure/jiva_controller_pod_failure.yaml + {% endif %} +{% endif %} diff --git a/experiments/openebs/openebs-target-pod-failure/cstor_target_failure.yaml b/experiments/openebs/openebs-target-pod-failure/cstor_target_failure.yaml new file mode 100644 index 00000000000..b9a7934fe3a --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/cstor_target_failure.yaml @@ -0,0 +1,39 @@ +--- +- name: Pick the cstor target pod + include_tasks: /utils/apps/openebs/fetch_cstor_target_pod.yml + +- name: Record the cstor target deployment of the PV + set_fact: + cstor_target_deploy: "{{ pv.stdout }}-{{ cstor_target_pod_suffix }}" + +- name: Get the resourceVersion of the target deploy before fault injection + shell: > + kubectl get deployment {{ cstor_target_deploy }} + -n {{ openebs_ns }} -o=jsonpath='{.metadata.resourceVersion}' + args: + executable: /bin/bash + register: rv_bef + +# including litmus chaoslib -> kill-random-pod +- name: Kill the cstor target pod + include_tasks: /chaoslib/litmus/kill_random_pod.yml + vars: + app_ns: "{{ openebs_ns }}" + app_pod_name: "{{ cstor_target_pod.stdout }}" + +- name: Wait for 10s post fault injection + wait_for: + timeout: 10 + +- name: Get the resourceVersion of the target deploy after fault injection + shell: > + kubectl get deployment {{ cstor_target_deploy }} + -n {{ openebs_ns }} -o=jsonpath='{.metadata.resourceVersion}' + args: + executable: /bin/bash + register: rv_aft + +- name: Compare resourceVersions of target deployment + debug: + msg: "Verified target pods were restarted by fault injection" + failed_when: "rv_bef.stdout | int == rv_aft.stdout | int" diff --git a/experiments/openebs/openebs-target-pod-failure/data_persistence.j2 b/experiments/openebs/openebs-target-pod-failure/data_persistence.j2 new file mode 100644 index 00000000000..8b0e7e500cc --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/data_persistence.j2 @@ -0,0 +1,5 @@ +{% if data_persistence is defined and data_persistence == 'mysql' %} + consistencyutil: /utils/apps/mysql/mysql_data_persistence.yml + {% elif data_persistence is defined and data_persistence == 'busybox' %} + consistencyutil: /utils/apps/busybox/busybox_data_persistence.yml +{% endif %} diff --git a/experiments/openebs/openebs-target-pod-failure/jiva_controller_pod_failure.yaml b/experiments/openebs/openebs-target-pod-failure/jiva_controller_pod_failure.yaml new file mode 100644 index 00000000000..c47dfb9b976 --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/jiva_controller_pod_failure.yaml @@ -0,0 +1,86 @@ +--- +- name: Pick the jiva controller pod + include_tasks: /utils/apps/openebs/fetch_jiva_controller_pod.yml + +- name: Record the jiva controller deployment and container name + set_fact: + jiva_controller_deploy: "{{ pv.stdout }}-{{ jiva_controller_pod_suffix }}" + jiva_controller_name: "{{ pv.stdout }}-{{ jiva_controller_pod_suffix }}-{{ jiva_controller_container_suffix }}" + +- name: Get the resourceVersion of the target deploy before fault injection + shell: > + kubectl get deploy {{ jiva_controller_deploy }} -n {{ a_ns }} + -o=custom-columns=NAME:".metadata.resourceVersion" --no-headers + args: + executable: /bin/bash + register: rv_bef + +- name: Get controller svc + shell: > + kubectl get svc -l {{ jiva_controller_svc_label }} + -n {{ a_ns }} -o=jsonpath='{.items[0].spec.clusterIP}' + args: + executable: /bin/bash + register: controller_svc + failed_when: controller_svc.stdout == "" + +- name: Install jq package inside a controller container + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} -c {{ jiva_controller_name }} + -- bash -c "apt-get update && apt-get install -y jq" + args: + executable: /bin/bash + +- name: Getting the Replicastatus before killing controller + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_name }} curl http://"{{controller_svc.stdout}}":9501/v1/replicas | jq -r '.data[].mode' + args: + executable: /bin/bash + register: rstatus_before + +# including litmus chaoslib -> kill-random-pod +- name: Kill the jiva controller pod + include_tasks: /chaoslib/litmus/kill_random_pod.yml + vars: + app_ns: "{{ a_ns }}" + app_pod_name: "{{ jiva_controller_pod.stdout }}" + +- name: Get jiva controller pod belonging to the PV + shell: > + kubectl get pods --no-headers -l {{ jiva_controller_pod_label }} -n {{ a_ns }} + -o jsonpath="{.items[?(@.metadata.labels.openebs\\.io/persistent-volume==\"{{pv.stdout}}\")].metadata.name}" + args: + executable: /bin/bash + register: jctrl_pod_after + +- name: Install jq package inside a controller container + shell: > + kubectl exec -it {{ jctrl_pod_after.stdout }} -n {{ a_ns }} -c {{ jiva_controller_name }} + -- bash -c "apt-get update && apt-get install -y jq" + args: + executable: /bin/bash + +- name: Getting the Replicastatus after killing the controller + shell: > + kubectl exec -it {{ jctrl_pod_after.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_name }} curl http://"{{controller_svc.stdout}}":9501/v1/replicas | jq -r '.data[].mode' + args: + executable: /bin/bash + register: rstatus_after + until: "rstatus_after.stdout_lines == rstatus_before.stdout_lines and 'RW' in rstatus_after.stdout" + retries: 30 + delay: 10 + +- name: Get the resourceVersion of the target deploy after fault injection + shell: > + kubectl get deploy {{ jiva_controller_deploy }} -n {{ a_ns }} + -o=custom-columns=NAME:".metadata.resourceVersion" --no-headers + args: + executable: /bin/bash + register: rv_aft + +- name: Compare resourceVersions of target deployment + debug: + msg: "Verified target pods were restarted by fault injection" + failed_when: "rv_bef.stdout | int == rv_aft.stdout | int" diff --git a/experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_ansible_logic.yml b/experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_ansible_logic.yml new file mode 100644 index 00000000000..a225f807d9f --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_ansible_logic.yml @@ -0,0 +1,160 @@ +--- +- hosts: localhost + connection: local + + vars: + a_label: "{{ lookup('env','APP_LABEL') }}" + a_ns: "{{ lookup('env','APP_NAMESPACE') }}" + a_pvc: "{{ lookup('env','APP_PVC') }}" + c_experiment: openebs-target-failure + c_force: "{{ lookup('env','FORCE') }}" + c_interval: "{{ lookup('env','CHAOS_INTERVAL') }}" + chaos_duration: "{{ lookup('env','CHAOS_DURATION') }}" + data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" + deploy_type: "{{ lookup('env','DEPLOY_TYPE') }}" + liveness_label: "{{ lookup('env','LIVENESS_APP_LABEL') }}" + liveness_namespace: "{{ lookup('env','LIVENESS_APP_NAMESPACE') }}" + openebs_ns: "{{ lookup('env','OPENEBS_NAMESPACE') }}" + + vars_files: + - /mnt/parameters.yml + - /experiments/openebs/openebs_components.yml + + tasks: + - block: + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - include: test_prerequisites.yml + + - include_vars: + file: data_persistence.yml + + - include_vars: + file: chaosutil.yml + + - name: Record the chaos util path + set_fact: + chaos_util_path: "{{ chaosutil }}" + + - name: Record the data consistency util path + set_fact: + data_consistency_util_path: "{{ consistencyutil }}" + when: data_persistence != '' + + - include_tasks: /utils/runtime/create_testname.yml + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_experiment: "{{ lookup('env','CHAOSENGINE') }}-{{ c_experiment }}" + + when: lookup('env','CHAOSENGINE') + + ## RECORD START-OF-TEST IN CHAOS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ a_ns }}" + + ## DISPLAY APP INFORMATION + + - name: Display the app information passed via the test job + debug: + msg: + - "The application info is as follows:" + - "Namespace : {{ a_ns }}" + - "Target Namespace : {{ openebs_ns }}" + - "Label : {{ a_label }}" + - "PVC : {{ a_pvc }}" + - "StorageClass : {{ sc }}" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify that the AUT (Application Under Test) is running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + application_name: "{{ app_pod_name.stdout }}" + delay: 5 + retries: 60 + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: app_pod_name + + - name: Create some test data + include: "{{ data_consistency_util_path }}" + vars: + status: 'LOAD' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod_name.stdout }}" + when: data_persistence != '' + + ## STORAGE FAULT INJECTION + + - include: "{{ chaos_util_path }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + - name: Wait (soak) for I/O on pools + wait_for: + timeout: "{{ chaos_duration }}" + + - name: Verify AUT liveness post fault-injection + include_tasks: "/utils/common/status_app_pod.yml" + vars: + application_name: "{{ app_pod_name.stdout }}" + delay: 5 + retries: 60 + + ## POST-CHAOS APPLICATION LIVENESS CHECK + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: rescheduled_app_pod + + - name: Verify application data persistence + include: "{{ data_consistency_util_path }}" + vars: + status: 'VERIFY' + ns: "{{ a_ns }}" + pod_name: "{{ rescheduled_app_pod.stdout }}" + when: data_persistence != '' + + ## Check application-target pod affinity + - include_tasks: /utils/apps/openebs/target_affinity_check.yml + when: deploy_type == 'deployment' + + ## Check statefulset application-target pod affinity + - include_tasks: /utils/apps/openebs/sts_target_affinity_check.yml + when: deploy_type == 'statefulset' + + - set_fact: + flag: "Pass" + + rescue: + - set_fact: + flag: "Fail" + + always: + + ## RECORD END-OF-TEST IN CHAOS RESULT CR + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ a_ns }}" diff --git a/experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_k8s_job.yml b/experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_k8s_job.yml new file mode 100644 index 00000000000..f7dbc1627d0 --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_k8s_job.yml @@ -0,0 +1,78 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: openebs-target-pod-failure +data: + parameters.yml: | + +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: openebs-target-pod-failure- +spec: + template: + metadata: + labels: + name: openebs-target-pod-failure + spec: + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + # provide application namespace + - name: APP_NAMESPACE + value: '' + + # provide openebs namespace + - name: OPENEBS_NAMESPACE + value: 'openebs' + + # provide application label + - name: APP_LABEL + value: '' + + # provide application pvc + - name: APP_PVC + value: '' + + # it can be true or false, depending upon scenario - allowed force deletion or not + - name: FORCE + value: 'true' + + - name: LIVENESS_APP_LABEL + value: '' + + - name: LIVENESS_APP_NAMESPACE + value: '' + + - name: DATA_PERSISTENCE + value: '' + + - name: CHAOS_INTERVAL + value: '5' + + - name: CHAOS_DURATION + value: '120' + + # DEPLOY_TYPE values: deployment, statefulset + - name: DEPLOY_TYPE + value: 'deployment' + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"] + + volumeMounts: + - name: parameters + mountPath: /mnt/ + volumes: + - name: parameters + configMap: + name: openebs-target-pod-failure diff --git a/experiments/openebs/openebs-target-pod-failure/test_prerequisites.yml b/experiments/openebs/openebs-target-pod-failure/test_prerequisites.yml new file mode 100644 index 00000000000..16dbe18619b --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/test_prerequisites.yml @@ -0,0 +1,36 @@ +--- +- name: Fetch sc and provisioner + include_tasks: /utils/apps/openebs/fetch_sc_and_provisioner.yml + +- block: + - name: Derive PV name from PVC to query storage engine type (openebs) + shell: > + kubectl get pvc {{ a_pvc }} -n {{ a_ns }} + --no-headers -o custom-columns=:spec.volumeName + args: + executable: /bin/bash + register: pv + + - name: Check for presence & value of cas type annotation + shell: > + kubectl get pv {{ pv.stdout }} --no-headers + -o jsonpath="{.metadata.annotations.openebs\\.io/cas-type}" + args: + executable: /bin/bash + register: openebs_stg_engine + + - name: Record the storage engine name + set_fact: + stg_engine: "{{ openebs_stg_engine.stdout }}" + when: stg_prov == "openebs.io/provisioner-iscsi" + +- name: Identify the chaos util to be invoked + template: + src: chaosutil.j2 + dest: chaosutil.yml + +- name: Identify the data consistency util to be invoked + template: + src: data_persistence.j2 + dest: data_persistence.yml + diff --git a/experiments/openebs/openebs_components.yml b/experiments/openebs/openebs_components.yml index 143aca30ac2..68b67a189ce 100644 --- a/experiments/openebs/openebs_components.yml +++ b/experiments/openebs/openebs_components.yml @@ -1 +1,7 @@ pool_label: "app=cstor-pool" +cstor_target_pod_suffix: "target" +cstor_target_pod_label: "openebs.io/target=cstor-target" +jiva_controller_pod_suffix: "ctrl" +jiva_controller_container_suffix: "con" +jiva_controller_pod_label: "openebs.io/controller=jiva-controller" +jiva_controller_svc_label: "openebs.io/controller-service=jiva-controller-svc" diff --git a/utils/apps/kafka/display_kafka_broker_info.yml b/utils/apps/kafka/display_kafka_broker_info.yml new file mode 100644 index 00000000000..94b89fe438d --- /dev/null +++ b/utils/apps/kafka/display_kafka_broker_info.yml @@ -0,0 +1,7 @@ +- debug: + msg: "{{ kafka_broker }}" + when: kafka_broker != '' + +- debug: + msg: "kafka broker will be selected randomly across the cluster" + when: kafka_broker == '' diff --git a/utils/apps/kafka/kafka_cluster_health.yml b/utils/apps/kafka/kafka_cluster_health.yml new file mode 100644 index 00000000000..9d6e3b7de28 --- /dev/null +++ b/utils/apps/kafka/kafka_cluster_health.yml @@ -0,0 +1,16 @@ +--- +- name: Verify that all kafka pods are running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + delay: 1 + retries: 60 + a_ns: "{{ kafka_ns }}" + a_label: "{{ kafka_label }}" + +- name: Verify that all zookeeper pods are running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + delay: 1 + retries: 60 + a_ns: "{{ zk_ns }}" + a_label: "{{ zk_label }}" diff --git a/utils/apps/kafka/kafka_launch_stream_derive_leader_broker.yml b/utils/apps/kafka/kafka_launch_stream_derive_leader_broker.yml new file mode 100644 index 00000000000..3df02d783e3 --- /dev/null +++ b/utils/apps/kafka/kafka_launch_stream_derive_leader_broker.yml @@ -0,0 +1,3 @@ +- include_tasks: "/utils/apps/kafka/kafka_liveness_stream.yml" +- include_tasks: "/utils/apps/kafka/kafka_select_broker.yml" +- include_tasks: "/utils/apps/kafka/display_kafka_broker_info.yml" diff --git a/utils/apps/kafka/kafka_liveness.j2 b/utils/apps/kafka/kafka_liveness.j2 new file mode 100644 index 00000000000..18261e0462c --- /dev/null +++ b/utils/apps/kafka/kafka_liveness.j2 @@ -0,0 +1,56 @@ +--- +apiVersion: v1 +kind: Pod +metadata: + name: kafka-liveness + labels: + name: kafka-liveness +spec: + initContainers: + - name: kafka-topic-creator + image: litmuschaos/kafka-client:ci + imagePullPolicy: Always + env: + - name: TOPIC_NAME + value: {{ kafka_topic }} + - name: KAFKA_INSTANCE_NAME + value: {{ kafka_instance }} + - name: ZOOKEEPER_SERVICE + value: {{ zk_service }} + - name: ZOOKEEPER_PORT + value: "{{ zk_port }}" + - name: REPLICATION_FACTOR + value: "{{ kafka_replication_factor }}" + command: + - sh + - -c + - "./topic.sh" + containers: + - name: kafka-producer + image: litmuschaos/kafka-client:ci + imagePullPolicy: Always + env: + - name: TOPIC_NAME + value: {{ kafka_topic }} + - name: KAFKA_SERVICE + value: {{ kafka_service }} + - name: KAFKA_PORT + value: "{{ kafka_port }}" + command: + - sh + - -c + - "./producer.sh" + - name: kafka-consumer + image: litmuschaos/kafka-client:ci + imagePullPolicy: Always + env: + - name: TOPIC_NAME + value: {{ kafka_topic }} + - name: KAFKA_SERVICE + value: {{ kafka_service }} + - name: KAFKA_PORT + value: "{{ kafka_port }}" + command: + - sh + - -c + - "./consumer.sh" diff --git a/utils/apps/kafka/kafka_liveness_cleanup.yml b/utils/apps/kafka/kafka_liveness_cleanup.yml new file mode 100644 index 00000000000..cc35015fc65 --- /dev/null +++ b/utils/apps/kafka/kafka_liveness_cleanup.yml @@ -0,0 +1,16 @@ +- name: Remove the Kafka liveness pod + shell: + kubectl delete -f kafka_liveness.yml -n {{ kafka_ns }} + args: + executable: /bin/bash + register: result + +- name: Confirm that the Kafka liveness pod is deleted successfully + shell: + kubectl get pod -l name=kafka-liveness --no-headers -n {{ kafka_ns }} + args: + executable: /bin/bash + register: result + until: "'Running' not in result.stdout" + delay: 1 + retries: 120 diff --git a/utils/apps/kafka/kafka_liveness_stream.yml b/utils/apps/kafka/kafka_liveness_stream.yml new file mode 100644 index 00000000000..50e1faf666a --- /dev/null +++ b/utils/apps/kafka/kafka_liveness_stream.yml @@ -0,0 +1,85 @@ +- name: Generate a random strint as suffix to topic name + shell: echo $(mktemp) | cut -d '.' -f 2 + args: + executable: /bin/bash + register: uniqstr + +- name: Set the kafka topic name to a variable + set_fact: + kafka_topic: "topic-{{ uniqstr.stdout }}" + +- name: Generate the kafka liveness spec from template + template: + src: /utils/apps/kafka/kafka_liveness.j2 + dest: kafka_liveness.yml + +- name: Apply the pub-sub kafka liveness applicaton + shell: + kubectl apply -f kafka_liveness.yml -n {{ kafka_ns }} + args: + executable: /bin/bash + register: result + failed_when: "result.rc != 0" + +- name: Confirm that the kafka liveness pod is running + shell: + kubectl get pod -l name=kafka-liveness --no-headers -n {{ kafka_ns }} + args: + executable: /bin/bash + register: result + until: "'Running' in result.stdout" + delay: 1 + retries: 120 + +- name: Fetch the kafka-liveness pod name + shell: + kubectl get pods -n {{ kafka_ns }} -l name=kafka-liveness -o jsonpath='{.items[0].metadata.name}' + register: kafka_liveness_pod + +- block: + + - name: Obtain the leader broker ordinality for the topic (partition) created by kafka-liveness + shell: > + kubectl exec {{ kafka_liveness_pod.stdout }} -n {{ kafka_ns }} -c kafka-consumer + -- kafka-topics --topic {{ kafka_topic }} --describe --zookeeper {{ zk_service }}:{{ zk_port }} + | grep -o 'Leader: [^[:space:]]*' | awk '{print $2}' + args: + executable: /bin/bash + register: ordinality_non_instance + failed_when: "ordinality_non_instance.rc != 0" + + - set_fact: + ordinality: "{{ ordinality_non_instance.stdout }}" + + when: kafka_instance is not defined or kafka_instance == '' + +- block: + + - name: Obtain the leader broker ordinality for the topic (partition) created by kafka-liveness + shell: > + kubectl exec {{ kafka_liveness_pod.stdout }} -n {{ kafka_ns }} -c kafka-consumer + -- kafka-topics --topic {{ kafka_topic }} --describe --zookeeper {{ zk_service }}:{{ zk_port }}/{{ kafka_instance }} + | grep -o 'Leader: [^[:space:]]*' | awk '{print $2}' + args: + executable: /bin/bash + register: ordinality_instance + failed_when: "ordinality_instance.rc != 0" + + - set_fact: + ordinality: "{{ ordinality_instance.stdout }}" + + when: kafka_instance is defined and kafka_instance != '' + + +- name: Determine the leader broker pod name + shell: + kubectl get pods -l {{ kafka_label }} --no-headers -o custom-columns=:metadata.name | grep '^.*-{{ ordinality }}$' + args: + executable: /bin/bash + register: leader_broker + failed_when: "leader_broker.rc != 0" + +- name: Set the kafka broker to be subjected to chaos + set_fact: + liveness_topic_leader: "{{ leader_broker.stdout }}" + diff --git a/utils/apps/kafka/kafka_select_broker.yml b/utils/apps/kafka/kafka_select_broker.yml new file mode 100644 index 00000000000..507fb72154e --- /dev/null +++ b/utils/apps/kafka/kafka_select_broker.yml @@ -0,0 +1,11 @@ +- name: select leader broker as per the liveness topic (partition) + set_fact: + kafka_broker: "{{ liveness_topic_leader }}" + when: kafka_stream is defined and kafka_stream != '' + +- name: allow random pod selection by chaosutil + set_fact: + kafka_broker: '' + when: kafka_stream is undefined or kafka_stream == '' + + diff --git a/utils/apps/openebs/fetch_cstor_target_pod.yml b/utils/apps/openebs/fetch_cstor_target_pod.yml new file mode 100644 index 00000000000..537c068a8bf --- /dev/null +++ b/utils/apps/openebs/fetch_cstor_target_pod.yml @@ -0,0 +1,18 @@ +--- +- name: Derive PV from application PVC + shell: > + kubectl get pvc {{ a_pvc }} + -o custom-columns=:spec.volumeName -n {{ a_ns }} + --no-headers + args: + executable: /bin/bash + register: pv + +- name: Pick a cStor target pod belonging to the PV + shell: > + kubectl get pods -l {{ cstor_target_pod_label }} + -n {{ openebs_ns }} --no-headers | grep {{ pv.stdout }} + | shuf -n1 | awk '{print $1}' + args: + executable: /bin/bash + register: cstor_target_pod diff --git a/utils/apps/openebs/fetch_cvr_count_from_pv.yml b/utils/apps/openebs/fetch_cvr_count_from_pv.yml new file mode 100644 index 00000000000..30b3f49bff7 --- /dev/null +++ b/utils/apps/openebs/fetch_cvr_count_from_pv.yml @@ -0,0 +1,20 @@ +--- +- name: Derive PV from application PVC + shell: > + kubectl get pvc {{ a_pvc }} + -o custom-columns=:spec.volumeName -n {{ a_ns }} + --no-headers + args: + executable: /bin/bash + register: pv + +- name: Get CVR count from pv + shell: > + kubectl get cvr -n {{ openebs_ns }} + -l openebs.io/persistent-volume={{ pv.stdout }} --no-headers | wc -l + args: + executable: /bin/bash + register: cvr_count + +- set_fact: + cvr_count: "{{ cvr_count.stdout }}" diff --git a/utils/apps/openebs/fetch_jiva_controller_pod.yml b/utils/apps/openebs/fetch_jiva_controller_pod.yml new file mode 100644 index 00000000000..b97bdffd279 --- /dev/null +++ b/utils/apps/openebs/fetch_jiva_controller_pod.yml @@ -0,0 +1,18 @@ +--- +- name: Derive PV from application PVC + shell: > + kubectl get pvc {{ a_pvc }} + -o custom-columns=:spec.volumeName -n {{ a_ns }} + --no-headers + args: + executable: /bin/bash + register: pv + +- name: Get jiva controller pod belonging to the PV + shell: > + kubectl get pods --no-headers -l {{ jiva_controller_pod_label }} -n {{ a_ns }} + -o jsonpath="{.items[?(@.metadata.labels.openebs\\.io/persistent-volume==\"{{pv.stdout}}\")].metadata.name}" + args: + executable: /bin/bash + register: jiva_controller_pod + \ No newline at end of file diff --git a/utils/apps/openebs/fetch_replica_count_from_sc.yml b/utils/apps/openebs/fetch_replica_count_from_sc.yml new file mode 100644 index 00000000000..c6a1c607c07 --- /dev/null +++ b/utils/apps/openebs/fetch_replica_count_from_sc.yml @@ -0,0 +1,28 @@ +--- +- name: Derive SC from application PVC + shell: > + kubectl get pvc {{ a_pvc }} + -o custom-columns=:spec.storageClassName -n {{ a_ns }} + --no-headers + args: + executable: /bin/bash + register: sc + +- name: Derive ReplicaCount from SC + shell: > + kubectl get sc {{ sc.stdout }} -n {{ openebs_ns }} --no-headers + -o jsonpath="{.metadata.annotations.cas\\.openebs\\.io\/config}" + | grep -A1 "ReplicaCount" | grep -i value | awk '{print $2}' | tr -d '"' + args: + executable: /bin/bash + register: replicacount + +- name: Set default value for replicacount if it is empty + set_fact: + replicacnt: "3" + when: "replicacount.stdout == \"\"" + +- name: Set default value for replicacount if it is non-empty + set_fact: + replicacnt: "{{ replicacount.stdout }}" + when: "replicacount.stdout != \"\"" diff --git a/utils/apps/openebs/fetch_sc_and_provisioner.yml b/utils/apps/openebs/fetch_sc_and_provisioner.yml new file mode 100644 index 00000000000..111d8d82186 --- /dev/null +++ b/utils/apps/openebs/fetch_sc_and_provisioner.yml @@ -0,0 +1,24 @@ +--- +- name: Identify the storage class used by the PVC + shell: > + kubectl get pvc {{ a_pvc }} -n {{ a_ns }} + --no-headers -o custom-columns=:spec.storageClassName + args: + executable: /bin/bash + register: storage_class + +- name: Identify the storage provisioner used by the SC + shell: > + kubectl get sc {{ storage_class.stdout }} + --no-headers -o custom-columns=:provisioner + args: + executable: /bin/bash + register: provisioner + +- name: Record the storage class name + set_fact: + sc: "{{ storage_class.stdout }}" + +- name: Record the storage provisioner name + set_fact: + stg_prov: "{{ provisioner.stdout }}" diff --git a/utils/apps/openebs/target_affinity_check.yml b/utils/apps/openebs/target_affinity_check.yml new file mode 100644 index 00000000000..b99d0547aff --- /dev/null +++ b/utils/apps/openebs/target_affinity_check.yml @@ -0,0 +1,61 @@ +- name: Obtain node where app pod resides + k8s_facts: + kind: Pod + label_selectors: + - "{{ a_label }}" + namespace: "{{ a_ns }}" + register: app_node + failed_when: app_node.resources | length < 1 + +- debug: + msg: "{{ app_node | json_query('resources[*].spec.nodeName')}}" + + +- name: Derive PV from application PVC + k8s_facts: + kind: PersistentVolumeClaim + name: "{{ a_pvc }}" + namespace: "{{ a_ns }}" + register: pv + failed_when: pv.resources | length < 1 + +- debug: + msg: "{{ pv | json_query('resources[*].spec.volumeName')}}" + +- name: Derive storage engine from PV + k8s_facts: + kind: PersistentVolume + name: "{{ pv | json_query('resources[0].spec.volumeName')}}" + register: stg_engine + +- debug: + msg: "{{ item.metadata.annotations['openebs.io/cas-type'] }}" + with_items: "{{ stg_engine.resources }}" + +- set_fact: + target_ns: "{{ a_ns }}" + target_label: "openebs.io/controller=jiva-controller" + when: stg_engine.resources.0.metadata.annotations['openebs.io/cas-type'] == 'jiva' + + +## TODO: Account for the case where cstor target can reside in app_ns +## For future: Leave a bool var called {{ target_in_app_ns }} as undefined + +- set_fact: + target_ns: "{{ openebs_ns }}" + target_label: "openebs.io/target=cstor-target" + when: stg_engine.resources.0.metadata.annotations['openebs.io/cas-type'] == 'cstor' and target_in_app_ns is undefined + +- name: Obtain the node where PV target pod resides + k8s_facts: + kind: Pod + namespace: "{{ target_ns }}" + label_selectors: + - "{{ target_label }}" + - "openebs.io/persistent-volume={{ pv.resources.0.spec.volumeName }}" + register: target_node + +- name: Verify whether the app & target pod co-exist on same node + debug: + msg: "App and Target affinity is maintained" + failed_when: target_node.resources.0.spec.nodeName != app_node.resources.0.spec.nodeName diff --git a/utils/cloud/aws/aws_configure.yml b/utils/cloud/aws/aws_configure.yml new file mode 100644 index 00000000000..14c155d0a61 --- /dev/null +++ b/utils/cloud/aws/aws_configure.yml @@ -0,0 +1,14 @@ +- name: Creates directory for aws configuration + file: + path: /root/.aws + state: directory + +- name: Creating credential file in aws directory + file: + path: /root/.aws/credentials + state: touch + +- name: Copying aws credentials from cloud_config + copy: + src: /mnt/cloud_config.yml + dest: /root/.aws/credentials \ No newline at end of file diff --git a/utils/cloud/aws/status_disk.yml b/utils/cloud/aws/status_disk.yml new file mode 100644 index 00000000000..84a007558d1 --- /dev/null +++ b/utils/cloud/aws/status_disk.yml @@ -0,0 +1,15 @@ +# ec2_vol_facts is deprecated, once python2 is upgraded to python3 in ansible runner +# we can change ec2_vol_facts to ec2_vol_info. +- name: Getting disk users + ec2_vol_facts: + filters: + volume-id: "{{ disk_name }}" + register: disk_users + +- name: Disk status check + template: + src: disk_status_check.j2 + dest: disk_status_check.yml + +- include_vars: + file: disk_status_check.yml \ No newline at end of file diff --git a/utils/cloud/gcp/gcloud_configure.yml b/utils/cloud/gcp/gcloud_configure.yml index 042a098aa60..afd8678c7bd 100644 --- a/utils/cloud/gcp/gcloud_configure.yml +++ b/utils/cloud/gcp/gcloud_configure.yml @@ -1,5 +1,5 @@ - name: authenticate gcloud service account - shell: gcloud auth activate-service-account --key-file=/mnt/cloud_config.yml + shell: gcloud auth activate-service-account --key-file=/tmp/cloud_config.yml - name: Gcloud project setting shell: gcloud config set project {{ project_id }} diff --git a/utils/common/status_app_pod.yml b/utils/common/status_app_pod.yml index e855ddee90e..5a9183a6a85 100644 --- a/utils/common/status_app_pod.yml +++ b/utils/common/status_app_pod.yml @@ -1,5 +1,5 @@ --- -- name: Checking {{ application_name }} pod is in running state +- name: Checking whether application pods are in running state shell: kubectl get pods -n {{ a_ns }} -l {{ a_label }} -o custom-columns=:.status.phase --no-headers register: result until: "((result.stdout.split()|unique)|length) == 1 and 'Running' in result.stdout"