-
Notifications
You must be signed in to change notification settings - Fork 333
/
Copy pathdebug.sh
executable file
·78 lines (66 loc) · 3.42 KB
/
debug.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env bash
# This is a basic debug script for Kubernetes clusters
# Please use this script to collect a log bundle when opening a support request or asking for debug assistance
# Ideally this is run out of the DeepOps repo used to deploy the cluster
# However, this script will also work best-effort for any K8s cluster, DeepOps or otherwise
# Requirements for this script are a working "kubectl" and ideally a working "helm"
# Optionally, a working "ansible" with a config/inventory file that has kubernetes node defined in a kube-node group
# Source common libraries and env variables
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
ROOT_DIR="${SCRIPT_DIR}/../.."
source ${ROOT_DIR}/scripts/common.sh
timestamp=$(date +%s)
logdir=config/log_${timestamp}
mkdir -p ${logdir}
# Provisioner configuration (specific to DeepOps deployments)
cp config/inventory ${logdir}
git branch > ${logdir}/git-branch.log
git status > ${logdir}/git-status.log
git diff > ${logdir}/git-diff.log
git log --pretty=oneline | head -n 20 > ${logdir}/git-log.log
ansible --version
# GPU configuration
ansible kube-node -ba "nvidia-smi" -vv > ${logdir}/nvidia-smi.log
ansible kube-node -ba "cat /etc/nvidia/gridd.conf" -vv > ${logdir}/vgpu-gridd.conf.log
# Docker configuration
ansible kube-node -ba "docker info" -vv > ${logdir}/docker-info.log
ansible kube-node -ba "cat /etc/docker/daemon.json" -vv > ${logdir}/docker-daemon.log
# Kubectl (Generic for any Kubernetes cluster)
kubectl version
kubectl get pvc -A > ${logdir}/get-pvc.log
kubectl get pv -A > ${logdir}/get-pv.log
kubectl get pods -A > ${logdir}/get-pods.log
kubectl get daemonset -A > ${logdir}/get-daemons.log
kubectl get nodes > ${logdir}/get-nodes.log
kubectl describe nodes > ${logdir}/describe-nodes.log
kubectl get storageclass > ${logdir}/get-storageclass.log
kubectl get events -A > ${logdir}/get-events.log
kubectl get svc -A > ${logdir}/get-svc.log
# Kubectl / GPU Operator (Generic for any Kubernetes cluster)
kubectl get pvc -A > ${logdir}/get-pvc.log
for pod in $(kubectl get pods -n gpu-operator | grep nvidia-device-plugin | awk '{print $1}'); do
kubectl -n gpu-operator logs ${pod} > ${logdir}/get-plugin-logs-${pod}.log
done
for pod in $(kubectl get pods -n gpu-operator | grep gpu-feature-discovery | awk '{print $1}'); do
kubectl -n gpu-operator logs ${pod} > ${logdir}/get-plugin-logs-${pod}.log
done
for pod in $(kubectl get pods -n gpu-operator | grep nvidia-operator-validator | awk '{print $1}'); do
kubectl -n gpu-operator logs ${pod} > ${logdir}/get-plugin-logs-${pod}.log
done
for pod in $(kubectl get pods -n gpu-operator | grep driver | awk '{print $1}'); do
kubectl -n gpu-operator logs ${pod} > ${logdir}/get-plugin-logs-${pod}.log
done
for pod in $(kubectl get pods -n gpu-operator | grep mig | awk '{print $1}'); do
kubectl -n gpu-operator logs ${pod} > ${logdir}/get-plugin-logs-${pod}.log
done
kubectl describe pods -n gpu-operator > ${logdir}/describe-gpu-operator-pods.log
kubectl describe configmap -n gpu-operator default-mig-parted-config > ${logdir}/default-mig-parted-config.log
# Helm
helm list -aA > ${logdir}/helm-list.log
# DCGM example output / metrics
# Collect metrics from all nodes for debug
ansible kube-node -vv -bm raw -a "curl http://127.0.0.1:9400/metrics" > ${logdir}/dcgm-metrics.log
# Packaging
name="config/k8s-debug-${timestamp}.tgz"
tar -zcf ${name} ${logdir}
echo "A Kubernetes/Docker log bundle has been created at ${name}"