Last active 1767947155

esunar revised this gist 1767947153. Go to revision

1 file changed, 638 insertions

health-check.sh(file created)

@@ -0,0 +1,638 @@
1 + #!/usr/bin/env bash
2 + set -euo pipefail
3 + # k8s-health.sh — single-context Kubernetes health report (Markdown)
4 + # Focus: core cluster + Calico + Istio + common apps (Redis, RabbitMQ, MinIO)
5 + # Deps: kubectl, jq, awk, sed, grep, base64, openssl; (optional) gdate
6 + # ----------------------------- CLI & Globals -----------------------------
7 + OUT_DIR=""
8 + REQUEST_TIMEOUT="${REQUEST_TIMEOUT:-8s}"
9 + DNS_TEST="${DNS_TEST:-true}" # set false to skip ephemeral DNS test
10 + TLS_WARN_DAYS="${TLS_WARN_DAYS:-14}"
11 + TLS_ERR_DAYS="${TLS_ERR_DAYS:-7}"
12 + usage() {
13 + cat <<EOF
14 + Usage: $0 [--context NAME] [--namespace NS] [--out DIR] [--no-dns]
15 + Options:
16 + --context NAME Use a specific kubeconfig context (default: current)
17 + --namespace NS Limit app checks to a namespace (default: all)
18 + --out DIR Output directory (default: ./k8s-health-<timestamp>)
19 + --no-dns Skip ephemeral DNS resolution test
20 + Env overrides:
21 + REQUEST_TIMEOUT (default: 8s) | TLS_WARN_DAYS (14) | TLS_ERR_DAYS (7)
22 + EOF
23 + }
24 + CTX_ARGS=()
25 + APP_NS=""
26 + while [[ $# -gt 0 ]]; do
27 + case "$1" in
28 + --context) shift; CTX_ARGS+=(--context "$1");;
29 + --namespace) shift; APP_NS="$1";;
30 + --out) shift; OUT_DIR="$1";;
31 + --no-dns) DNS_TEST=false;;
32 + -h|--help) usage; exit 0;;
33 + *) echo "Unknown arg: $1" >&2; usage; exit 3;;
34 + esac
35 + shift
36 + done
37 + ts_now() { date -Is; }
38 + to_ts() {
39 + # portable timestamp from RFC date string (uses gdate if available)
40 + local d="$1"
41 + if command -v gdate >/dev/null 2>&1; then gdate -d "$d" +%s; else date -d "$d" +%s; fi 2>/dev/null || echo 0
42 + }
43 + days_until() {
44 + local end="$1"; local end_ts; end_ts=$(to_ts "$end")
45 + local now_ts; now_ts=$(date +%s)
46 + echo $(( (end_ts - now_ts) / 86400 ))
47 + }
48 + # Normalize possibly multi-line/non-numeric values to a single non-negative integer (default 0)
49 + to_int() {
50 + local v="$1"
51 + # replace newlines/tabs with spaces, take first token
52 + v="${v//$'\n'/ }"; v="${v//$'\t'/ }"; v="${v%% *}"
53 + # strip non-digits
54 + v="$(printf '%s' "$v" | sed -E 's/[^0-9-]//g')"
55 + [[ "$v" =~ ^-?[0-9]+$ ]] || v=0
56 + printf '%s' "$v"
57 + }
58 + if ! command -v kubectl >/dev/null; then echo "kubectl not found" >&2; exit 3; fi
59 + if ! command -v jq >/dev/null; then echo "jq not found" >&2; exit 3; fi
60 + if [[ -z "$OUT_DIR" ]]; then
61 + OUT_DIR="./k8s-health-$(date +%Y%m%d-%H%M%S)"
62 + fi
63 + ART="$OUT_DIR/artifacts"
64 + mkdir -p "$ART"
65 + REPORT="$OUT_DIR/report.md"
66 + JSONL="$OUT_DIR/summary.jsonl"
67 + touch "$REPORT" "$JSONL"
68 + # ----------------------------- Emit Helpers ------------------------------
69 + emit_json() {
70 + # emit_json LEVEL AREA CHECK MESSAGE HINT
71 + printf '{"ts":"%s","level":"%s","area":"%s","check":"%s","message":%s,"hint":%s}\n' \
72 + "$(ts_now)" "$1" "$2" "$3" "$(jq -Rs . <<<"$4")" "$(jq -Rs . <<<"${5:-}")" >> "$JSONL"
73 + }
74 + emit_md_h1() { echo -e "# $1\n" >> "$REPORT"; }
75 + emit_md_h2() { echo -e "## $1\n" >> "$REPORT"; }
76 + emit_md_h3() { echo -e "### $1\n" >> "$REPORT"; }
77 + emit_md_kv() { echo "- **$1:** $2" >> "$REPORT"; }
78 + emit_md_code() { echo -e "\n\`\`\`\n$1\n\`\`\`\n" >> "$REPORT"; }
79 + # ----------------------------- Prefetch Cache ----------------------------
80 + echo "Collecting cluster state..."
81 + set +e
82 + KUBECTL=(kubectl "${CTX_ARGS[@]}")
83 + # Use a lightweight API call instead of 'kubectl version' which can fail for reasons unrelated to reachability.
84 + if ! "${KUBECTL[@]}" --request-timeout="$REQUEST_TIMEOUT" get --raw='/version' >/dev/null 2>&1; then
85 + # Fallback to a simple resource list in case /version endpoint is blocked by a proxy.
86 + if ! "${KUBECTL[@]}" --request-timeout="$REQUEST_TIMEOUT" get nodes -o name >/dev/null 2>&1; then
87 + echo "Cannot reach cluster with kubectl" >&2; exit 3
88 + fi
89 + fi
90 + set -e
91 + "${KUBECTL[@]}" version -o json > "$ART/version.json" 2>/dev/null
92 + "${KUBECTL[@]}" api-resources > "$ART/apiresources.txt" || true
93 + "${KUBECTL[@]}" get nodes -o json > "$ART/nodes.json"
94 + if [[ -n "$APP_NS" ]]; then
95 + ns_arg=(-n "$APP_NS")
96 + else
97 + ns_arg=(--all-namespaces)
98 + fi
99 + "${KUBECTL[@]}" get pods "${ns_arg[@]}" -o json > "$ART/pods.json"
100 + "${KUBECTL[@]}" get ns -o json > "$ART/namespaces.json"
101 + "${KUBECTL[@]}" get events --all-namespaces --sort-by=.lastTimestamp -o json --request-timeout="$REQUEST_TIMEOUT" > "$ART/events.json" || true
102 + "${KUBECTL[@]}" get svc --all-namespaces -o json > "$ART/svc.json"
103 + "${KUBECTL[@]}" get endpoints --all-namespaces -o json > "$ART/endpoints.json"
104 + "${KUBECTL[@]}" get endpointslices.discovery.k8s.io --all-namespaces -o json > "$ART/epslices.json" 2>/dev/null || true
105 + "${KUBECTL[@]}" get deploy,ds,sts,job,cronjob,hpa,pdb --all-namespaces -o json > "$ART/workloads.json" 2>/dev/null || true
106 + "${KUBECTL[@]}" get pvc --all-namespaces -o json > "$ART/pvc.json" 2>/dev/null || true
107 + "${KUBECTL[@]}" get pv -o json > "$ART/pv.json" 2>/dev/null || true
108 + "${KUBECTL[@]}" get storageclasses.storage.k8s.io -o json > "$ART/sc.json" 2>/dev/null || true
109 + "${KUBECTL[@]}" get secrets --all-namespaces -o json > "$ART/secrets.json" 2>/dev/null || true
110 + "${KUBECTL[@]}" get csidrivers.storage.k8s.io,csinodes.storage.k8s.io -o json > "$ART/csi.json" 2>/dev/null || true
111 + # Istio + Calico artifacts (best effort)
112 + "${KUBECTL[@]}" -n istio-system get deploy,ds,pods,svc -o wide > "$ART/istio_ls.txt" 2>/dev/null || true
113 + "${KUBECTL[@]}" -n calico-system get deploy,ds,pods -o wide > "$ART/calico_ls.txt" 2>/dev/null || true
114 + "${KUBECTL[@]}" get crd tenants.minio.min.io rabbitmqclusters.rabbitmq.com 2>/dev/null | sed '1d' > "$ART/app_crds.txt" || true
115 + # Cache Istio Gateways (best effort)
116 + "${KUBECTL[@]}" get gateway.networking.istio.io --all-namespaces -o json > "$ART/istio_gateways.json" 2>/dev/null || true
117 + # ----------------------------- Report Header -----------------------------
118 + cluster_server=$(
119 + jq -r '.serverVersion.gitVersion + " (" + .serverVersion.platform + ")"' "$ART/version.json" 2>/dev/null \
120 + || echo "unknown")
121 + client_ver=$(
122 + jq -r '.clientVersion.gitVersion' "$ART/version.json" 2>/dev/null \
123 + || echo "unknown")
124 + ctx_name=$("${KUBECTL[@]}" config current-context 2>/dev/null || echo "current")
125 + emit_md_h1 "Kubernetes Health Report — ${ctx_name}"
126 + emit_md_kv "Generated" "$(ts_now)"
127 + emit_md_kv "kubectl client" "$client_ver"
128 + emit_md_kv "APIServer" "$cluster_server"
129 + emit_md_kv "Namespace scope (apps)" "${APP_NS:-all}"
130 + echo "" >> "$REPORT"
131 + # ----------------------------- Versions & Skew ----------------------------
132 + emit_md_h2 "Cluster Versions & Skew"
133 + node_versions=$(jq -r '.items[]?.status.nodeInfo.kubeletVersion' "$ART/nodes.json" | sort | uniq -c | sed 's/^/ /')
134 + emit_md_code "Kubelet versions:\n${node_versions}"
135 + server_minor=$(jq -r 'try (.serverVersion.minor|tonumber) catch 0' "$ART/version.json" 2>/dev/null || echo 0)
136 + first_kubelet_minor=$(jq -r '[.items[]?.status.nodeInfo.kubeletVersion
137 + | capture("v(?<maj>\\d+)\\.(?<min>\\d+)")
138 + | .min
139 + | tonumber]
140 + | (min // 0)' "$ART/nodes.json" 2>/dev/null || echo 0)
141 + last_kubelet_minor=$(jq -r '[.items[]?.status.nodeInfo.kubeletVersion
142 + | capture("v(?<maj>\\d+)\\.(?<min>\\d+)")
143 + | .min
144 + | tonumber]
145 + | (max // 0)' "$ART/nodes.json" 2>/dev/null || echo 0)
146 + : "${server_minor:=0}"
147 + : "${first_kubelet_minor:=0}"
148 + : "${last_kubelet_minor:=0}"
149 + # Normalize minor versions
150 + server_minor="$(to_int "$server_minor")"
151 + first_kubelet_minor="$(to_int "$first_kubelet_minor")"
152 + last_kubelet_minor="$(to_int "$last_kubelet_minor")"
153 + # Ensure single-token integers before computing skew (avoid bash arithmetic on malformed values)
154 + lm="$(to_int "$last_kubelet_minor")"
155 + sm="$(to_int "$server_minor")"
156 + fm="$(to_int "$first_kubelet_minor")"
157 + # Compute absolute skew with awk only (robust even if inputs are "0")
158 + abs_skew="$(awk -v a="$lm" -v b="$sm" 'BEGIN{d=a-b; if (d<0) d=-d; print d+0}' 2>/dev/null)"
159 + abs_skew="$(to_int "$abs_skew")"
160 + if [ "$abs_skew" -gt 1 ]; then
161 + emit_json "ERROR" "version" "skew" "Kubelet/APIServer minor skew > 1 (server minor ${sm}, kubelet min/max ${fm}/${lm})" "Align versions per K8s skew policy."
162 + echo "- **Version Skew:** ❌ kubelet/APIServer minor skew > 1 (server=${sm}, kubelet min/max=${fm}/${lm})" >> "$REPORT"
163 + else
164 + echo "- **Version Skew:** ✅ within supported range (server=${sm}, kubelet min/max=${fm}/${lm})" >> "$REPORT"
165 + fi
166 + echo "" >> "$REPORT"
167 + # ----------------------------- API Server Health --------------------------
168 + emit_md_h2 "API Server Ready/Liveness"
169 + readyz="$ART/readyz.txt"
170 + livez="$ART/livez.txt"
171 + set +e
172 + "${KUBECTL[@]}" get --raw='/readyz?verbose' >"$readyz" 2>&1
173 + r_rc=$?
174 + "${KUBECTL[@]}" get --raw='/livez?verbose' >"$livez" 2>&1
175 + l_rc=$?
176 + set -e
177 + emit_md_code "readyz:\n$(cat "$readyz" 2>/dev/null || true)"
178 + emit_md_code "livez:\n$(cat "$livez" 2>/dev/null || true)"
179 + fail_cnt=$(grep -c 'fail' "$readyz" 2>/dev/null || true)
180 + fail_cnt="$(to_int "${fail_cnt:-0}")"
181 + if [ "$r_rc" -ne 0 ] || [ "$fail_cnt" -gt 0 ]; then
182 + emit_json "ERROR" "control-plane" "readyz" "APIServer readyz reports failures" "Check control-plane component health."
183 + echo "- **APIServer readyz:** ❌ failures detected" >> "$REPORT"
184 + else
185 + echo "- **APIServer readyz:** ✅ ok" >> "$REPORT"
186 + fi
187 + if [[ $l_rc -ne 0 ]]; then
188 + emit_json "ERROR" "control-plane" "livez" "APIServer livez not reachable" ""
189 + echo "- **APIServer livez:** ❌ unreachable" >> "$REPORT"
190 + else
191 + echo "- **APIServer livez:** ✅ ok" >> "$REPORT"
192 + fi
193 + echo "" >> "$REPORT"
194 + # ----------------------------- Nodes -------------------------------------
195 + emit_md_h2 "Node Health"
196 + nodes_json="$ART/nodes.json"
197 + not_ready=$(jq -r '.items[] | select([.status.conditions[]?|select(.type=="Ready")][0].status!="True") | .metadata.name' "$nodes_json")
198 + if [[ -n "$not_ready" ]]; then
199 + emit_json "ERROR" "nodes" "ready" "Some nodes NotReady" "$not_ready"
200 + echo "- **Ready:** ❌ NotReady nodes present:" >> "$REPORT"; echo "$not_ready" | sed 's/^/ - /' >> "$REPORT"
201 + else
202 + echo "- **Ready:** ✅ all nodes Ready" >> "$REPORT"
203 + fi
204 + pressures=$(jq -r '
205 + .items[] as $n
206 + | ($n.status.conditions[] | select((.type=="DiskPressure" or .type=="MemoryPressure" or .type=="PIDPressure") and .status=="True")) as $p
207 + | "\($n.metadata.name)\t\($p.type)\t\($p.message)"' "$nodes_json")
208 + if [[ -n "$pressures" ]]; then
209 + emit_json "WARN" "nodes" "pressure" "Node pressure conditions detected" "$pressures"
210 + echo "- **Pressure:** ⚠️" >> "$REPORT"; echo "$pressures" | sed 's/^/ - /' >> "$REPORT"
211 + else
212 + echo "- **Pressure:** ✅ none" >> "$REPORT"
213 + fi
214 + unsched=$(jq -r '.items[] | select(.spec.unschedulable==true) | .metadata.name' "$nodes_json")
215 + if [[ -n "$unsched" ]]; then
216 + emit_json "WARN" "nodes" "unschedulable" "Unschedulable nodes present" "$unsched"
217 + echo "- **Unschedulable:** ⚠️ $(echo "$unsched" | tr '\n' ' ')" >> "$REPORT"
218 + else
219 + echo "- **Unschedulable:** ✅ none" >> "$REPORT"
220 + fi
221 + echo "" >> "$REPORT"
222 + # ----------------------------- Networking (DNS + Calico) ------------------
223 + emit_md_h2 "Networking & DNS"
224 + # CoreDNS pods status
225 + core_dns=$("${KUBECTL[@]}" -n kube-system get deploy -l k8s-app=kube-dns,app.kubernetes.io/name=coredns -o json 2>/dev/null || true)
226 + dn_unavail=$(jq -r '([.items[]?|.status.unavailableReplicas // 0] | add) // 0' <<<"$core_dns" 2>/dev/null || echo 0)
227 + dn_unavail="$(to_int "$dn_unavail")"
228 + if [ "$dn_unavail" -gt 0 ]; then
229 + emit_json "ERROR" "networking" "coredns" "CoreDNS has unavailable replicas" ""
230 + echo "- **CoreDNS:** ❌ unavailable replicas: $dn_unavail" >> "$REPORT"
231 + else
232 + echo "- **CoreDNS:** ✅ deployment healthy or not found" >> "$REPORT"
233 + fi
234 + # Optional ephemeral DNS nslookup test
235 + if [[ "$DNS_TEST" == "true" ]]; then
236 + echo "- **DNS test:** running ephemeral busybox nslookup ..." >> "$REPORT"
237 + set +e
238 + "${KUBECTL[@]}" run dnscheck-$$ --image=busybox:1.36 --restart=Never --command -- /bin/sh -c 'nslookup kubernetes.default.svc.cluster.local >/dev/null' \
239 + --image-pull-policy=IfNotPresent --quiet --timeout=30s 1>/dev/null 2>&1
240 + run_rc=$?
241 + "${KUBECTL[@]}" delete pod dnscheck-$$ --now --wait=false 1>/dev/null 2>&1
242 + set -e
243 + if [[ $run_rc -ne 0 ]]; then
244 + emit_json "ERROR" "networking" "dns" "In-pod DNS resolution failed" "Check CoreDNS, network policies, kube-dns Service."
245 + echo " ❌ DNS resolution failed" >> "$REPORT"
246 + else
247 + echo " ✅ DNS resolution ok" >> "$REPORT"
248 + fi
249 + else
250 + echo "- **DNS test:** (skipped)" >> "$REPORT"
251 + fi
252 + echo "" >> "$REPORT"
253 + # Calico basic health
254 + emit_md_h3 "Calico"
255 + calico_ds=$("${KUBECTL[@]}" -n calico-system get ds calico-node -o json 2>/dev/null || true)
256 + if [[ -n "$calico_ds" ]]; then
257 + desire=$(jq -r '.status.desiredNumberScheduled // 0' <<<"$calico_ds")
258 + ready=$(jq -r '.status.numberReady // 0' <<<"$calico_ds")
259 + desire="$(to_int "$desire")"; ready="$(to_int "$ready")"
260 + if [ "$ready" -lt "$desire" ]; then
261 + emit_json "ERROR" "calico" "daemonset" "calico-node not fully Ready ($ready/$desire)" "Check calico-node pods and CNI errors."
262 + echo "- **calico-node:** ❌ $ready/$desire Ready" >> "$REPORT"
263 + else
264 + echo "- **calico-node:** ✅ $ready/$desire Ready" >> "$REPORT"
265 + fi
266 + else
267 + echo "- **calico-node:** (DaemonSet not found)" >> "$REPORT"
268 + fi
269 + typha=$("${KUBECTL[@]}" -n calico-system get deploy -l k8s-app=calico-typha -o json 2>/dev/null || true)
270 + if [[ -n "$typha" ]]; then
271 + unavail=$(jq -r '[.items[]?|.status.unavailableReplicas // 0] | add' <<<"$typha")
272 + unavail="$(to_int "$unavail")"
273 + if [ "$unavail" -gt 0 ]; then
274 + emit_json "WARN" "calico" "typha" "Calico Typha unavailable replicas: $unavail" ""
275 + echo "- **calico-typha:** ⚠️ unavailable: $unavail" >> "$REPORT"
276 + else
277 + echo "- **calico-typha:** ✅ healthy" >> "$REPORT"
278 + fi
279 + fi
280 + echo "" >> "$REPORT"
281 + # ----------------------------- Storage & CSI ------------------------------
282 + emit_md_h2 "Storage"
283 + sc_json="$ART/sc.json"
284 + if [[ -s "$sc_json" ]]; then
285 + defaults=$(jq -r '.items[]|select(.metadata.annotations["storageclass.kubernetes.io/is-default-class"]=="true")|.metadata.name' "$sc_json")
286 + if [[ -z "$defaults" ]]; then
287 + emit_json "WARN" "storage" "default-sc" "No default StorageClass set" "Annotate one SC as default."
288 + echo "- **Default StorageClass:** ⚠️ none set" >> "$REPORT"
289 + else
290 + echo "- **Default StorageClass:** ✅ $defaults" >> "$REPORT"
291 + fi
292 + fi
293 + pvc_pending=$(jq -r '.items[]|select(.status.phase=="Pending")|.metadata.namespace + "/" + .metadata.name' "$ART/pvc.json" 2>/dev/null || true)
294 + if [[ -n "$pvc_pending" ]]; then
295 + emit_json "ERROR" "storage" "pvc" "Pending PVCs detected" "$pvc_pending"
296 + echo "- **PVCs:** ❌ Pending:\n$(echo "$pvc_pending" | sed 's/^/ - /')" >> "$REPORT"
297 + else
298 + echo "- **PVCs:** ✅ none Pending" >> "$REPORT"
299 + fi
300 + echo "" >> "$REPORT"
301 + # ----------------------------- Workloads ---------------------------------
302 + emit_md_h2 "Workloads"
303 + # Pending pods >5m
304 + pending=$(jq -r '
305 + .items[]
306 + | select(.status.phase=="Pending")
307 + | select((now - (.metadata.creationTimestamp|fromdate)) > 300)
308 + | .metadata.namespace + "/" + .metadata.name + " — " + ((.status.conditions // [] | map(select(.type=="PodScheduled"))[0].reason) // "Pending")
309 + ' "$ART/pods.json")
310 + if [[ -n "$pending" ]]; then
311 + emit_json "ERROR" "workloads" "pending" "Pending pods >5m" "$pending"
312 + echo "- **Pending Pods (>5m):** ❌" >> "$REPORT"; echo "$pending" | sed 's/^/ - /' >> "$REPORT"
313 + else
314 + echo "- **Pending Pods (>5m):** ✅ none" >> "$REPORT"
315 + fi
316 + # CrashLoop / high restarts
317 + crash=$(jq -r '
318 + .items[] as $p
319 + | ($p.status.containerStatuses // [])[]
320 + | select((.restartCount // 0) >= 3)
321 + | "\($p.metadata.namespace)/\($p.metadata.name) — \(.name) restarts=\(.restartCount) lastState=\(.lastState|tojson)"
322 + ' "$ART/pods.json")
323 + if [[ -n "$crash" ]]; then
324 + emit_json "WARN" "workloads" "restarts" "Containers with >=3 restarts" "$crash"
325 + echo "- **High Restarts (>=3):** ⚠️" >> "$REPORT"; echo "$crash" | sed 's/^/ - /' >> "$REPORT"
326 + else
327 + echo "- **High Restarts (>=3):** ✅ none" >> "$REPORT"
328 + fi
329 + # Deployments with unavailable replicas
330 + unavail=$(jq -r '
331 + .items[]?|select(.kind=="Deployment")|select((.status.unavailableReplicas // 0) > 0)
332 + | .metadata.namespace + "/" + .metadata.name + " — unavailable=" + ((.status.unavailableReplicas|tostring))
333 + ' "$ART/workloads.json" 2>/dev/null || true)
334 + if [[ -n "$unavail" ]]; then
335 + emit_json "ERROR" "workloads" "deploy-unavailable" "Deployments with unavailable replicas" "$unavail"
336 + echo "- **Deployments:** ❌ unavailable replicas:\n$(echo "$unavail" | sed 's/^/ - /')" >> "$REPORT"
337 + else
338 + echo "- **Deployments:** ✅ all available" >> "$REPORT"
339 + fi
340 + echo "" >> "$REPORT"
341 + # ----------------------------- Services & Endpoints -----------------------
342 + emit_md_h2 "Services & Endpoints"
343 + svc_0ep=$(jq -r '
344 + ( input | .items[] | {ns:.metadata.namespace, name:.metadata.name} ) as $svc
345 + | . as $eps
346 + | $svc.ns + "/" + $svc.name as $k
347 + ' "$ART/svc.json" "$ART/endpoints.json" 2>/dev/null | sort | uniq -u || true)
348 + # Alternative: compute zero endpoints properly
349 + svc_zero=$(
350 + jq -r '
351 + .items[] | [.metadata.namespace,.metadata.name, (.spec.selector|type)] | @tsv' "$ART/svc.json" \
352 + | while IFS=$'\t' read -r ns name seltype; do
353 + # Skip headless/ExternalName? Keep simple: check subsets len
354 + subsets=$(jq -r --arg ns "$ns" --arg name "$name" \
355 + '.items[]|select(.metadata.namespace==$ns and .metadata.name==$name)|(.subsets|length)' "$ART/endpoints.json" 2>/dev/null | head -n1)
356 + subsets=${subsets:-0}
357 + subsets="$(to_int "$subsets")"
358 + if [[ "$seltype" != "null" && "$subsets" -eq 0 ]]; then
359 + echo "$ns/$name"
360 + fi
361 + done
362 + )
363 + if [[ -n "$svc_zero" ]]; then
364 + emit_json "ERROR" "networking" "svc-no-endpoints" "Services with zero Endpoints" "$svc_zero"
365 + echo "- **Services with 0 endpoints:** ❌" >> "$REPORT"; echo "$svc_zero" | sed 's/^/ - /' >> "$REPORT"
366 + else
367 + echo "- **Services with 0 endpoints:** ✅ none" >> "$REPORT"
368 + fi
369 + echo "" >> "$REPORT"
370 + # ----------------------------- TLS Secret Expiry -------------------------
371 + emit_md_h2 "TLS Certificates (Secrets)"
372 + # Build a set of TLS secrets actually referenced by Istio Gateways (credentialName)
373 + ISTIO_GW_SECRETS_FILE="$ART/istio_gateway_tls_secrets.tsv"
374 + : > "$ISTIO_GW_SECRETS_FILE"
375 + if [[ -s "$ART/istio_gateways.json" ]]; then
376 + jq -r '
377 + .items[]
378 + | .metadata.namespace as $ns
379 + | (.spec.servers // [])
380 + | map(select(.tls.credentialName != null) | [$ns, .tls.credentialName])
381 + | .[]
382 + | @tsv
383 + ' "$ART/istio_gateways.json" 2>/dev/null | sort -u > "$ISTIO_GW_SECRETS_FILE" || true
384 + fi
385 + tls_list=$(jq -r '.items[]|select(.type=="kubernetes.io/tls")|.metadata.namespace + "\t" + .metadata.name + "\t" + (.data["tls.crt"]//"")' "$ART/secrets.json" 2>/dev/null || true)
386 + if [[ -n "$tls_list" ]]; then
387 + exp_rows_inuse=""
388 + exp_rows_unused=""
389 + while IFS=$'\t' read -r ns name b64; do
390 + [[ -z "$b64" ]] && continue
391 + crt="$ART/${ns}_${name}.crt"
392 + echo "$b64" | base64 -d > "$crt" 2>/dev/null || continue
393 + end=$(openssl x509 -enddate -noout -in "$crt" 2>/dev/null | cut -d= -f2)
394 + [[ -z "$end" ]] && continue
395 + days=$(days_until "$end"); days="$(to_int "$days")"
396 + # Is this secret referenced by any Istio Gateway in the same namespace?
397 + in_use="no"
398 + if grep -q -P "^${ns}\t${name}$" "$ISTIO_GW_SECRETS_FILE" 2>/dev/null; then
399 + in_use="yes"
400 + fi
401 + if [ "$in_use" = "yes" ]; then
402 + # Severity: only for IN-USE secrets
403 + level="INFO"
404 + if [ "$days" -le "$TLS_WARN_DAYS" ]; then level="WARN"; fi
405 + if [ "$days" -le "$TLS_ERR_DAYS" ]; then level="ERROR"; fi
406 + exp_rows_inuse+="$ns/$name — expires in ${days}d (${level}) [IN-USE]"$'\n'
407 + if [ "$level" = "ERROR" ]; then
408 + emit_json "ERROR" "security" "tls-expiry" "$ns/$name expiring in ${days}d [IN-USE]" "Referenced by an Istio Gateway; renew certificate."
409 + elif [ "$level" = "WARN" ]; then
410 + emit_json "WARN" "security" "tls-expiry" "$ns/$name expiring in ${days}d [IN-USE]" "Plan renewal."
411 + fi
412 + else
413 + # UNUSED secrets: do NOT alert; just list under an informational subheader
414 + exp_rows_unused+="$ns/$name — expires in ${days}d [unused]"$'\n'
415 + fi
416 + done <<< "$tls_list"
417 + # Print IN-USE expiries (with levels)
418 + if [[ -n "$exp_rows_inuse" ]]; then
419 + echo "- **TLS expiries (in-use secrets):**" >> "$REPORT"
420 + echo "$exp_rows_inuse" | sed 's/^/ - /' >> "$REPORT"
421 + else
422 + echo "- **TLS expiries (in-use secrets):** none" >> "$REPORT"
423 + fi
424 + # Print UNUSED secrets as information only
425 + if [[ -n "$exp_rows_unused" ]]; then
426 + emit_md_h3 "Unused Secrets"
427 + echo "$exp_rows_unused" | sed 's/^/ - /' >> "$REPORT"
428 + else
429 + emit_md_h3 "Unused Secrets"
430 + echo " - none" >> "$REPORT"
431 + fi
432 + else
433 + echo "- **TLS expiries (in-use secrets):** (no kubernetes.io/tls secrets found)" >> "$REPORT"
434 + emit_md_h3 "Unused Secrets"
435 + echo " - none" >> "$REPORT"
436 + fi
437 + echo "" >> "$REPORT"
438 + # ----------------------------- Istio Checks ------------------------------
439 + emit_md_h2 "Istio"
440 + # istiod deployment
441 + istiod=$("${KUBECTL[@]}" -n istio-system get deploy istiod -o json 2>/dev/null || true)
442 + if [[ -n "$istiod" ]]; then
443 + un=$(jq -r '.status.unavailableReplicas // 0' <<<"$istiod")
444 + un="$(to_int "$un")"
445 + if [ "$un" -gt 0 ]; then
446 + emit_json "ERROR" "istio" "istiod" "istiod has unavailable replicas: $un" ""
447 + echo "- **istiod:** ❌ unavailable=$un" >> "$REPORT"
448 + else
449 + echo "- **istiod:** ✅ healthy" >> "$REPORT"
450 + fi
451 + else
452 + echo "- **istiod:** (not found)" >> "$REPORT"
453 + fi
454 + # ingress gateway (classic)
455 + igw=$("${KUBECTL[@]}" -n istio-system get deploy -l app=istio-ingressgateway -o json 2>/dev/null || true)
456 + if [[ -n "$igw" ]]; then
457 + un=$(jq -r '[.items[]?|.status.unavailableReplicas // 0] | add' <<<"$igw")
458 + un="$(to_int "$un")"
459 + if [ "$un" -gt 0 ]; then
460 + emit_json "WARN" "istio" "ingressgateway" "IngressGateway unavailable: $un" ""
461 + echo "- **IngressGateway:** ⚠️ unavailable=$un" >> "$REPORT"
462 + else
463 + echo "- **IngressGateway:** ✅ healthy" >> "$REPORT"
464 + fi
465 + fi
466 + # namespaces with auto-injection enabled but pods missing sidecar
467 + emit_md_h3 "Sidecar Injection Coverage"
468 + # Detect namespaces with auto-injection enabled either by legacy label or revision label
469 + inj_ns=$(jq -r '.items[]
470 + | select(.metadata.labels["istio-injection"]=="enabled" or (.metadata.labels["istio.io/rev"] != null))
471 + | .metadata.name' "$ART/namespaces.json")
472 + missing_list=""
473 + if [[ -n "$inj_ns" ]]; then
474 + while IFS= read -r ns; do
475 + pods=$(jq -r --arg ns "$ns" '
476 + .items[]
477 + | select(.metadata.namespace==$ns and (.status.phase=="Running" or .status.phase=="Pending"))
478 + | .metadata.name as $n
479 + | ((.spec.containers // []) | any(.name=="istio-proxy")) as $has
480 + | (.metadata.annotations["sidecar.istio.io/inject"] // "") as $inject
481 + | [$n, ($has|tostring), $inject] | @tsv
482 + ' "$ART/pods.json")
483 + while IFS=$'\t' read -r pn has inject; do
484 + [[ -z "$pn" ]] && continue
485 + # If a pod explicitly disables injection, don't flag it as missing.
486 + if [[ "$has" != "true" && "$inject" != "false" ]]; then
487 + missing_list+="$ns/$pn"$'\n'
488 + fi
489 + done <<< "$pods"
490 + done <<< "$inj_ns"
491 + fi
492 + if [[ -n "$missing_list" ]]; then
493 + emit_json "WARN" "istio" "sidecar-missing" "Pods missing istio-proxy in injection-enabled namespaces" "$missing_list"
494 + echo "- **Missing sidecars (in injection-enabled ns):** ⚠️" >> "$REPORT"; echo "$missing_list" | sed 's/^/ - /' >> "$REPORT"
495 + else
496 + echo "- **Missing sidecars:** ✅ none (or no injection-enabled namespaces)" >> "$REPORT"
497 + fi
498 + echo "" >> "$REPORT"
499 + # ----------------------------- App Discovery: Redis -----------------------
500 + emit_md_h2 "App Health — Redis / RabbitMQ / MinIO"
501 + emit_md_h3 "Redis"
502 + # detect by common labels & names
503 + redis_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -l app=redis,app.kubernetes.io/name=redis -o json 2>/dev/null || true)
504 + if [[ -z "$redis_objs" || "$(jq '.items|length' <<<"$redis_objs")" -eq 0 ]]; then
505 + # fallback: name contains redis
506 + redis_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -o json 2>/dev/null | jq '{items:[.items[]|select(.metadata.name|test("redis"))]}' || true)
507 + fi
508 + if [[ "$(jq '.items|length' <<<"$redis_objs" 2>/dev/null)" -gt 0 ]]; then
509 + while read -r line; do
510 + ns=$(cut -d' ' -f1 <<<"$line"); kind=$(cut -d' ' -f2 <<<"$line"); name=$(cut -d' ' -f3- <<<"$line")
511 + obj=$("${KUBECTL[@]}" -n "$ns" get "$kind" "$name" -o json)
512 + desired=$(jq -r '.spec.replicas // 1' <<<"$obj"); ready=$(jq -r '.status.readyReplicas // 0' <<<"$obj")
513 + desired="$(to_int "$desired")"; ready="$(to_int "$ready")"
514 + status="ok"; marker="✅"
515 + if [ "$ready" -lt "$desired" ]; then status="unavailable"; marker="❌"; emit_json "ERROR" "apps.redis" "$kind" "$ns/$name unavailable ($ready/$desired)" "Check pod logs and PVCs."; fi
516 + echo "- **$ns/$name ($kind):** $marker $ready/$desired ready" >> "$REPORT"
517 + # Endpoints
518 + svc=$("${KUBECTL[@]}" -n "$ns" get svc -l "app=redis,app.kubernetes.io/name=redis" -o json 2>/dev/null || true)
519 + if [[ -n "$svc" && "$(jq '.items|length' <<<"$svc")" -gt 0 ]]; then
520 + while read -r sname; do
521 + eps=$(jq -r --arg ns "$ns" --arg s "$sname" '.items[]|select(.metadata.namespace==$ns and .metadata.name==$s)|(.subsets|length)' "$ART/endpoints.json")
522 + eps=${eps:-0}
523 + eps="$(to_int "$eps")"
524 + echo " - svc/$sname endpoints: $eps" >> "$REPORT"
525 + if [ "$eps" -eq 0 ]; then emit_json "ERROR" "apps.redis" "endpoints" "$ns/svc/$sname has 0 endpoints" ""; fi
526 + done < <(jq -r '.items[].metadata.name' <<<"$svc")
527 + fi
528 + done < <(jq -r '.items[]|.metadata.namespace+" "+.kind+" "+.metadata.name' <<<"$redis_objs")
529 + else
530 + echo "- (no Redis discovered)" >> "$REPORT"
531 + fi
532 + echo "" >> "$REPORT"
533 + # ----------------------------- App Discovery: RabbitMQ --------------------
534 + emit_md_h3 "RabbitMQ"
535 + rabbit_crd=$(grep -c rabbitmqclusters.rabbitmq.com "$ART/app_crds.txt" 2>/dev/null || echo 0)
536 + if (( rabbit_crd > 0 )); then
537 + # Operator CRD health (best effort)
538 + "${KUBECTL[@]}" get rabbitmqclusters.rabbitmq.com --all-namespaces -o json > "$ART/rabbit_cr.json" 2>/dev/null || true
539 + if [[ -s "$ART/rabbit_cr.json" ]]; then
540 + while read -r ns name phase; do
541 + marker="✅"; lvl="INFO"
542 + if [[ "$phase" != "Running" && "$phase" != "Ready" ]]; then marker="❌"; lvl="ERROR"; fi
543 + echo "- **$ns/$name (RabbitmqCluster):** $marker phase=$phase" >> "$REPORT"
544 + [[ "$lvl" == "ERROR" ]] && emit_json "ERROR" "apps.rabbitmq" "cluster" "$ns/$name phase=$phase" "Check operator and pods."
545 + done < <(jq -r '.items[]|.metadata.namespace+" "+.metadata.name+" "+(.status.conditions[]?|select(.type=="Ready")|.status // "Unknown")' "$ART/rabbit_cr.json" 2>/dev/null || true)
546 + fi
547 + fi
548 + # Fallback to Deploy/STS named rabbit
549 + rabbit_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -l app.kubernetes.io/name=rabbitmq,app=rabbitmq -o json 2>/dev/null || true)
550 + if [[ -z "$rabbit_objs" || "$(jq '.items|length' <<<"$rabbit_objs")" -eq 0 ]]; then
551 + rabbit_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -o json 2>/dev/null | jq '{items:[.items[]|select(.metadata.name|test("rabbit"))]}' || true)
552 + fi
553 + if [[ "$(jq '.items|length' <<<"$rabbit_objs" 2>/dev/null)" -gt 0 ]]; then
554 + while read -r line; do
555 + ns=$(cut -d' ' -f1 <<<"$line"); kind=$(cut -d' ' -f2 <<<"$line"); name=$(cut -d' ' -f3- <<<"$line")
556 + obj=$("${KUBECTL[@]}" -n "$ns" get "$kind" "$name" -o json)
557 + desired=$(jq -r '.spec.replicas // 1' <<<"$obj"); ready=$(jq -r '.status.readyReplicas // 0' <<<"$obj")
558 + desired="$(to_int "$desired")"; ready="$(to_int "$ready")"
559 + marker="✅"; if [ "$ready" -lt "$desired" ]; then marker="❌"; emit_json "ERROR" "apps.rabbitmq" "$kind" "$ns/$name unavailable ($ready/$desired)" ""; fi
560 + echo "- **$ns/$name ($kind):** $marker $ready/$desired ready" >> "$REPORT"
561 + done < <(jq -r '.items[]|.metadata.namespace+" "+.kind+" "+.metadata.name' <<<"$rabbit_objs")
562 + else
563 + echo "- (no RabbitMQ discovered)" >> "$REPORT"
564 + fi
565 + echo "" >> "$REPORT"
566 + # ----------------------------- App Discovery: MinIO -----------------------
567 + emit_md_h3 "MinIO"
568 + minio_tenants_crd=$(grep -c tenants.minio.min.io "$ART/app_crds.txt" 2>/dev/null || echo 0)
569 + if (( minio_tenants_crd > 0 )); then
570 + "${KUBECTL[@]}" get tenants.minio.min.io --all-namespaces -o json > "$ART/minio_tenants.json" 2>/dev/null || true
571 + if [[ -s "$ART/minio_tenants.json" ]]; then
572 + while read -r ns name ready; do
573 + marker="✅"
574 + [[ "$ready" != "True" ]] && marker="❌" && emit_json "ERROR" "apps.minio" "tenant" "$ns/$name not Ready" ""
575 + echo "- **$ns/$name (Tenant):** $marker Ready=$ready" >> "$REPORT"
576 + done < <(jq -r '.items[]|.metadata.namespace+" "+.metadata.name+" "+((.status.conditions[]?|select(.type=="Available")|.status)//"Unknown")' "$ART/minio_tenants.json")
577 + fi
578 + fi
579 + # Fallback: Deploy/STS named/labeled minio
580 + minio_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -l app=minio,app.kubernetes.io/name=minio -o json 2>/dev/null || true)
581 + if [[ -z "$minio_objs" || "$(jq '.items|length' <<<"$minio_objs")" -eq 0 ]]; then
582 + minio_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -o json 2>/dev/null | jq '{items:[.items[]|select(.metadata.name|test("minio"))]}' || true)
583 + fi
584 + if [[ "$(jq '.items|length' <<<"$minio_objs" 2>/dev/null)" -gt 0 ]]; then
585 + while read -r line; do
586 + ns=$(cut -d' ' -f1 <<<"$line"); kind=$(cut -d' ' -f2 <<<"$line"); name=$(cut -d' ' -f3- <<<"$line")
587 + obj=$("${KUBECTL[@]}" -n "$ns" get "$kind" "$name" -o json)
588 + desired=$(jq -r '.spec.replicas // 1' <<<"$obj"); ready=$(jq -r '.status.readyReplicas // 0' <<<"$obj")
589 + desired="$(to_int "$desired")"; ready="$(to_int "$ready")"
590 + marker="✅"; if [ "$ready" -lt "$desired" ]; then marker="❌"; emit_json "ERROR" "apps.minio" "$kind" "$ns/$name unavailable ($ready/$desired)" ""; fi
591 + echo "- **$ns/$name ($kind):** $marker $ready/$desired ready" >> "$REPORT"
592 + # PVCs bound?
593 + claim_names=$(jq -r '.spec.volumeClaimTemplates[]?.metadata.name' <<<"$obj" 2>/dev/null || true)
594 + if [[ -n "$claim_names" ]]; then
595 + for cn in $claim_names; do
596 + # StatefulSets name-ordinal claim pattern
597 + echo " - PVC template: $cn" >> "$REPORT"
598 + done
599 + fi
600 + done < <(jq -r '.items[]|.metadata.namespace+" "+.kind+" "+.metadata.name' <<<"$minio_objs")
601 + else
602 + echo "- (no MinIO discovered)" >> "$REPORT"
603 + fi
604 + echo "" >> "$REPORT"
605 + # ----------------------------- Events Snapshot ---------------------------
606 + emit_md_h2 "Recent Warning/Error Events (top 30)"
607 + events_tsv=$(jq -r '
608 + .items[]
609 + | select(.type=="Warning" or (.reason|test("BackOff|Failed|Error")))
610 + | [.lastTimestamp, .involvedObject.namespace, .involvedObject.kind, .involvedObject.name, .reason, (.message|gsub("\n"; " "))]
611 + | @tsv' "$ART/events.json" 2>/dev/null | tail -n 30 || true)
612 + if [[ -n "$events_tsv" ]]; then
613 + echo -e "\n| Time | NS | Kind | Name | Reason | Message |" >> "$REPORT"
614 + echo "|---|---|---|---|---|---|" >> "$REPORT"
615 + while IFS=$'\t' read -r t ns k n r m; do
616 + echo "| $t | ${ns:-} | ${k:-} | ${n:-} | ${r:-} | ${m:-} |" >> "$REPORT"
617 + done <<< "$events_tsv"
618 + else
619 + echo "- No recent warnings/errors." >> "$REPORT"
620 + fi
621 + # ----------------------------- Rollup & Exit -----------------------------
622 + emit_md_h2 "Summary & Exit Code"
623 + # produce a compact rollup
624 + LEVEL="OK"
625 + if grep -q '"level":"ERROR"' "$JSONL" 2>/dev/null; then LEVEL="ERROR"
626 + elif grep -q '"level":"WARN"' "$JSONL" 2>/dev/null; then LEVEL="WARN"
627 + fi
628 + echo "- **Overall:** ${LEVEL}" >> "$REPORT"
629 + # finalize JSON summary array
630 + jq -s '.' "$JSONL" > "$OUT_DIR/summary.json" 2>/dev/null || echo "[]">"$OUT_DIR/summary.json"
631 + echo
632 + echo "Report written to: $REPORT"
633 + echo "Artifacts in: $ART"
634 + case "$LEVEL" in
635 + ERROR) exit 2;;
636 + WARN) exit 1;;
637 + *) exit 0;;
638 + esac
Newer Older