Last active 1767947155

Revision 23f4ba08706771070d085f0918a9b72ba551cd43

health-check.sh Raw
1#!/usr/bin/env bash
2set -euo pipefail
3# k8s-health.sh — single-context Kubernetes health report (Markdown)
4# Focus: core cluster + Calico + Istio + common apps (Redis, RabbitMQ, MinIO)
5# Deps: kubectl, jq, awk, sed, grep, base64, openssl; (optional) gdate
6# ----------------------------- CLI & Globals -----------------------------
7OUT_DIR=""
8REQUEST_TIMEOUT="${REQUEST_TIMEOUT:-8s}"
9DNS_TEST="${DNS_TEST:-true}" # set false to skip ephemeral DNS test
10TLS_WARN_DAYS="${TLS_WARN_DAYS:-14}"
11TLS_ERR_DAYS="${TLS_ERR_DAYS:-7}"
12usage() {
13 cat <<EOF
14Usage: $0 [--context NAME] [--namespace NS] [--out DIR] [--no-dns]
15Options:
16 --context NAME Use a specific kubeconfig context (default: current)
17 --namespace NS Limit app checks to a namespace (default: all)
18 --out DIR Output directory (default: ./k8s-health-<timestamp>)
19 --no-dns Skip ephemeral DNS resolution test
20Env overrides:
21 REQUEST_TIMEOUT (default: 8s) | TLS_WARN_DAYS (14) | TLS_ERR_DAYS (7)
22EOF
23}
24CTX_ARGS=()
25APP_NS=""
26while [[ $# -gt 0 ]]; do
27 case "$1" in
28 --context) shift; CTX_ARGS+=(--context "$1");;
29 --namespace) shift; APP_NS="$1";;
30 --out) shift; OUT_DIR="$1";;
31 --no-dns) DNS_TEST=false;;
32 -h|--help) usage; exit 0;;
33 *) echo "Unknown arg: $1" >&2; usage; exit 3;;
34 esac
35 shift
36done
37ts_now() { date -Is; }
38to_ts() {
39 # portable timestamp from RFC date string (uses gdate if available)
40 local d="$1"
41 if command -v gdate >/dev/null 2>&1; then gdate -d "$d" +%s; else date -d "$d" +%s; fi 2>/dev/null || echo 0
42}
43days_until() {
44 local end="$1"; local end_ts; end_ts=$(to_ts "$end")
45 local now_ts; now_ts=$(date +%s)
46 echo $(( (end_ts - now_ts) / 86400 ))
47}
48# Normalize possibly multi-line/non-numeric values to a single non-negative integer (default 0)
49to_int() {
50 local v="$1"
51 # replace newlines/tabs with spaces, take first token
52 v="${v//$'\n'/ }"; v="${v//$'\t'/ }"; v="${v%% *}"
53 # strip non-digits
54 v="$(printf '%s' "$v" | sed -E 's/[^0-9-]//g')"
55 [[ "$v" =~ ^-?[0-9]+$ ]] || v=0
56 printf '%s' "$v"
57}
58if ! command -v kubectl >/dev/null; then echo "kubectl not found" >&2; exit 3; fi
59if ! command -v jq >/dev/null; then echo "jq not found" >&2; exit 3; fi
60if [[ -z "$OUT_DIR" ]]; then
61 OUT_DIR="./k8s-health-$(date +%Y%m%d-%H%M%S)"
62fi
63ART="$OUT_DIR/artifacts"
64mkdir -p "$ART"
65REPORT="$OUT_DIR/report.md"
66JSONL="$OUT_DIR/summary.jsonl"
67touch "$REPORT" "$JSONL"
68# ----------------------------- Emit Helpers ------------------------------
69emit_json() {
70 # emit_json LEVEL AREA CHECK MESSAGE HINT
71 printf '{"ts":"%s","level":"%s","area":"%s","check":"%s","message":%s,"hint":%s}\n' \
72 "$(ts_now)" "$1" "$2" "$3" "$(jq -Rs . <<<"$4")" "$(jq -Rs . <<<"${5:-}")" >> "$JSONL"
73}
74emit_md_h1() { echo -e "# $1\n" >> "$REPORT"; }
75emit_md_h2() { echo -e "## $1\n" >> "$REPORT"; }
76emit_md_h3() { echo -e "### $1\n" >> "$REPORT"; }
77emit_md_kv() { echo "- **$1:** $2" >> "$REPORT"; }
78emit_md_code() { echo -e "\n\`\`\`\n$1\n\`\`\`\n" >> "$REPORT"; }
79# ----------------------------- Prefetch Cache ----------------------------
80echo "Collecting cluster state..."
81set +e
82KUBECTL=(kubectl "${CTX_ARGS[@]}")
83# Use a lightweight API call instead of 'kubectl version' which can fail for reasons unrelated to reachability.
84if ! "${KUBECTL[@]}" --request-timeout="$REQUEST_TIMEOUT" get --raw='/version' >/dev/null 2>&1; then
85 # Fallback to a simple resource list in case /version endpoint is blocked by a proxy.
86 if ! "${KUBECTL[@]}" --request-timeout="$REQUEST_TIMEOUT" get nodes -o name >/dev/null 2>&1; then
87 echo "Cannot reach cluster with kubectl" >&2; exit 3
88 fi
89fi
90set -e
91"${KUBECTL[@]}" version -o json > "$ART/version.json" 2>/dev/null
92"${KUBECTL[@]}" api-resources > "$ART/apiresources.txt" || true
93"${KUBECTL[@]}" get nodes -o json > "$ART/nodes.json"
94if [[ -n "$APP_NS" ]]; then
95 ns_arg=(-n "$APP_NS")
96else
97 ns_arg=(--all-namespaces)
98fi
99"${KUBECTL[@]}" get pods "${ns_arg[@]}" -o json > "$ART/pods.json"
100"${KUBECTL[@]}" get ns -o json > "$ART/namespaces.json"
101"${KUBECTL[@]}" get events --all-namespaces --sort-by=.lastTimestamp -o json --request-timeout="$REQUEST_TIMEOUT" > "$ART/events.json" || true
102"${KUBECTL[@]}" get svc --all-namespaces -o json > "$ART/svc.json"
103"${KUBECTL[@]}" get endpoints --all-namespaces -o json > "$ART/endpoints.json"
104"${KUBECTL[@]}" get endpointslices.discovery.k8s.io --all-namespaces -o json > "$ART/epslices.json" 2>/dev/null || true
105"${KUBECTL[@]}" get deploy,ds,sts,job,cronjob,hpa,pdb --all-namespaces -o json > "$ART/workloads.json" 2>/dev/null || true
106"${KUBECTL[@]}" get pvc --all-namespaces -o json > "$ART/pvc.json" 2>/dev/null || true
107"${KUBECTL[@]}" get pv -o json > "$ART/pv.json" 2>/dev/null || true
108"${KUBECTL[@]}" get storageclasses.storage.k8s.io -o json > "$ART/sc.json" 2>/dev/null || true
109"${KUBECTL[@]}" get secrets --all-namespaces -o json > "$ART/secrets.json" 2>/dev/null || true
110"${KUBECTL[@]}" get csidrivers.storage.k8s.io,csinodes.storage.k8s.io -o json > "$ART/csi.json" 2>/dev/null || true
111# Istio + Calico artifacts (best effort)
112"${KUBECTL[@]}" -n istio-system get deploy,ds,pods,svc -o wide > "$ART/istio_ls.txt" 2>/dev/null || true
113"${KUBECTL[@]}" -n calico-system get deploy,ds,pods -o wide > "$ART/calico_ls.txt" 2>/dev/null || true
114"${KUBECTL[@]}" get crd tenants.minio.min.io rabbitmqclusters.rabbitmq.com 2>/dev/null | sed '1d' > "$ART/app_crds.txt" || true
115# Cache Istio Gateways (best effort)
116"${KUBECTL[@]}" get gateway.networking.istio.io --all-namespaces -o json > "$ART/istio_gateways.json" 2>/dev/null || true
117# ----------------------------- Report Header -----------------------------
118cluster_server=$(
119 jq -r '.serverVersion.gitVersion + " (" + .serverVersion.platform + ")"' "$ART/version.json" 2>/dev/null \
120 || echo "unknown")
121client_ver=$(
122 jq -r '.clientVersion.gitVersion' "$ART/version.json" 2>/dev/null \
123 || echo "unknown")
124ctx_name=$("${KUBECTL[@]}" config current-context 2>/dev/null || echo "current")
125emit_md_h1 "Kubernetes Health Report — ${ctx_name}"
126emit_md_kv "Generated" "$(ts_now)"
127emit_md_kv "kubectl client" "$client_ver"
128emit_md_kv "APIServer" "$cluster_server"
129emit_md_kv "Namespace scope (apps)" "${APP_NS:-all}"
130echo "" >> "$REPORT"
131# ----------------------------- Versions & Skew ----------------------------
132emit_md_h2 "Cluster Versions & Skew"
133node_versions=$(jq -r '.items[]?.status.nodeInfo.kubeletVersion' "$ART/nodes.json" | sort | uniq -c | sed 's/^/ /')
134emit_md_code "Kubelet versions:\n${node_versions}"
135server_minor=$(jq -r 'try (.serverVersion.minor|tonumber) catch 0' "$ART/version.json" 2>/dev/null || echo 0)
136first_kubelet_minor=$(jq -r '[.items[]?.status.nodeInfo.kubeletVersion
137 | capture("v(?<maj>\\d+)\\.(?<min>\\d+)")
138 | .min
139 | tonumber]
140 | (min // 0)' "$ART/nodes.json" 2>/dev/null || echo 0)
141last_kubelet_minor=$(jq -r '[.items[]?.status.nodeInfo.kubeletVersion
142 | capture("v(?<maj>\\d+)\\.(?<min>\\d+)")
143 | .min
144 | tonumber]
145 | (max // 0)' "$ART/nodes.json" 2>/dev/null || echo 0)
146: "${server_minor:=0}"
147: "${first_kubelet_minor:=0}"
148: "${last_kubelet_minor:=0}"
149# Normalize minor versions
150server_minor="$(to_int "$server_minor")"
151first_kubelet_minor="$(to_int "$first_kubelet_minor")"
152last_kubelet_minor="$(to_int "$last_kubelet_minor")"
153# Ensure single-token integers before computing skew (avoid bash arithmetic on malformed values)
154lm="$(to_int "$last_kubelet_minor")"
155sm="$(to_int "$server_minor")"
156fm="$(to_int "$first_kubelet_minor")"
157# Compute absolute skew with awk only (robust even if inputs are "0")
158abs_skew="$(awk -v a="$lm" -v b="$sm" 'BEGIN{d=a-b; if (d<0) d=-d; print d+0}' 2>/dev/null)"
159abs_skew="$(to_int "$abs_skew")"
160if [ "$abs_skew" -gt 1 ]; then
161 emit_json "ERROR" "version" "skew" "Kubelet/APIServer minor skew > 1 (server minor ${sm}, kubelet min/max ${fm}/${lm})" "Align versions per K8s skew policy."
162 echo "- **Version Skew:** ❌ kubelet/APIServer minor skew > 1 (server=${sm}, kubelet min/max=${fm}/${lm})" >> "$REPORT"
163else
164 echo "- **Version Skew:** ✅ within supported range (server=${sm}, kubelet min/max=${fm}/${lm})" >> "$REPORT"
165fi
166echo "" >> "$REPORT"
167# ----------------------------- API Server Health --------------------------
168emit_md_h2 "API Server Ready/Liveness"
169readyz="$ART/readyz.txt"
170livez="$ART/livez.txt"
171set +e
172"${KUBECTL[@]}" get --raw='/readyz?verbose' >"$readyz" 2>&1
173r_rc=$?
174"${KUBECTL[@]}" get --raw='/livez?verbose' >"$livez" 2>&1
175l_rc=$?
176set -e
177emit_md_code "readyz:\n$(cat "$readyz" 2>/dev/null || true)"
178emit_md_code "livez:\n$(cat "$livez" 2>/dev/null || true)"
179fail_cnt=$(grep -c 'fail' "$readyz" 2>/dev/null || true)
180fail_cnt="$(to_int "${fail_cnt:-0}")"
181if [ "$r_rc" -ne 0 ] || [ "$fail_cnt" -gt 0 ]; then
182 emit_json "ERROR" "control-plane" "readyz" "APIServer readyz reports failures" "Check control-plane component health."
183 echo "- **APIServer readyz:** ❌ failures detected" >> "$REPORT"
184else
185 echo "- **APIServer readyz:** ✅ ok" >> "$REPORT"
186fi
187if [[ $l_rc -ne 0 ]]; then
188 emit_json "ERROR" "control-plane" "livez" "APIServer livez not reachable" ""
189 echo "- **APIServer livez:** ❌ unreachable" >> "$REPORT"
190else
191 echo "- **APIServer livez:** ✅ ok" >> "$REPORT"
192fi
193echo "" >> "$REPORT"
194# ----------------------------- Nodes -------------------------------------
195emit_md_h2 "Node Health"
196nodes_json="$ART/nodes.json"
197not_ready=$(jq -r '.items[] | select([.status.conditions[]?|select(.type=="Ready")][0].status!="True") | .metadata.name' "$nodes_json")
198if [[ -n "$not_ready" ]]; then
199 emit_json "ERROR" "nodes" "ready" "Some nodes NotReady" "$not_ready"
200 echo "- **Ready:** ❌ NotReady nodes present:" >> "$REPORT"; echo "$not_ready" | sed 's/^/ - /' >> "$REPORT"
201else
202 echo "- **Ready:** ✅ all nodes Ready" >> "$REPORT"
203fi
204pressures=$(jq -r '
205 .items[] as $n
206 | ($n.status.conditions[] | select((.type=="DiskPressure" or .type=="MemoryPressure" or .type=="PIDPressure") and .status=="True")) as $p
207 | "\($n.metadata.name)\t\($p.type)\t\($p.message)"' "$nodes_json")
208if [[ -n "$pressures" ]]; then
209 emit_json "WARN" "nodes" "pressure" "Node pressure conditions detected" "$pressures"
210 echo "- **Pressure:** ⚠️" >> "$REPORT"; echo "$pressures" | sed 's/^/ - /' >> "$REPORT"
211else
212 echo "- **Pressure:** ✅ none" >> "$REPORT"
213fi
214unsched=$(jq -r '.items[] | select(.spec.unschedulable==true) | .metadata.name' "$nodes_json")
215if [[ -n "$unsched" ]]; then
216 emit_json "WARN" "nodes" "unschedulable" "Unschedulable nodes present" "$unsched"
217 echo "- **Unschedulable:** ⚠️ $(echo "$unsched" | tr '\n' ' ')" >> "$REPORT"
218else
219 echo "- **Unschedulable:** ✅ none" >> "$REPORT"
220fi
221echo "" >> "$REPORT"
222# ----------------------------- Networking (DNS + Calico) ------------------
223emit_md_h2 "Networking & DNS"
224# CoreDNS pods status
225core_dns=$("${KUBECTL[@]}" -n kube-system get deploy -l k8s-app=kube-dns,app.kubernetes.io/name=coredns -o json 2>/dev/null || true)
226dn_unavail=$(jq -r '([.items[]?|.status.unavailableReplicas // 0] | add) // 0' <<<"$core_dns" 2>/dev/null || echo 0)
227dn_unavail="$(to_int "$dn_unavail")"
228if [ "$dn_unavail" -gt 0 ]; then
229 emit_json "ERROR" "networking" "coredns" "CoreDNS has unavailable replicas" ""
230 echo "- **CoreDNS:** ❌ unavailable replicas: $dn_unavail" >> "$REPORT"
231else
232 echo "- **CoreDNS:** ✅ deployment healthy or not found" >> "$REPORT"
233fi
234# Optional ephemeral DNS nslookup test
235if [[ "$DNS_TEST" == "true" ]]; then
236 echo "- **DNS test:** running ephemeral busybox nslookup ..." >> "$REPORT"
237 set +e
238 "${KUBECTL[@]}" run dnscheck-$$ --image=busybox:1.36 --restart=Never --command -- /bin/sh -c 'nslookup kubernetes.default.svc.cluster.local >/dev/null' \
239 --image-pull-policy=IfNotPresent --quiet --timeout=30s 1>/dev/null 2>&1
240 run_rc=$?
241 "${KUBECTL[@]}" delete pod dnscheck-$$ --now --wait=false 1>/dev/null 2>&1
242 set -e
243 if [[ $run_rc -ne 0 ]]; then
244 emit_json "ERROR" "networking" "dns" "In-pod DNS resolution failed" "Check CoreDNS, network policies, kube-dns Service."
245 echo " ❌ DNS resolution failed" >> "$REPORT"
246 else
247 echo " ✅ DNS resolution ok" >> "$REPORT"
248 fi
249else
250 echo "- **DNS test:** (skipped)" >> "$REPORT"
251fi
252echo "" >> "$REPORT"
253# Calico basic health
254emit_md_h3 "Calico"
255calico_ds=$("${KUBECTL[@]}" -n calico-system get ds calico-node -o json 2>/dev/null || true)
256if [[ -n "$calico_ds" ]]; then
257 desire=$(jq -r '.status.desiredNumberScheduled // 0' <<<"$calico_ds")
258 ready=$(jq -r '.status.numberReady // 0' <<<"$calico_ds")
259 desire="$(to_int "$desire")"; ready="$(to_int "$ready")"
260 if [ "$ready" -lt "$desire" ]; then
261 emit_json "ERROR" "calico" "daemonset" "calico-node not fully Ready ($ready/$desire)" "Check calico-node pods and CNI errors."
262 echo "- **calico-node:** ❌ $ready/$desire Ready" >> "$REPORT"
263 else
264 echo "- **calico-node:** ✅ $ready/$desire Ready" >> "$REPORT"
265 fi
266else
267 echo "- **calico-node:** (DaemonSet not found)" >> "$REPORT"
268fi
269typha=$("${KUBECTL[@]}" -n calico-system get deploy -l k8s-app=calico-typha -o json 2>/dev/null || true)
270if [[ -n "$typha" ]]; then
271 unavail=$(jq -r '[.items[]?|.status.unavailableReplicas // 0] | add' <<<"$typha")
272 unavail="$(to_int "$unavail")"
273 if [ "$unavail" -gt 0 ]; then
274 emit_json "WARN" "calico" "typha" "Calico Typha unavailable replicas: $unavail" ""
275 echo "- **calico-typha:** ⚠️ unavailable: $unavail" >> "$REPORT"
276 else
277 echo "- **calico-typha:** ✅ healthy" >> "$REPORT"
278 fi
279fi
280echo "" >> "$REPORT"
281# ----------------------------- Storage & CSI ------------------------------
282emit_md_h2 "Storage"
283sc_json="$ART/sc.json"
284if [[ -s "$sc_json" ]]; then
285 defaults=$(jq -r '.items[]|select(.metadata.annotations["storageclass.kubernetes.io/is-default-class"]=="true")|.metadata.name' "$sc_json")
286 if [[ -z "$defaults" ]]; then
287 emit_json "WARN" "storage" "default-sc" "No default StorageClass set" "Annotate one SC as default."
288 echo "- **Default StorageClass:** ⚠️ none set" >> "$REPORT"
289 else
290 echo "- **Default StorageClass:** ✅ $defaults" >> "$REPORT"
291 fi
292fi
293pvc_pending=$(jq -r '.items[]|select(.status.phase=="Pending")|.metadata.namespace + "/" + .metadata.name' "$ART/pvc.json" 2>/dev/null || true)
294if [[ -n "$pvc_pending" ]]; then
295 emit_json "ERROR" "storage" "pvc" "Pending PVCs detected" "$pvc_pending"
296 echo "- **PVCs:** ❌ Pending:\n$(echo "$pvc_pending" | sed 's/^/ - /')" >> "$REPORT"
297else
298 echo "- **PVCs:** ✅ none Pending" >> "$REPORT"
299fi
300echo "" >> "$REPORT"
301# ----------------------------- Workloads ---------------------------------
302emit_md_h2 "Workloads"
303# Pending pods >5m
304pending=$(jq -r '
305 .items[]
306 | select(.status.phase=="Pending")
307 | select((now - (.metadata.creationTimestamp|fromdate)) > 300)
308 | .metadata.namespace + "/" + .metadata.name + " — " + ((.status.conditions // [] | map(select(.type=="PodScheduled"))[0].reason) // "Pending")
309' "$ART/pods.json")
310if [[ -n "$pending" ]]; then
311 emit_json "ERROR" "workloads" "pending" "Pending pods >5m" "$pending"
312 echo "- **Pending Pods (>5m):** ❌" >> "$REPORT"; echo "$pending" | sed 's/^/ - /' >> "$REPORT"
313else
314 echo "- **Pending Pods (>5m):** ✅ none" >> "$REPORT"
315fi
316# CrashLoop / high restarts
317crash=$(jq -r '
318 .items[] as $p
319 | ($p.status.containerStatuses // [])[]
320 | select((.restartCount // 0) >= 3)
321 | "\($p.metadata.namespace)/\($p.metadata.name) — \(.name) restarts=\(.restartCount) lastState=\(.lastState|tojson)"
322' "$ART/pods.json")
323if [[ -n "$crash" ]]; then
324 emit_json "WARN" "workloads" "restarts" "Containers with >=3 restarts" "$crash"
325 echo "- **High Restarts (>=3):** ⚠️" >> "$REPORT"; echo "$crash" | sed 's/^/ - /' >> "$REPORT"
326else
327 echo "- **High Restarts (>=3):** ✅ none" >> "$REPORT"
328fi
329# Deployments with unavailable replicas
330unavail=$(jq -r '
331 .items[]?|select(.kind=="Deployment")|select((.status.unavailableReplicas // 0) > 0)
332 | .metadata.namespace + "/" + .metadata.name + " — unavailable=" + ((.status.unavailableReplicas|tostring))
333' "$ART/workloads.json" 2>/dev/null || true)
334if [[ -n "$unavail" ]]; then
335 emit_json "ERROR" "workloads" "deploy-unavailable" "Deployments with unavailable replicas" "$unavail"
336 echo "- **Deployments:** ❌ unavailable replicas:\n$(echo "$unavail" | sed 's/^/ - /')" >> "$REPORT"
337else
338 echo "- **Deployments:** ✅ all available" >> "$REPORT"
339fi
340echo "" >> "$REPORT"
341# ----------------------------- Services & Endpoints -----------------------
342emit_md_h2 "Services & Endpoints"
343svc_0ep=$(jq -r '
344 ( input | .items[] | {ns:.metadata.namespace, name:.metadata.name} ) as $svc
345 | . as $eps
346 | $svc.ns + "/" + $svc.name as $k
347' "$ART/svc.json" "$ART/endpoints.json" 2>/dev/null | sort | uniq -u || true)
348# Alternative: compute zero endpoints properly
349svc_zero=$(
350 jq -r '
351 .items[] | [.metadata.namespace,.metadata.name, (.spec.selector|type)] | @tsv' "$ART/svc.json" \
352 | while IFS=$'\t' read -r ns name seltype; do
353 # Skip headless/ExternalName? Keep simple: check subsets len
354 subsets=$(jq -r --arg ns "$ns" --arg name "$name" \
355 '.items[]|select(.metadata.namespace==$ns and .metadata.name==$name)|(.subsets|length)' "$ART/endpoints.json" 2>/dev/null | head -n1)
356 subsets=${subsets:-0}
357 subsets="$(to_int "$subsets")"
358 if [[ "$seltype" != "null" && "$subsets" -eq 0 ]]; then
359 echo "$ns/$name"
360 fi
361 done
362)
363if [[ -n "$svc_zero" ]]; then
364 emit_json "ERROR" "networking" "svc-no-endpoints" "Services with zero Endpoints" "$svc_zero"
365 echo "- **Services with 0 endpoints:** ❌" >> "$REPORT"; echo "$svc_zero" | sed 's/^/ - /' >> "$REPORT"
366else
367 echo "- **Services with 0 endpoints:** ✅ none" >> "$REPORT"
368fi
369echo "" >> "$REPORT"
370# ----------------------------- TLS Secret Expiry -------------------------
371emit_md_h2 "TLS Certificates (Secrets)"
372# Build a set of TLS secrets actually referenced by Istio Gateways (credentialName)
373ISTIO_GW_SECRETS_FILE="$ART/istio_gateway_tls_secrets.tsv"
374: > "$ISTIO_GW_SECRETS_FILE"
375if [[ -s "$ART/istio_gateways.json" ]]; then
376 jq -r '
377 .items[]
378 | .metadata.namespace as $ns
379 | (.spec.servers // [])
380 | map(select(.tls.credentialName != null) | [$ns, .tls.credentialName])
381 | .[]
382 | @tsv
383 ' "$ART/istio_gateways.json" 2>/dev/null | sort -u > "$ISTIO_GW_SECRETS_FILE" || true
384fi
385tls_list=$(jq -r '.items[]|select(.type=="kubernetes.io/tls")|.metadata.namespace + "\t" + .metadata.name + "\t" + (.data["tls.crt"]//"")' "$ART/secrets.json" 2>/dev/null || true)
386if [[ -n "$tls_list" ]]; then
387 exp_rows_inuse=""
388 exp_rows_unused=""
389 while IFS=$'\t' read -r ns name b64; do
390 [[ -z "$b64" ]] && continue
391 crt="$ART/${ns}_${name}.crt"
392 echo "$b64" | base64 -d > "$crt" 2>/dev/null || continue
393 end=$(openssl x509 -enddate -noout -in "$crt" 2>/dev/null | cut -d= -f2)
394 [[ -z "$end" ]] && continue
395 days=$(days_until "$end"); days="$(to_int "$days")"
396 # Is this secret referenced by any Istio Gateway in the same namespace?
397 in_use="no"
398 if grep -q -P "^${ns}\t${name}$" "$ISTIO_GW_SECRETS_FILE" 2>/dev/null; then
399 in_use="yes"
400 fi
401 if [ "$in_use" = "yes" ]; then
402 # Severity: only for IN-USE secrets
403 level="INFO"
404 if [ "$days" -le "$TLS_WARN_DAYS" ]; then level="WARN"; fi
405 if [ "$days" -le "$TLS_ERR_DAYS" ]; then level="ERROR"; fi
406 exp_rows_inuse+="$ns/$name — expires in ${days}d (${level}) [IN-USE]"$'\n'
407 if [ "$level" = "ERROR" ]; then
408 emit_json "ERROR" "security" "tls-expiry" "$ns/$name expiring in ${days}d [IN-USE]" "Referenced by an Istio Gateway; renew certificate."
409 elif [ "$level" = "WARN" ]; then
410 emit_json "WARN" "security" "tls-expiry" "$ns/$name expiring in ${days}d [IN-USE]" "Plan renewal."
411 fi
412 else
413 # UNUSED secrets: do NOT alert; just list under an informational subheader
414 exp_rows_unused+="$ns/$name — expires in ${days}d [unused]"$'\n'
415 fi
416 done <<< "$tls_list"
417 # Print IN-USE expiries (with levels)
418 if [[ -n "$exp_rows_inuse" ]]; then
419 echo "- **TLS expiries (in-use secrets):**" >> "$REPORT"
420 echo "$exp_rows_inuse" | sed 's/^/ - /' >> "$REPORT"
421 else
422 echo "- **TLS expiries (in-use secrets):** none" >> "$REPORT"
423 fi
424 # Print UNUSED secrets as information only
425 if [[ -n "$exp_rows_unused" ]]; then
426 emit_md_h3 "Unused Secrets"
427 echo "$exp_rows_unused" | sed 's/^/ - /' >> "$REPORT"
428 else
429 emit_md_h3 "Unused Secrets"
430 echo " - none" >> "$REPORT"
431 fi
432else
433 echo "- **TLS expiries (in-use secrets):** (no kubernetes.io/tls secrets found)" >> "$REPORT"
434 emit_md_h3 "Unused Secrets"
435 echo " - none" >> "$REPORT"
436fi
437echo "" >> "$REPORT"
438# ----------------------------- Istio Checks ------------------------------
439emit_md_h2 "Istio"
440# istiod deployment
441istiod=$("${KUBECTL[@]}" -n istio-system get deploy istiod -o json 2>/dev/null || true)
442if [[ -n "$istiod" ]]; then
443 un=$(jq -r '.status.unavailableReplicas // 0' <<<"$istiod")
444 un="$(to_int "$un")"
445 if [ "$un" -gt 0 ]; then
446 emit_json "ERROR" "istio" "istiod" "istiod has unavailable replicas: $un" ""
447 echo "- **istiod:** ❌ unavailable=$un" >> "$REPORT"
448 else
449 echo "- **istiod:** ✅ healthy" >> "$REPORT"
450 fi
451else
452 echo "- **istiod:** (not found)" >> "$REPORT"
453fi
454# ingress gateway (classic)
455igw=$("${KUBECTL[@]}" -n istio-system get deploy -l app=istio-ingressgateway -o json 2>/dev/null || true)
456if [[ -n "$igw" ]]; then
457 un=$(jq -r '[.items[]?|.status.unavailableReplicas // 0] | add' <<<"$igw")
458 un="$(to_int "$un")"
459 if [ "$un" -gt 0 ]; then
460 emit_json "WARN" "istio" "ingressgateway" "IngressGateway unavailable: $un" ""
461 echo "- **IngressGateway:** ⚠️ unavailable=$un" >> "$REPORT"
462 else
463 echo "- **IngressGateway:** ✅ healthy" >> "$REPORT"
464 fi
465fi
466# namespaces with auto-injection enabled but pods missing sidecar
467emit_md_h3 "Sidecar Injection Coverage"
468# Detect namespaces with auto-injection enabled either by legacy label or revision label
469inj_ns=$(jq -r '.items[]
470 | select(.metadata.labels["istio-injection"]=="enabled" or (.metadata.labels["istio.io/rev"] != null))
471 | .metadata.name' "$ART/namespaces.json")
472missing_list=""
473if [[ -n "$inj_ns" ]]; then
474 while IFS= read -r ns; do
475 pods=$(jq -r --arg ns "$ns" '
476 .items[]
477 | select(.metadata.namespace==$ns and (.status.phase=="Running" or .status.phase=="Pending"))
478 | .metadata.name as $n
479 | ((.spec.containers // []) | any(.name=="istio-proxy")) as $has
480 | (.metadata.annotations["sidecar.istio.io/inject"] // "") as $inject
481 | [$n, ($has|tostring), $inject] | @tsv
482 ' "$ART/pods.json")
483 while IFS=$'\t' read -r pn has inject; do
484 [[ -z "$pn" ]] && continue
485 # If a pod explicitly disables injection, don't flag it as missing.
486 if [[ "$has" != "true" && "$inject" != "false" ]]; then
487 missing_list+="$ns/$pn"$'\n'
488 fi
489 done <<< "$pods"
490 done <<< "$inj_ns"
491fi
492if [[ -n "$missing_list" ]]; then
493 emit_json "WARN" "istio" "sidecar-missing" "Pods missing istio-proxy in injection-enabled namespaces" "$missing_list"
494 echo "- **Missing sidecars (in injection-enabled ns):** ⚠️" >> "$REPORT"; echo "$missing_list" | sed 's/^/ - /' >> "$REPORT"
495else
496 echo "- **Missing sidecars:** ✅ none (or no injection-enabled namespaces)" >> "$REPORT"
497fi
498echo "" >> "$REPORT"
499# ----------------------------- App Discovery: Redis -----------------------
500emit_md_h2 "App Health — Redis / RabbitMQ / MinIO"
501emit_md_h3 "Redis"
502# detect by common labels & names
503redis_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -l app=redis,app.kubernetes.io/name=redis -o json 2>/dev/null || true)
504if [[ -z "$redis_objs" || "$(jq '.items|length' <<<"$redis_objs")" -eq 0 ]]; then
505 # fallback: name contains redis
506 redis_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -o json 2>/dev/null | jq '{items:[.items[]|select(.metadata.name|test("redis"))]}' || true)
507fi
508if [[ "$(jq '.items|length' <<<"$redis_objs" 2>/dev/null)" -gt 0 ]]; then
509 while read -r line; do
510 ns=$(cut -d' ' -f1 <<<"$line"); kind=$(cut -d' ' -f2 <<<"$line"); name=$(cut -d' ' -f3- <<<"$line")
511 obj=$("${KUBECTL[@]}" -n "$ns" get "$kind" "$name" -o json)
512 desired=$(jq -r '.spec.replicas // 1' <<<"$obj"); ready=$(jq -r '.status.readyReplicas // 0' <<<"$obj")
513 desired="$(to_int "$desired")"; ready="$(to_int "$ready")"
514 status="ok"; marker="✅"
515 if [ "$ready" -lt "$desired" ]; then status="unavailable"; marker="❌"; emit_json "ERROR" "apps.redis" "$kind" "$ns/$name unavailable ($ready/$desired)" "Check pod logs and PVCs."; fi
516 echo "- **$ns/$name ($kind):** $marker $ready/$desired ready" >> "$REPORT"
517 # Endpoints
518 svc=$("${KUBECTL[@]}" -n "$ns" get svc -l "app=redis,app.kubernetes.io/name=redis" -o json 2>/dev/null || true)
519 if [[ -n "$svc" && "$(jq '.items|length' <<<"$svc")" -gt 0 ]]; then
520 while read -r sname; do
521 eps=$(jq -r --arg ns "$ns" --arg s "$sname" '.items[]|select(.metadata.namespace==$ns and .metadata.name==$s)|(.subsets|length)' "$ART/endpoints.json")
522 eps=${eps:-0}
523 eps="$(to_int "$eps")"
524 echo " - svc/$sname endpoints: $eps" >> "$REPORT"
525 if [ "$eps" -eq 0 ]; then emit_json "ERROR" "apps.redis" "endpoints" "$ns/svc/$sname has 0 endpoints" ""; fi
526 done < <(jq -r '.items[].metadata.name' <<<"$svc")
527 fi
528 done < <(jq -r '.items[]|.metadata.namespace+" "+.kind+" "+.metadata.name' <<<"$redis_objs")
529else
530 echo "- (no Redis discovered)" >> "$REPORT"
531fi
532echo "" >> "$REPORT"
533# ----------------------------- App Discovery: RabbitMQ --------------------
534emit_md_h3 "RabbitMQ"
535rabbit_crd=$(grep -c rabbitmqclusters.rabbitmq.com "$ART/app_crds.txt" 2>/dev/null || echo 0)
536if (( rabbit_crd > 0 )); then
537 # Operator CRD health (best effort)
538 "${KUBECTL[@]}" get rabbitmqclusters.rabbitmq.com --all-namespaces -o json > "$ART/rabbit_cr.json" 2>/dev/null || true
539 if [[ -s "$ART/rabbit_cr.json" ]]; then
540 while read -r ns name phase; do
541 marker="✅"; lvl="INFO"
542 if [[ "$phase" != "Running" && "$phase" != "Ready" ]]; then marker="❌"; lvl="ERROR"; fi
543 echo "- **$ns/$name (RabbitmqCluster):** $marker phase=$phase" >> "$REPORT"
544 [[ "$lvl" == "ERROR" ]] && emit_json "ERROR" "apps.rabbitmq" "cluster" "$ns/$name phase=$phase" "Check operator and pods."
545 done < <(jq -r '.items[]|.metadata.namespace+" "+.metadata.name+" "+(.status.conditions[]?|select(.type=="Ready")|.status // "Unknown")' "$ART/rabbit_cr.json" 2>/dev/null || true)
546 fi
547fi
548# Fallback to Deploy/STS named rabbit
549rabbit_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -l app.kubernetes.io/name=rabbitmq,app=rabbitmq -o json 2>/dev/null || true)
550if [[ -z "$rabbit_objs" || "$(jq '.items|length' <<<"$rabbit_objs")" -eq 0 ]]; then
551 rabbit_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -o json 2>/dev/null | jq '{items:[.items[]|select(.metadata.name|test("rabbit"))]}' || true)
552fi
553if [[ "$(jq '.items|length' <<<"$rabbit_objs" 2>/dev/null)" -gt 0 ]]; then
554 while read -r line; do
555 ns=$(cut -d' ' -f1 <<<"$line"); kind=$(cut -d' ' -f2 <<<"$line"); name=$(cut -d' ' -f3- <<<"$line")
556 obj=$("${KUBECTL[@]}" -n "$ns" get "$kind" "$name" -o json)
557 desired=$(jq -r '.spec.replicas // 1' <<<"$obj"); ready=$(jq -r '.status.readyReplicas // 0' <<<"$obj")
558 desired="$(to_int "$desired")"; ready="$(to_int "$ready")"
559 marker="✅"; if [ "$ready" -lt "$desired" ]; then marker="❌"; emit_json "ERROR" "apps.rabbitmq" "$kind" "$ns/$name unavailable ($ready/$desired)" ""; fi
560 echo "- **$ns/$name ($kind):** $marker $ready/$desired ready" >> "$REPORT"
561 done < <(jq -r '.items[]|.metadata.namespace+" "+.kind+" "+.metadata.name' <<<"$rabbit_objs")
562else
563 echo "- (no RabbitMQ discovered)" >> "$REPORT"
564fi
565echo "" >> "$REPORT"
566# ----------------------------- App Discovery: MinIO -----------------------
567emit_md_h3 "MinIO"
568minio_tenants_crd=$(grep -c tenants.minio.min.io "$ART/app_crds.txt" 2>/dev/null || echo 0)
569if (( minio_tenants_crd > 0 )); then
570 "${KUBECTL[@]}" get tenants.minio.min.io --all-namespaces -o json > "$ART/minio_tenants.json" 2>/dev/null || true
571 if [[ -s "$ART/minio_tenants.json" ]]; then
572 while read -r ns name ready; do
573 marker="✅"
574 [[ "$ready" != "True" ]] && marker="❌" && emit_json "ERROR" "apps.minio" "tenant" "$ns/$name not Ready" ""
575 echo "- **$ns/$name (Tenant):** $marker Ready=$ready" >> "$REPORT"
576 done < <(jq -r '.items[]|.metadata.namespace+" "+.metadata.name+" "+((.status.conditions[]?|select(.type=="Available")|.status)//"Unknown")' "$ART/minio_tenants.json")
577 fi
578fi
579# Fallback: Deploy/STS named/labeled minio
580minio_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -l app=minio,app.kubernetes.io/name=minio -o json 2>/dev/null || true)
581if [[ -z "$minio_objs" || "$(jq '.items|length' <<<"$minio_objs")" -eq 0 ]]; then
582 minio_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -o json 2>/dev/null | jq '{items:[.items[]|select(.metadata.name|test("minio"))]}' || true)
583fi
584if [[ "$(jq '.items|length' <<<"$minio_objs" 2>/dev/null)" -gt 0 ]]; then
585 while read -r line; do
586 ns=$(cut -d' ' -f1 <<<"$line"); kind=$(cut -d' ' -f2 <<<"$line"); name=$(cut -d' ' -f3- <<<"$line")
587 obj=$("${KUBECTL[@]}" -n "$ns" get "$kind" "$name" -o json)
588 desired=$(jq -r '.spec.replicas // 1' <<<"$obj"); ready=$(jq -r '.status.readyReplicas // 0' <<<"$obj")
589 desired="$(to_int "$desired")"; ready="$(to_int "$ready")"
590 marker="✅"; if [ "$ready" -lt "$desired" ]; then marker="❌"; emit_json "ERROR" "apps.minio" "$kind" "$ns/$name unavailable ($ready/$desired)" ""; fi
591 echo "- **$ns/$name ($kind):** $marker $ready/$desired ready" >> "$REPORT"
592 # PVCs bound?
593 claim_names=$(jq -r '.spec.volumeClaimTemplates[]?.metadata.name' <<<"$obj" 2>/dev/null || true)
594 if [[ -n "$claim_names" ]]; then
595 for cn in $claim_names; do
596 # StatefulSets name-ordinal claim pattern
597 echo " - PVC template: $cn" >> "$REPORT"
598 done
599 fi
600 done < <(jq -r '.items[]|.metadata.namespace+" "+.kind+" "+.metadata.name' <<<"$minio_objs")
601else
602 echo "- (no MinIO discovered)" >> "$REPORT"
603fi
604echo "" >> "$REPORT"
605# ----------------------------- Events Snapshot ---------------------------
606emit_md_h2 "Recent Warning/Error Events (top 30)"
607events_tsv=$(jq -r '
608 .items[]
609 | select(.type=="Warning" or (.reason|test("BackOff|Failed|Error")))
610 | [.lastTimestamp, .involvedObject.namespace, .involvedObject.kind, .involvedObject.name, .reason, (.message|gsub("\n"; " "))]
611 | @tsv' "$ART/events.json" 2>/dev/null | tail -n 30 || true)
612if [[ -n "$events_tsv" ]]; then
613 echo -e "\n| Time | NS | Kind | Name | Reason | Message |" >> "$REPORT"
614 echo "|---|---|---|---|---|---|" >> "$REPORT"
615 while IFS=$'\t' read -r t ns k n r m; do
616 echo "| $t | ${ns:-} | ${k:-} | ${n:-} | ${r:-} | ${m:-} |" >> "$REPORT"
617 done <<< "$events_tsv"
618else
619 echo "- No recent warnings/errors." >> "$REPORT"
620fi
621# ----------------------------- Rollup & Exit -----------------------------
622emit_md_h2 "Summary & Exit Code"
623# produce a compact rollup
624LEVEL="OK"
625if grep -q '"level":"ERROR"' "$JSONL" 2>/dev/null; then LEVEL="ERROR"
626elif grep -q '"level":"WARN"' "$JSONL" 2>/dev/null; then LEVEL="WARN"
627fi
628echo "- **Overall:** ${LEVEL}" >> "$REPORT"
629# finalize JSON summary array
630jq -s '.' "$JSONL" > "$OUT_DIR/summary.json" 2>/dev/null || echo "[]">"$OUT_DIR/summary.json"
631echo
632echo "Report written to: $REPORT"
633echo "Artifacts in: $ART"
634case "$LEVEL" in
635 ERROR) exit 2;;
636 WARN) exit 1;;
637 *) exit 0;;
638esac