health-check.sh
· 31 KiB · Bash
Raw
#!/usr/bin/env bash
set -euo pipefail
# k8s-health.sh — single-context Kubernetes health report (Markdown)
# Focus: core cluster + Calico + Istio + common apps (Redis, RabbitMQ, MinIO)
# Deps: kubectl, jq, awk, sed, grep, base64, openssl; (optional) gdate
# ----------------------------- CLI & Globals -----------------------------
OUT_DIR=""
REQUEST_TIMEOUT="${REQUEST_TIMEOUT:-8s}"
DNS_TEST="${DNS_TEST:-true}" # set false to skip ephemeral DNS test
TLS_WARN_DAYS="${TLS_WARN_DAYS:-14}"
TLS_ERR_DAYS="${TLS_ERR_DAYS:-7}"
usage() {
cat <<EOF
Usage: $0 [--context NAME] [--namespace NS] [--out DIR] [--no-dns]
Options:
--context NAME Use a specific kubeconfig context (default: current)
--namespace NS Limit app checks to a namespace (default: all)
--out DIR Output directory (default: ./k8s-health-<timestamp>)
--no-dns Skip ephemeral DNS resolution test
Env overrides:
REQUEST_TIMEOUT (default: 8s) | TLS_WARN_DAYS (14) | TLS_ERR_DAYS (7)
EOF
}
CTX_ARGS=()
APP_NS=""
while [[ $# -gt 0 ]]; do
case "$1" in
--context) shift; CTX_ARGS+=(--context "$1");;
--namespace) shift; APP_NS="$1";;
--out) shift; OUT_DIR="$1";;
--no-dns) DNS_TEST=false;;
-h|--help) usage; exit 0;;
*) echo "Unknown arg: $1" >&2; usage; exit 3;;
esac
shift
done
ts_now() { date -Is; }
to_ts() {
# portable timestamp from RFC date string (uses gdate if available)
local d="$1"
if command -v gdate >/dev/null 2>&1; then gdate -d "$d" +%s; else date -d "$d" +%s; fi 2>/dev/null || echo 0
}
days_until() {
local end="$1"; local end_ts; end_ts=$(to_ts "$end")
local now_ts; now_ts=$(date +%s)
echo $(( (end_ts - now_ts) / 86400 ))
}
# Normalize possibly multi-line/non-numeric values to a single non-negative integer (default 0)
to_int() {
local v="$1"
# replace newlines/tabs with spaces, take first token
v="${v//$'\n'/ }"; v="${v//$'\t'/ }"; v="${v%% *}"
# strip non-digits
v="$(printf '%s' "$v" | sed -E 's/[^0-9-]//g')"
[[ "$v" =~ ^-?[0-9]+$ ]] || v=0
printf '%s' "$v"
}
if ! command -v kubectl >/dev/null; then echo "kubectl not found" >&2; exit 3; fi
if ! command -v jq >/dev/null; then echo "jq not found" >&2; exit 3; fi
if [[ -z "$OUT_DIR" ]]; then
OUT_DIR="./k8s-health-$(date +%Y%m%d-%H%M%S)"
fi
ART="$OUT_DIR/artifacts"
mkdir -p "$ART"
REPORT="$OUT_DIR/report.md"
JSONL="$OUT_DIR/summary.jsonl"
touch "$REPORT" "$JSONL"
# ----------------------------- Emit Helpers ------------------------------
emit_json() {
# emit_json LEVEL AREA CHECK MESSAGE HINT
printf '{"ts":"%s","level":"%s","area":"%s","check":"%s","message":%s,"hint":%s}\n' \
"$(ts_now)" "$1" "$2" "$3" "$(jq -Rs . <<<"$4")" "$(jq -Rs . <<<"${5:-}")" >> "$JSONL"
}
emit_md_h1() { echo -e "# $1\n" >> "$REPORT"; }
emit_md_h2() { echo -e "## $1\n" >> "$REPORT"; }
emit_md_h3() { echo -e "### $1\n" >> "$REPORT"; }
emit_md_kv() { echo "- **$1:** $2" >> "$REPORT"; }
emit_md_code() { echo -e "\n\`\`\`\n$1\n\`\`\`\n" >> "$REPORT"; }
# ----------------------------- Prefetch Cache ----------------------------
echo "Collecting cluster state..."
set +e
KUBECTL=(kubectl "${CTX_ARGS[@]}")
# Use a lightweight API call instead of 'kubectl version' which can fail for reasons unrelated to reachability.
if ! "${KUBECTL[@]}" --request-timeout="$REQUEST_TIMEOUT" get --raw='/version' >/dev/null 2>&1; then
# Fallback to a simple resource list in case /version endpoint is blocked by a proxy.
if ! "${KUBECTL[@]}" --request-timeout="$REQUEST_TIMEOUT" get nodes -o name >/dev/null 2>&1; then
echo "Cannot reach cluster with kubectl" >&2; exit 3
fi
fi
set -e
"${KUBECTL[@]}" version -o json > "$ART/version.json" 2>/dev/null
"${KUBECTL[@]}" api-resources > "$ART/apiresources.txt" || true
"${KUBECTL[@]}" get nodes -o json > "$ART/nodes.json"
if [[ -n "$APP_NS" ]]; then
ns_arg=(-n "$APP_NS")
else
ns_arg=(--all-namespaces)
fi
"${KUBECTL[@]}" get pods "${ns_arg[@]}" -o json > "$ART/pods.json"
"${KUBECTL[@]}" get ns -o json > "$ART/namespaces.json"
"${KUBECTL[@]}" get events --all-namespaces --sort-by=.lastTimestamp -o json --request-timeout="$REQUEST_TIMEOUT" > "$ART/events.json" || true
"${KUBECTL[@]}" get svc --all-namespaces -o json > "$ART/svc.json"
"${KUBECTL[@]}" get endpoints --all-namespaces -o json > "$ART/endpoints.json"
"${KUBECTL[@]}" get endpointslices.discovery.k8s.io --all-namespaces -o json > "$ART/epslices.json" 2>/dev/null || true
"${KUBECTL[@]}" get deploy,ds,sts,job,cronjob,hpa,pdb --all-namespaces -o json > "$ART/workloads.json" 2>/dev/null || true
"${KUBECTL[@]}" get pvc --all-namespaces -o json > "$ART/pvc.json" 2>/dev/null || true
"${KUBECTL[@]}" get pv -o json > "$ART/pv.json" 2>/dev/null || true
"${KUBECTL[@]}" get storageclasses.storage.k8s.io -o json > "$ART/sc.json" 2>/dev/null || true
"${KUBECTL[@]}" get secrets --all-namespaces -o json > "$ART/secrets.json" 2>/dev/null || true
"${KUBECTL[@]}" get csidrivers.storage.k8s.io,csinodes.storage.k8s.io -o json > "$ART/csi.json" 2>/dev/null || true
# Istio + Calico artifacts (best effort)
"${KUBECTL[@]}" -n istio-system get deploy,ds,pods,svc -o wide > "$ART/istio_ls.txt" 2>/dev/null || true
"${KUBECTL[@]}" -n calico-system get deploy,ds,pods -o wide > "$ART/calico_ls.txt" 2>/dev/null || true
"${KUBECTL[@]}" get crd tenants.minio.min.io rabbitmqclusters.rabbitmq.com 2>/dev/null | sed '1d' > "$ART/app_crds.txt" || true
# Cache Istio Gateways (best effort)
"${KUBECTL[@]}" get gateway.networking.istio.io --all-namespaces -o json > "$ART/istio_gateways.json" 2>/dev/null || true
# ----------------------------- Report Header -----------------------------
cluster_server=$(
jq -r '.serverVersion.gitVersion + " (" + .serverVersion.platform + ")"' "$ART/version.json" 2>/dev/null \
|| echo "unknown")
client_ver=$(
jq -r '.clientVersion.gitVersion' "$ART/version.json" 2>/dev/null \
|| echo "unknown")
ctx_name=$("${KUBECTL[@]}" config current-context 2>/dev/null || echo "current")
emit_md_h1 "Kubernetes Health Report — ${ctx_name}"
emit_md_kv "Generated" "$(ts_now)"
emit_md_kv "kubectl client" "$client_ver"
emit_md_kv "APIServer" "$cluster_server"
emit_md_kv "Namespace scope (apps)" "${APP_NS:-all}"
echo "" >> "$REPORT"
# ----------------------------- Versions & Skew ----------------------------
emit_md_h2 "Cluster Versions & Skew"
node_versions=$(jq -r '.items[]?.status.nodeInfo.kubeletVersion' "$ART/nodes.json" | sort | uniq -c | sed 's/^/ /')
emit_md_code "Kubelet versions:\n${node_versions}"
server_minor=$(jq -r 'try (.serverVersion.minor|tonumber) catch 0' "$ART/version.json" 2>/dev/null || echo 0)
first_kubelet_minor=$(jq -r '[.items[]?.status.nodeInfo.kubeletVersion
| capture("v(?<maj>\\d+)\\.(?<min>\\d+)")
| .min
| tonumber]
| (min // 0)' "$ART/nodes.json" 2>/dev/null || echo 0)
last_kubelet_minor=$(jq -r '[.items[]?.status.nodeInfo.kubeletVersion
| capture("v(?<maj>\\d+)\\.(?<min>\\d+)")
| .min
| tonumber]
| (max // 0)' "$ART/nodes.json" 2>/dev/null || echo 0)
: "${server_minor:=0}"
: "${first_kubelet_minor:=0}"
: "${last_kubelet_minor:=0}"
# Normalize minor versions
server_minor="$(to_int "$server_minor")"
first_kubelet_minor="$(to_int "$first_kubelet_minor")"
last_kubelet_minor="$(to_int "$last_kubelet_minor")"
# Ensure single-token integers before computing skew (avoid bash arithmetic on malformed values)
lm="$(to_int "$last_kubelet_minor")"
sm="$(to_int "$server_minor")"
fm="$(to_int "$first_kubelet_minor")"
# Compute absolute skew with awk only (robust even if inputs are "0")
abs_skew="$(awk -v a="$lm" -v b="$sm" 'BEGIN{d=a-b; if (d<0) d=-d; print d+0}' 2>/dev/null)"
abs_skew="$(to_int "$abs_skew")"
if [ "$abs_skew" -gt 1 ]; then
emit_json "ERROR" "version" "skew" "Kubelet/APIServer minor skew > 1 (server minor ${sm}, kubelet min/max ${fm}/${lm})" "Align versions per K8s skew policy."
echo "- **Version Skew:** ❌ kubelet/APIServer minor skew > 1 (server=${sm}, kubelet min/max=${fm}/${lm})" >> "$REPORT"
else
echo "- **Version Skew:** ✅ within supported range (server=${sm}, kubelet min/max=${fm}/${lm})" >> "$REPORT"
fi
echo "" >> "$REPORT"
# ----------------------------- API Server Health --------------------------
emit_md_h2 "API Server Ready/Liveness"
readyz="$ART/readyz.txt"
livez="$ART/livez.txt"
set +e
"${KUBECTL[@]}" get --raw='/readyz?verbose' >"$readyz" 2>&1
r_rc=$?
"${KUBECTL[@]}" get --raw='/livez?verbose' >"$livez" 2>&1
l_rc=$?
set -e
emit_md_code "readyz:\n$(cat "$readyz" 2>/dev/null || true)"
emit_md_code "livez:\n$(cat "$livez" 2>/dev/null || true)"
fail_cnt=$(grep -c 'fail' "$readyz" 2>/dev/null || true)
fail_cnt="$(to_int "${fail_cnt:-0}")"
if [ "$r_rc" -ne 0 ] || [ "$fail_cnt" -gt 0 ]; then
emit_json "ERROR" "control-plane" "readyz" "APIServer readyz reports failures" "Check control-plane component health."
echo "- **APIServer readyz:** ❌ failures detected" >> "$REPORT"
else
echo "- **APIServer readyz:** ✅ ok" >> "$REPORT"
fi
if [[ $l_rc -ne 0 ]]; then
emit_json "ERROR" "control-plane" "livez" "APIServer livez not reachable" ""
echo "- **APIServer livez:** ❌ unreachable" >> "$REPORT"
else
echo "- **APIServer livez:** ✅ ok" >> "$REPORT"
fi
echo "" >> "$REPORT"
# ----------------------------- Nodes -------------------------------------
emit_md_h2 "Node Health"
nodes_json="$ART/nodes.json"
not_ready=$(jq -r '.items[] | select([.status.conditions[]?|select(.type=="Ready")][0].status!="True") | .metadata.name' "$nodes_json")
if [[ -n "$not_ready" ]]; then
emit_json "ERROR" "nodes" "ready" "Some nodes NotReady" "$not_ready"
echo "- **Ready:** ❌ NotReady nodes present:" >> "$REPORT"; echo "$not_ready" | sed 's/^/ - /' >> "$REPORT"
else
echo "- **Ready:** ✅ all nodes Ready" >> "$REPORT"
fi
pressures=$(jq -r '
.items[] as $n
| ($n.status.conditions[] | select((.type=="DiskPressure" or .type=="MemoryPressure" or .type=="PIDPressure") and .status=="True")) as $p
| "\($n.metadata.name)\t\($p.type)\t\($p.message)"' "$nodes_json")
if [[ -n "$pressures" ]]; then
emit_json "WARN" "nodes" "pressure" "Node pressure conditions detected" "$pressures"
echo "- **Pressure:** ⚠️" >> "$REPORT"; echo "$pressures" | sed 's/^/ - /' >> "$REPORT"
else
echo "- **Pressure:** ✅ none" >> "$REPORT"
fi
unsched=$(jq -r '.items[] | select(.spec.unschedulable==true) | .metadata.name' "$nodes_json")
if [[ -n "$unsched" ]]; then
emit_json "WARN" "nodes" "unschedulable" "Unschedulable nodes present" "$unsched"
echo "- **Unschedulable:** ⚠️ $(echo "$unsched" | tr '\n' ' ')" >> "$REPORT"
else
echo "- **Unschedulable:** ✅ none" >> "$REPORT"
fi
echo "" >> "$REPORT"
# ----------------------------- Networking (DNS + Calico) ------------------
emit_md_h2 "Networking & DNS"
# CoreDNS pods status
core_dns=$("${KUBECTL[@]}" -n kube-system get deploy -l k8s-app=kube-dns,app.kubernetes.io/name=coredns -o json 2>/dev/null || true)
dn_unavail=$(jq -r '([.items[]?|.status.unavailableReplicas // 0] | add) // 0' <<<"$core_dns" 2>/dev/null || echo 0)
dn_unavail="$(to_int "$dn_unavail")"
if [ "$dn_unavail" -gt 0 ]; then
emit_json "ERROR" "networking" "coredns" "CoreDNS has unavailable replicas" ""
echo "- **CoreDNS:** ❌ unavailable replicas: $dn_unavail" >> "$REPORT"
else
echo "- **CoreDNS:** ✅ deployment healthy or not found" >> "$REPORT"
fi
# Optional ephemeral DNS nslookup test
if [[ "$DNS_TEST" == "true" ]]; then
echo "- **DNS test:** running ephemeral busybox nslookup ..." >> "$REPORT"
set +e
"${KUBECTL[@]}" run dnscheck-$$ --image=busybox:1.36 --restart=Never --command -- /bin/sh -c 'nslookup kubernetes.default.svc.cluster.local >/dev/null' \
--image-pull-policy=IfNotPresent --quiet --timeout=30s 1>/dev/null 2>&1
run_rc=$?
"${KUBECTL[@]}" delete pod dnscheck-$$ --now --wait=false 1>/dev/null 2>&1
set -e
if [[ $run_rc -ne 0 ]]; then
emit_json "ERROR" "networking" "dns" "In-pod DNS resolution failed" "Check CoreDNS, network policies, kube-dns Service."
echo " ❌ DNS resolution failed" >> "$REPORT"
else
echo " ✅ DNS resolution ok" >> "$REPORT"
fi
else
echo "- **DNS test:** (skipped)" >> "$REPORT"
fi
echo "" >> "$REPORT"
# Calico basic health
emit_md_h3 "Calico"
calico_ds=$("${KUBECTL[@]}" -n calico-system get ds calico-node -o json 2>/dev/null || true)
if [[ -n "$calico_ds" ]]; then
desire=$(jq -r '.status.desiredNumberScheduled // 0' <<<"$calico_ds")
ready=$(jq -r '.status.numberReady // 0' <<<"$calico_ds")
desire="$(to_int "$desire")"; ready="$(to_int "$ready")"
if [ "$ready" -lt "$desire" ]; then
emit_json "ERROR" "calico" "daemonset" "calico-node not fully Ready ($ready/$desire)" "Check calico-node pods and CNI errors."
echo "- **calico-node:** ❌ $ready/$desire Ready" >> "$REPORT"
else
echo "- **calico-node:** ✅ $ready/$desire Ready" >> "$REPORT"
fi
else
echo "- **calico-node:** (DaemonSet not found)" >> "$REPORT"
fi
typha=$("${KUBECTL[@]}" -n calico-system get deploy -l k8s-app=calico-typha -o json 2>/dev/null || true)
if [[ -n "$typha" ]]; then
unavail=$(jq -r '[.items[]?|.status.unavailableReplicas // 0] | add' <<<"$typha")
unavail="$(to_int "$unavail")"
if [ "$unavail" -gt 0 ]; then
emit_json "WARN" "calico" "typha" "Calico Typha unavailable replicas: $unavail" ""
echo "- **calico-typha:** ⚠️ unavailable: $unavail" >> "$REPORT"
else
echo "- **calico-typha:** ✅ healthy" >> "$REPORT"
fi
fi
echo "" >> "$REPORT"
# ----------------------------- Storage & CSI ------------------------------
emit_md_h2 "Storage"
sc_json="$ART/sc.json"
if [[ -s "$sc_json" ]]; then
defaults=$(jq -r '.items[]|select(.metadata.annotations["storageclass.kubernetes.io/is-default-class"]=="true")|.metadata.name' "$sc_json")
if [[ -z "$defaults" ]]; then
emit_json "WARN" "storage" "default-sc" "No default StorageClass set" "Annotate one SC as default."
echo "- **Default StorageClass:** ⚠️ none set" >> "$REPORT"
else
echo "- **Default StorageClass:** ✅ $defaults" >> "$REPORT"
fi
fi
pvc_pending=$(jq -r '.items[]|select(.status.phase=="Pending")|.metadata.namespace + "/" + .metadata.name' "$ART/pvc.json" 2>/dev/null || true)
if [[ -n "$pvc_pending" ]]; then
emit_json "ERROR" "storage" "pvc" "Pending PVCs detected" "$pvc_pending"
echo "- **PVCs:** ❌ Pending:\n$(echo "$pvc_pending" | sed 's/^/ - /')" >> "$REPORT"
else
echo "- **PVCs:** ✅ none Pending" >> "$REPORT"
fi
echo "" >> "$REPORT"
# ----------------------------- Workloads ---------------------------------
emit_md_h2 "Workloads"
# Pending pods >5m
pending=$(jq -r '
.items[]
| select(.status.phase=="Pending")
| select((now - (.metadata.creationTimestamp|fromdate)) > 300)
| .metadata.namespace + "/" + .metadata.name + " — " + ((.status.conditions // [] | map(select(.type=="PodScheduled"))[0].reason) // "Pending")
' "$ART/pods.json")
if [[ -n "$pending" ]]; then
emit_json "ERROR" "workloads" "pending" "Pending pods >5m" "$pending"
echo "- **Pending Pods (>5m):** ❌" >> "$REPORT"; echo "$pending" | sed 's/^/ - /' >> "$REPORT"
else
echo "- **Pending Pods (>5m):** ✅ none" >> "$REPORT"
fi
# CrashLoop / high restarts
crash=$(jq -r '
.items[] as $p
| ($p.status.containerStatuses // [])[]
| select((.restartCount // 0) >= 3)
| "\($p.metadata.namespace)/\($p.metadata.name) — \(.name) restarts=\(.restartCount) lastState=\(.lastState|tojson)"
' "$ART/pods.json")
if [[ -n "$crash" ]]; then
emit_json "WARN" "workloads" "restarts" "Containers with >=3 restarts" "$crash"
echo "- **High Restarts (>=3):** ⚠️" >> "$REPORT"; echo "$crash" | sed 's/^/ - /' >> "$REPORT"
else
echo "- **High Restarts (>=3):** ✅ none" >> "$REPORT"
fi
# Deployments with unavailable replicas
unavail=$(jq -r '
.items[]?|select(.kind=="Deployment")|select((.status.unavailableReplicas // 0) > 0)
| .metadata.namespace + "/" + .metadata.name + " — unavailable=" + ((.status.unavailableReplicas|tostring))
' "$ART/workloads.json" 2>/dev/null || true)
if [[ -n "$unavail" ]]; then
emit_json "ERROR" "workloads" "deploy-unavailable" "Deployments with unavailable replicas" "$unavail"
echo "- **Deployments:** ❌ unavailable replicas:\n$(echo "$unavail" | sed 's/^/ - /')" >> "$REPORT"
else
echo "- **Deployments:** ✅ all available" >> "$REPORT"
fi
echo "" >> "$REPORT"
# ----------------------------- Services & Endpoints -----------------------
emit_md_h2 "Services & Endpoints"
svc_0ep=$(jq -r '
( input | .items[] | {ns:.metadata.namespace, name:.metadata.name} ) as $svc
| . as $eps
| $svc.ns + "/" + $svc.name as $k
' "$ART/svc.json" "$ART/endpoints.json" 2>/dev/null | sort | uniq -u || true)
# Alternative: compute zero endpoints properly
svc_zero=$(
jq -r '
.items[] | [.metadata.namespace,.metadata.name, (.spec.selector|type)] | @tsv' "$ART/svc.json" \
| while IFS=$'\t' read -r ns name seltype; do
# Skip headless/ExternalName? Keep simple: check subsets len
subsets=$(jq -r --arg ns "$ns" --arg name "$name" \
'.items[]|select(.metadata.namespace==$ns and .metadata.name==$name)|(.subsets|length)' "$ART/endpoints.json" 2>/dev/null | head -n1)
subsets=${subsets:-0}
subsets="$(to_int "$subsets")"
if [[ "$seltype" != "null" && "$subsets" -eq 0 ]]; then
echo "$ns/$name"
fi
done
)
if [[ -n "$svc_zero" ]]; then
emit_json "ERROR" "networking" "svc-no-endpoints" "Services with zero Endpoints" "$svc_zero"
echo "- **Services with 0 endpoints:** ❌" >> "$REPORT"; echo "$svc_zero" | sed 's/^/ - /' >> "$REPORT"
else
echo "- **Services with 0 endpoints:** ✅ none" >> "$REPORT"
fi
echo "" >> "$REPORT"
# ----------------------------- TLS Secret Expiry -------------------------
emit_md_h2 "TLS Certificates (Secrets)"
# Build a set of TLS secrets actually referenced by Istio Gateways (credentialName)
ISTIO_GW_SECRETS_FILE="$ART/istio_gateway_tls_secrets.tsv"
: > "$ISTIO_GW_SECRETS_FILE"
if [[ -s "$ART/istio_gateways.json" ]]; then
jq -r '
.items[]
| .metadata.namespace as $ns
| (.spec.servers // [])
| map(select(.tls.credentialName != null) | [$ns, .tls.credentialName])
| .[]
| @tsv
' "$ART/istio_gateways.json" 2>/dev/null | sort -u > "$ISTIO_GW_SECRETS_FILE" || true
fi
tls_list=$(jq -r '.items[]|select(.type=="kubernetes.io/tls")|.metadata.namespace + "\t" + .metadata.name + "\t" + (.data["tls.crt"]//"")' "$ART/secrets.json" 2>/dev/null || true)
if [[ -n "$tls_list" ]]; then
exp_rows_inuse=""
exp_rows_unused=""
while IFS=$'\t' read -r ns name b64; do
[[ -z "$b64" ]] && continue
crt="$ART/${ns}_${name}.crt"
echo "$b64" | base64 -d > "$crt" 2>/dev/null || continue
end=$(openssl x509 -enddate -noout -in "$crt" 2>/dev/null | cut -d= -f2)
[[ -z "$end" ]] && continue
days=$(days_until "$end"); days="$(to_int "$days")"
# Is this secret referenced by any Istio Gateway in the same namespace?
in_use="no"
if grep -q -P "^${ns}\t${name}$" "$ISTIO_GW_SECRETS_FILE" 2>/dev/null; then
in_use="yes"
fi
if [ "$in_use" = "yes" ]; then
# Severity: only for IN-USE secrets
level="INFO"
if [ "$days" -le "$TLS_WARN_DAYS" ]; then level="WARN"; fi
if [ "$days" -le "$TLS_ERR_DAYS" ]; then level="ERROR"; fi
exp_rows_inuse+="$ns/$name — expires in ${days}d (${level}) [IN-USE]"$'\n'
if [ "$level" = "ERROR" ]; then
emit_json "ERROR" "security" "tls-expiry" "$ns/$name expiring in ${days}d [IN-USE]" "Referenced by an Istio Gateway; renew certificate."
elif [ "$level" = "WARN" ]; then
emit_json "WARN" "security" "tls-expiry" "$ns/$name expiring in ${days}d [IN-USE]" "Plan renewal."
fi
else
# UNUSED secrets: do NOT alert; just list under an informational subheader
exp_rows_unused+="$ns/$name — expires in ${days}d [unused]"$'\n'
fi
done <<< "$tls_list"
# Print IN-USE expiries (with levels)
if [[ -n "$exp_rows_inuse" ]]; then
echo "- **TLS expiries (in-use secrets):**" >> "$REPORT"
echo "$exp_rows_inuse" | sed 's/^/ - /' >> "$REPORT"
else
echo "- **TLS expiries (in-use secrets):** none" >> "$REPORT"
fi
# Print UNUSED secrets as information only
if [[ -n "$exp_rows_unused" ]]; then
emit_md_h3 "Unused Secrets"
echo "$exp_rows_unused" | sed 's/^/ - /' >> "$REPORT"
else
emit_md_h3 "Unused Secrets"
echo " - none" >> "$REPORT"
fi
else
echo "- **TLS expiries (in-use secrets):** (no kubernetes.io/tls secrets found)" >> "$REPORT"
emit_md_h3 "Unused Secrets"
echo " - none" >> "$REPORT"
fi
echo "" >> "$REPORT"
# ----------------------------- Istio Checks ------------------------------
emit_md_h2 "Istio"
# istiod deployment
istiod=$("${KUBECTL[@]}" -n istio-system get deploy istiod -o json 2>/dev/null || true)
if [[ -n "$istiod" ]]; then
un=$(jq -r '.status.unavailableReplicas // 0' <<<"$istiod")
un="$(to_int "$un")"
if [ "$un" -gt 0 ]; then
emit_json "ERROR" "istio" "istiod" "istiod has unavailable replicas: $un" ""
echo "- **istiod:** ❌ unavailable=$un" >> "$REPORT"
else
echo "- **istiod:** ✅ healthy" >> "$REPORT"
fi
else
echo "- **istiod:** (not found)" >> "$REPORT"
fi
# ingress gateway (classic)
igw=$("${KUBECTL[@]}" -n istio-system get deploy -l app=istio-ingressgateway -o json 2>/dev/null || true)
if [[ -n "$igw" ]]; then
un=$(jq -r '[.items[]?|.status.unavailableReplicas // 0] | add' <<<"$igw")
un="$(to_int "$un")"
if [ "$un" -gt 0 ]; then
emit_json "WARN" "istio" "ingressgateway" "IngressGateway unavailable: $un" ""
echo "- **IngressGateway:** ⚠️ unavailable=$un" >> "$REPORT"
else
echo "- **IngressGateway:** ✅ healthy" >> "$REPORT"
fi
fi
# namespaces with auto-injection enabled but pods missing sidecar
emit_md_h3 "Sidecar Injection Coverage"
# Detect namespaces with auto-injection enabled either by legacy label or revision label
inj_ns=$(jq -r '.items[]
| select(.metadata.labels["istio-injection"]=="enabled" or (.metadata.labels["istio.io/rev"] != null))
| .metadata.name' "$ART/namespaces.json")
missing_list=""
if [[ -n "$inj_ns" ]]; then
while IFS= read -r ns; do
pods=$(jq -r --arg ns "$ns" '
.items[]
| select(.metadata.namespace==$ns and (.status.phase=="Running" or .status.phase=="Pending"))
| .metadata.name as $n
| ((.spec.containers // []) | any(.name=="istio-proxy")) as $has
| (.metadata.annotations["sidecar.istio.io/inject"] // "") as $inject
| [$n, ($has|tostring), $inject] | @tsv
' "$ART/pods.json")
while IFS=$'\t' read -r pn has inject; do
[[ -z "$pn" ]] && continue
# If a pod explicitly disables injection, don't flag it as missing.
if [[ "$has" != "true" && "$inject" != "false" ]]; then
missing_list+="$ns/$pn"$'\n'
fi
done <<< "$pods"
done <<< "$inj_ns"
fi
if [[ -n "$missing_list" ]]; then
emit_json "WARN" "istio" "sidecar-missing" "Pods missing istio-proxy in injection-enabled namespaces" "$missing_list"
echo "- **Missing sidecars (in injection-enabled ns):** ⚠️" >> "$REPORT"; echo "$missing_list" | sed 's/^/ - /' >> "$REPORT"
else
echo "- **Missing sidecars:** ✅ none (or no injection-enabled namespaces)" >> "$REPORT"
fi
echo "" >> "$REPORT"
# ----------------------------- App Discovery: Redis -----------------------
emit_md_h2 "App Health — Redis / RabbitMQ / MinIO"
emit_md_h3 "Redis"
# detect by common labels & names
redis_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -l app=redis,app.kubernetes.io/name=redis -o json 2>/dev/null || true)
if [[ -z "$redis_objs" || "$(jq '.items|length' <<<"$redis_objs")" -eq 0 ]]; then
# fallback: name contains redis
redis_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -o json 2>/dev/null | jq '{items:[.items[]|select(.metadata.name|test("redis"))]}' || true)
fi
if [[ "$(jq '.items|length' <<<"$redis_objs" 2>/dev/null)" -gt 0 ]]; then
while read -r line; do
ns=$(cut -d' ' -f1 <<<"$line"); kind=$(cut -d' ' -f2 <<<"$line"); name=$(cut -d' ' -f3- <<<"$line")
obj=$("${KUBECTL[@]}" -n "$ns" get "$kind" "$name" -o json)
desired=$(jq -r '.spec.replicas // 1' <<<"$obj"); ready=$(jq -r '.status.readyReplicas // 0' <<<"$obj")
desired="$(to_int "$desired")"; ready="$(to_int "$ready")"
status="ok"; marker="✅"
if [ "$ready" -lt "$desired" ]; then status="unavailable"; marker="❌"; emit_json "ERROR" "apps.redis" "$kind" "$ns/$name unavailable ($ready/$desired)" "Check pod logs and PVCs."; fi
echo "- **$ns/$name ($kind):** $marker $ready/$desired ready" >> "$REPORT"
# Endpoints
svc=$("${KUBECTL[@]}" -n "$ns" get svc -l "app=redis,app.kubernetes.io/name=redis" -o json 2>/dev/null || true)
if [[ -n "$svc" && "$(jq '.items|length' <<<"$svc")" -gt 0 ]]; then
while read -r sname; do
eps=$(jq -r --arg ns "$ns" --arg s "$sname" '.items[]|select(.metadata.namespace==$ns and .metadata.name==$s)|(.subsets|length)' "$ART/endpoints.json")
eps=${eps:-0}
eps="$(to_int "$eps")"
echo " - svc/$sname endpoints: $eps" >> "$REPORT"
if [ "$eps" -eq 0 ]; then emit_json "ERROR" "apps.redis" "endpoints" "$ns/svc/$sname has 0 endpoints" ""; fi
done < <(jq -r '.items[].metadata.name' <<<"$svc")
fi
done < <(jq -r '.items[]|.metadata.namespace+" "+.kind+" "+.metadata.name' <<<"$redis_objs")
else
echo "- (no Redis discovered)" >> "$REPORT"
fi
echo "" >> "$REPORT"
# ----------------------------- App Discovery: RabbitMQ --------------------
emit_md_h3 "RabbitMQ"
rabbit_crd=$(grep -c rabbitmqclusters.rabbitmq.com "$ART/app_crds.txt" 2>/dev/null || echo 0)
if (( rabbit_crd > 0 )); then
# Operator CRD health (best effort)
"${KUBECTL[@]}" get rabbitmqclusters.rabbitmq.com --all-namespaces -o json > "$ART/rabbit_cr.json" 2>/dev/null || true
if [[ -s "$ART/rabbit_cr.json" ]]; then
while read -r ns name phase; do
marker="✅"; lvl="INFO"
if [[ "$phase" != "Running" && "$phase" != "Ready" ]]; then marker="❌"; lvl="ERROR"; fi
echo "- **$ns/$name (RabbitmqCluster):** $marker phase=$phase" >> "$REPORT"
[[ "$lvl" == "ERROR" ]] && emit_json "ERROR" "apps.rabbitmq" "cluster" "$ns/$name phase=$phase" "Check operator and pods."
done < <(jq -r '.items[]|.metadata.namespace+" "+.metadata.name+" "+(.status.conditions[]?|select(.type=="Ready")|.status // "Unknown")' "$ART/rabbit_cr.json" 2>/dev/null || true)
fi
fi
# Fallback to Deploy/STS named rabbit
rabbit_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -l app.kubernetes.io/name=rabbitmq,app=rabbitmq -o json 2>/dev/null || true)
if [[ -z "$rabbit_objs" || "$(jq '.items|length' <<<"$rabbit_objs")" -eq 0 ]]; then
rabbit_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -o json 2>/dev/null | jq '{items:[.items[]|select(.metadata.name|test("rabbit"))]}' || true)
fi
if [[ "$(jq '.items|length' <<<"$rabbit_objs" 2>/dev/null)" -gt 0 ]]; then
while read -r line; do
ns=$(cut -d' ' -f1 <<<"$line"); kind=$(cut -d' ' -f2 <<<"$line"); name=$(cut -d' ' -f3- <<<"$line")
obj=$("${KUBECTL[@]}" -n "$ns" get "$kind" "$name" -o json)
desired=$(jq -r '.spec.replicas // 1' <<<"$obj"); ready=$(jq -r '.status.readyReplicas // 0' <<<"$obj")
desired="$(to_int "$desired")"; ready="$(to_int "$ready")"
marker="✅"; if [ "$ready" -lt "$desired" ]; then marker="❌"; emit_json "ERROR" "apps.rabbitmq" "$kind" "$ns/$name unavailable ($ready/$desired)" ""; fi
echo "- **$ns/$name ($kind):** $marker $ready/$desired ready" >> "$REPORT"
done < <(jq -r '.items[]|.metadata.namespace+" "+.kind+" "+.metadata.name' <<<"$rabbit_objs")
else
echo "- (no RabbitMQ discovered)" >> "$REPORT"
fi
echo "" >> "$REPORT"
# ----------------------------- App Discovery: MinIO -----------------------
emit_md_h3 "MinIO"
minio_tenants_crd=$(grep -c tenants.minio.min.io "$ART/app_crds.txt" 2>/dev/null || echo 0)
if (( minio_tenants_crd > 0 )); then
"${KUBECTL[@]}" get tenants.minio.min.io --all-namespaces -o json > "$ART/minio_tenants.json" 2>/dev/null || true
if [[ -s "$ART/minio_tenants.json" ]]; then
while read -r ns name ready; do
marker="✅"
[[ "$ready" != "True" ]] && marker="❌" && emit_json "ERROR" "apps.minio" "tenant" "$ns/$name not Ready" ""
echo "- **$ns/$name (Tenant):** $marker Ready=$ready" >> "$REPORT"
done < <(jq -r '.items[]|.metadata.namespace+" "+.metadata.name+" "+((.status.conditions[]?|select(.type=="Available")|.status)//"Unknown")' "$ART/minio_tenants.json")
fi
fi
# Fallback: Deploy/STS named/labeled minio
minio_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -l app=minio,app.kubernetes.io/name=minio -o json 2>/dev/null || true)
if [[ -z "$minio_objs" || "$(jq '.items|length' <<<"$minio_objs")" -eq 0 ]]; then
minio_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -o json 2>/dev/null | jq '{items:[.items[]|select(.metadata.name|test("minio"))]}' || true)
fi
if [[ "$(jq '.items|length' <<<"$minio_objs" 2>/dev/null)" -gt 0 ]]; then
while read -r line; do
ns=$(cut -d' ' -f1 <<<"$line"); kind=$(cut -d' ' -f2 <<<"$line"); name=$(cut -d' ' -f3- <<<"$line")
obj=$("${KUBECTL[@]}" -n "$ns" get "$kind" "$name" -o json)
desired=$(jq -r '.spec.replicas // 1' <<<"$obj"); ready=$(jq -r '.status.readyReplicas // 0' <<<"$obj")
desired="$(to_int "$desired")"; ready="$(to_int "$ready")"
marker="✅"; if [ "$ready" -lt "$desired" ]; then marker="❌"; emit_json "ERROR" "apps.minio" "$kind" "$ns/$name unavailable ($ready/$desired)" ""; fi
echo "- **$ns/$name ($kind):** $marker $ready/$desired ready" >> "$REPORT"
# PVCs bound?
claim_names=$(jq -r '.spec.volumeClaimTemplates[]?.metadata.name' <<<"$obj" 2>/dev/null || true)
if [[ -n "$claim_names" ]]; then
for cn in $claim_names; do
# StatefulSets name-ordinal claim pattern
echo " - PVC template: $cn" >> "$REPORT"
done
fi
done < <(jq -r '.items[]|.metadata.namespace+" "+.kind+" "+.metadata.name' <<<"$minio_objs")
else
echo "- (no MinIO discovered)" >> "$REPORT"
fi
echo "" >> "$REPORT"
# ----------------------------- Events Snapshot ---------------------------
emit_md_h2 "Recent Warning/Error Events (top 30)"
events_tsv=$(jq -r '
.items[]
| select(.type=="Warning" or (.reason|test("BackOff|Failed|Error")))
| [.lastTimestamp, .involvedObject.namespace, .involvedObject.kind, .involvedObject.name, .reason, (.message|gsub("\n"; " "))]
| @tsv' "$ART/events.json" 2>/dev/null | tail -n 30 || true)
if [[ -n "$events_tsv" ]]; then
echo -e "\n| Time | NS | Kind | Name | Reason | Message |" >> "$REPORT"
echo "|---|---|---|---|---|---|" >> "$REPORT"
while IFS=$'\t' read -r t ns k n r m; do
echo "| $t | ${ns:-} | ${k:-} | ${n:-} | ${r:-} | ${m:-} |" >> "$REPORT"
done <<< "$events_tsv"
else
echo "- No recent warnings/errors." >> "$REPORT"
fi
# ----------------------------- Rollup & Exit -----------------------------
emit_md_h2 "Summary & Exit Code"
# produce a compact rollup
LEVEL="OK"
if grep -q '"level":"ERROR"' "$JSONL" 2>/dev/null; then LEVEL="ERROR"
elif grep -q '"level":"WARN"' "$JSONL" 2>/dev/null; then LEVEL="WARN"
fi
echo "- **Overall:** ${LEVEL}" >> "$REPORT"
# finalize JSON summary array
jq -s '.' "$JSONL" > "$OUT_DIR/summary.json" 2>/dev/null || echo "[]">"$OUT_DIR/summary.json"
echo
echo "Report written to: $REPORT"
echo "Artifacts in: $ART"
case "$LEVEL" in
ERROR) exit 2;;
WARN) exit 1;;
*) exit 0;;
esac
| 1 | #!/usr/bin/env bash |
| 2 | set -euo pipefail |
| 3 | # k8s-health.sh — single-context Kubernetes health report (Markdown) |
| 4 | # Focus: core cluster + Calico + Istio + common apps (Redis, RabbitMQ, MinIO) |
| 5 | # Deps: kubectl, jq, awk, sed, grep, base64, openssl; (optional) gdate |
| 6 | # ----------------------------- CLI & Globals ----------------------------- |
| 7 | OUT_DIR="" |
| 8 | REQUEST_TIMEOUT="${REQUEST_TIMEOUT:-8s}" |
| 9 | DNS_TEST="${DNS_TEST:-true}" # set false to skip ephemeral DNS test |
| 10 | TLS_WARN_DAYS="${TLS_WARN_DAYS:-14}" |
| 11 | TLS_ERR_DAYS="${TLS_ERR_DAYS:-7}" |
| 12 | usage() { |
| 13 | cat <<EOF |
| 14 | Usage: $0 [--context NAME] [--namespace NS] [--out DIR] [--no-dns] |
| 15 | Options: |
| 16 | --context NAME Use a specific kubeconfig context (default: current) |
| 17 | --namespace NS Limit app checks to a namespace (default: all) |
| 18 | --out DIR Output directory (default: ./k8s-health-<timestamp>) |
| 19 | --no-dns Skip ephemeral DNS resolution test |
| 20 | Env overrides: |
| 21 | REQUEST_TIMEOUT (default: 8s) | TLS_WARN_DAYS (14) | TLS_ERR_DAYS (7) |
| 22 | EOF |
| 23 | } |
| 24 | CTX_ARGS=() |
| 25 | APP_NS="" |
| 26 | while [[ $# -gt 0 ]]; do |
| 27 | case "$1" in |
| 28 | --context) shift; CTX_ARGS+=(--context "$1");; |
| 29 | --namespace) shift; APP_NS="$1";; |
| 30 | --out) shift; OUT_DIR="$1";; |
| 31 | --no-dns) DNS_TEST=false;; |
| 32 | -h|--help) usage; exit 0;; |
| 33 | *) echo "Unknown arg: $1" >&2; usage; exit 3;; |
| 34 | esac |
| 35 | shift |
| 36 | done |
| 37 | ts_now() { date -Is; } |
| 38 | to_ts() { |
| 39 | # portable timestamp from RFC date string (uses gdate if available) |
| 40 | local d="$1" |
| 41 | if command -v gdate >/dev/null 2>&1; then gdate -d "$d" +%s; else date -d "$d" +%s; fi 2>/dev/null || echo 0 |
| 42 | } |
| 43 | days_until() { |
| 44 | local end="$1"; local end_ts; end_ts=$(to_ts "$end") |
| 45 | local now_ts; now_ts=$(date +%s) |
| 46 | echo $(( (end_ts - now_ts) / 86400 )) |
| 47 | } |
| 48 | # Normalize possibly multi-line/non-numeric values to a single non-negative integer (default 0) |
| 49 | to_int() { |
| 50 | local v="$1" |
| 51 | # replace newlines/tabs with spaces, take first token |
| 52 | v="${v//$'\n'/ }"; v="${v//$'\t'/ }"; v="${v%% *}" |
| 53 | # strip non-digits |
| 54 | v="$(printf '%s' "$v" | sed -E 's/[^0-9-]//g')" |
| 55 | [[ "$v" =~ ^-?[0-9]+$ ]] || v=0 |
| 56 | printf '%s' "$v" |
| 57 | } |
| 58 | if ! command -v kubectl >/dev/null; then echo "kubectl not found" >&2; exit 3; fi |
| 59 | if ! command -v jq >/dev/null; then echo "jq not found" >&2; exit 3; fi |
| 60 | if [[ -z "$OUT_DIR" ]]; then |
| 61 | OUT_DIR="./k8s-health-$(date +%Y%m%d-%H%M%S)" |
| 62 | fi |
| 63 | ART="$OUT_DIR/artifacts" |
| 64 | mkdir -p "$ART" |
| 65 | REPORT="$OUT_DIR/report.md" |
| 66 | JSONL="$OUT_DIR/summary.jsonl" |
| 67 | touch "$REPORT" "$JSONL" |
| 68 | # ----------------------------- Emit Helpers ------------------------------ |
| 69 | emit_json() { |
| 70 | # emit_json LEVEL AREA CHECK MESSAGE HINT |
| 71 | printf '{"ts":"%s","level":"%s","area":"%s","check":"%s","message":%s,"hint":%s}\n' \ |
| 72 | "$(ts_now)" "$1" "$2" "$3" "$(jq -Rs . <<<"$4")" "$(jq -Rs . <<<"${5:-}")" >> "$JSONL" |
| 73 | } |
| 74 | emit_md_h1() { echo -e "# $1\n" >> "$REPORT"; } |
| 75 | emit_md_h2() { echo -e "## $1\n" >> "$REPORT"; } |
| 76 | emit_md_h3() { echo -e "### $1\n" >> "$REPORT"; } |
| 77 | emit_md_kv() { echo "- **$1:** $2" >> "$REPORT"; } |
| 78 | emit_md_code() { echo -e "\n\`\`\`\n$1\n\`\`\`\n" >> "$REPORT"; } |
| 79 | # ----------------------------- Prefetch Cache ---------------------------- |
| 80 | echo "Collecting cluster state..." |
| 81 | set +e |
| 82 | KUBECTL=(kubectl "${CTX_ARGS[@]}") |
| 83 | # Use a lightweight API call instead of 'kubectl version' which can fail for reasons unrelated to reachability. |
| 84 | if ! "${KUBECTL[@]}" --request-timeout="$REQUEST_TIMEOUT" get --raw='/version' >/dev/null 2>&1; then |
| 85 | # Fallback to a simple resource list in case /version endpoint is blocked by a proxy. |
| 86 | if ! "${KUBECTL[@]}" --request-timeout="$REQUEST_TIMEOUT" get nodes -o name >/dev/null 2>&1; then |
| 87 | echo "Cannot reach cluster with kubectl" >&2; exit 3 |
| 88 | fi |
| 89 | fi |
| 90 | set -e |
| 91 | "${KUBECTL[@]}" version -o json > "$ART/version.json" 2>/dev/null |
| 92 | "${KUBECTL[@]}" api-resources > "$ART/apiresources.txt" || true |
| 93 | "${KUBECTL[@]}" get nodes -o json > "$ART/nodes.json" |
| 94 | if [[ -n "$APP_NS" ]]; then |
| 95 | ns_arg=(-n "$APP_NS") |
| 96 | else |
| 97 | ns_arg=(--all-namespaces) |
| 98 | fi |
| 99 | "${KUBECTL[@]}" get pods "${ns_arg[@]}" -o json > "$ART/pods.json" |
| 100 | "${KUBECTL[@]}" get ns -o json > "$ART/namespaces.json" |
| 101 | "${KUBECTL[@]}" get events --all-namespaces --sort-by=.lastTimestamp -o json --request-timeout="$REQUEST_TIMEOUT" > "$ART/events.json" || true |
| 102 | "${KUBECTL[@]}" get svc --all-namespaces -o json > "$ART/svc.json" |
| 103 | "${KUBECTL[@]}" get endpoints --all-namespaces -o json > "$ART/endpoints.json" |
| 104 | "${KUBECTL[@]}" get endpointslices.discovery.k8s.io --all-namespaces -o json > "$ART/epslices.json" 2>/dev/null || true |
| 105 | "${KUBECTL[@]}" get deploy,ds,sts,job,cronjob,hpa,pdb --all-namespaces -o json > "$ART/workloads.json" 2>/dev/null || true |
| 106 | "${KUBECTL[@]}" get pvc --all-namespaces -o json > "$ART/pvc.json" 2>/dev/null || true |
| 107 | "${KUBECTL[@]}" get pv -o json > "$ART/pv.json" 2>/dev/null || true |
| 108 | "${KUBECTL[@]}" get storageclasses.storage.k8s.io -o json > "$ART/sc.json" 2>/dev/null || true |
| 109 | "${KUBECTL[@]}" get secrets --all-namespaces -o json > "$ART/secrets.json" 2>/dev/null || true |
| 110 | "${KUBECTL[@]}" get csidrivers.storage.k8s.io,csinodes.storage.k8s.io -o json > "$ART/csi.json" 2>/dev/null || true |
| 111 | # Istio + Calico artifacts (best effort) |
| 112 | "${KUBECTL[@]}" -n istio-system get deploy,ds,pods,svc -o wide > "$ART/istio_ls.txt" 2>/dev/null || true |
| 113 | "${KUBECTL[@]}" -n calico-system get deploy,ds,pods -o wide > "$ART/calico_ls.txt" 2>/dev/null || true |
| 114 | "${KUBECTL[@]}" get crd tenants.minio.min.io rabbitmqclusters.rabbitmq.com 2>/dev/null | sed '1d' > "$ART/app_crds.txt" || true |
| 115 | # Cache Istio Gateways (best effort) |
| 116 | "${KUBECTL[@]}" get gateway.networking.istio.io --all-namespaces -o json > "$ART/istio_gateways.json" 2>/dev/null || true |
| 117 | # ----------------------------- Report Header ----------------------------- |
| 118 | cluster_server=$( |
| 119 | jq -r '.serverVersion.gitVersion + " (" + .serverVersion.platform + ")"' "$ART/version.json" 2>/dev/null \ |
| 120 | || echo "unknown") |
| 121 | client_ver=$( |
| 122 | jq -r '.clientVersion.gitVersion' "$ART/version.json" 2>/dev/null \ |
| 123 | || echo "unknown") |
| 124 | ctx_name=$("${KUBECTL[@]}" config current-context 2>/dev/null || echo "current") |
| 125 | emit_md_h1 "Kubernetes Health Report — ${ctx_name}" |
| 126 | emit_md_kv "Generated" "$(ts_now)" |
| 127 | emit_md_kv "kubectl client" "$client_ver" |
| 128 | emit_md_kv "APIServer" "$cluster_server" |
| 129 | emit_md_kv "Namespace scope (apps)" "${APP_NS:-all}" |
| 130 | echo "" >> "$REPORT" |
| 131 | # ----------------------------- Versions & Skew ---------------------------- |
| 132 | emit_md_h2 "Cluster Versions & Skew" |
| 133 | node_versions=$(jq -r '.items[]?.status.nodeInfo.kubeletVersion' "$ART/nodes.json" | sort | uniq -c | sed 's/^/ /') |
| 134 | emit_md_code "Kubelet versions:\n${node_versions}" |
| 135 | server_minor=$(jq -r 'try (.serverVersion.minor|tonumber) catch 0' "$ART/version.json" 2>/dev/null || echo 0) |
| 136 | first_kubelet_minor=$(jq -r '[.items[]?.status.nodeInfo.kubeletVersion |
| 137 | | capture("v(?<maj>\\d+)\\.(?<min>\\d+)") |
| 138 | | .min |
| 139 | | tonumber] |
| 140 | | (min // 0)' "$ART/nodes.json" 2>/dev/null || echo 0) |
| 141 | last_kubelet_minor=$(jq -r '[.items[]?.status.nodeInfo.kubeletVersion |
| 142 | | capture("v(?<maj>\\d+)\\.(?<min>\\d+)") |
| 143 | | .min |
| 144 | | tonumber] |
| 145 | | (max // 0)' "$ART/nodes.json" 2>/dev/null || echo 0) |
| 146 | : "${server_minor:=0}" |
| 147 | : "${first_kubelet_minor:=0}" |
| 148 | : "${last_kubelet_minor:=0}" |
| 149 | # Normalize minor versions |
| 150 | server_minor="$(to_int "$server_minor")" |
| 151 | first_kubelet_minor="$(to_int "$first_kubelet_minor")" |
| 152 | last_kubelet_minor="$(to_int "$last_kubelet_minor")" |
| 153 | # Ensure single-token integers before computing skew (avoid bash arithmetic on malformed values) |
| 154 | lm="$(to_int "$last_kubelet_minor")" |
| 155 | sm="$(to_int "$server_minor")" |
| 156 | fm="$(to_int "$first_kubelet_minor")" |
| 157 | # Compute absolute skew with awk only (robust even if inputs are "0") |
| 158 | abs_skew="$(awk -v a="$lm" -v b="$sm" 'BEGIN{d=a-b; if (d<0) d=-d; print d+0}' 2>/dev/null)" |
| 159 | abs_skew="$(to_int "$abs_skew")" |
| 160 | if [ "$abs_skew" -gt 1 ]; then |
| 161 | emit_json "ERROR" "version" "skew" "Kubelet/APIServer minor skew > 1 (server minor ${sm}, kubelet min/max ${fm}/${lm})" "Align versions per K8s skew policy." |
| 162 | echo "- **Version Skew:** ❌ kubelet/APIServer minor skew > 1 (server=${sm}, kubelet min/max=${fm}/${lm})" >> "$REPORT" |
| 163 | else |
| 164 | echo "- **Version Skew:** ✅ within supported range (server=${sm}, kubelet min/max=${fm}/${lm})" >> "$REPORT" |
| 165 | fi |
| 166 | echo "" >> "$REPORT" |
| 167 | # ----------------------------- API Server Health -------------------------- |
| 168 | emit_md_h2 "API Server Ready/Liveness" |
| 169 | readyz="$ART/readyz.txt" |
| 170 | livez="$ART/livez.txt" |
| 171 | set +e |
| 172 | "${KUBECTL[@]}" get --raw='/readyz?verbose' >"$readyz" 2>&1 |
| 173 | r_rc=$? |
| 174 | "${KUBECTL[@]}" get --raw='/livez?verbose' >"$livez" 2>&1 |
| 175 | l_rc=$? |
| 176 | set -e |
| 177 | emit_md_code "readyz:\n$(cat "$readyz" 2>/dev/null || true)" |
| 178 | emit_md_code "livez:\n$(cat "$livez" 2>/dev/null || true)" |
| 179 | fail_cnt=$(grep -c 'fail' "$readyz" 2>/dev/null || true) |
| 180 | fail_cnt="$(to_int "${fail_cnt:-0}")" |
| 181 | if [ "$r_rc" -ne 0 ] || [ "$fail_cnt" -gt 0 ]; then |
| 182 | emit_json "ERROR" "control-plane" "readyz" "APIServer readyz reports failures" "Check control-plane component health." |
| 183 | echo "- **APIServer readyz:** ❌ failures detected" >> "$REPORT" |
| 184 | else |
| 185 | echo "- **APIServer readyz:** ✅ ok" >> "$REPORT" |
| 186 | fi |
| 187 | if [[ $l_rc -ne 0 ]]; then |
| 188 | emit_json "ERROR" "control-plane" "livez" "APIServer livez not reachable" "" |
| 189 | echo "- **APIServer livez:** ❌ unreachable" >> "$REPORT" |
| 190 | else |
| 191 | echo "- **APIServer livez:** ✅ ok" >> "$REPORT" |
| 192 | fi |
| 193 | echo "" >> "$REPORT" |
| 194 | # ----------------------------- Nodes ------------------------------------- |
| 195 | emit_md_h2 "Node Health" |
| 196 | nodes_json="$ART/nodes.json" |
| 197 | not_ready=$(jq -r '.items[] | select([.status.conditions[]?|select(.type=="Ready")][0].status!="True") | .metadata.name' "$nodes_json") |
| 198 | if [[ -n "$not_ready" ]]; then |
| 199 | emit_json "ERROR" "nodes" "ready" "Some nodes NotReady" "$not_ready" |
| 200 | echo "- **Ready:** ❌ NotReady nodes present:" >> "$REPORT"; echo "$not_ready" | sed 's/^/ - /' >> "$REPORT" |
| 201 | else |
| 202 | echo "- **Ready:** ✅ all nodes Ready" >> "$REPORT" |
| 203 | fi |
| 204 | pressures=$(jq -r ' |
| 205 | .items[] as $n |
| 206 | | ($n.status.conditions[] | select((.type=="DiskPressure" or .type=="MemoryPressure" or .type=="PIDPressure") and .status=="True")) as $p |
| 207 | | "\($n.metadata.name)\t\($p.type)\t\($p.message)"' "$nodes_json") |
| 208 | if [[ -n "$pressures" ]]; then |
| 209 | emit_json "WARN" "nodes" "pressure" "Node pressure conditions detected" "$pressures" |
| 210 | echo "- **Pressure:** ⚠️" >> "$REPORT"; echo "$pressures" | sed 's/^/ - /' >> "$REPORT" |
| 211 | else |
| 212 | echo "- **Pressure:** ✅ none" >> "$REPORT" |
| 213 | fi |
| 214 | unsched=$(jq -r '.items[] | select(.spec.unschedulable==true) | .metadata.name' "$nodes_json") |
| 215 | if [[ -n "$unsched" ]]; then |
| 216 | emit_json "WARN" "nodes" "unschedulable" "Unschedulable nodes present" "$unsched" |
| 217 | echo "- **Unschedulable:** ⚠️ $(echo "$unsched" | tr '\n' ' ')" >> "$REPORT" |
| 218 | else |
| 219 | echo "- **Unschedulable:** ✅ none" >> "$REPORT" |
| 220 | fi |
| 221 | echo "" >> "$REPORT" |
| 222 | # ----------------------------- Networking (DNS + Calico) ------------------ |
| 223 | emit_md_h2 "Networking & DNS" |
| 224 | # CoreDNS pods status |
| 225 | core_dns=$("${KUBECTL[@]}" -n kube-system get deploy -l k8s-app=kube-dns,app.kubernetes.io/name=coredns -o json 2>/dev/null || true) |
| 226 | dn_unavail=$(jq -r '([.items[]?|.status.unavailableReplicas // 0] | add) // 0' <<<"$core_dns" 2>/dev/null || echo 0) |
| 227 | dn_unavail="$(to_int "$dn_unavail")" |
| 228 | if [ "$dn_unavail" -gt 0 ]; then |
| 229 | emit_json "ERROR" "networking" "coredns" "CoreDNS has unavailable replicas" "" |
| 230 | echo "- **CoreDNS:** ❌ unavailable replicas: $dn_unavail" >> "$REPORT" |
| 231 | else |
| 232 | echo "- **CoreDNS:** ✅ deployment healthy or not found" >> "$REPORT" |
| 233 | fi |
| 234 | # Optional ephemeral DNS nslookup test |
| 235 | if [[ "$DNS_TEST" == "true" ]]; then |
| 236 | echo "- **DNS test:** running ephemeral busybox nslookup ..." >> "$REPORT" |
| 237 | set +e |
| 238 | "${KUBECTL[@]}" run dnscheck-$$ --image=busybox:1.36 --restart=Never --command -- /bin/sh -c 'nslookup kubernetes.default.svc.cluster.local >/dev/null' \ |
| 239 | --image-pull-policy=IfNotPresent --quiet --timeout=30s 1>/dev/null 2>&1 |
| 240 | run_rc=$? |
| 241 | "${KUBECTL[@]}" delete pod dnscheck-$$ --now --wait=false 1>/dev/null 2>&1 |
| 242 | set -e |
| 243 | if [[ $run_rc -ne 0 ]]; then |
| 244 | emit_json "ERROR" "networking" "dns" "In-pod DNS resolution failed" "Check CoreDNS, network policies, kube-dns Service." |
| 245 | echo " ❌ DNS resolution failed" >> "$REPORT" |
| 246 | else |
| 247 | echo " ✅ DNS resolution ok" >> "$REPORT" |
| 248 | fi |
| 249 | else |
| 250 | echo "- **DNS test:** (skipped)" >> "$REPORT" |
| 251 | fi |
| 252 | echo "" >> "$REPORT" |
| 253 | # Calico basic health |
| 254 | emit_md_h3 "Calico" |
| 255 | calico_ds=$("${KUBECTL[@]}" -n calico-system get ds calico-node -o json 2>/dev/null || true) |
| 256 | if [[ -n "$calico_ds" ]]; then |
| 257 | desire=$(jq -r '.status.desiredNumberScheduled // 0' <<<"$calico_ds") |
| 258 | ready=$(jq -r '.status.numberReady // 0' <<<"$calico_ds") |
| 259 | desire="$(to_int "$desire")"; ready="$(to_int "$ready")" |
| 260 | if [ "$ready" -lt "$desire" ]; then |
| 261 | emit_json "ERROR" "calico" "daemonset" "calico-node not fully Ready ($ready/$desire)" "Check calico-node pods and CNI errors." |
| 262 | echo "- **calico-node:** ❌ $ready/$desire Ready" >> "$REPORT" |
| 263 | else |
| 264 | echo "- **calico-node:** ✅ $ready/$desire Ready" >> "$REPORT" |
| 265 | fi |
| 266 | else |
| 267 | echo "- **calico-node:** (DaemonSet not found)" >> "$REPORT" |
| 268 | fi |
| 269 | typha=$("${KUBECTL[@]}" -n calico-system get deploy -l k8s-app=calico-typha -o json 2>/dev/null || true) |
| 270 | if [[ -n "$typha" ]]; then |
| 271 | unavail=$(jq -r '[.items[]?|.status.unavailableReplicas // 0] | add' <<<"$typha") |
| 272 | unavail="$(to_int "$unavail")" |
| 273 | if [ "$unavail" -gt 0 ]; then |
| 274 | emit_json "WARN" "calico" "typha" "Calico Typha unavailable replicas: $unavail" "" |
| 275 | echo "- **calico-typha:** ⚠️ unavailable: $unavail" >> "$REPORT" |
| 276 | else |
| 277 | echo "- **calico-typha:** ✅ healthy" >> "$REPORT" |
| 278 | fi |
| 279 | fi |
| 280 | echo "" >> "$REPORT" |
| 281 | # ----------------------------- Storage & CSI ------------------------------ |
| 282 | emit_md_h2 "Storage" |
| 283 | sc_json="$ART/sc.json" |
| 284 | if [[ -s "$sc_json" ]]; then |
| 285 | defaults=$(jq -r '.items[]|select(.metadata.annotations["storageclass.kubernetes.io/is-default-class"]=="true")|.metadata.name' "$sc_json") |
| 286 | if [[ -z "$defaults" ]]; then |
| 287 | emit_json "WARN" "storage" "default-sc" "No default StorageClass set" "Annotate one SC as default." |
| 288 | echo "- **Default StorageClass:** ⚠️ none set" >> "$REPORT" |
| 289 | else |
| 290 | echo "- **Default StorageClass:** ✅ $defaults" >> "$REPORT" |
| 291 | fi |
| 292 | fi |
| 293 | pvc_pending=$(jq -r '.items[]|select(.status.phase=="Pending")|.metadata.namespace + "/" + .metadata.name' "$ART/pvc.json" 2>/dev/null || true) |
| 294 | if [[ -n "$pvc_pending" ]]; then |
| 295 | emit_json "ERROR" "storage" "pvc" "Pending PVCs detected" "$pvc_pending" |
| 296 | echo "- **PVCs:** ❌ Pending:\n$(echo "$pvc_pending" | sed 's/^/ - /')" >> "$REPORT" |
| 297 | else |
| 298 | echo "- **PVCs:** ✅ none Pending" >> "$REPORT" |
| 299 | fi |
| 300 | echo "" >> "$REPORT" |
| 301 | # ----------------------------- Workloads --------------------------------- |
| 302 | emit_md_h2 "Workloads" |
| 303 | # Pending pods >5m |
| 304 | pending=$(jq -r ' |
| 305 | .items[] |
| 306 | | select(.status.phase=="Pending") |
| 307 | | select((now - (.metadata.creationTimestamp|fromdate)) > 300) |
| 308 | | .metadata.namespace + "/" + .metadata.name + " — " + ((.status.conditions // [] | map(select(.type=="PodScheduled"))[0].reason) // "Pending") |
| 309 | ' "$ART/pods.json") |
| 310 | if [[ -n "$pending" ]]; then |
| 311 | emit_json "ERROR" "workloads" "pending" "Pending pods >5m" "$pending" |
| 312 | echo "- **Pending Pods (>5m):** ❌" >> "$REPORT"; echo "$pending" | sed 's/^/ - /' >> "$REPORT" |
| 313 | else |
| 314 | echo "- **Pending Pods (>5m):** ✅ none" >> "$REPORT" |
| 315 | fi |
| 316 | # CrashLoop / high restarts |
| 317 | crash=$(jq -r ' |
| 318 | .items[] as $p |
| 319 | | ($p.status.containerStatuses // [])[] |
| 320 | | select((.restartCount // 0) >= 3) |
| 321 | | "\($p.metadata.namespace)/\($p.metadata.name) — \(.name) restarts=\(.restartCount) lastState=\(.lastState|tojson)" |
| 322 | ' "$ART/pods.json") |
| 323 | if [[ -n "$crash" ]]; then |
| 324 | emit_json "WARN" "workloads" "restarts" "Containers with >=3 restarts" "$crash" |
| 325 | echo "- **High Restarts (>=3):** ⚠️" >> "$REPORT"; echo "$crash" | sed 's/^/ - /' >> "$REPORT" |
| 326 | else |
| 327 | echo "- **High Restarts (>=3):** ✅ none" >> "$REPORT" |
| 328 | fi |
| 329 | # Deployments with unavailable replicas |
| 330 | unavail=$(jq -r ' |
| 331 | .items[]?|select(.kind=="Deployment")|select((.status.unavailableReplicas // 0) > 0) |
| 332 | | .metadata.namespace + "/" + .metadata.name + " — unavailable=" + ((.status.unavailableReplicas|tostring)) |
| 333 | ' "$ART/workloads.json" 2>/dev/null || true) |
| 334 | if [[ -n "$unavail" ]]; then |
| 335 | emit_json "ERROR" "workloads" "deploy-unavailable" "Deployments with unavailable replicas" "$unavail" |
| 336 | echo "- **Deployments:** ❌ unavailable replicas:\n$(echo "$unavail" | sed 's/^/ - /')" >> "$REPORT" |
| 337 | else |
| 338 | echo "- **Deployments:** ✅ all available" >> "$REPORT" |
| 339 | fi |
| 340 | echo "" >> "$REPORT" |
| 341 | # ----------------------------- Services & Endpoints ----------------------- |
| 342 | emit_md_h2 "Services & Endpoints" |
| 343 | svc_0ep=$(jq -r ' |
| 344 | ( input | .items[] | {ns:.metadata.namespace, name:.metadata.name} ) as $svc |
| 345 | | . as $eps |
| 346 | | $svc.ns + "/" + $svc.name as $k |
| 347 | ' "$ART/svc.json" "$ART/endpoints.json" 2>/dev/null | sort | uniq -u || true) |
| 348 | # Alternative: compute zero endpoints properly |
| 349 | svc_zero=$( |
| 350 | jq -r ' |
| 351 | .items[] | [.metadata.namespace,.metadata.name, (.spec.selector|type)] | @tsv' "$ART/svc.json" \ |
| 352 | | while IFS=$'\t' read -r ns name seltype; do |
| 353 | # Skip headless/ExternalName? Keep simple: check subsets len |
| 354 | subsets=$(jq -r --arg ns "$ns" --arg name "$name" \ |
| 355 | '.items[]|select(.metadata.namespace==$ns and .metadata.name==$name)|(.subsets|length)' "$ART/endpoints.json" 2>/dev/null | head -n1) |
| 356 | subsets=${subsets:-0} |
| 357 | subsets="$(to_int "$subsets")" |
| 358 | if [[ "$seltype" != "null" && "$subsets" -eq 0 ]]; then |
| 359 | echo "$ns/$name" |
| 360 | fi |
| 361 | done |
| 362 | ) |
| 363 | if [[ -n "$svc_zero" ]]; then |
| 364 | emit_json "ERROR" "networking" "svc-no-endpoints" "Services with zero Endpoints" "$svc_zero" |
| 365 | echo "- **Services with 0 endpoints:** ❌" >> "$REPORT"; echo "$svc_zero" | sed 's/^/ - /' >> "$REPORT" |
| 366 | else |
| 367 | echo "- **Services with 0 endpoints:** ✅ none" >> "$REPORT" |
| 368 | fi |
| 369 | echo "" >> "$REPORT" |
| 370 | # ----------------------------- TLS Secret Expiry ------------------------- |
| 371 | emit_md_h2 "TLS Certificates (Secrets)" |
| 372 | # Build a set of TLS secrets actually referenced by Istio Gateways (credentialName) |
| 373 | ISTIO_GW_SECRETS_FILE="$ART/istio_gateway_tls_secrets.tsv" |
| 374 | : > "$ISTIO_GW_SECRETS_FILE" |
| 375 | if [[ -s "$ART/istio_gateways.json" ]]; then |
| 376 | jq -r ' |
| 377 | .items[] |
| 378 | | .metadata.namespace as $ns |
| 379 | | (.spec.servers // []) |
| 380 | | map(select(.tls.credentialName != null) | [$ns, .tls.credentialName]) |
| 381 | | .[] |
| 382 | | @tsv |
| 383 | ' "$ART/istio_gateways.json" 2>/dev/null | sort -u > "$ISTIO_GW_SECRETS_FILE" || true |
| 384 | fi |
| 385 | tls_list=$(jq -r '.items[]|select(.type=="kubernetes.io/tls")|.metadata.namespace + "\t" + .metadata.name + "\t" + (.data["tls.crt"]//"")' "$ART/secrets.json" 2>/dev/null || true) |
| 386 | if [[ -n "$tls_list" ]]; then |
| 387 | exp_rows_inuse="" |
| 388 | exp_rows_unused="" |
| 389 | while IFS=$'\t' read -r ns name b64; do |
| 390 | [[ -z "$b64" ]] && continue |
| 391 | crt="$ART/${ns}_${name}.crt" |
| 392 | echo "$b64" | base64 -d > "$crt" 2>/dev/null || continue |
| 393 | end=$(openssl x509 -enddate -noout -in "$crt" 2>/dev/null | cut -d= -f2) |
| 394 | [[ -z "$end" ]] && continue |
| 395 | days=$(days_until "$end"); days="$(to_int "$days")" |
| 396 | # Is this secret referenced by any Istio Gateway in the same namespace? |
| 397 | in_use="no" |
| 398 | if grep -q -P "^${ns}\t${name}$" "$ISTIO_GW_SECRETS_FILE" 2>/dev/null; then |
| 399 | in_use="yes" |
| 400 | fi |
| 401 | if [ "$in_use" = "yes" ]; then |
| 402 | # Severity: only for IN-USE secrets |
| 403 | level="INFO" |
| 404 | if [ "$days" -le "$TLS_WARN_DAYS" ]; then level="WARN"; fi |
| 405 | if [ "$days" -le "$TLS_ERR_DAYS" ]; then level="ERROR"; fi |
| 406 | exp_rows_inuse+="$ns/$name — expires in ${days}d (${level}) [IN-USE]"$'\n' |
| 407 | if [ "$level" = "ERROR" ]; then |
| 408 | emit_json "ERROR" "security" "tls-expiry" "$ns/$name expiring in ${days}d [IN-USE]" "Referenced by an Istio Gateway; renew certificate." |
| 409 | elif [ "$level" = "WARN" ]; then |
| 410 | emit_json "WARN" "security" "tls-expiry" "$ns/$name expiring in ${days}d [IN-USE]" "Plan renewal." |
| 411 | fi |
| 412 | else |
| 413 | # UNUSED secrets: do NOT alert; just list under an informational subheader |
| 414 | exp_rows_unused+="$ns/$name — expires in ${days}d [unused]"$'\n' |
| 415 | fi |
| 416 | done <<< "$tls_list" |
| 417 | # Print IN-USE expiries (with levels) |
| 418 | if [[ -n "$exp_rows_inuse" ]]; then |
| 419 | echo "- **TLS expiries (in-use secrets):**" >> "$REPORT" |
| 420 | echo "$exp_rows_inuse" | sed 's/^/ - /' >> "$REPORT" |
| 421 | else |
| 422 | echo "- **TLS expiries (in-use secrets):** none" >> "$REPORT" |
| 423 | fi |
| 424 | # Print UNUSED secrets as information only |
| 425 | if [[ -n "$exp_rows_unused" ]]; then |
| 426 | emit_md_h3 "Unused Secrets" |
| 427 | echo "$exp_rows_unused" | sed 's/^/ - /' >> "$REPORT" |
| 428 | else |
| 429 | emit_md_h3 "Unused Secrets" |
| 430 | echo " - none" >> "$REPORT" |
| 431 | fi |
| 432 | else |
| 433 | echo "- **TLS expiries (in-use secrets):** (no kubernetes.io/tls secrets found)" >> "$REPORT" |
| 434 | emit_md_h3 "Unused Secrets" |
| 435 | echo " - none" >> "$REPORT" |
| 436 | fi |
| 437 | echo "" >> "$REPORT" |
| 438 | # ----------------------------- Istio Checks ------------------------------ |
| 439 | emit_md_h2 "Istio" |
| 440 | # istiod deployment |
| 441 | istiod=$("${KUBECTL[@]}" -n istio-system get deploy istiod -o json 2>/dev/null || true) |
| 442 | if [[ -n "$istiod" ]]; then |
| 443 | un=$(jq -r '.status.unavailableReplicas // 0' <<<"$istiod") |
| 444 | un="$(to_int "$un")" |
| 445 | if [ "$un" -gt 0 ]; then |
| 446 | emit_json "ERROR" "istio" "istiod" "istiod has unavailable replicas: $un" "" |
| 447 | echo "- **istiod:** ❌ unavailable=$un" >> "$REPORT" |
| 448 | else |
| 449 | echo "- **istiod:** ✅ healthy" >> "$REPORT" |
| 450 | fi |
| 451 | else |
| 452 | echo "- **istiod:** (not found)" >> "$REPORT" |
| 453 | fi |
| 454 | # ingress gateway (classic) |
| 455 | igw=$("${KUBECTL[@]}" -n istio-system get deploy -l app=istio-ingressgateway -o json 2>/dev/null || true) |
| 456 | if [[ -n "$igw" ]]; then |
| 457 | un=$(jq -r '[.items[]?|.status.unavailableReplicas // 0] | add' <<<"$igw") |
| 458 | un="$(to_int "$un")" |
| 459 | if [ "$un" -gt 0 ]; then |
| 460 | emit_json "WARN" "istio" "ingressgateway" "IngressGateway unavailable: $un" "" |
| 461 | echo "- **IngressGateway:** ⚠️ unavailable=$un" >> "$REPORT" |
| 462 | else |
| 463 | echo "- **IngressGateway:** ✅ healthy" >> "$REPORT" |
| 464 | fi |
| 465 | fi |
| 466 | # namespaces with auto-injection enabled but pods missing sidecar |
| 467 | emit_md_h3 "Sidecar Injection Coverage" |
| 468 | # Detect namespaces with auto-injection enabled either by legacy label or revision label |
| 469 | inj_ns=$(jq -r '.items[] |
| 470 | | select(.metadata.labels["istio-injection"]=="enabled" or (.metadata.labels["istio.io/rev"] != null)) |
| 471 | | .metadata.name' "$ART/namespaces.json") |
| 472 | missing_list="" |
| 473 | if [[ -n "$inj_ns" ]]; then |
| 474 | while IFS= read -r ns; do |
| 475 | pods=$(jq -r --arg ns "$ns" ' |
| 476 | .items[] |
| 477 | | select(.metadata.namespace==$ns and (.status.phase=="Running" or .status.phase=="Pending")) |
| 478 | | .metadata.name as $n |
| 479 | | ((.spec.containers // []) | any(.name=="istio-proxy")) as $has |
| 480 | | (.metadata.annotations["sidecar.istio.io/inject"] // "") as $inject |
| 481 | | [$n, ($has|tostring), $inject] | @tsv |
| 482 | ' "$ART/pods.json") |
| 483 | while IFS=$'\t' read -r pn has inject; do |
| 484 | [[ -z "$pn" ]] && continue |
| 485 | # If a pod explicitly disables injection, don't flag it as missing. |
| 486 | if [[ "$has" != "true" && "$inject" != "false" ]]; then |
| 487 | missing_list+="$ns/$pn"$'\n' |
| 488 | fi |
| 489 | done <<< "$pods" |
| 490 | done <<< "$inj_ns" |
| 491 | fi |
| 492 | if [[ -n "$missing_list" ]]; then |
| 493 | emit_json "WARN" "istio" "sidecar-missing" "Pods missing istio-proxy in injection-enabled namespaces" "$missing_list" |
| 494 | echo "- **Missing sidecars (in injection-enabled ns):** ⚠️" >> "$REPORT"; echo "$missing_list" | sed 's/^/ - /' >> "$REPORT" |
| 495 | else |
| 496 | echo "- **Missing sidecars:** ✅ none (or no injection-enabled namespaces)" >> "$REPORT" |
| 497 | fi |
| 498 | echo "" >> "$REPORT" |
| 499 | # ----------------------------- App Discovery: Redis ----------------------- |
| 500 | emit_md_h2 "App Health — Redis / RabbitMQ / MinIO" |
| 501 | emit_md_h3 "Redis" |
| 502 | # detect by common labels & names |
| 503 | redis_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -l app=redis,app.kubernetes.io/name=redis -o json 2>/dev/null || true) |
| 504 | if [[ -z "$redis_objs" || "$(jq '.items|length' <<<"$redis_objs")" -eq 0 ]]; then |
| 505 | # fallback: name contains redis |
| 506 | redis_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -o json 2>/dev/null | jq '{items:[.items[]|select(.metadata.name|test("redis"))]}' || true) |
| 507 | fi |
| 508 | if [[ "$(jq '.items|length' <<<"$redis_objs" 2>/dev/null)" -gt 0 ]]; then |
| 509 | while read -r line; do |
| 510 | ns=$(cut -d' ' -f1 <<<"$line"); kind=$(cut -d' ' -f2 <<<"$line"); name=$(cut -d' ' -f3- <<<"$line") |
| 511 | obj=$("${KUBECTL[@]}" -n "$ns" get "$kind" "$name" -o json) |
| 512 | desired=$(jq -r '.spec.replicas // 1' <<<"$obj"); ready=$(jq -r '.status.readyReplicas // 0' <<<"$obj") |
| 513 | desired="$(to_int "$desired")"; ready="$(to_int "$ready")" |
| 514 | status="ok"; marker="✅" |
| 515 | if [ "$ready" -lt "$desired" ]; then status="unavailable"; marker="❌"; emit_json "ERROR" "apps.redis" "$kind" "$ns/$name unavailable ($ready/$desired)" "Check pod logs and PVCs."; fi |
| 516 | echo "- **$ns/$name ($kind):** $marker $ready/$desired ready" >> "$REPORT" |
| 517 | # Endpoints |
| 518 | svc=$("${KUBECTL[@]}" -n "$ns" get svc -l "app=redis,app.kubernetes.io/name=redis" -o json 2>/dev/null || true) |
| 519 | if [[ -n "$svc" && "$(jq '.items|length' <<<"$svc")" -gt 0 ]]; then |
| 520 | while read -r sname; do |
| 521 | eps=$(jq -r --arg ns "$ns" --arg s "$sname" '.items[]|select(.metadata.namespace==$ns and .metadata.name==$s)|(.subsets|length)' "$ART/endpoints.json") |
| 522 | eps=${eps:-0} |
| 523 | eps="$(to_int "$eps")" |
| 524 | echo " - svc/$sname endpoints: $eps" >> "$REPORT" |
| 525 | if [ "$eps" -eq 0 ]; then emit_json "ERROR" "apps.redis" "endpoints" "$ns/svc/$sname has 0 endpoints" ""; fi |
| 526 | done < <(jq -r '.items[].metadata.name' <<<"$svc") |
| 527 | fi |
| 528 | done < <(jq -r '.items[]|.metadata.namespace+" "+.kind+" "+.metadata.name' <<<"$redis_objs") |
| 529 | else |
| 530 | echo "- (no Redis discovered)" >> "$REPORT" |
| 531 | fi |
| 532 | echo "" >> "$REPORT" |
| 533 | # ----------------------------- App Discovery: RabbitMQ -------------------- |
| 534 | emit_md_h3 "RabbitMQ" |
| 535 | rabbit_crd=$(grep -c rabbitmqclusters.rabbitmq.com "$ART/app_crds.txt" 2>/dev/null || echo 0) |
| 536 | if (( rabbit_crd > 0 )); then |
| 537 | # Operator CRD health (best effort) |
| 538 | "${KUBECTL[@]}" get rabbitmqclusters.rabbitmq.com --all-namespaces -o json > "$ART/rabbit_cr.json" 2>/dev/null || true |
| 539 | if [[ -s "$ART/rabbit_cr.json" ]]; then |
| 540 | while read -r ns name phase; do |
| 541 | marker="✅"; lvl="INFO" |
| 542 | if [[ "$phase" != "Running" && "$phase" != "Ready" ]]; then marker="❌"; lvl="ERROR"; fi |
| 543 | echo "- **$ns/$name (RabbitmqCluster):** $marker phase=$phase" >> "$REPORT" |
| 544 | [[ "$lvl" == "ERROR" ]] && emit_json "ERROR" "apps.rabbitmq" "cluster" "$ns/$name phase=$phase" "Check operator and pods." |
| 545 | done < <(jq -r '.items[]|.metadata.namespace+" "+.metadata.name+" "+(.status.conditions[]?|select(.type=="Ready")|.status // "Unknown")' "$ART/rabbit_cr.json" 2>/dev/null || true) |
| 546 | fi |
| 547 | fi |
| 548 | # Fallback to Deploy/STS named rabbit |
| 549 | rabbit_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -l app.kubernetes.io/name=rabbitmq,app=rabbitmq -o json 2>/dev/null || true) |
| 550 | if [[ -z "$rabbit_objs" || "$(jq '.items|length' <<<"$rabbit_objs")" -eq 0 ]]; then |
| 551 | rabbit_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -o json 2>/dev/null | jq '{items:[.items[]|select(.metadata.name|test("rabbit"))]}' || true) |
| 552 | fi |
| 553 | if [[ "$(jq '.items|length' <<<"$rabbit_objs" 2>/dev/null)" -gt 0 ]]; then |
| 554 | while read -r line; do |
| 555 | ns=$(cut -d' ' -f1 <<<"$line"); kind=$(cut -d' ' -f2 <<<"$line"); name=$(cut -d' ' -f3- <<<"$line") |
| 556 | obj=$("${KUBECTL[@]}" -n "$ns" get "$kind" "$name" -o json) |
| 557 | desired=$(jq -r '.spec.replicas // 1' <<<"$obj"); ready=$(jq -r '.status.readyReplicas // 0' <<<"$obj") |
| 558 | desired="$(to_int "$desired")"; ready="$(to_int "$ready")" |
| 559 | marker="✅"; if [ "$ready" -lt "$desired" ]; then marker="❌"; emit_json "ERROR" "apps.rabbitmq" "$kind" "$ns/$name unavailable ($ready/$desired)" ""; fi |
| 560 | echo "- **$ns/$name ($kind):** $marker $ready/$desired ready" >> "$REPORT" |
| 561 | done < <(jq -r '.items[]|.metadata.namespace+" "+.kind+" "+.metadata.name' <<<"$rabbit_objs") |
| 562 | else |
| 563 | echo "- (no RabbitMQ discovered)" >> "$REPORT" |
| 564 | fi |
| 565 | echo "" >> "$REPORT" |
| 566 | # ----------------------------- App Discovery: MinIO ----------------------- |
| 567 | emit_md_h3 "MinIO" |
| 568 | minio_tenants_crd=$(grep -c tenants.minio.min.io "$ART/app_crds.txt" 2>/dev/null || echo 0) |
| 569 | if (( minio_tenants_crd > 0 )); then |
| 570 | "${KUBECTL[@]}" get tenants.minio.min.io --all-namespaces -o json > "$ART/minio_tenants.json" 2>/dev/null || true |
| 571 | if [[ -s "$ART/minio_tenants.json" ]]; then |
| 572 | while read -r ns name ready; do |
| 573 | marker="✅" |
| 574 | [[ "$ready" != "True" ]] && marker="❌" && emit_json "ERROR" "apps.minio" "tenant" "$ns/$name not Ready" "" |
| 575 | echo "- **$ns/$name (Tenant):** $marker Ready=$ready" >> "$REPORT" |
| 576 | done < <(jq -r '.items[]|.metadata.namespace+" "+.metadata.name+" "+((.status.conditions[]?|select(.type=="Available")|.status)//"Unknown")' "$ART/minio_tenants.json") |
| 577 | fi |
| 578 | fi |
| 579 | # Fallback: Deploy/STS named/labeled minio |
| 580 | minio_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -l app=minio,app.kubernetes.io/name=minio -o json 2>/dev/null || true) |
| 581 | if [[ -z "$minio_objs" || "$(jq '.items|length' <<<"$minio_objs")" -eq 0 ]]; then |
| 582 | minio_objs=$("${KUBECTL[@]}" get deploy,sts --all-namespaces -o json 2>/dev/null | jq '{items:[.items[]|select(.metadata.name|test("minio"))]}' || true) |
| 583 | fi |
| 584 | if [[ "$(jq '.items|length' <<<"$minio_objs" 2>/dev/null)" -gt 0 ]]; then |
| 585 | while read -r line; do |
| 586 | ns=$(cut -d' ' -f1 <<<"$line"); kind=$(cut -d' ' -f2 <<<"$line"); name=$(cut -d' ' -f3- <<<"$line") |
| 587 | obj=$("${KUBECTL[@]}" -n "$ns" get "$kind" "$name" -o json) |
| 588 | desired=$(jq -r '.spec.replicas // 1' <<<"$obj"); ready=$(jq -r '.status.readyReplicas // 0' <<<"$obj") |
| 589 | desired="$(to_int "$desired")"; ready="$(to_int "$ready")" |
| 590 | marker="✅"; if [ "$ready" -lt "$desired" ]; then marker="❌"; emit_json "ERROR" "apps.minio" "$kind" "$ns/$name unavailable ($ready/$desired)" ""; fi |
| 591 | echo "- **$ns/$name ($kind):** $marker $ready/$desired ready" >> "$REPORT" |
| 592 | # PVCs bound? |
| 593 | claim_names=$(jq -r '.spec.volumeClaimTemplates[]?.metadata.name' <<<"$obj" 2>/dev/null || true) |
| 594 | if [[ -n "$claim_names" ]]; then |
| 595 | for cn in $claim_names; do |
| 596 | # StatefulSets name-ordinal claim pattern |
| 597 | echo " - PVC template: $cn" >> "$REPORT" |
| 598 | done |
| 599 | fi |
| 600 | done < <(jq -r '.items[]|.metadata.namespace+" "+.kind+" "+.metadata.name' <<<"$minio_objs") |
| 601 | else |
| 602 | echo "- (no MinIO discovered)" >> "$REPORT" |
| 603 | fi |
| 604 | echo "" >> "$REPORT" |
| 605 | # ----------------------------- Events Snapshot --------------------------- |
| 606 | emit_md_h2 "Recent Warning/Error Events (top 30)" |
| 607 | events_tsv=$(jq -r ' |
| 608 | .items[] |
| 609 | | select(.type=="Warning" or (.reason|test("BackOff|Failed|Error"))) |
| 610 | | [.lastTimestamp, .involvedObject.namespace, .involvedObject.kind, .involvedObject.name, .reason, (.message|gsub("\n"; " "))] |
| 611 | | @tsv' "$ART/events.json" 2>/dev/null | tail -n 30 || true) |
| 612 | if [[ -n "$events_tsv" ]]; then |
| 613 | echo -e "\n| Time | NS | Kind | Name | Reason | Message |" >> "$REPORT" |
| 614 | echo "|---|---|---|---|---|---|" >> "$REPORT" |
| 615 | while IFS=$'\t' read -r t ns k n r m; do |
| 616 | echo "| $t | ${ns:-} | ${k:-} | ${n:-} | ${r:-} | ${m:-} |" >> "$REPORT" |
| 617 | done <<< "$events_tsv" |
| 618 | else |
| 619 | echo "- No recent warnings/errors." >> "$REPORT" |
| 620 | fi |
| 621 | # ----------------------------- Rollup & Exit ----------------------------- |
| 622 | emit_md_h2 "Summary & Exit Code" |
| 623 | # produce a compact rollup |
| 624 | LEVEL="OK" |
| 625 | if grep -q '"level":"ERROR"' "$JSONL" 2>/dev/null; then LEVEL="ERROR" |
| 626 | elif grep -q '"level":"WARN"' "$JSONL" 2>/dev/null; then LEVEL="WARN" |
| 627 | fi |
| 628 | echo "- **Overall:** ${LEVEL}" >> "$REPORT" |
| 629 | # finalize JSON summary array |
| 630 | jq -s '.' "$JSONL" > "$OUT_DIR/summary.json" 2>/dev/null || echo "[]">"$OUT_DIR/summary.json" |
| 631 | echo |
| 632 | echo "Report written to: $REPORT" |
| 633 | echo "Artifacts in: $ART" |
| 634 | case "$LEVEL" in |
| 635 | ERROR) exit 2;; |
| 636 | WARN) exit 1;; |
| 637 | *) exit 0;; |
| 638 | esac |