[K8s] grafana Alerm list
[Critical] Master Node
Master Kubelet Ready
Mtrics browser > sum(up(job="kubelet",metrics_path="/metrics/probes",node="master0.p"})
when min() of query(A,5m,now) is below 3
MasterNode Ready
sum(kube_node_status_condition{condition="Ready",node="master01p|master02p|master03p",status="true"})
when min() of query(A,5m,now) is below 3
MasterNode Unreachable
sum(kube_node_status_condition{job="kube-state-metrics",status="true",condition="NetworkUnavailable",node="lgestgbee0.p"})
when max() of query(A,5m,now) is above 0
[Critical] Kubernetes System Pod
kube-scheduler
sum(kube_pod_status_ready{condition="true",pod=~"kube-scheduler-master0.*"})
when min() of query(A,5m,now) is below 3
kube-apiserver
sum(kube_pod_status_ready{namespace="kube-system",pod=~"kube-apiserver-master0.*",condition="true"})
when min() of query(A,5m,now) is below 3
kube-controller-manager
sum(kube_pod_status_ready{namespace="kube-system",pod=~"kube-controller-manager-master0.*",condition="true"})
when min() of query(A,5m,now) is below 3
[Critical] etcd
ETCD Ready
sum(kube_pod_status_ready{namespace="kube-system",pod=~"etcd-master0.*",condition="true"})
when min() of query(A,5m,now) is below 3
ETCD No Leader
etcd_server_has_leader{job="kube-etcd"}
when min() of query(A,5m,now) is below 1
[Critical] 서비스
PersystentVolume Error Count
sum(kube_persistentvolume_status_phase{phase=~"Pending|Failed"})
when max() of query(A,5m,now) is above 0
Trident CSI Ready Count
sum(kube_pod_status_ready{namespace="trident",pod=~"trident-csi.*",condition="true"})
when min() of query(A,5m,now) is below 43 *노드 수
Ingress Controller Ready
kube_deployment_status_replicas_ready{deployment="ingress-ngninx-controller"}
when min() of query(A,5m,now) is below 1
[Warning] kubernetes
WorkerNode Ready
sum(kube_node_status_condition{node=~"worker0.*|bworker.*",status="true"})
when min() of query(A,5m,now) is below 43 *노드 수
kube-proxy Ready
avg(kube_pod_status_ready{namespace="kube-system",pod=~"kube-proxy-.*",condition="true"})
when min() of query(A,5m,now) is below 1
Kube Memory Pressure
kube_node_status_condition{condition="MemoryPressure",status="true"}
when avg() of query(A,5m,now) is above 0
Kube CPU Overcommit
kube_node_status_allocatable{resource="cpu"}
when max() of query(A,5m,now) is above 64
KubeClientCertificateExpiration
kubelet_certificate_manager_client_expiration_renew_errors{job="kubelet",metrics_path="/metrics"}
when avg() of query(A,5m,now) is above 0