Kubernetes

[K8s] grafana Alerm list

밝은숲 2022. 10. 24. 16:10

[Critical] Master Node
Master Kubelet Ready
Mtrics browser > sum(up(job="kubelet",metrics_path="/metrics/probes",node="master0.p"})
when min() of query(A,5m,now) is below 3

MasterNode Ready
sum(kube_node_status_condition{condition="Ready",node="master01p|master02p|master03p",status="true"})
when min() of query(A,5m,now) is below 3

MasterNode Unreachable
sum(kube_node_status_condition{job="kube-state-metrics",status="true",condition="NetworkUnavailable",node="lgestgbee0.p"})
when max() of query(A,5m,now) is above 0



[Critical] Kubernetes System Pod
kube-scheduler
sum(kube_pod_status_ready{condition="true",pod=~"kube-scheduler-master0.*"})
when min() of query(A,5m,now) is below 3

kube-apiserver
sum(kube_pod_status_ready{namespace="kube-system",pod=~"kube-apiserver-master0.*",condition="true"})
when min() of query(A,5m,now) is below 3

kube-controller-manager
sum(kube_pod_status_ready{namespace="kube-system",pod=~"kube-controller-manager-master0.*",condition="true"})
when min() of query(A,5m,now) is below 3



[Critical] etcd
ETCD Ready
sum(kube_pod_status_ready{namespace="kube-system",pod=~"etcd-master0.*",condition="true"})
when min() of query(A,5m,now) is below 3

ETCD No Leader
etcd_server_has_leader{job="kube-etcd"}
when min() of query(A,5m,now) is below 1



[Critical] 서비스
PersystentVolume Error Count
sum(kube_persistentvolume_status_phase{phase=~"Pending|Failed"})
when max() of query(A,5m,now) is above 0

Trident CSI Ready Count
sum(kube_pod_status_ready{namespace="trident",pod=~"trident-csi.*",condition="true"})
when min() of query(A,5m,now) is below 43 *노드 수

Ingress Controller Ready
kube_deployment_status_replicas_ready{deployment="ingress-ngninx-controller"}
when min() of query(A,5m,now) is below 1



[Warning] kubernetes
WorkerNode Ready
sum(kube_node_status_condition{node=~"worker0.*|bworker.*",status="true"})
when min() of query(A,5m,now) is below 43 *노드 수

kube-proxy Ready
avg(kube_pod_status_ready{namespace="kube-system",pod=~"kube-proxy-.*",condition="true"})
when min() of query(A,5m,now) is below 1

Kube Memory Pressure
kube_node_status_condition{condition="MemoryPressure",status="true"}
when avg() of query(A,5m,now) is above 0

Kube CPU Overcommit
kube_node_status_allocatable{resource="cpu"}
when max() of query(A,5m,now) is above 64

KubeClientCertificateExpiration
kubelet_certificate_manager_client_expiration_renew_errors{job="kubelet",metrics_path="/metrics"}
when avg() of query(A,5m,now) is above 0