74 lines
2.4 KiB
Plaintext
74 lines
2.4 KiB
Plaintext
groups:
|
|
- name: example
|
|
rules:
|
|
|
|
- name: targets
|
|
rules:
|
|
- alert: monitor_service_down
|
|
expr: up == 0
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Monitor service non-operational"
|
|
description: "Service {{ $labels.instance }} is down."
|
|
|
|
- name: host
|
|
rules:
|
|
- alert: high_cpu_load
|
|
expr: node_load1 > 1.5
|
|
for: 30s
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Server under high load"
|
|
description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
|
|
|
- alert: high_memory_load
|
|
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
|
|
for: 30s
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Server memory is almost full"
|
|
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
|
|
|
- alert: high_storage_load
|
|
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
|
|
for: 30s
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Server storage is almost full"
|
|
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
|
|
|
- name: containers
|
|
rules:
|
|
- alert: jenkins_down
|
|
expr: absent(container_memory_usage_bytes{name="jenkins"})
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Jenkins down"
|
|
description: "Jenkins container is down for more than 30 seconds."
|
|
|
|
# Alert for any instance that is unreachable for >2 minutes.
|
|
- alert: service_down
|
|
expr: up == 0
|
|
for: 2m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} down"
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
|
|
|
- alert: high_load
|
|
expr: node_load1 > 0.5
|
|
for: 2m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} under high load"
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} is under high load."
|