groups: - name: example rules: - name: targets rules: - alert: monitor_service_down expr: up == 0 for: 30s labels: severity: critical annotations: summary: "Monitor service non-operational" description: "Service {{ $labels.instance }} is down." - name: host rules: - alert: high_cpu_load expr: node_load1 > 1.5 for: 30s labels: severity: warning annotations: summary: "Server under high load" description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." - alert: high_memory_load expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 for: 30s labels: severity: warning annotations: summary: "Server memory is almost full" description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." - alert: high_storage_load expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 for: 30s labels: severity: warning annotations: summary: "Server storage is almost full" description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." - name: containers rules: - alert: jenkins_down expr: absent(container_memory_usage_bytes{name="jenkins"}) for: 30s labels: severity: critical annotations: summary: "Jenkins down" description: "Jenkins container is down for more than 30 seconds." # Alert for any instance that is unreachable for >2 minutes. - alert: service_down expr: up == 0 for: 2m labels: severity: page annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." - alert: high_load expr: node_load1 > 0.5 for: 2m labels: severity: page annotations: summary: "Instance {{ $labels.instance }} under high load" description: "{{ $labels.instance }} of job {{ $labels.job }} is under high load."