This commit is contained in:
Hubert Cornet 2024-03-24 12:10:53 +01:00
parent a6b5e452fb
commit d89afeb61b
10 changed files with 3208 additions and 14 deletions

10
alertmanager/config.yml Normal file
View File

@ -0,0 +1,10 @@
route:
receiver: 'slack'
receivers:
- name: 'slack'
# slack_configs:
# - send_resolved: true
# username: '<username>'
# channel: '#<channel-name>'
# api_url: '<incomming-webhook-url>'

View File

@ -1,6 +1,6 @@
version: '3.8'
#### networks
#### NETWORKS
networks:
docker-traefik_front_network:
external: true
@ -8,12 +8,10 @@ networks:
driver: bridge
attachable: true
volumes:
prometheus_data: {}
grafana_data: {}
#### services
#### SERVICES
services:
### prometheus
prometheus:
image: prom/prometheus
restart: always
@ -38,18 +36,19 @@ services:
labels:
- "traefik.enable=true"
- "traefik.docker.network=docker-traefik_front_network"
# HTTP
## HTTP
- "traefik.http.routers.prometheus-http.rule=Host(`prometheus.10.0.4.29.traefik.me`)"
- "traefik.http.routers.prometheus-http.entrypoints=http"
# HTTPS
## HTTPS
- "traefik.http.routers.prometheus-https.rule=Host(`prometheus.10.0.4.29.traefik.me`)"
- "traefik.http.routers.prometheus-https.entrypoints=https"
- "traefik.http.routers.prometheus-https.tls=true"
- "traefik.http.routers.prometheus-https.service=prometheus-service"
# Middleware
# Service
## Middleware
## Service
- "traefik.http.services.prometheus-service.loadbalancer.server.port=9090"
### node-exporter
node-exporter:
image: prom/node-exporter
volumes:
@ -69,6 +68,7 @@ services:
deploy:
mode: global
### alertmanager
alertmanager:
image: prom/alertmanager
restart: always
@ -82,6 +82,7 @@ services:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/alertmanager'
### cadvisor
cadvisor:
image: gcr.io/cadvisor/cadvisor
volumes:
@ -97,6 +98,7 @@ services:
deploy:
mode: global
### grafana
grafana:
image: grafana/grafana
user: '472'
@ -118,14 +120,19 @@ services:
labels:
- "traefik.enable=true"
- "traefik.docker.network=interne"
# HTTP
## HTTP
- "traefik.http.routers.grafana-http.rule=Host(`grafana.10.0.4.29.traefik.me`)"
- "traefik.http.routers.grafana-http.entrypoints=http"
# HTTPS
## HTTPS
- "traefik.http.routers.grafana-https.rule=Host(`grafana.10.0.4.29.traefik.me`)"
- "traefik.http.routers.grafana-https.entrypoints=https"
- "traefik.http.routers.grafana-https.tls=true"
- "traefik.http.routers.grafana-https.service=grafana-service"
# Middleware
# Service
## Middleware
## Service
- "traefik.http.services.grafana-service.loadbalancer.server.port=3000"
#### VOLUMES
volumes:
prometheus_data: {}
grafana_data: {}

View File

@ -0,0 +1,3 @@
GF_SECURITY_ADMIN_USER=admin
GF_SECURITY_ADMIN_PASSWORD=foobar
GF_USERS_ALLOW_SIGN_UP=false

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,11 @@
apiVersion: 1
providers:
- name: 'Prometheus'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
options:
path: /etc/grafana/provisioning/dashboards

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,50 @@
# config file version
apiVersion: 1
# list of datasources that should be deleted from the database
deleteDatasources:
- name: Prometheus
orgId: 1
# list of datasources to insert/update depending
# whats available in the database
datasources:
# <string, required> name of the datasource. Required
- name: Prometheus
# <string, required> datasource type. Required
type: prometheus
# <string, required> access mode. direct or proxy. Required
access: proxy
# <int> org id. will default to orgId 1 if not specified
orgId: 1
# <string> url
url: http://prometheus:9090
# <string> database password, if used
password:
# <string> database user, if used
user:
# <string> database name, if used
database:
# <bool> enable/disable basic auth
basicAuth: false
# <string> basic auth username, if used
basicAuthUser:
# <string> basic auth password, if used
basicAuthPassword:
# <bool> enable/disable with credentials headers
withCredentials:
# <bool> mark as default datasource. Max one per org
isDefault: true
# <map> fields that will be converted to json and stored in json_data
jsonData:
graphiteVersion: "1.1"
tlsAuth: false
tlsAuthWithCACert: false
# <string> json object of data that will be encrypted.
secureJsonData:
tlsCACert: "..."
tlsClientCert: "..."
tlsClientKey: "..."
version: 1
# <bool> allow users to edit datasources from the UI.
editable: true

16
prometheus.yml Normal file
View File

@ -0,0 +1,16 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
# - "first.rules"
# - "second.rules"
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['localhost:9090']
- job_name: app
scrape_interval: 5s
static_configs:
- targets: ['host.docker.internal:10088']

22
prometheus/alert.rules Normal file
View File

@ -0,0 +1,22 @@
groups:
- name: example
rules:
# Alert for any instance that is unreachable for >2 minutes.
- alert: service_down
expr: up == 0
for: 2m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
- alert: high_load
expr: node_load1 > 0.5
for: 2m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} under high load"
description: "{{ $labels.instance }} of job {{ $labels.job }} is under high load."

82
prometheus/prometheus.yml Normal file
View File

@ -0,0 +1,82 @@
# my global config
global:
scrape_interval: 15s # By default, scrape targets every 15 seconds.
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
# scrape_timeout is set to the global default (10s).
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'my-project'
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
rule_files:
- 'alert.rules'
# - "first.rules"
# - "second.rules"
# alert
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "alertmanager:9093"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: app
scrape_interval: 5s
static_configs:
- targets: ['host.docker.internal:8000']
- job_name: 'prometheus'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
static_configs:
- targets: ['localhost:9090']
- job_name: 'cadvisor'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
dns_sd_configs:
- names:
- 'tasks.cadvisor'
type: 'A'
port: 8080
# static_configs:
# - targets: ['cadvisor:8080']
- job_name: 'node-exporter'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
dns_sd_configs:
- names:
- 'tasks.node-exporter'
type: 'A'
port: 9100
# - job_name: 'pushgateway'
# scrape_interval: 10s
# dns_sd_configs:
# - names:
# - 'tasks.pushgateway'
# type: 'A'
# port: 9091
# static_configs:
# - targets: ['node-exporter:9100']
- job_name: 'traefik-app'
scrape_interval: 5s
static_configs:
- targets: ['10.12.1.14:8181']