Add alert settings.
This commit is contained in:
parent
2c70f62509
commit
6f0f7ff472
|
@ -0,0 +1,119 @@
|
||||||
|
groups:
|
||||||
|
- name: Node
|
||||||
|
rules:
|
||||||
|
- alert: OutOfMemory
|
||||||
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Out of memory (instance {{ $labels.instance }})"
|
||||||
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: UnusualNetworkThroughputIn
|
||||||
|
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual network throughput in (instance {{ $labels.instance }})"
|
||||||
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: UnusualNetworkThroughputOut
|
||||||
|
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual network throughput out (instance {{ $labels.instance }})"
|
||||||
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: UnusualDiskReadRate
|
||||||
|
expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual disk read rate (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: UnusualDiskWriteRate
|
||||||
|
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual disk write rate (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OutOfDiskSpace
|
||||||
|
expr: node_filesystem_free_bytes{mountpoint ="/"} / node_filesystem_size_bytes{mountpoint ="/"} * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Out of disk space (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OutOfInodes
|
||||||
|
expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint ="/"} * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Out of inodes (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: UnusualDiskReadLatency
|
||||||
|
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual disk read latency (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: UnusualDiskWriteLatency
|
||||||
|
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual disk write latency (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HighCpuLoad
|
||||||
|
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High CPU load (instance {{ $labels.instance }})"
|
||||||
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: SwapIsFillingUp
|
||||||
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Swap is filling up (instance {{ $labels.instance }})"
|
||||||
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Instance {{ $labels.instance }} down"
|
||||||
|
description: "{{ $labels.instance }} has been down for more than 5 minutes."
|
||||||
|
|
||||||
|
- alert: LoadAvg
|
||||||
|
expr: node_load1 > 2
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: 'High Load Avg (1min) detected for instance {{ $labels.instance }}, the utilisation is currently: {{ $value }}%'
|
||||||
|
summary: Load Average (1min) Alert
|
|
@ -0,0 +1,29 @@
|
||||||
|
groups:
|
||||||
|
- name: Prometheus
|
||||||
|
rules:
|
||||||
|
- alert: ExporterDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Instance {{ $labels.instance }} down"
|
||||||
|
description: "{{ $labels.instance }} has been down for more than 5 minutes."
|
||||||
|
|
||||||
|
- alert: PrometheusConfigurationReload
|
||||||
|
expr: prometheus_config_last_reload_successful != 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: error
|
||||||
|
annotations:
|
||||||
|
summary: "Prometheus configuration reload (instance {{ $labels.instance }})"
|
||||||
|
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: AlertmanagerConfigurationReload
|
||||||
|
expr: alertmanager_config_last_reload_successful != 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: error
|
||||||
|
annotations:
|
||||||
|
summary: "AlertManager configuration reload (instance {{ $labels.instance }})"
|
||||||
|
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
Loading…
Reference in New Issue