itamae/cookbooks/prometheus/files/etc/prometheus.d/alerts/node_exporter.yml

groups:
- name: Node
  rules:
  - alert: OutOfMemory
    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Out of memory (instance {{ $labels.instance }})"
      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: UnusualNetworkThroughputIn
    expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Unusual network throughput in (instance {{ $labels.instance }})"
      description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: UnusualNetworkThroughputOut
    expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Unusual network throughput out (instance {{ $labels.instance }})"
      description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: UnusualDiskReadRate
    expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Unusual disk read rate (instance {{ $labels.instance }})"
      description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: UnusualDiskWriteRate
    expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Unusual disk write rate (instance {{ $labels.instance }})"
      description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: OutOfDiskSpace
    expr: node_filesystem_free_bytes{mountpoint ="/"} / node_filesystem_size_bytes{mountpoint ="/"} * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Out of disk space (instance {{ $labels.instance }})"
      description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: OutOfInodes
    expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint ="/"} * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Out of inodes (instance {{ $labels.instance }})"
      description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: UnusualDiskReadLatency
    expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Unusual disk read latency (instance {{ $labels.instance }})"
      description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: UnusualDiskWriteLatency
    expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Unusual disk write latency (instance {{ $labels.instance }})"
      description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HighCpuLoad
    expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High CPU load (instance {{ $labels.instance }})"
      description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: SwapIsFillingUp
    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Swap is filling up (instance {{ $labels.instance }})"
      description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: InstanceDown
    expr: up == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Instance {{ $labels.instance }} down"
      description: "{{ $labels.instance }} has been down for more than 5 minutes."

  - alert: LoadAvg
    expr: node_load1 > 2
    for: 1m
    labels:
      severity: warning
    annotations:
      description: 'High Load Avg (1min) detected for instance {{ $labels.instance }}, the utilisation is currently: {{ $value }}%'
      summary: Load Average (1min) Alert
Add alert settings. 2020-07-18 10:58:15 +00:00			`groups:`
			`- name: Node`
			`rules:`
			`- alert: OutOfMemory`
			`expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Out of memory (instance {{ $labels.instance }})"`
			`description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"`

			`- alert: UnusualNetworkThroughputIn`
			`expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Unusual network throughput in (instance {{ $labels.instance }})"`
			`description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"`

			`- alert: UnusualNetworkThroughputOut`
			`expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Unusual network throughput out (instance {{ $labels.instance }})"`
			`description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"`

			`- alert: UnusualDiskReadRate`
			`expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Unusual disk read rate (instance {{ $labels.instance }})"`
			`description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"`

			`- alert: UnusualDiskWriteRate`
			`expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Unusual disk write rate (instance {{ $labels.instance }})"`
			`description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"`

			`- alert: OutOfDiskSpace`
			`expr: node_filesystem_free_bytes{mountpoint ="/"} / node_filesystem_size_bytes{mountpoint ="/"} * 100 < 10`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Out of disk space (instance {{ $labels.instance }})"`
			`description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"`

			`- alert: OutOfInodes`
			`expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint ="/"} * 100 < 10`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Out of inodes (instance {{ $labels.instance }})"`
			`description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"`

			`- alert: UnusualDiskReadLatency`
			`expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Unusual disk read latency (instance {{ $labels.instance }})"`
			`description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"`

			`- alert: UnusualDiskWriteLatency`
			`expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Unusual disk write latency (instance {{ $labels.instance }})"`
			`description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"`

			`- alert: HighCpuLoad`
			`expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "High CPU load (instance {{ $labels.instance }})"`
			`description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"`

			`- alert: SwapIsFillingUp`
			`expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Swap is filling up (instance {{ $labels.instance }})"`
			`description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"`

			`- alert: InstanceDown`
			`expr: up == 0`
			`for: 1m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "Instance {{ $labels.instance }} down"`
			`description: "{{ $labels.instance }} has been down for more than 5 minutes."`

			`- alert: LoadAvg`
			`expr: node_load1 > 2`
			`for: 1m`
			`labels:`
			`severity: warning`
			`annotations:`
			`description: 'High Load Avg (1min) detected for instance {{ $labels.instance }}, the utilisation is currently: {{ $value }}%'`
			`summary: Load Average (1min) Alert`