Merge branch 'alertmanager-install' of kazu634/itamae into master

2020-07-18 20:09:58 +09:00 · 2020-07-18 20:09:58 +09:00 · c3323eb4d7
commit c3323eb4d7
parent 730fcd3e54 ff6d4a7185
10 changed files with 292 additions and 3 deletions
--- a/cookbooks/prometheus/alertmanager_install.rb
+++ b/cookbooks/prometheus/alertmanager_install.rb
@ -0,0 +1,57 @@
+alertmanager_url = ''
+alertmanager_bin = ''
+
+vtag             = ''
+tag              = ''
+
+# Calculate the Download URL:
+begin
+  require 'net/http'
+
+  uri = URI.parse('https://github.com/prometheus/alertmanager/releases/latest')
+
+  Timeout.timeout(3) do
+    response = Net::HTTP.get_response(uri)
+
+    vtag = $1 if response.body =~ %r{tag\/(v\d+\.\d+\.\d+)}
+    tag = vtag.sub(/^v/, '')
+
+    alertmanager_bin = "#{node['alertmanager']['prefix']}#{tag}#{node['alertmanager']['postfix']}"
+
+    alertmanager_url = "#{node['alertmanager']['url']}/#{vtag}/#{alertmanager_bin}"
+  end
+rescue
+  # Abort the chef client process:
+  raise 'Cannot connect to http://github.com.'
+end
+
+
+# バージョン確認して、アップデート必要かどうか確認
+result = run_command("alertmanager --version 2>&1 | grep #{tag}", error: false)
+if result.exit_status != 0
+  # Download:
+  TMP = "/tmp/#{alertmanager_bin}"
+
+  execute "wget #{alertmanager_url} -O #{TMP}"
+
+  # Install:
+  directory node['alertmanager']['storage'] do
+    owner 'root'
+    group 'root'
+    mode '755'
+  end
+
+  execute "tar zxf #{TMP} -C #{node['alertmanager']['storage']} --strip-components 1"
+
+  # Change Owner and Permissions:
+  file "#{node['alertmanager']['storage']}alertmanager" do
+    owner 'root'
+    group 'root'
+    mode  '755'
+  end
+
+  # Create Link
+  link "#{node['alertmanager']['location']}alertmanager" do
+    to "#{node['alertmanager']['storage']}alertmanager"
+  end
+end
--- a/cookbooks/prometheus/alertmanager_setup.rb
+++ b/cookbooks/prometheus/alertmanager_setup.rb
@ -0,0 +1,49 @@
+# Create `/etc/prometheus.d/alerts`:
+%w(/etc/prometheus.d/alerts).each do |d|
+  directory d do
+    owner  'root'
+    group  'root'
+    mode   '0755'
+  end
+end
+
+# Deploy `alertmanager` file:
+remote_file '/etc/prometheus.d/alertmanager.yml' do
+  owner  'root'
+  group  'root'
+  mode   '644'
+
+  notifies :restart, 'service[supervisor]'
+end
+
+# Deploy alert setting file:
+remote_file '/etc/prometheus.d/alerts/resource.yml' do
+  owner  'root'
+  group  'root'
+  mode   '644'
+
+  notifies :restart, 'service[supervisor]'
+end
+
+# Restart the `supervisor`:
+service 'supervisor' do
+  action :nothing
+end
+
+# Firewall settings here:
+%w( 9093/tcp ).each do |p|
+  execute "ufw allow #{p}" do
+    user 'root'
+
+    not_if "LANG=c ufw status | grep #{p}"
+
+    notifies :run, 'execute[ufw reload-or-enable]'
+  end
+end
+
+execute 'ufw reload-or-enable' do
+  user 'root'
+  command 'LANG=C ufw reload | grep skipping && ufw --force enable || exit 0'
+
+  action :nothing
+end
--- a/cookbooks/prometheus/attributes.rb
+++ b/cookbooks/prometheus/attributes.rb
@ -23,5 +23,12 @@ node.reverse_merge!({
    'postfix' => '.linux-amd64.tar.gz',
    'storage' => '/opt/blackbox_exporter/bin/',
    'location' => '/usr/local/bin/'
-  }
+  },
+  'alertmanager' => {
+    'url' => 'https://github.com/prometheus/alertmanager/releases/download/',
+    'prefix' => 'alertmanager-',
+    'postfix' => '.linux-amd64.tar.gz',
+    'storage' => '/opt/prometheus/',
+    'location' => '/usr/local/bin/'
+  },
 })
--- a/cookbooks/prometheus/default.rb
+++ b/cookbooks/prometheus/default.rb
@ -3,8 +3,11 @@ include_recipe './attributes.rb'

 # Install the Prometheus manager:
 if node['prometheus']['manager']
-  include_recipe './install.rb'
-  include_recipe './setup.rb'
+  include_recipe './prometheus_install.rb'
+  include_recipe './prometheus_setup.rb'
+
+  include_recipe './alertmanager_install.rb'
+  include_recipe './alertmanager_setup.rb'
 end

 # Install the node_exporter here:
--- a/cookbooks/prometheus/files/etc/prometheus.d/alertmanager.yml
+++ b/cookbooks/prometheus/files/etc/prometheus.d/alertmanager.yml
@ -0,0 +1,17 @@
+global:
+  slack_api_url: 'https://hooks.slack.com/services/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'
+
+route:
+  receiver: 'test-route'
+  group_by: [alertname]
+  group_wait:      10s
+  group_interval:  1m
+  repeat_interval: 6h
+
+receivers:
+- name: 'test-route'
+  slack_configs:
+  - channel: '#ops'
+    title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}"
+    text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}"
+    send_resolved: true
--- a/cookbooks/prometheus/files/etc/prometheus.d/alerts/node_exporter.yml
+++ b/cookbooks/prometheus/files/etc/prometheus.d/alerts/node_exporter.yml
@ -0,0 +1,119 @@
+groups:
+- name: Node
+  rules:
+  - alert: OutOfMemory
+    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Out of memory (instance {{ $labels.instance }})"
+      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: UnusualNetworkThroughputIn
+    expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Unusual network throughput in (instance {{ $labels.instance }})"
+      description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: UnusualNetworkThroughputOut
+    expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Unusual network throughput out (instance {{ $labels.instance }})"
+      description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: UnusualDiskReadRate
+    expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Unusual disk read rate (instance {{ $labels.instance }})"
+      description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: UnusualDiskWriteRate
+    expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Unusual disk write rate (instance {{ $labels.instance }})"
+      description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: OutOfDiskSpace
+    expr: node_filesystem_free_bytes{mountpoint ="/"} / node_filesystem_size_bytes{mountpoint ="/"} * 100 < 10
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Out of disk space (instance {{ $labels.instance }})"
+      description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: OutOfInodes
+    expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint ="/"} * 100 < 10
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Out of inodes (instance {{ $labels.instance }})"
+      description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: UnusualDiskReadLatency
+    expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Unusual disk read latency (instance {{ $labels.instance }})"
+      description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: UnusualDiskWriteLatency
+    expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Unusual disk write latency (instance {{ $labels.instance }})"
+      description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: HighCpuLoad
+    expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "High CPU load (instance {{ $labels.instance }})"
+      description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: SwapIsFillingUp
+    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Swap is filling up (instance {{ $labels.instance }})"
+      description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: InstanceDown
+    expr: up == 0
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Instance {{ $labels.instance }} down"
+      description: "{{ $labels.instance }} has been down for more than 5 minutes."
+
+  - alert: LoadAvg
+    expr: node_load1 > 2
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      description: 'High Load Avg (1min) detected for instance {{ $labels.instance }}, the utilisation is currently: {{ $value }}%'
+      summary: Load Average (1min) Alert
--- a/cookbooks/prometheus/files/etc/prometheus.d/alerts/prometheus.yml
+++ b/cookbooks/prometheus/files/etc/prometheus.d/alerts/prometheus.yml
@ -0,0 +1,29 @@
+groups:
+- name: Prometheus
+  rules:
+  - alert: ExporterDown
+    expr: up == 0
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Instance {{ $labels.instance }} down"
+      description: "{{ $labels.instance }} has been down for more than 5 minutes."
+
+  - alert: PrometheusConfigurationReload
+    expr: prometheus_config_last_reload_successful != 1
+    for: 5m
+    labels:
+      severity: error
+    annotations:
+      summary: "Prometheus configuration reload (instance {{ $labels.instance }})"
+      description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+    
+  - alert: AlertmanagerConfigurationReload
+    expr: alertmanager_config_last_reload_successful != 1
+    for: 5m
+    labels:
+      severity: error
+    annotations:
+      summary: "AlertManager configuration reload (instance {{ $labels.instance }})"
+      description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
--- a/cookbooks/prometheus/files/etc/prometheus.d/prometheus.yml
+++ b/cookbooks/prometheus/files/etc/prometheus.d/prometheus.yml
@ -13,6 +13,8 @@ global:
 rule_files:
  # - "first.rules"
  # - "second.rules"
+  - '/etc/prometheus.d/alerts/node_exporter.yml'
+  - '/etc/prometheus.d/alerts/prometheus.yml'

 # A scrape configuration containing exactly one endpoint to scrape:
 # Here it's Prometheus itself.
@ -41,3 +43,9 @@ scrape_configs:
    bearer_token: 'e351393306ea245de5f9588cbe8627c74db007c6'
    static_configs:
      - targets: ['10.0.1.234:9099']
+alerting:
+  alertmanagers:
+    - static_configs:
+      - targets:
+        - "localhost:9093"
+
--- a/cookbooks/prometheus/prometheus_install.rb
+++ b/cookbooks/prometheus/prometheus_install.rb
--- a/cookbooks/prometheus/prometheus_setup.rb
+++ b/cookbooks/prometheus/prometheus_setup.rb