Merge branch 'alertmanager-install' of kazu634/itamae into master
This commit is contained in:
commit
c3323eb4d7
|
@ -0,0 +1,57 @@
|
||||||
|
alertmanager_url = ''
|
||||||
|
alertmanager_bin = ''
|
||||||
|
|
||||||
|
vtag = ''
|
||||||
|
tag = ''
|
||||||
|
|
||||||
|
# Calculate the Download URL:
|
||||||
|
begin
|
||||||
|
require 'net/http'
|
||||||
|
|
||||||
|
uri = URI.parse('https://github.com/prometheus/alertmanager/releases/latest')
|
||||||
|
|
||||||
|
Timeout.timeout(3) do
|
||||||
|
response = Net::HTTP.get_response(uri)
|
||||||
|
|
||||||
|
vtag = $1 if response.body =~ %r{tag\/(v\d+\.\d+\.\d+)}
|
||||||
|
tag = vtag.sub(/^v/, '')
|
||||||
|
|
||||||
|
alertmanager_bin = "#{node['alertmanager']['prefix']}#{tag}#{node['alertmanager']['postfix']}"
|
||||||
|
|
||||||
|
alertmanager_url = "#{node['alertmanager']['url']}/#{vtag}/#{alertmanager_bin}"
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
# Abort the chef client process:
|
||||||
|
raise 'Cannot connect to http://github.com.'
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# バージョン確認して、アップデート必要かどうか確認
|
||||||
|
result = run_command("alertmanager --version 2>&1 | grep #{tag}", error: false)
|
||||||
|
if result.exit_status != 0
|
||||||
|
# Download:
|
||||||
|
TMP = "/tmp/#{alertmanager_bin}"
|
||||||
|
|
||||||
|
execute "wget #{alertmanager_url} -O #{TMP}"
|
||||||
|
|
||||||
|
# Install:
|
||||||
|
directory node['alertmanager']['storage'] do
|
||||||
|
owner 'root'
|
||||||
|
group 'root'
|
||||||
|
mode '755'
|
||||||
|
end
|
||||||
|
|
||||||
|
execute "tar zxf #{TMP} -C #{node['alertmanager']['storage']} --strip-components 1"
|
||||||
|
|
||||||
|
# Change Owner and Permissions:
|
||||||
|
file "#{node['alertmanager']['storage']}alertmanager" do
|
||||||
|
owner 'root'
|
||||||
|
group 'root'
|
||||||
|
mode '755'
|
||||||
|
end
|
||||||
|
|
||||||
|
# Create Link
|
||||||
|
link "#{node['alertmanager']['location']}alertmanager" do
|
||||||
|
to "#{node['alertmanager']['storage']}alertmanager"
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,49 @@
|
||||||
|
# Create `/etc/prometheus.d/alerts`:
|
||||||
|
%w(/etc/prometheus.d/alerts).each do |d|
|
||||||
|
directory d do
|
||||||
|
owner 'root'
|
||||||
|
group 'root'
|
||||||
|
mode '0755'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Deploy `alertmanager` file:
|
||||||
|
remote_file '/etc/prometheus.d/alertmanager.yml' do
|
||||||
|
owner 'root'
|
||||||
|
group 'root'
|
||||||
|
mode '644'
|
||||||
|
|
||||||
|
notifies :restart, 'service[supervisor]'
|
||||||
|
end
|
||||||
|
|
||||||
|
# Deploy alert setting file:
|
||||||
|
remote_file '/etc/prometheus.d/alerts/resource.yml' do
|
||||||
|
owner 'root'
|
||||||
|
group 'root'
|
||||||
|
mode '644'
|
||||||
|
|
||||||
|
notifies :restart, 'service[supervisor]'
|
||||||
|
end
|
||||||
|
|
||||||
|
# Restart the `supervisor`:
|
||||||
|
service 'supervisor' do
|
||||||
|
action :nothing
|
||||||
|
end
|
||||||
|
|
||||||
|
# Firewall settings here:
|
||||||
|
%w( 9093/tcp ).each do |p|
|
||||||
|
execute "ufw allow #{p}" do
|
||||||
|
user 'root'
|
||||||
|
|
||||||
|
not_if "LANG=c ufw status | grep #{p}"
|
||||||
|
|
||||||
|
notifies :run, 'execute[ufw reload-or-enable]'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
execute 'ufw reload-or-enable' do
|
||||||
|
user 'root'
|
||||||
|
command 'LANG=C ufw reload | grep skipping && ufw --force enable || exit 0'
|
||||||
|
|
||||||
|
action :nothing
|
||||||
|
end
|
|
@ -23,5 +23,12 @@ node.reverse_merge!({
|
||||||
'postfix' => '.linux-amd64.tar.gz',
|
'postfix' => '.linux-amd64.tar.gz',
|
||||||
'storage' => '/opt/blackbox_exporter/bin/',
|
'storage' => '/opt/blackbox_exporter/bin/',
|
||||||
'location' => '/usr/local/bin/'
|
'location' => '/usr/local/bin/'
|
||||||
}
|
},
|
||||||
|
'alertmanager' => {
|
||||||
|
'url' => 'https://github.com/prometheus/alertmanager/releases/download/',
|
||||||
|
'prefix' => 'alertmanager-',
|
||||||
|
'postfix' => '.linux-amd64.tar.gz',
|
||||||
|
'storage' => '/opt/prometheus/',
|
||||||
|
'location' => '/usr/local/bin/'
|
||||||
|
},
|
||||||
})
|
})
|
||||||
|
|
|
@ -3,8 +3,11 @@ include_recipe './attributes.rb'
|
||||||
|
|
||||||
# Install the Prometheus manager:
|
# Install the Prometheus manager:
|
||||||
if node['prometheus']['manager']
|
if node['prometheus']['manager']
|
||||||
include_recipe './install.rb'
|
include_recipe './prometheus_install.rb'
|
||||||
include_recipe './setup.rb'
|
include_recipe './prometheus_setup.rb'
|
||||||
|
|
||||||
|
include_recipe './alertmanager_install.rb'
|
||||||
|
include_recipe './alertmanager_setup.rb'
|
||||||
end
|
end
|
||||||
|
|
||||||
# Install the node_exporter here:
|
# Install the node_exporter here:
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
global:
|
||||||
|
slack_api_url: 'https://hooks.slack.com/services/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'
|
||||||
|
|
||||||
|
route:
|
||||||
|
receiver: 'test-route'
|
||||||
|
group_by: [alertname]
|
||||||
|
group_wait: 10s
|
||||||
|
group_interval: 1m
|
||||||
|
repeat_interval: 6h
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'test-route'
|
||||||
|
slack_configs:
|
||||||
|
- channel: '#ops'
|
||||||
|
title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}"
|
||||||
|
text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}"
|
||||||
|
send_resolved: true
|
|
@ -0,0 +1,119 @@
|
||||||
|
groups:
|
||||||
|
- name: Node
|
||||||
|
rules:
|
||||||
|
- alert: OutOfMemory
|
||||||
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Out of memory (instance {{ $labels.instance }})"
|
||||||
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: UnusualNetworkThroughputIn
|
||||||
|
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual network throughput in (instance {{ $labels.instance }})"
|
||||||
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: UnusualNetworkThroughputOut
|
||||||
|
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual network throughput out (instance {{ $labels.instance }})"
|
||||||
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: UnusualDiskReadRate
|
||||||
|
expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual disk read rate (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: UnusualDiskWriteRate
|
||||||
|
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual disk write rate (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OutOfDiskSpace
|
||||||
|
expr: node_filesystem_free_bytes{mountpoint ="/"} / node_filesystem_size_bytes{mountpoint ="/"} * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Out of disk space (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OutOfInodes
|
||||||
|
expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint ="/"} * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Out of inodes (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: UnusualDiskReadLatency
|
||||||
|
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual disk read latency (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: UnusualDiskWriteLatency
|
||||||
|
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual disk write latency (instance {{ $labels.instance }})"
|
||||||
|
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HighCpuLoad
|
||||||
|
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High CPU load (instance {{ $labels.instance }})"
|
||||||
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: SwapIsFillingUp
|
||||||
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Swap is filling up (instance {{ $labels.instance }})"
|
||||||
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Instance {{ $labels.instance }} down"
|
||||||
|
description: "{{ $labels.instance }} has been down for more than 5 minutes."
|
||||||
|
|
||||||
|
- alert: LoadAvg
|
||||||
|
expr: node_load1 > 2
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: 'High Load Avg (1min) detected for instance {{ $labels.instance }}, the utilisation is currently: {{ $value }}%'
|
||||||
|
summary: Load Average (1min) Alert
|
|
@ -0,0 +1,29 @@
|
||||||
|
groups:
|
||||||
|
- name: Prometheus
|
||||||
|
rules:
|
||||||
|
- alert: ExporterDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Instance {{ $labels.instance }} down"
|
||||||
|
description: "{{ $labels.instance }} has been down for more than 5 minutes."
|
||||||
|
|
||||||
|
- alert: PrometheusConfigurationReload
|
||||||
|
expr: prometheus_config_last_reload_successful != 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: error
|
||||||
|
annotations:
|
||||||
|
summary: "Prometheus configuration reload (instance {{ $labels.instance }})"
|
||||||
|
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: AlertmanagerConfigurationReload
|
||||||
|
expr: alertmanager_config_last_reload_successful != 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: error
|
||||||
|
annotations:
|
||||||
|
summary: "AlertManager configuration reload (instance {{ $labels.instance }})"
|
||||||
|
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
|
|
@ -13,6 +13,8 @@ global:
|
||||||
rule_files:
|
rule_files:
|
||||||
# - "first.rules"
|
# - "first.rules"
|
||||||
# - "second.rules"
|
# - "second.rules"
|
||||||
|
- '/etc/prometheus.d/alerts/node_exporter.yml'
|
||||||
|
- '/etc/prometheus.d/alerts/prometheus.yml'
|
||||||
|
|
||||||
# A scrape configuration containing exactly one endpoint to scrape:
|
# A scrape configuration containing exactly one endpoint to scrape:
|
||||||
# Here it's Prometheus itself.
|
# Here it's Prometheus itself.
|
||||||
|
@ -41,3 +43,9 @@ scrape_configs:
|
||||||
bearer_token: 'e351393306ea245de5f9588cbe8627c74db007c6'
|
bearer_token: 'e351393306ea245de5f9588cbe8627c74db007c6'
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ['10.0.1.234:9099']
|
- targets: ['10.0.1.234:9099']
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets:
|
||||||
|
- "localhost:9093"
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue