diff --git a/cookbooks/prometheus/alertmanager_install.rb b/cookbooks/prometheus/alertmanager_install.rb new file mode 100644 index 0000000..b182f28 --- /dev/null +++ b/cookbooks/prometheus/alertmanager_install.rb @@ -0,0 +1,57 @@ +alertmanager_url = '' +alertmanager_bin = '' + +vtag = '' +tag = '' + +# Calculate the Download URL: +begin + require 'net/http' + + uri = URI.parse('https://github.com/prometheus/alertmanager/releases/latest') + + Timeout.timeout(3) do + response = Net::HTTP.get_response(uri) + + vtag = $1 if response.body =~ %r{tag\/(v\d+\.\d+\.\d+)} + tag = vtag.sub(/^v/, '') + + alertmanager_bin = "#{node['alertmanager']['prefix']}#{tag}#{node['alertmanager']['postfix']}" + + alertmanager_url = "#{node['alertmanager']['url']}/#{vtag}/#{alertmanager_bin}" + end +rescue + # Abort the chef client process: + raise 'Cannot connect to http://github.com.' +end + + +# バージョン確認して、アップデート必要かどうか確認 +result = run_command("alertmanager --version 2>&1 | grep #{tag}", error: false) +if result.exit_status != 0 + # Download: + TMP = "/tmp/#{alertmanager_bin}" + + execute "wget #{alertmanager_url} -O #{TMP}" + + # Install: + directory node['alertmanager']['storage'] do + owner 'root' + group 'root' + mode '755' + end + + execute "tar zxf #{TMP} -C #{node['alertmanager']['storage']} --strip-components 1" + + # Change Owner and Permissions: + file "#{node['alertmanager']['storage']}alertmanager" do + owner 'root' + group 'root' + mode '755' + end + + # Create Link + link "#{node['alertmanager']['location']}alertmanager" do + to "#{node['alertmanager']['storage']}alertmanager" + end +end diff --git a/cookbooks/prometheus/alertmanager_setup.rb b/cookbooks/prometheus/alertmanager_setup.rb new file mode 100644 index 0000000..6965083 --- /dev/null +++ b/cookbooks/prometheus/alertmanager_setup.rb @@ -0,0 +1,49 @@ +# Create `/etc/prometheus.d/alerts`: +%w(/etc/prometheus.d/alerts).each do |d| + directory d do + owner 'root' + group 'root' + mode '0755' + end +end + +# Deploy `alertmanager` file: +remote_file '/etc/prometheus.d/alertmanager.yml' do + owner 'root' + group 'root' + mode '644' + + notifies :restart, 'service[supervisor]' +end + +# Deploy alert setting file: +remote_file '/etc/prometheus.d/alerts/resource.yml' do + owner 'root' + group 'root' + mode '644' + + notifies :restart, 'service[supervisor]' +end + +# Restart the `supervisor`: +service 'supervisor' do + action :nothing +end + +# Firewall settings here: +%w( 9093/tcp ).each do |p| + execute "ufw allow #{p}" do + user 'root' + + not_if "LANG=c ufw status | grep #{p}" + + notifies :run, 'execute[ufw reload-or-enable]' + end +end + +execute 'ufw reload-or-enable' do + user 'root' + command 'LANG=C ufw reload | grep skipping && ufw --force enable || exit 0' + + action :nothing +end diff --git a/cookbooks/prometheus/attributes.rb b/cookbooks/prometheus/attributes.rb index 356c0b7..5676d8d 100644 --- a/cookbooks/prometheus/attributes.rb +++ b/cookbooks/prometheus/attributes.rb @@ -23,5 +23,12 @@ node.reverse_merge!({ 'postfix' => '.linux-amd64.tar.gz', 'storage' => '/opt/blackbox_exporter/bin/', 'location' => '/usr/local/bin/' - } + }, + 'alertmanager' => { + 'url' => 'https://github.com/prometheus/alertmanager/releases/download/', + 'prefix' => 'alertmanager-', + 'postfix' => '.linux-amd64.tar.gz', + 'storage' => '/opt/prometheus/', + 'location' => '/usr/local/bin/' + }, }) diff --git a/cookbooks/prometheus/default.rb b/cookbooks/prometheus/default.rb index 23fbf05..6cfdbbc 100644 --- a/cookbooks/prometheus/default.rb +++ b/cookbooks/prometheus/default.rb @@ -3,8 +3,11 @@ include_recipe './attributes.rb' # Install the Prometheus manager: if node['prometheus']['manager'] - include_recipe './install.rb' - include_recipe './setup.rb' + include_recipe './prometheus_install.rb' + include_recipe './prometheus_setup.rb' + + include_recipe './alertmanager_install.rb' + include_recipe './alertmanager_setup.rb' end # Install the node_exporter here: diff --git a/cookbooks/prometheus/files/etc/prometheus.d/alertmanager.yml b/cookbooks/prometheus/files/etc/prometheus.d/alertmanager.yml new file mode 100644 index 0000000..d95b147 --- /dev/null +++ b/cookbooks/prometheus/files/etc/prometheus.d/alertmanager.yml @@ -0,0 +1,17 @@ +global: + slack_api_url: 'https://hooks.slack.com/services/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' + +route: + receiver: 'test-route' + group_by: [alertname] + group_wait: 10s + group_interval: 1m + repeat_interval: 6h + +receivers: +- name: 'test-route' + slack_configs: + - channel: '#ops' + title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}" + text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}" + send_resolved: true diff --git a/cookbooks/prometheus/files/etc/prometheus.d/alerts/node_exporter.yml b/cookbooks/prometheus/files/etc/prometheus.d/alerts/node_exporter.yml new file mode 100644 index 0000000..6ed118b --- /dev/null +++ b/cookbooks/prometheus/files/etc/prometheus.d/alerts/node_exporter.yml @@ -0,0 +1,119 @@ +groups: +- name: Node + rules: + - alert: OutOfMemory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Out of memory (instance {{ $labels.instance }})" + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: UnusualNetworkThroughputIn + expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "Unusual network throughput in (instance {{ $labels.instance }})" + description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: UnusualNetworkThroughputOut + expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "Unusual network throughput out (instance {{ $labels.instance }})" + description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: UnusualDiskReadRate + expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 + for: 5m + labels: + severity: warning + annotations: + summary: "Unusual disk read rate (instance {{ $labels.instance }})" + description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: UnusualDiskWriteRate + expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 + for: 5m + labels: + severity: warning + annotations: + summary: "Unusual disk write rate (instance {{ $labels.instance }})" + description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: OutOfDiskSpace + expr: node_filesystem_free_bytes{mountpoint ="/"} / node_filesystem_size_bytes{mountpoint ="/"} * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Out of disk space (instance {{ $labels.instance }})" + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: OutOfInodes + expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint ="/"} * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Out of inodes (instance {{ $labels.instance }})" + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: UnusualDiskReadLatency + expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "Unusual disk read latency (instance {{ $labels.instance }})" + description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: UnusualDiskWriteLatency + expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "Unusual disk write latency (instance {{ $labels.instance }})" + description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HighCpuLoad + expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU load (instance {{ $labels.instance }})" + description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: SwapIsFillingUp + expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Swap is filling up (instance {{ $labels.instance }})" + description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: InstanceDown + expr: up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} has been down for more than 5 minutes." + + - alert: LoadAvg + expr: node_load1 > 2 + for: 1m + labels: + severity: warning + annotations: + description: 'High Load Avg (1min) detected for instance {{ $labels.instance }}, the utilisation is currently: {{ $value }}%' + summary: Load Average (1min) Alert diff --git a/cookbooks/prometheus/files/etc/prometheus.d/alerts/prometheus.yml b/cookbooks/prometheus/files/etc/prometheus.d/alerts/prometheus.yml new file mode 100644 index 0000000..a509f1e --- /dev/null +++ b/cookbooks/prometheus/files/etc/prometheus.d/alerts/prometheus.yml @@ -0,0 +1,29 @@ +groups: +- name: Prometheus + rules: + - alert: ExporterDown + expr: up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} has been down for more than 5 minutes." + + - alert: PrometheusConfigurationReload + expr: prometheus_config_last_reload_successful != 1 + for: 5m + labels: + severity: error + annotations: + summary: "Prometheus configuration reload (instance {{ $labels.instance }})" + description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: AlertmanagerConfigurationReload + expr: alertmanager_config_last_reload_successful != 1 + for: 5m + labels: + severity: error + annotations: + summary: "AlertManager configuration reload (instance {{ $labels.instance }})" + description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" diff --git a/cookbooks/prometheus/files/etc/prometheus.d/prometheus.yml b/cookbooks/prometheus/files/etc/prometheus.d/prometheus.yml index 53fd5a4..215e5e9 100644 --- a/cookbooks/prometheus/files/etc/prometheus.d/prometheus.yml +++ b/cookbooks/prometheus/files/etc/prometheus.d/prometheus.yml @@ -13,6 +13,8 @@ global: rule_files: # - "first.rules" # - "second.rules" + - '/etc/prometheus.d/alerts/node_exporter.yml' + - '/etc/prometheus.d/alerts/prometheus.yml' # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. @@ -41,3 +43,9 @@ scrape_configs: bearer_token: 'e351393306ea245de5f9588cbe8627c74db007c6' static_configs: - targets: ['10.0.1.234:9099'] +alerting: + alertmanagers: + - static_configs: + - targets: + - "localhost:9093" + diff --git a/cookbooks/prometheus/install.rb b/cookbooks/prometheus/prometheus_install.rb similarity index 100% rename from cookbooks/prometheus/install.rb rename to cookbooks/prometheus/prometheus_install.rb diff --git a/cookbooks/prometheus/setup.rb b/cookbooks/prometheus/prometheus_setup.rb similarity index 100% rename from cookbooks/prometheus/setup.rb rename to cookbooks/prometheus/prometheus_setup.rb