From e509c531ae37f7e23902be7d4fc9f89398046dbb Mon Sep 17 00:00:00 2001 From: Kazuhiro MUSASHI Date: Sun, 5 Feb 2023 13:40:53 +0900 Subject: [PATCH] Add alert rules for `systemd` services. --- cookbooks/prometheus/alertmanager_setup.rb | 2 +- .../files/etc/prometheus.d/alerts/services.yml | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 cookbooks/prometheus/files/etc/prometheus.d/alerts/services.yml diff --git a/cookbooks/prometheus/alertmanager_setup.rb b/cookbooks/prometheus/alertmanager_setup.rb index 9e10b89..b74a3bf 100644 --- a/cookbooks/prometheus/alertmanager_setup.rb +++ b/cookbooks/prometheus/alertmanager_setup.rb @@ -20,7 +20,7 @@ encrypted_remote_file '/etc/prometheus.d/alertmanager.yml' do end # Deploy alert setting file: -%w(node_exporter prometheus filestat).each do |conf| +%w(node_exporter prometheus filestat services snmp).each do |conf| remote_file "/etc/prometheus.d/alerts/#{conf}.yml" do owner 'root' group 'root' diff --git a/cookbooks/prometheus/files/etc/prometheus.d/alerts/services.yml b/cookbooks/prometheus/files/etc/prometheus.d/alerts/services.yml new file mode 100644 index 0000000..20488a6 --- /dev/null +++ b/cookbooks/prometheus/files/etc/prometheus.d/alerts/services.yml @@ -0,0 +1,12 @@ +groups: +- name: services + rules: + - alert: Digdag + expr: node_systemd_unit_state{name="digdag.service", state="active"} != 1 + for: 5m + labels: + severity: error + annotations: + summary: "Digdag is not running: {{ $labels.instance }}." + description: "Digdag is not running: {{ $labels.instance }}." +