Author: 博同学
参考:
https://gitee.com/slcnx/generic_architecture/blob/master/playbooks/prometheus/01-deploy-node-exporter.yml
成果图,此处报表基于大佬的模板 稍作修改,加了个Row放在node-exporter里表里了
1、在原有node-exporter 添加启动参数
https://gitee.com/slcnx/generic_architecture/blob/master/playbooks/prometheus/01-deploy-node-exporter.yml
vim /usr/lib/systemd/system/node-exporter.service
ExecStart=/opt/node-exporter --web.listen-address=:9100 --collector.supervisord --collector.supervisord.url=unix:///var/run/supervisor/supervisor.sock
systemctl daemon-reload
systemctl restart node-exporter
2、验证supervisor是否监控
curl -s localhost:9100/metrics | grep super | head
3、告警规则
groups:
- name: supervisor_alerts
rules:
- alert: SupervisordTooManyRestarts
expr: changes(node_supervisord_start_time_seconds[15m]) > 2
for: 0m
labels:
severity: critical
annotations:
summary: supervisord too many restarts (实例 {{ $labels.instance }})
description: "supervisord has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: supervisordDown
expr: node_supervisord_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: supervisord task down (实例 {{ $labels.instance }} , 任务 {{ $labels.name}})
description: "supervisord task is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: supervisordError
expr: node_supervisord_exit_status !=0
for: 0m
labels:
severity: critical
annotations:
summary: supervisord task 失败 (实例 {{ $labels.instance }} , 任务 {{ $labels.name}})
description: "任务退出码为 {{ $value }}"
4、报表
https://gitee.com/slcnx/generic_architecture#supervisor-%E9%85%8D%E7%BD%AE%E5%8F%8A%E7%9B%91%E6%8E%A7
groups:
- name: supervisor_alerts
rules:
- alert: SupervisordTooManyRestarts
expr: changes(node_supervisord_start_time_seconds[15m]) > 2
for: 0m
labels:
severity: critical
annotations:
summary: supervisord too many restarts (实例 {{ $labels.instance }})
description: "supervisord has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: supervisordDown
expr: node_supervisord_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: supervisord task down (实例 {{ $labels.instance }} , 任务 {{ $labels.name}})
description: "supervisord task is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: supervisordError
expr: node_supervisord_exit_status !=0
for: 0m
labels:
severity: critical
annotations:
summary: supervisord task 失败 (实例 {{ $labels.instance }} , 任务 {{ $labels.name}})
description: "任务退出码为 {{ $value }}"