一、安装Alertmanager
step 1.下载
wget https://github.com/prometheus/alertmanager/releases/download/v0.25.1/alertmanager-0.25.1.linux-amd64.tar.gz
step 2.安装
tar -xzf alertmanager-0.25.1.linux-amd64.tar.gz -C /opt/
cd /opt
mv alertmanager-0.25.1.linux-amd64/ alertmanager
chown -R prometheus:prometheus /opt/alertmanager/
step 3.创建服务脚本systemd管理
(/opt/alertmanager/data如果没有这个data就自己创建一个data目录)
vim /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=Alertmanager for Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/opt/alertmanager/alertmanager \
--config.file=/opt/alertmanager/alertmanager.yml \
--storage.path=/opt/alertmanager/data
Restart=always
[Install]
WantedBy=multi-user.target
step 4.启动
systemctl daemon-reload
systemctl enable alertmanager
systemctl start alertmanager
systemctl status alertmanager
手动启动命令(可做测试),这里不需要
./alertmanager --config.file=alertmanager.yml
在浏览器输入192.168.242.150::9093即可访问web页面
二、钉钉告警
1、在钉钉创建群聊,添加机器人–>自定义机器人
2、设置机器人名称,安全选择“加签”,记录Webhook地址和加签秘钥
3、修改prometheus配置文件
vim /opt/prometheus/prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
# - alertmanager:9093
#指定规则文件
rule_files:
- "/opt/prometheus/rules/*.yml"
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
#rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
- job_name: 'node'
static_configs:
- targets: ['192.168.242.151:9101']
4 、创建目录编写规则yml文件
vim /opt/prometheus/rules/node.yml
groups:
# 报警组组名称
- name: alters
#报警组规则
rules:
#告警名称,需唯一
- alert: cpu使用率大于75%
#promQL表达式
expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance) > 0.75
#满足此表达式持续时间超过for规定的时间才会触发此报警
for: 1m
labels:
#严重级别
severity: warning
annotations:
#发出的告警标题
summary: "实例 {{ $labels.instance }} CPU 使用率过高"
#发出的告警内容
description: "实例{{ $labels.instance }} CPU 使用率超过 75% (当前值为: {{ $value }})"
- alert: 内存使用率大于90%
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes > 0.90
for: 1m
labels:
severity: warning
annotations:
summary: "实例 {{ $labels.instance }} 内存使用率过高"
description: "实例 {{ $labels.instance }} 内存使用率 90% (当前值为: {{ $value }})"
vim /opt/prometheus/rules/node_alters.yml
groups:
- name: Alerthost
rules:
- alert: 服务器宕机
expr: avg by (instance) (up{job="host"}) == 0
for: 15s #控制在触发告警之前,测试表达式的值必须为true的时长
labels:
severity: '突发事件'
annotations:
description: "实例 {{ $labels.instance }} 服务器已宕机,请进行检查."
summary: "{{ $labels.instance }} 机器已经宕机超过15秒"
- alert: 磁盘使用率大于80%
expr: 100 - (node_filesystem_free_bytes{mountpoint="/",fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
for: 2m
labels:
severity: warning
annotations:
description: "{{ $labels.instance }} : {{ $labels.job }} :{{ $labels.mountpoint }} 这个分区使用大于百分之80% (当前值:{{ $value }})"
summary: "Instance {{ $labels.instance }} :{{ $labels.mountpoint }} 分区使用率过高"
5、访问普罗米页面,看是否添加成功
6、重启Prometheus
pkill prometheus
cd /usr/local/prometheus
nohup ./prometheus --config.file=prometheus.yml > prometheus.log 2>&1 &
7、安装钉钉告警插件
cd /opt
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
tar xvf prometheus-webhook-dingtalk-*.linux-amd64.tar.gz
mv prometheus-webhook-dingtalk-*.linux-amd64 /usr/local/prometheus-webhook-dingtalk
8、配置prometheus-webhook-dingtalk
cd /usr/local/prometheus-webhook-dingtalk
cp config.example.yml config.yml
url和secret填刚才记录的钉钉机器人的Webhook地址和加签秘钥
vim config.yml
## Request timeout
# timeout: 5s
## Uncomment following line in order to write template from scratch (be careful!)
#no_builtin_template: true
## Customizable templates path
#templates:
# - contrib/templates/legacy/template.tmpl
## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
#default_message:
# title: '{{ template "legacy.title" . }}'
# text: '{{ template "legacy.content" . }}'
## Targets, previously was known as "profiles"
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxx
# secret for signature
secret: xxxxxxxx
9、启动钉钉告警插件
nohup ./prometheus-webhook-dingtalk --config.file=config.yml > webhook.log 2>&1 &
10、检查插件是否正常运行
ps aux | grep prometheus-webhook-dingtalk
正常日志示例
11、修改alertmanager配置文件
vim /opt/alertmanager/alertmanager.yml
把日志里面的钉钉webhook地址添加进来
global:
resolve_timeout: 5m #处理超时时间
route:
group_by: ['alertname']
group_wait: 10s #等待时间
group_interval: 10s #相同的Gourp之间发送告警通知的时间间隔
repeat_interval: 5m #重复报警的间隔时长
receiver: 'dingding'
receivers:
- name: 'dingding'
webhook_configs:
- url: 'http://localhost:8060/dingtalk/webhook1/send'
send_resolved: true # 发送恢复通知
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
12、重启alertmanager
#停止旧进程
pkill alertmanager
#重启
cd /opt/alertmanager
nohup ./alertmanager --config.file=alertmanager.yml > alertmanager.log 2>&1 &
三、验证钉钉告警
方式一、使用curl命令模拟alertmanager发送告警
curl -H "Content-Type: application/json" -d '{
"receiver": "dingding",
"status": "firing",
"alerts": [{
"labels": { "alertname": "TestAlert", "instance": "localhost" },
"annotations": { "summary": "测试告警", "description": "这是一个手动触发的测试告警" }
}]
}' http://localhost:8060/dingtalk/webhook1/send
方式二、在192.168.242.151机器上用stress命令模拟CPU负载,会让CPU拉到100
stress --cpu 4 --timeout 2m