/etc/prometheus/alert_loadbalancing.yml > lowpref
|
|
/etc/prometheus/alerts/blackbox-exporter.yml > BlackboxExporter
|
|
|
|
|
|
|
|
/etc/prometheus/alerts/node-exporter.yml > NodeExporter
|
Labels |
State |
Active Since |
Value |
alertname="HostNetworkInterfaceSaturated"
device="bb11"
instance="gw01n03"
job="node_gateways"
nodename="gw01n03"
severity="warning"
|
firing |
2025-04-03 09:44:17.079934382 +0000 UTC |
0.9783954844444445 |
Annotations |
- description
- The network interface "bb11" on "gw01n03" is getting overloaded.
VALUE = 0.9783954844444445
LABELS = map[device:bb11 instance:gw01n03 job:node_gateways nodename:gw01n03]
- summary
- Host Network Interface Saturated (instance gw01n03)
|
|
Labels |
State |
Active Since |
Value |
alertname="HostOutOfDiskSpace"
device="zp_pve/subvol-3226-disk-0"
fstype="zfs"
instance="ffs11"
job="node_gateways"
mountpoint="/zp_pve/subvol-3226-disk-0"
nodename="ffs11"
severity="warning"
|
firing |
2025-03-31 20:03:47.079934382 +0000 UTC |
0 |
Annotations |
- description
- Disk is almost full (< 10% left)
VALUE = 0
LABELS = map[device:zp_pve/subvol-3226-disk-0 fstype:zfs instance:ffs11 job:node_gateways mountpoint:/zp_pve/subvol-3226-disk-0 nodename:ffs11]
- summary
- Host out of disk space (instance ffs11)
|
alertname="HostOutOfDiskSpace"
device="zp_pve/subvol-3226-disk-0"
fstype="zfs"
instance="ffs05"
job="node_gateways"
mountpoint="/zp_pve/subvol-3226-disk-0"
nodename="ffs05"
severity="warning"
|
firing |
2025-03-31 20:03:47.079934382 +0000 UTC |
0.008544921875 |
Annotations |
- description
- Disk is almost full (< 10% left)
VALUE = 0.008544921875
LABELS = map[device:zp_pve/subvol-3226-disk-0 fstype:zfs instance:ffs05 job:node_gateways mountpoint:/zp_pve/subvol-3226-disk-0 nodename:ffs05]
- summary
- Host out of disk space (instance ffs05)
|
alertname="HostOutOfDiskSpace"
device="rpool/data/subvol-8194-disk-0"
fstype="zfs"
instance="ffs13"
job="node_gateways"
mountpoint="/rpool/data/subvol-8194-disk-0"
nodename="ffs13"
severity="warning"
|
firing |
2025-03-31 04:00:32.079934382 +0000 UTC |
7.9193115234375 |
Annotations |
- description
- Disk is almost full (< 10% left)
VALUE = 7.9193115234375
LABELS = map[device:rpool/data/subvol-8194-disk-0 fstype:zfs instance:ffs13 job:node_gateways mountpoint:/rpool/data/subvol-8194-disk-0 nodename:ffs13]
- summary
- Host out of disk space (instance ffs13)
|
alertname="HostOutOfDiskSpace"
device="rpool/data/subvol-8194-disk-1"
fstype="zfs"
instance="ffs13"
job="node_gateways"
mountpoint="/rpool/data/subvol-8194-disk-1"
nodename="ffs13"
severity="warning"
|
firing |
2025-03-15 03:04:02.079934382 +0000 UTC |
4.9297707297585225 |
Annotations |
- description
- Disk is almost full (< 10% left)
VALUE = 4.9297707297585225
LABELS = map[device:rpool/data/subvol-8194-disk-1 fstype:zfs instance:ffs13 job:node_gateways mountpoint:/rpool/data/subvol-8194-disk-1 nodename:ffs13]
- summary
- Host out of disk space (instance ffs13)
|
alertname="HostOutOfDiskSpace"
device="rpool/data/subvol-8195-disk-0"
fstype="zfs"
instance="ffs13"
job="node_gateways"
mountpoint="/rpool/data/subvol-8195-disk-0"
nodename="ffs13"
severity="warning"
|
firing |
2025-03-16 04:43:32.079934382 +0000 UTC |
2.0660923549107144 |
Annotations |
- description
- Disk is almost full (< 10% left)
VALUE = 2.0660923549107144
LABELS = map[device:rpool/data/subvol-8195-disk-0 fstype:zfs instance:ffs13 job:node_gateways mountpoint:/rpool/data/subvol-8195-disk-0 nodename:ffs13]
- summary
- Host out of disk space (instance ffs13)
|
|
Labels |
State |
Active Since |
Value |
alertname="HostOutOfInodes"
device="zp_pve/subvol-3226-disk-0"
fstype="zfs"
instance="ffs11"
job="node_gateways"
mountpoint="/zp_pve/subvol-3226-disk-0"
nodename="ffs11"
severity="warning"
|
firing |
2025-03-31 20:03:47.079934382 +0000 UTC |
0 |
Annotations |
- description
- Disk is almost running out of available inodes (< 10% left)
VALUE = 0
LABELS = map[device:zp_pve/subvol-3226-disk-0 fstype:zfs instance:ffs11 job:node_gateways mountpoint:/zp_pve/subvol-3226-disk-0 nodename:ffs11]
- summary
- Host out of inodes (instance ffs11)
|
alertname="HostOutOfInodes"
device="zp_pve/subvol-3226-disk-0"
fstype="zfs"
instance="ffs05"
job="node_gateways"
mountpoint="/zp_pve/subvol-3226-disk-0"
nodename="ffs05"
severity="warning"
|
firing |
2025-03-31 20:03:47.079934382 +0000 UTC |
0.17443491157225793 |
Annotations |
- description
- Disk is almost running out of available inodes (< 10% left)
VALUE = 0.17443491157225793
LABELS = map[device:zp_pve/subvol-3226-disk-0 fstype:zfs instance:ffs05 job:node_gateways mountpoint:/zp_pve/subvol-3226-disk-0 nodename:ffs05]
- summary
- Host out of inodes (instance ffs05)
|
|
Labels |
State |
Active Since |
Value |
alertname="HostRequiresReboot"
instance="monitor01"
job="node"
nodename="monitor01"
severity="info"
|
firing |
2025-03-16 05:52:47.079934382 +0000 UTC |
1 |
Annotations |
- description
- monitor01 requires a reboot.
VALUE = 1
LABELS = map[instance:monitor01 job:node nodename:monitor01]
- summary
- Host requires reboot (instance monitor01)
|
alertname="HostRequiresReboot"
instance="ffs08"
job="node_gateways"
nodename="ffs08"
severity="info"
|
firing |
2025-01-16 18:11:47 +0000 UTC |
1 |
Annotations |
- description
- ffs08 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs08 job:node_gateways nodename:ffs08]
- summary
- Host requires reboot (instance ffs08)
|
alertname="HostRequiresReboot"
instance="ffs13"
job="node_gateways"
nodename="ffs13"
severity="info"
|
firing |
2025-01-13 18:50:17 +0000 UTC |
1 |
Annotations |
- description
- ffs13 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs13 job:node_gateways nodename:ffs13]
- summary
- Host requires reboot (instance ffs13)
|
alertname="HostRequiresReboot"
instance="gw09n04"
job="node_gateways"
nodename="gw09n04"
severity="info"
|
firing |
2025-03-25 06:40:32.079934382 +0000 UTC |
1 |
Annotations |
- description
- gw09n04 requires a reboot.
VALUE = 1
LABELS = map[instance:gw09n04 job:node_gateways nodename:gw09n04]
- summary
- Host requires reboot (instance gw09n04)
|
alertname="HostRequiresReboot"
instance="ffs05"
job="node_gateways"
nodename="ffs05"
severity="info"
|
firing |
2025-03-31 20:03:47.079934382 +0000 UTC |
1 |
Annotations |
- description
- ffs05 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs05 job:node_gateways nodename:ffs05]
- summary
- Host requires reboot (instance ffs05)
|
|
Labels |
State |
Active Since |
Value |
alertname="HostUnusualDiskIo"
device="sdf"
instance="gw05n02"
job="node_gateways"
nodename="gw05n02"
severity="warning"
|
firing |
2025-04-03 09:35:32.079934382 +0000 UTC |
0.545422222222098 |
Annotations |
- description
- Time spent in IO is too high on gw05n02. Check storage for issues.
VALUE = 0.545422222222098
LABELS = map[device:sdf instance:gw05n02 job:node_gateways nodename:gw05n02]
- summary
- Host unusual disk IO (instance gw05n02)
|
alertname="HostUnusualDiskIo"
device="sdi"
instance="gw05n02"
job="node_gateways"
nodename="gw05n02"
severity="warning"
|
firing |
2025-04-03 09:39:47.079934382 +0000 UTC |
0.5374444444430991 |
Annotations |
- description
- Time spent in IO is too high on gw05n02. Check storage for issues.
VALUE = 0.5374444444430991
LABELS = map[device:sdi instance:gw05n02 job:node_gateways nodename:gw05n02]
- summary
- Host unusual disk IO (instance gw05n02)
|
alertname="HostUnusualDiskIo"
device="sdq"
instance="gw05n02"
job="node_gateways"
nodename="gw05n02"
severity="warning"
|
firing |
2025-04-03 09:39:47.079934382 +0000 UTC |
0.5530222222208976 |
Annotations |
- description
- Time spent in IO is too high on gw05n02. Check storage for issues.
VALUE = 0.5530222222208976
LABELS = map[device:sdq instance:gw05n02 job:node_gateways nodename:gw05n02]
- summary
- Host unusual disk IO (instance gw05n02)
|
alertname="HostUnusualDiskIo"
device="sdm"
instance="gw05n02"
job="node_gateways"
nodename="gw05n02"
severity="warning"
|
firing |
2025-04-03 09:39:47.079934382 +0000 UTC |
0.5399333333337886 |
Annotations |
- description
- Time spent in IO is too high on gw05n02. Check storage for issues.
VALUE = 0.5399333333337886
LABELS = map[device:sdm instance:gw05n02 job:node_gateways nodename:gw05n02]
- summary
- Host unusual disk IO (instance gw05n02)
|
|
Labels |
State |
Active Since |
Value |
alertname="HostUnusualNetworkThroughputIn"
instance="ffs13"
nodename="ffs13"
severity="warning"
|
firing |
2025-04-03 05:19:32.079934382 +0000 UTC |
234.45386663164405 |
Annotations |
- description
- Host network interfaces are probably receiving too much data (> 100 MB/s)
VALUE = 234.45386663164405
LABELS = map[instance:ffs13 nodename:ffs13]
- summary
- Host unusual network throughput in (instance ffs13)
|
alertname="HostUnusualNetworkThroughputIn"
instance="ffs08"
nodename="ffs08"
severity="warning"
|
firing |
2025-02-04 15:35:17.079934382 +0000 UTC |
557.418238721575 |
Annotations |
- description
- Host network interfaces are probably receiving too much data (> 100 MB/s)
VALUE = 557.418238721575
LABELS = map[instance:ffs08 nodename:ffs08]
- summary
- Host unusual network throughput in (instance ffs08)
|
alertname="HostUnusualNetworkThroughputIn"
instance="core02-z10a"
nodename="core02-z10a"
severity="warning"
|
firing |
2025-04-03 06:56:47.079934382 +0000 UTC |
131.98826498551801 |
Annotations |
- description
- Host network interfaces are probably receiving too much data (> 100 MB/s)
VALUE = 131.98826498551801
LABELS = map[instance:core02-z10a nodename:core02-z10a]
- summary
- Host unusual network throughput in (instance core02-z10a)
|
alertname="HostUnusualNetworkThroughputIn"
instance="core01-z10a"
nodename="core01-z10a"
severity="warning"
|
firing |
2025-03-16 05:21:17.079934382 +0000 UTC |
431.5261819673621 |
Annotations |
- description
- Host network interfaces are probably receiving too much data (> 100 MB/s)
VALUE = 431.5261819673621
LABELS = map[instance:core01-z10a nodename:core01-z10a]
- summary
- Host unusual network throughput in (instance core01-z10a)
|
alertname="HostUnusualNetworkThroughputIn"
instance="gw09n04"
nodename="gw09n04"
severity="warning"
|
firing |
2025-04-03 06:17:02.079934382 +0000 UTC |
135.635642914545 |
Annotations |
- description
- Host network interfaces are probably receiving too much data (> 100 MB/s)
VALUE = 135.635642914545
LABELS = map[instance:gw09n04 nodename:gw09n04]
- summary
- Host unusual network throughput in (instance gw09n04)
|
|
Labels |
State |
Active Since |
Value |
alertname="HostUnusualNetworkThroughputOut"
instance="core01-z10a"
nodename="core01-z10a"
severity="warning"
|
firing |
2025-04-03 05:00:47.079934382 +0000 UTC |
217.7182387973951 |
Annotations |
- description
- Host network interfaces are probably sending too much data (> 100 MB/s)
VALUE = 217.7182387973951
LABELS = map[instance:core01-z10a nodename:core01-z10a]
- summary
- Host unusual network throughput out (instance core01-z10a)
|
alertname="HostUnusualNetworkThroughputOut"
instance="ffs08"
nodename="ffs08"
severity="warning"
|
firing |
2025-02-04 15:35:17.079934382 +0000 UTC |
563.9806868598575 |
Annotations |
- description
- Host network interfaces are probably sending too much data (> 100 MB/s)
VALUE = 563.9806868598575
LABELS = map[instance:ffs08 nodename:ffs08]
- summary
- Host unusual network throughput out (instance ffs08)
|
alertname="HostUnusualNetworkThroughputOut"
instance="gw09n04"
nodename="gw09n04"
severity="warning"
|
firing |
2025-04-03 04:09:32.079934382 +0000 UTC |
385.26989159356975 |
Annotations |
- description
- Host network interfaces are probably sending too much data (> 100 MB/s)
VALUE = 385.26989159356975
LABELS = map[instance:gw09n04 nodename:gw09n04]
- summary
- Host unusual network throughput out (instance gw09n04)
|
alertname="HostUnusualNetworkThroughputOut"
instance="ffs13"
nodename="ffs13"
severity="warning"
|
firing |
2025-04-03 05:19:32.079934382 +0000 UTC |
247.77873088291716 |
Annotations |
- description
- Host network interfaces are probably sending too much data (> 100 MB/s)
VALUE = 247.77873088291716
LABELS = map[instance:ffs13 nodename:ffs13]
- summary
- Host unusual network throughput out (instance ffs13)
|
|
|
|
|
|
|
|
|
alert: HostFilesystemDeviceError
expr: node_filesystem_device_error
== 1
for: 2m
labels:
severity: critical
annotations:
description: |-
{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host filesystem device error (instance {{ $labels.instance }})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/alerts/smartctl-exporter.yml > SmartctlExporter
|
alert: SmartCriticalWarning
expr: smartctl_device_critical_warning
> 0
for: 15m
labels:
severity: critical
annotations:
description: |-
device has critical warning (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart critical warning (instance {{ $labels.instance }})
|
alert: SmartDeviceTemperatureCritical
expr: smartctl_device_temperature
> 80
for: 2m
labels:
severity: critical
annotations:
description: |-
Device temperature critical (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart device temperature critical (instance {{ $labels.instance }})
|
alert: SmartDeviceTemperatureWarning
expr: smartctl_device_temperature
> 60
for: 2m
labels:
severity: warning
annotations:
description: |-
Device temperature warning (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart device temperature warning (instance {{ $labels.instance }})
|
alert: SmartMediaErrors
expr: smartctl_device_media_errors
> 0
for: 15m
labels:
severity: critical
annotations:
description: |-
device has media errors (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart media errors (instance {{ $labels.instance }})
|
|