/etc/prometheus/alerts/alert_healthchecks.yml > Selfmonitoring
|
Labels |
State |
Active Since |
Value |
alertname="SelfMonitoringAlwaysFiring"
application="leonard_healthchecks"
severity="info"
|
firing |
2025-07-20 19:10:16.797458603 +0000 UTC |
20 |
|
/etc/prometheus/alerts/alert_loadbalancing.yml > lowpref
|
|
/etc/prometheus/alerts/blackbox-exporter.yml > BlackboxExporter
|
|
|
|
|
|
|
|
/etc/prometheus/alerts/general.yml > probe_success
|
|
/etc/prometheus/alerts/general.yml > reload_success
|
|
|
/etc/prometheus/alerts/general.yml > up_success
|
|
/etc/prometheus/alerts/node-exporter.yml > NodeExporter
|
Labels |
State |
Active Since |
Value |
alertname="HostOutOfDiskSpace"
device="rpool/data/subvol-8194-disk-1"
fstype="zfs"
instance="ffs13"
job="node"
mountpoint="/rpool/data/subvol-8194-disk-1"
nodename="ffs13"
severity="warning"
|
firing |
2025-07-25 02:07:02.079934382 +0000 UTC |
9.516910807291667 |
Annotations |
- description
- Disk is almost full (< 10% left)
VALUE = 9.516910807291667
LABELS = map[device:rpool/data/subvol-8194-disk-1 fstype:zfs instance:ffs13 job:node mountpoint:/rpool/data/subvol-8194-disk-1 nodename:ffs13]
- summary
- Host out of disk space (instance ffs13)
|
alertname="HostOutOfDiskSpace"
device="rpool/data/subvol-8194-disk-1"
fstype="zfs"
instance="pbs01"
job="node"
mountpoint="/srv"
nodename="pbs01"
severity="warning"
|
firing |
2025-07-25 02:07:02.079934382 +0000 UTC |
9.516910807291667 |
Annotations |
- description
- Disk is almost full (< 10% left)
VALUE = 9.516910807291667
LABELS = map[device:rpool/data/subvol-8194-disk-1 fstype:zfs instance:pbs01 job:node mountpoint:/srv nodename:pbs01]
- summary
- Host out of disk space (instance pbs01)
|
|
Labels |
State |
Active Since |
Value |
alertname="HostRequiresReboot"
instance="ffs08"
job="node"
nodename="ffs08"
severity="info"
|
firing |
2025-07-19 04:42:02 +0000 UTC |
1 |
Annotations |
- description
- ffs08 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs08 job:node nodename:ffs08]
- summary
- Host requires reboot (instance ffs08)
|
alertname="HostRequiresReboot"
instance="ffs10"
job="node"
nodename="ffs10"
severity="info"
|
firing |
2025-07-19 04:21:02 +0000 UTC |
1 |
Annotations |
- description
- ffs10 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs10 job:node nodename:ffs10]
- summary
- Host requires reboot (instance ffs10)
|
alertname="HostRequiresReboot"
instance="ffs11"
job="node"
nodename="ffs11"
severity="info"
|
firing |
2025-07-19 06:56:02 +0000 UTC |
1 |
Annotations |
- description
- ffs11 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs11 job:node nodename:ffs11]
- summary
- Host requires reboot (instance ffs11)
|
alertname="HostRequiresReboot"
instance="ffs13"
job="node"
nodename="ffs13"
severity="info"
|
firing |
2025-07-19 04:44:47 +0000 UTC |
1 |
Annotations |
- description
- ffs13 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs13 job:node nodename:ffs13]
- summary
- Host requires reboot (instance ffs13)
|
alertname="HostRequiresReboot"
instance="ffs05"
job="node"
nodename="ffs05"
severity="info"
|
firing |
2025-07-19 04:34:02 +0000 UTC |
1 |
Annotations |
- description
- ffs05 requires a reboot.
VALUE = 1
LABELS = map[instance:ffs05 job:node nodename:ffs05]
- summary
- Host requires reboot (instance ffs05)
|
|
|
|
|
|
|
|
|
|
alert: HostFilesystemDeviceError
expr: node_filesystem_device_error
== 1
for: 2m
labels:
severity: critical
annotations:
description: |-
{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host filesystem device error (instance {{ $labels.instance }})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/alerts/smartctl-exporter.yml > SmartctlExporter
|
alert: SmartCriticalWarning
expr: smartctl_device_critical_warning
> 0
for: 15m
labels:
severity: critical
annotations:
description: |-
device has critical warning (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart critical warning (instance {{ $labels.instance }})
|
alert: SmartDeviceTemperatureCritical
expr: smartctl_device_temperature
> 80
for: 2m
labels:
severity: critical
annotations:
description: |-
Device temperature critical (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart device temperature critical (instance {{ $labels.instance }})
|
alert: SmartDeviceTemperatureWarning
expr: smartctl_device_temperature
> 60
for: 2m
labels:
severity: warning
annotations:
description: |-
Device temperature warning (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart device temperature warning (instance {{ $labels.instance }})
|
alert: SmartMediaErrors
expr: smartctl_device_media_errors
> 0
for: 15m
labels:
severity: critical
annotations:
description: |-
device has media errors (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Smart media errors (instance {{ $labels.instance }})
|
|