Alerts

/etc/prometheus/alerts.rules.yml > major_alerting_rules
CPU (3 active)
alert: CPU
expr: sum
  by(instance, job) (rate(node_cpu_seconds_total{mode="system"}[5m])) * 100
  >= 90
labels:
  severity: major
annotations:
  cluster: '{{ $labels.job | toUpper }}'
  description: has high CPU usage ({{ $value | printf "%.2f" }}) over last
    5 minutes.
  summary: CPU average
Labels State Active Since Value
alertname="CPU" instance="b2b-db-local-1.tct.travel:9100" job="db" severity="major" firing 2025-12-07 13:10:29.882319426 +0000 UTC 167.51929824569328
alertname="CPU" instance="tb3.tct.travel:9100" job="php" severity="major" firing 2025-12-07 13:14:44.882319426 +0000 UTC 194.1676190476199
alertname="CPU" instance="b2b-db-local-2.tct.travel:9100" job="db" severity="major" firing 2025-12-07 13:09:44.882319426 +0000 UTC 217.35438596467444
Load Average (11 active)
alert: Load
  Average
expr: node_load15 >= 1
labels:
  severity: major
annotations:
  cluster: '{{ $labels.job | toUpper }}'
  description: has high load average ({{ $value }}) over last 15 minutes.
  summary: high load average
Labels State Active Since Value
alertname="Load Average" instance="api1.tct.travel:9100" job="php" severity="major" firing 2025-12-07 13:13:29.882319426 +0000 UTC 1.57
alertname="Load Average" instance="live8.tct.travel:9100" job="php" severity="major" firing 2025-12-07 13:18:44.882319426 +0000 UTC 1.02
alertname="Load Average" instance="b2b-db-local-2.tct.travel:9100" job="db" severity="major" firing 2025-12-07 13:09:29.882319426 +0000 UTC 4.01
alertname="Load Average" instance="live10.tct.travel:9100" job="php" severity="major" firing 2025-12-07 13:12:14.882319426 +0000 UTC 1.71
alertname="Load Average" instance="b2b-db-main.tct.travel:9100" job="db" severity="major" firing 2025-12-07 04:02:14.882319426 +0000 UTC 2.7
alertname="Load Average" instance="dbl6.tct.travel:9100" job="db" severity="major" firing 2025-12-07 13:05:59.882319426 +0000 UTC 3.31
alertname="Load Average" instance="b2b-db-local-1.tct.travel:9100" job="db" severity="major" firing 2025-12-07 13:09:44.882319426 +0000 UTC 3.27
alertname="Load Average" instance="live6.tct.travel:9100" job="php" severity="major" firing 2025-12-07 13:11:29.882319426 +0000 UTC 1.87
alertname="Load Average" instance="stage-b2b.tct.travel:9100" job="php" severity="major" firing 2025-11-19 03:46:14.882319426 +0000 UTC 60.69
alertname="Load Average" instance="live7.tct.travel:9100" job="php" severity="major" firing 2025-12-07 13:11:44.882319426 +0000 UTC 1.82
alertname="Load Average" instance="live9.tct.travel:9100" job="php" severity="major" firing 2025-12-07 13:12:29.882319426 +0000 UTC 1.52
Space Usage (4 active)
alert: Space
  Usage
expr: 100
  - ((node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"})
  >= 85
labels:
  severity: major
annotations:
  cluster: '{{ $labels.job | toUpper }}'
  description: has high / usage ({{ $value | printf "%.2f" }}%).
  summary: Space Usage
Labels State Active Since Value
alertname="Space Usage" device="/dev/md2" fstype="ext4" instance="extranet.tct.travel:9100" job="extranet" mountpoint="/" severity="major" firing 2025-10-13 16:35:59.882319426 +0000 UTC 99.42625780893587
alertname="Space Usage" device="/dev/md2" fstype="ext4" instance="live2.tct.travel:9100" job="php" mountpoint="/" severity="major" firing 2025-11-26 21:10:29.882319426 +0000 UTC 94.75086407244413
alertname="Space Usage" device="/dev/md2" fstype="ext4" instance="extranet2.tct.travel:9100" job="extranet" mountpoint="/" severity="major" firing 2025-11-12 17:34:14.882319426 +0000 UTC 89.72809040904944
alertname="Space Usage" device="/dev/md2" fstype="ext4" instance="db1.tct.travel:9100" job="db" mountpoint="/" severity="major" firing 2025-11-15 17:51:44.882319426 +0000 UTC 99.09432763419717
Used RAM Memory (2 active)
alert: Used
  RAM Memory
expr: 100
  - ((node_memory_MemAvailable_bytes * 100) / node_memory_MemTotal_bytes) >= 75
labels:
  severity: major
annotations:
  cluster: '{{ $labels.job | toUpper }}'
  description: has high RAM usage ({{ $value | printf "%.2f" }}%).
  summary: Used RAM Memory
Labels State Active Since Value
alertname="Used RAM Memory" instance="live8.tct.travel:9100" job="php" severity="major" firing 2025-12-07 13:11:44.882319426 +0000 UTC 76.28320038296732
alertname="Used RAM Memory" instance="extranet3.tct.travel:9100" job="extranet" severity="major" firing 2025-12-04 14:23:14.882319426 +0000 UTC 90.64428365409191