add uptime check and four alert policies

- uptime check probes https://forge.kontain.me/api/v1/version every 60s - alert: Forgejo is down (uptime fails 5+ min) - alert: TLS cert expiring (<7 days remaining) - alert: data disk >80% full (hourly disk-check.timer writes DISK_HIGH to journal; log-based metric) - alert: container logs match panic/FATAL (cos_containers log-based metric) - all alerts email admin_email via the existing notification channel Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-07 21:35:43 -04:00 · 2026-05-07 21:35:43 -04:00 · af57959480
commit af57959480
parent 15ea287728
2 changed files with 202 additions and 0 deletions
--- a/cloud-init/user-data.yaml.tpl
+++ b/cloud-init/user-data.yaml.tpl
@ -123,6 +123,38 @@ write_files:
      [Install]
      WantedBy=timers.target

+  - path: /var/lib/google/forgejo/disk-check.sh
+    permissions: '0755'
+    content: |
+      #!/bin/bash
+      set -euo pipefail
+      USE=$(df --output=pcent /mnt/disks/forgejo-data | tail -1 | tr -dc '0-9')
+      if [ "$USE" -gt 80 ]; then
+        logger -t forgejo-disk "DISK_HIGH: /mnt/disks/forgejo-data at $${USE}% used"
+      fi
+
+  - path: /etc/systemd/system/forgejo-disk-check.service
+    content: |
+      [Unit]
+      Description=Emit a log line if Forgejo data disk is >80% full
+
+      [Service]
+      Type=oneshot
+      ExecStart=/var/lib/google/forgejo/disk-check.sh
+
+  - path: /etc/systemd/system/forgejo-disk-check.timer
+    content: |
+      [Unit]
+      Description=Hourly disk-fullness check
+
+      [Timer]
+      OnBootSec=5min
+      OnUnitActiveSec=1h
+      Persistent=true
+
+      [Install]
+      WantedBy=timers.target
+
  - path: /etc/systemd/system/forgejo-reboot.service
    content: |
      [Unit]
@ -152,3 +184,4 @@ runcmd:
  - systemctl enable --now forgejo-stack.service
  - systemctl enable --now forgejo-backup.timer
  - systemctl enable --now forgejo-reboot.timer
+  - systemctl enable --now forgejo-disk-check.timer