add uptime check and four alert policies

- uptime check probes https://forge.kontain.me/api/v1/version every 60s - alert: Forgejo is down (uptime fails 5+ min) - alert: TLS cert expiring (<7 days remaining) - alert: data disk >80% full (hourly disk-check.timer writes DISK_HIGH to journal; log-based metric) - alert: container logs match panic/FATAL (cos_containers log-based metric) - all alerts email admin_email via the existing notification channel Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-07 21:35:43 -04:00 · 2026-05-07 21:35:43 -04:00 · af57959480
commit af57959480
parent 15ea287728
2 changed files with 202 additions and 0 deletions
--- a/cloud-init/user-data.yaml.tpl
+++ b/cloud-init/user-data.yaml.tpl
@ -123,6 +123,38 @@ write_files:
      [Install]
      WantedBy=timers.target

+  - path: /var/lib/google/forgejo/disk-check.sh
+    permissions: '0755'
+    content: |
+      #!/bin/bash
+      set -euo pipefail
+      USE=$(df --output=pcent /mnt/disks/forgejo-data | tail -1 | tr -dc '0-9')
+      if [ "$USE" -gt 80 ]; then
+        logger -t forgejo-disk "DISK_HIGH: /mnt/disks/forgejo-data at $${USE}% used"
+      fi
+
+  - path: /etc/systemd/system/forgejo-disk-check.service
+    content: |
+      [Unit]
+      Description=Emit a log line if Forgejo data disk is >80% full
+
+      [Service]
+      Type=oneshot
+      ExecStart=/var/lib/google/forgejo/disk-check.sh
+
+  - path: /etc/systemd/system/forgejo-disk-check.timer
+    content: |
+      [Unit]
+      Description=Hourly disk-fullness check
+
+      [Timer]
+      OnBootSec=5min
+      OnUnitActiveSec=1h
+      Persistent=true
+
+      [Install]
+      WantedBy=timers.target
+
  - path: /etc/systemd/system/forgejo-reboot.service
    content: |
      [Unit]
@ -152,3 +184,4 @@ runcmd:
  - systemctl enable --now forgejo-stack.service
  - systemctl enable --now forgejo-backup.timer
  - systemctl enable --now forgejo-reboot.timer
+  - systemctl enable --now forgejo-disk-check.timer
--- a/terraform/monitoring.tf
+++ b/terraform/monitoring.tf
@ -0,0 +1,169 @@
+resource "google_monitoring_uptime_check_config" "forgejo" {
+  display_name = "Forgejo /api/v1/version"
+  timeout      = "10s"
+  period       = "60s"
+
+  http_check {
+    path           = "/api/v1/version"
+    port           = "443"
+    use_ssl        = true
+    validate_ssl   = true
+    request_method = "GET"
+  }
+
+  monitored_resource {
+    type = "uptime_url"
+    labels = {
+      project_id = var.project_id
+      host       = var.domain
+    }
+  }
+}
+
+resource "google_monitoring_alert_policy" "forgejo_down" {
+  display_name = "Forgejo is down"
+  combiner     = "OR"
+
+  conditions {
+    display_name = "Uptime check failing for 5+ min"
+    condition_threshold {
+      filter = join(" AND ", [
+        "resource.type=\"uptime_url\"",
+        "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\"",
+        "metric.label.\"check_id\"=\"${google_monitoring_uptime_check_config.forgejo.uptime_check_id}\"",
+      ])
+      duration        = "300s"
+      comparison      = "COMPARISON_GT"
+      threshold_value = 1
+
+      aggregations {
+        alignment_period     = "1200s"
+        per_series_aligner   = "ALIGN_NEXT_OLDER"
+        cross_series_reducer = "REDUCE_COUNT_FALSE"
+        group_by_fields      = ["resource.label.host"]
+      }
+
+      trigger {
+        count = 1
+      }
+    }
+  }
+
+  notification_channels = [google_monitoring_notification_channel.email.id]
+
+  alert_strategy {
+    auto_close = "1800s"
+  }
+}
+
+resource "google_monitoring_alert_policy" "tls_cert_expiry" {
+  display_name = "Forgejo TLS cert expiring soon"
+  combiner     = "OR"
+
+  conditions {
+    display_name = "Cert expires in <7 days"
+    condition_threshold {
+      filter = join(" AND ", [
+        "resource.type=\"uptime_url\"",
+        "metric.type=\"monitoring.googleapis.com/uptime_check/time_until_ssl_cert_expires\"",
+        "metric.label.\"check_id\"=\"${google_monitoring_uptime_check_config.forgejo.uptime_check_id}\"",
+      ])
+      duration        = "600s"
+      comparison      = "COMPARISON_LT"
+      threshold_value = 7
+
+      aggregations {
+        alignment_period   = "1200s"
+        per_series_aligner = "ALIGN_NEXT_OLDER"
+      }
+    }
+  }
+
+  notification_channels = [google_monitoring_notification_channel.email.id]
+
+  alert_strategy {
+    auto_close = "86400s"
+  }
+}
+
+resource "google_logging_metric" "forgejo_disk_high" {
+  name   = "forgejo_disk_high"
+  filter = "resource.type=\"gce_instance\" AND jsonPayload.MESSAGE=~\"^DISK_HIGH:\""
+
+  metric_descriptor {
+    metric_kind = "DELTA"
+    value_type  = "INT64"
+  }
+}
+
+resource "google_monitoring_alert_policy" "disk_full" {
+  display_name = "Forgejo data disk >80% full"
+  combiner     = "OR"
+
+  conditions {
+    display_name = "DISK_HIGH log line emitted"
+    condition_threshold {
+      filter          = "resource.type=\"gce_instance\" AND metric.type=\"logging.googleapis.com/user/forgejo_disk_high\""
+      duration        = "0s"
+      comparison      = "COMPARISON_GT"
+      threshold_value = 0
+
+      aggregations {
+        alignment_period     = "3600s"
+        per_series_aligner   = "ALIGN_COUNT"
+        cross_series_reducer = "REDUCE_SUM"
+      }
+    }
+  }
+
+  notification_channels = [google_monitoring_notification_channel.email.id]
+
+  alert_strategy {
+    auto_close = "86400s"
+  }
+
+  depends_on = [google_logging_metric.forgejo_disk_high]
+}
+
+resource "google_logging_metric" "forgejo_panic" {
+  name = "forgejo_panic"
+  filter = join(" AND ", [
+    "resource.type=\"gce_instance\"",
+    "logName=~\"projects/.*/logs/cos_containers\"",
+    "(textPayload=~\"panic\" OR textPayload=~\"FATAL\" OR jsonPayload.message=~\"panic\" OR jsonPayload.message=~\"FATAL\")",
+  ])
+
+  metric_descriptor {
+    metric_kind = "DELTA"
+    value_type  = "INT64"
+  }
+}
+
+resource "google_monitoring_alert_policy" "forgejo_panic" {
+  display_name = "Forgejo container panic/FATAL"
+  combiner     = "OR"
+
+  conditions {
+    display_name = "panic or FATAL in container logs"
+    condition_threshold {
+      filter          = "resource.type=\"gce_instance\" AND metric.type=\"logging.googleapis.com/user/forgejo_panic\""
+      duration        = "0s"
+      comparison      = "COMPARISON_GT"
+      threshold_value = 0
+
+      aggregations {
+        alignment_period     = "300s"
+        per_series_aligner   = "ALIGN_COUNT"
+        cross_series_reducer = "REDUCE_SUM"
+      }
+    }
+  }
+
+  notification_channels = [google_monitoring_notification_channel.email.id]
+
+  alert_strategy {
+    auto_close = "3600s"
+  }
+
+  depends_on = [google_logging_metric.forgejo_panic]
+}