diff --git a/cloud-init/user-data.yaml.tpl b/cloud-init/user-data.yaml.tpl index 73ef206..f3a00a3 100644 --- a/cloud-init/user-data.yaml.tpl +++ b/cloud-init/user-data.yaml.tpl @@ -123,6 +123,38 @@ write_files: [Install] WantedBy=timers.target + - path: /var/lib/google/forgejo/disk-check.sh + permissions: '0755' + content: | + #!/bin/bash + set -euo pipefail + USE=$(df --output=pcent /mnt/disks/forgejo-data | tail -1 | tr -dc '0-9') + if [ "$USE" -gt 80 ]; then + logger -t forgejo-disk "DISK_HIGH: /mnt/disks/forgejo-data at $${USE}% used" + fi + + - path: /etc/systemd/system/forgejo-disk-check.service + content: | + [Unit] + Description=Emit a log line if Forgejo data disk is >80% full + + [Service] + Type=oneshot + ExecStart=/var/lib/google/forgejo/disk-check.sh + + - path: /etc/systemd/system/forgejo-disk-check.timer + content: | + [Unit] + Description=Hourly disk-fullness check + + [Timer] + OnBootSec=5min + OnUnitActiveSec=1h + Persistent=true + + [Install] + WantedBy=timers.target + - path: /etc/systemd/system/forgejo-reboot.service content: | [Unit] @@ -152,3 +184,4 @@ runcmd: - systemctl enable --now forgejo-stack.service - systemctl enable --now forgejo-backup.timer - systemctl enable --now forgejo-reboot.timer + - systemctl enable --now forgejo-disk-check.timer diff --git a/terraform/monitoring.tf b/terraform/monitoring.tf new file mode 100644 index 0000000..af4936c --- /dev/null +++ b/terraform/monitoring.tf @@ -0,0 +1,169 @@ +resource "google_monitoring_uptime_check_config" "forgejo" { + display_name = "Forgejo /api/v1/version" + timeout = "10s" + period = "60s" + + http_check { + path = "/api/v1/version" + port = "443" + use_ssl = true + validate_ssl = true + request_method = "GET" + } + + monitored_resource { + type = "uptime_url" + labels = { + project_id = var.project_id + host = var.domain + } + } +} + +resource "google_monitoring_alert_policy" "forgejo_down" { + display_name = "Forgejo is down" + combiner = "OR" + + conditions { + display_name = "Uptime check failing for 5+ min" + condition_threshold { + filter = join(" AND ", [ + "resource.type=\"uptime_url\"", + "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\"", + "metric.label.\"check_id\"=\"${google_monitoring_uptime_check_config.forgejo.uptime_check_id}\"", + ]) + duration = "300s" + comparison = "COMPARISON_GT" + threshold_value = 1 + + aggregations { + alignment_period = "1200s" + per_series_aligner = "ALIGN_NEXT_OLDER" + cross_series_reducer = "REDUCE_COUNT_FALSE" + group_by_fields = ["resource.label.host"] + } + + trigger { + count = 1 + } + } + } + + notification_channels = [google_monitoring_notification_channel.email.id] + + alert_strategy { + auto_close = "1800s" + } +} + +resource "google_monitoring_alert_policy" "tls_cert_expiry" { + display_name = "Forgejo TLS cert expiring soon" + combiner = "OR" + + conditions { + display_name = "Cert expires in <7 days" + condition_threshold { + filter = join(" AND ", [ + "resource.type=\"uptime_url\"", + "metric.type=\"monitoring.googleapis.com/uptime_check/time_until_ssl_cert_expires\"", + "metric.label.\"check_id\"=\"${google_monitoring_uptime_check_config.forgejo.uptime_check_id}\"", + ]) + duration = "600s" + comparison = "COMPARISON_LT" + threshold_value = 7 + + aggregations { + alignment_period = "1200s" + per_series_aligner = "ALIGN_NEXT_OLDER" + } + } + } + + notification_channels = [google_monitoring_notification_channel.email.id] + + alert_strategy { + auto_close = "86400s" + } +} + +resource "google_logging_metric" "forgejo_disk_high" { + name = "forgejo_disk_high" + filter = "resource.type=\"gce_instance\" AND jsonPayload.MESSAGE=~\"^DISK_HIGH:\"" + + metric_descriptor { + metric_kind = "DELTA" + value_type = "INT64" + } +} + +resource "google_monitoring_alert_policy" "disk_full" { + display_name = "Forgejo data disk >80% full" + combiner = "OR" + + conditions { + display_name = "DISK_HIGH log line emitted" + condition_threshold { + filter = "resource.type=\"gce_instance\" AND metric.type=\"logging.googleapis.com/user/forgejo_disk_high\"" + duration = "0s" + comparison = "COMPARISON_GT" + threshold_value = 0 + + aggregations { + alignment_period = "3600s" + per_series_aligner = "ALIGN_COUNT" + cross_series_reducer = "REDUCE_SUM" + } + } + } + + notification_channels = [google_monitoring_notification_channel.email.id] + + alert_strategy { + auto_close = "86400s" + } + + depends_on = [google_logging_metric.forgejo_disk_high] +} + +resource "google_logging_metric" "forgejo_panic" { + name = "forgejo_panic" + filter = join(" AND ", [ + "resource.type=\"gce_instance\"", + "logName=~\"projects/.*/logs/cos_containers\"", + "(textPayload=~\"panic\" OR textPayload=~\"FATAL\" OR jsonPayload.message=~\"panic\" OR jsonPayload.message=~\"FATAL\")", + ]) + + metric_descriptor { + metric_kind = "DELTA" + value_type = "INT64" + } +} + +resource "google_monitoring_alert_policy" "forgejo_panic" { + display_name = "Forgejo container panic/FATAL" + combiner = "OR" + + conditions { + display_name = "panic or FATAL in container logs" + condition_threshold { + filter = "resource.type=\"gce_instance\" AND metric.type=\"logging.googleapis.com/user/forgejo_panic\"" + duration = "0s" + comparison = "COMPARISON_GT" + threshold_value = 0 + + aggregations { + alignment_period = "300s" + per_series_aligner = "ALIGN_COUNT" + cross_series_reducer = "REDUCE_SUM" + } + } + } + + notification_channels = [google_monitoring_notification_channel.email.id] + + alert_strategy { + auto_close = "3600s" + } + + depends_on = [google_logging_metric.forgejo_panic] +}