1
0
Fork 0

add uptime check and four alert policies

- uptime check probes https://forge.kontain.me/api/v1/version every 60s
- alert: Forgejo is down (uptime fails 5+ min)
- alert: TLS cert expiring (<7 days remaining)
- alert: data disk >80% full (hourly disk-check.timer writes DISK_HIGH to journal; log-based metric)
- alert: container logs match panic/FATAL (cos_containers log-based metric)
- all alerts email admin_email via the existing notification channel

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Jason Hall 2026-05-07 21:35:43 -04:00
parent 15ea287728
commit af57959480
2 changed files with 202 additions and 0 deletions

View file

@ -123,6 +123,38 @@ write_files:
[Install]
WantedBy=timers.target
- path: /var/lib/google/forgejo/disk-check.sh
permissions: '0755'
content: |
#!/bin/bash
set -euo pipefail
USE=$(df --output=pcent /mnt/disks/forgejo-data | tail -1 | tr -dc '0-9')
if [ "$USE" -gt 80 ]; then
logger -t forgejo-disk "DISK_HIGH: /mnt/disks/forgejo-data at $${USE}% used"
fi
- path: /etc/systemd/system/forgejo-disk-check.service
content: |
[Unit]
Description=Emit a log line if Forgejo data disk is >80% full
[Service]
Type=oneshot
ExecStart=/var/lib/google/forgejo/disk-check.sh
- path: /etc/systemd/system/forgejo-disk-check.timer
content: |
[Unit]
Description=Hourly disk-fullness check
[Timer]
OnBootSec=5min
OnUnitActiveSec=1h
Persistent=true
[Install]
WantedBy=timers.target
- path: /etc/systemd/system/forgejo-reboot.service
content: |
[Unit]
@ -152,3 +184,4 @@ runcmd:
- systemctl enable --now forgejo-stack.service
- systemctl enable --now forgejo-backup.timer
- systemctl enable --now forgejo-reboot.timer
- systemctl enable --now forgejo-disk-check.timer

169
terraform/monitoring.tf Normal file
View file

@ -0,0 +1,169 @@
resource "google_monitoring_uptime_check_config" "forgejo" {
display_name = "Forgejo /api/v1/version"
timeout = "10s"
period = "60s"
http_check {
path = "/api/v1/version"
port = "443"
use_ssl = true
validate_ssl = true
request_method = "GET"
}
monitored_resource {
type = "uptime_url"
labels = {
project_id = var.project_id
host = var.domain
}
}
}
resource "google_monitoring_alert_policy" "forgejo_down" {
display_name = "Forgejo is down"
combiner = "OR"
conditions {
display_name = "Uptime check failing for 5+ min"
condition_threshold {
filter = join(" AND ", [
"resource.type=\"uptime_url\"",
"metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\"",
"metric.label.\"check_id\"=\"${google_monitoring_uptime_check_config.forgejo.uptime_check_id}\"",
])
duration = "300s"
comparison = "COMPARISON_GT"
threshold_value = 1
aggregations {
alignment_period = "1200s"
per_series_aligner = "ALIGN_NEXT_OLDER"
cross_series_reducer = "REDUCE_COUNT_FALSE"
group_by_fields = ["resource.label.host"]
}
trigger {
count = 1
}
}
}
notification_channels = [google_monitoring_notification_channel.email.id]
alert_strategy {
auto_close = "1800s"
}
}
resource "google_monitoring_alert_policy" "tls_cert_expiry" {
display_name = "Forgejo TLS cert expiring soon"
combiner = "OR"
conditions {
display_name = "Cert expires in <7 days"
condition_threshold {
filter = join(" AND ", [
"resource.type=\"uptime_url\"",
"metric.type=\"monitoring.googleapis.com/uptime_check/time_until_ssl_cert_expires\"",
"metric.label.\"check_id\"=\"${google_monitoring_uptime_check_config.forgejo.uptime_check_id}\"",
])
duration = "600s"
comparison = "COMPARISON_LT"
threshold_value = 7
aggregations {
alignment_period = "1200s"
per_series_aligner = "ALIGN_NEXT_OLDER"
}
}
}
notification_channels = [google_monitoring_notification_channel.email.id]
alert_strategy {
auto_close = "86400s"
}
}
resource "google_logging_metric" "forgejo_disk_high" {
name = "forgejo_disk_high"
filter = "resource.type=\"gce_instance\" AND jsonPayload.MESSAGE=~\"^DISK_HIGH:\""
metric_descriptor {
metric_kind = "DELTA"
value_type = "INT64"
}
}
resource "google_monitoring_alert_policy" "disk_full" {
display_name = "Forgejo data disk >80% full"
combiner = "OR"
conditions {
display_name = "DISK_HIGH log line emitted"
condition_threshold {
filter = "resource.type=\"gce_instance\" AND metric.type=\"logging.googleapis.com/user/forgejo_disk_high\""
duration = "0s"
comparison = "COMPARISON_GT"
threshold_value = 0
aggregations {
alignment_period = "3600s"
per_series_aligner = "ALIGN_COUNT"
cross_series_reducer = "REDUCE_SUM"
}
}
}
notification_channels = [google_monitoring_notification_channel.email.id]
alert_strategy {
auto_close = "86400s"
}
depends_on = [google_logging_metric.forgejo_disk_high]
}
resource "google_logging_metric" "forgejo_panic" {
name = "forgejo_panic"
filter = join(" AND ", [
"resource.type=\"gce_instance\"",
"logName=~\"projects/.*/logs/cos_containers\"",
"(textPayload=~\"panic\" OR textPayload=~\"FATAL\" OR jsonPayload.message=~\"panic\" OR jsonPayload.message=~\"FATAL\")",
])
metric_descriptor {
metric_kind = "DELTA"
value_type = "INT64"
}
}
resource "google_monitoring_alert_policy" "forgejo_panic" {
display_name = "Forgejo container panic/FATAL"
combiner = "OR"
conditions {
display_name = "panic or FATAL in container logs"
condition_threshold {
filter = "resource.type=\"gce_instance\" AND metric.type=\"logging.googleapis.com/user/forgejo_panic\""
duration = "0s"
comparison = "COMPARISON_GT"
threshold_value = 0
aggregations {
alignment_period = "300s"
per_series_aligner = "ALIGN_COUNT"
cross_series_reducer = "REDUCE_SUM"
}
}
}
notification_channels = [google_monitoring_notification_channel.email.id]
alert_strategy {
auto_close = "3600s"
}
depends_on = [google_logging_metric.forgejo_panic]
}