Files
Charon/backend/internal/services/uptime_service.go
GitHub Actions 697ef6d200 feat: implement comprehensive test optimization
- Add gotestsum for real-time test progress visibility
- Parallelize 174 tests across 14 files for faster execution
- Add -short mode support skipping 21 heavy integration tests
- Create testutil/db.go helper for future transaction rollbacks
- Fix data race in notification_service_test.go
- Fix 4 CrowdSec LAPI test failures with permissive validator

Performance improvements:
- Tests now run in parallel (174 tests with t.Parallel())
- Quick feedback loop via -short mode
- Zero race conditions detected
- Coverage maintained at 87.7%

Closes test optimization initiative
2026-01-03 19:42:53 +00:00

1106 lines
31 KiB
Go

package services
import (
"context"
"encoding/json"
"errors"
"fmt"
"net"
"net/http"
"net/url"
"strings"
"sync"
"time"
"github.com/Wikid82/charon/backend/internal/logger"
"github.com/Wikid82/charon/backend/internal/models"
"github.com/Wikid82/charon/backend/internal/network"
"github.com/Wikid82/charon/backend/internal/security"
"github.com/Wikid82/charon/backend/internal/util"
"gorm.io/gorm"
)
type UptimeService struct {
DB *gorm.DB
NotificationService *NotificationService
// Batching: track pending notifications
pendingNotifications map[string]*pendingHostNotification
notificationMutex sync.Mutex
batchWindow time.Duration
// Host-specific mutexes to prevent concurrent database updates
hostMutexes map[string]*sync.Mutex
hostMutexLock sync.Mutex
// Configuration
config UptimeConfig
}
// UptimeConfig holds configurable timeouts and thresholds
type UptimeConfig struct {
TCPTimeout time.Duration
MaxRetries int
FailureThreshold int
CheckTimeout time.Duration
StaggerDelay time.Duration
}
type pendingHostNotification struct {
hostID string
hostName string
downMonitors []monitorDownInfo
timer *time.Timer
createdAt time.Time
}
type monitorDownInfo struct {
ID string
Name string
URL string
Message string
PreviousUptime string
}
func NewUptimeService(db *gorm.DB, ns *NotificationService) *UptimeService {
return &UptimeService{
DB: db,
NotificationService: ns,
pendingNotifications: make(map[string]*pendingHostNotification),
batchWindow: 30 * time.Second, // Wait 30 seconds to batch notifications
hostMutexes: make(map[string]*sync.Mutex),
config: UptimeConfig{
TCPTimeout: 10 * time.Second,
MaxRetries: 2,
FailureThreshold: 2,
CheckTimeout: 60 * time.Second,
StaggerDelay: 100 * time.Millisecond,
},
}
}
// extractPort extracts the port from a URL or host:port string
func extractPort(urlStr string) string {
// Try parsing as URL first
if u, err := url.Parse(urlStr); err == nil && u.Host != "" {
port := u.Port()
if port != "" {
return port
}
// Default ports
if u.Scheme == "https" {
return "443"
}
if u.Scheme == "http" {
return "80"
}
}
// Try as host:port
if _, port, err := net.SplitHostPort(urlStr); err == nil {
return port
}
return ""
}
// formatDuration formats a duration in a human-readable way
func formatDuration(d time.Duration) string {
d = d.Round(time.Second)
days := int(d.Hours() / 24)
hours := int(d.Hours()) % 24
minutes := int(d.Minutes()) % 60
seconds := int(d.Seconds()) % 60
if days > 0 {
return fmt.Sprintf("%dd %dh %dm", days, hours, minutes)
}
if hours > 0 {
return fmt.Sprintf("%dh %dm %ds", hours, minutes, seconds)
}
if minutes > 0 {
return fmt.Sprintf("%dm %ds", minutes, seconds)
}
return fmt.Sprintf("%ds", seconds)
}
// SyncMonitors ensures every ProxyHost and RemoteServer has a corresponding UptimeMonitor
// and that UptimeHosts are created for grouping
func (s *UptimeService) SyncMonitors() error {
var hosts []models.ProxyHost
if err := s.DB.Find(&hosts).Error; err != nil {
return err
}
for _, host := range hosts {
var monitor models.UptimeMonitor
err := s.DB.Where("proxy_host_id = ?", host.ID).First(&monitor).Error
domains := strings.Split(host.DomainNames, ",")
firstDomain := ""
if len(domains) > 0 {
firstDomain = strings.TrimSpace(domains[0])
}
// Construct the public URL
scheme := "http"
if host.SSLForced {
scheme = "https"
}
publicURL := fmt.Sprintf("%s://%s", scheme, firstDomain)
internalURL := fmt.Sprintf("%s:%d", host.ForwardHost, host.ForwardPort)
// The upstream host for grouping is the ForwardHost
upstreamHost := host.ForwardHost
switch err {
case gorm.ErrRecordNotFound:
// Create new monitor
name := host.Name
if name == "" {
name = firstDomain
}
// Find or create UptimeHost
uptimeHostID := s.ensureUptimeHost(upstreamHost, name)
monitor = models.UptimeMonitor{
ProxyHostID: &host.ID,
UptimeHostID: &uptimeHostID,
Name: name,
Type: "http", // Check public access
URL: publicURL,
UpstreamHost: upstreamHost,
Interval: 60,
Enabled: true,
Status: "pending",
}
if err := s.DB.Create(&monitor).Error; err != nil {
logger.Log().WithError(err).WithField("host_id", host.ID).Error("Failed to create monitor")
}
case nil:
// Always sync the name from proxy host
newName := host.Name
if newName == "" {
newName = firstDomain
}
needsSave := false
if monitor.Name != newName {
monitor.Name = newName
needsSave = true
}
// Ensure upstream host is set for grouping
if monitor.UpstreamHost == "" || monitor.UpstreamHost != upstreamHost {
monitor.UpstreamHost = upstreamHost
needsSave = true
}
// Ensure UptimeHost link exists
if monitor.UptimeHostID == nil {
uptimeHostID := s.ensureUptimeHost(upstreamHost, newName)
monitor.UptimeHostID = &uptimeHostID
needsSave = true
}
// Update existing monitor if it looks like it's using the old default (TCP to internal upstream)
if monitor.Type == "tcp" && monitor.URL == internalURL {
monitor.Type = "http"
monitor.URL = publicURL
needsSave = true
logger.Log().WithField("host_id", host.ID).Infof("Migrated monitor for host %d to check public URL: %s", host.ID, publicURL)
}
// Upgrade to HTTPS if SSL is forced and we are currently checking HTTP
if host.SSLForced && strings.HasPrefix(monitor.URL, "http://") {
monitor.URL = strings.Replace(monitor.URL, "http://", "https://", 1)
needsSave = true
logger.Log().WithField("host_id", host.ID).Infof("Upgraded monitor for host %d to HTTPS: %s", host.ID, monitor.URL)
}
if needsSave {
s.DB.Save(&monitor)
}
}
}
// Sync Remote Servers
var remoteServers []models.RemoteServer
if err := s.DB.Find(&remoteServers).Error; err != nil {
return err
}
for _, server := range remoteServers {
var monitor models.UptimeMonitor
err := s.DB.Where("remote_server_id = ?", server.ID).First(&monitor).Error
targetType := "tcp"
targetURL := fmt.Sprintf("%s:%d", server.Host, server.Port)
if server.Scheme == "http" || server.Scheme == "https" {
targetType = server.Scheme
targetURL = fmt.Sprintf("%s://%s:%d", server.Scheme, server.Host, server.Port)
}
// The upstream host for grouping
upstreamHost := server.Host
switch err {
case gorm.ErrRecordNotFound:
// Find or create UptimeHost
uptimeHostID := s.ensureUptimeHost(upstreamHost, server.Name)
monitor = models.UptimeMonitor{
RemoteServerID: &server.ID,
UptimeHostID: &uptimeHostID,
Name: server.Name,
Type: targetType,
URL: targetURL,
UpstreamHost: upstreamHost,
Interval: 60,
Enabled: server.Enabled,
Status: "pending",
}
if err := s.DB.Create(&monitor).Error; err != nil {
logger.Log().WithError(err).WithField("remote_server_id", server.ID).Error("Failed to create monitor for remote server")
}
case nil:
needsSave := false
if monitor.Name != server.Name {
monitor.Name = server.Name
needsSave = true
}
// Ensure upstream host is set for grouping
if monitor.UpstreamHost == "" || monitor.UpstreamHost != upstreamHost {
monitor.UpstreamHost = upstreamHost
needsSave = true
}
// Ensure UptimeHost link exists
if monitor.UptimeHostID == nil {
uptimeHostID := s.ensureUptimeHost(upstreamHost, server.Name)
monitor.UptimeHostID = &uptimeHostID
needsSave = true
}
if monitor.URL != targetURL || monitor.Type != targetType {
monitor.URL = targetURL
monitor.Type = targetType
needsSave = true
}
if monitor.Enabled != server.Enabled {
monitor.Enabled = server.Enabled
needsSave = true
}
if needsSave {
s.DB.Save(&monitor)
}
}
}
return nil
}
// ensureUptimeHost finds or creates an UptimeHost for the given host string
func (s *UptimeService) ensureUptimeHost(host, defaultName string) string {
var uptimeHost models.UptimeHost
err := s.DB.Where("host = ?", host).First(&uptimeHost).Error
if err == gorm.ErrRecordNotFound {
uptimeHost = models.UptimeHost{
Host: host,
Name: defaultName,
Status: "pending",
}
if err := s.DB.Create(&uptimeHost).Error; err != nil {
logger.Log().WithError(err).WithField("host", util.SanitizeForLog(host)).Error("Failed to create UptimeHost")
return ""
}
logger.Log().WithField("host_id", uptimeHost.ID).WithField("host", util.SanitizeForLog(uptimeHost.Host)).Info("Created UptimeHost")
}
return uptimeHost.ID
}
// CheckAll runs checks for all enabled monitors with host-level pre-check
func (s *UptimeService) CheckAll() {
// First, check all UptimeHosts
s.checkAllHosts()
var monitors []models.UptimeMonitor
if err := s.DB.Where("enabled = ?", true).Find(&monitors).Error; err != nil {
logger.Log().WithError(err).Error("Failed to fetch monitors")
return
}
// Group monitors by UptimeHost
hostMonitors := make(map[string][]models.UptimeMonitor)
for _, monitor := range monitors {
hostID := ""
if monitor.UptimeHostID != nil {
hostID = *monitor.UptimeHostID
}
hostMonitors[hostID] = append(hostMonitors[hostID], monitor)
}
// Check each host's monitors
for hostID, monitors := range hostMonitors {
// If host is down, mark all monitors as down without individual checks
if hostID != "" {
var uptimeHost models.UptimeHost
if err := s.DB.First(&uptimeHost, "id = ?", hostID).Error; err == nil {
if uptimeHost.Status == "down" {
s.markHostMonitorsDown(monitors, &uptimeHost)
continue
}
}
}
// Host is up, check individual monitors
for _, monitor := range monitors {
go s.checkMonitor(monitor)
}
}
}
// checkAllHosts performs TCP connectivity check on all UptimeHosts
func (s *UptimeService) checkAllHosts() {
var hosts []models.UptimeHost
if err := s.DB.Find(&hosts).Error; err != nil {
logger.Log().WithError(err).Error("Failed to fetch uptime hosts")
return
}
if len(hosts) == 0 {
return
}
logger.Log().WithField("host_count", len(hosts)).Info("Starting host checks")
// Create context with timeout for all checks
ctx, cancel := context.WithTimeout(context.Background(), s.config.CheckTimeout)
defer cancel()
var wg sync.WaitGroup
for i := range hosts {
wg.Add(1)
// Staggered startup to reduce load spikes
if i > 0 {
time.Sleep(s.config.StaggerDelay)
}
go func(host *models.UptimeHost) {
defer wg.Done()
// Check if context is cancelled
select {
case <-ctx.Done():
logger.Log().WithField("host_name", host.Name).Warn("Host check cancelled due to timeout")
return
default:
s.checkHost(ctx, host)
}
}(&hosts[i])
}
wg.Wait() // Wait for all host checks to complete
logger.Log().WithField("host_count", len(hosts)).Info("All host checks completed")
}
// checkHost performs a basic TCP connectivity check to determine if the host is reachable
func (s *UptimeService) checkHost(ctx context.Context, host *models.UptimeHost) {
// Get host-specific mutex to prevent concurrent database updates
s.hostMutexLock.Lock()
if s.hostMutexes[host.ID] == nil {
s.hostMutexes[host.ID] = &sync.Mutex{}
}
mutex := s.hostMutexes[host.ID]
s.hostMutexLock.Unlock()
mutex.Lock()
defer mutex.Unlock()
start := time.Now()
logger.Log().WithFields(map[string]any{
"host_name": host.Name,
"host_ip": host.Host,
"host_id": host.ID,
}).Debug("Starting TCP check for host")
// Get common ports for this host from its monitors
var monitors []models.UptimeMonitor
s.DB.Preload("ProxyHost").Where("uptime_host_id = ?", host.ID).Find(&monitors)
logger.Log().WithField("host_name", host.Name).WithField("monitor_count", len(monitors)).Debug("Retrieved monitors for host")
if len(monitors) == 0 {
return
}
// Try to connect to any of the monitor ports with retry logic
success := false
var msg string
var lastErr error
for retry := 0; retry <= s.config.MaxRetries && !success; retry++ {
if retry > 0 {
logger.Log().WithFields(map[string]any{
"host_name": host.Name,
"retry": retry,
"max": s.config.MaxRetries,
}).Info("Retrying TCP check")
time.Sleep(2 * time.Second) // Brief delay between retries
}
// Check if context is cancelled
select {
case <-ctx.Done():
logger.Log().WithField("host_name", host.Name).Warn("TCP check cancelled")
return
default:
}
for _, monitor := range monitors {
var port string
// Use actual backend port from ProxyHost if available
if monitor.ProxyHost != nil {
port = fmt.Sprintf("%d", monitor.ProxyHost.ForwardPort)
} else {
// Fallback to extracting from URL for standalone monitors
port = extractPort(monitor.URL)
}
if port == "" {
continue
}
logger.Log().WithFields(map[string]any{
"monitor": monitor.Name,
"extracted_port": extractPort(monitor.URL),
"actual_port": port,
"host": host.Host,
"retry": retry,
}).Debug("TCP check port resolution")
// Use net.JoinHostPort for IPv6 compatibility
addr := net.JoinHostPort(host.Host, port)
// Create dialer with timeout from context
dialer := net.Dialer{Timeout: s.config.TCPTimeout}
conn, err := dialer.DialContext(ctx, "tcp", addr)
if err == nil {
if err := conn.Close(); err != nil {
logger.Log().WithError(err).Warn("failed to close tcp connection")
}
success = true
msg = fmt.Sprintf("TCP connection to %s successful (retry %d)", addr, retry)
logger.Log().WithFields(map[string]any{
"host_name": host.Name,
"addr": addr,
"retry": retry,
}).Debug("TCP connection successful")
break
}
lastErr = err
msg = fmt.Sprintf("TCP check failed: %v", err)
}
}
latency := time.Since(start).Milliseconds()
oldStatus := host.Status
var newStatus string
// Implement failure count debouncing
if success {
host.FailureCount = 0
newStatus = "up"
} else {
host.FailureCount++
if host.FailureCount >= s.config.FailureThreshold {
newStatus = "down"
} else {
// Keep current status on first failure
newStatus = host.Status
logger.Log().WithFields(map[string]any{
"host_name": host.Name,
"failure_count": host.FailureCount,
"threshold": s.config.FailureThreshold,
"last_error": lastErr,
}).Warn("Host check failed, waiting for threshold")
}
}
statusChanged := oldStatus != newStatus && oldStatus != "pending"
host.Status = newStatus
host.LastCheck = time.Now()
host.Latency = latency
if statusChanged {
host.LastStatusChange = time.Now()
logger.Log().WithFields(map[string]any{
"host_name": host.Name,
"host_ip": host.Host,
"old": oldStatus,
"new": newStatus,
"message": msg,
}).Info("Host status changed")
}
logger.Log().WithFields(map[string]any{
"host_name": host.Name,
"host_ip": host.Host,
"success": success,
"failure_count": host.FailureCount,
"old_status": oldStatus,
"new_status": newStatus,
"elapsed_ms": latency,
"status_changed": statusChanged,
}).Debug("Host TCP check completed")
s.DB.Save(host)
}
// markHostMonitorsDown marks all monitors for a down host as down and sends a single notification
func (s *UptimeService) markHostMonitorsDown(monitors []models.UptimeMonitor, host *models.UptimeHost) {
downMonitors := []monitorDownInfo{}
for i := range monitors {
monitor := &monitors[i]
oldStatus := monitor.Status
if oldStatus == "down" {
continue // Already down, no need to update
}
// Calculate previous uptime
var durationStr string
if !monitor.LastStatusChange.IsZero() {
duration := time.Since(monitor.LastStatusChange)
durationStr = formatDuration(duration)
}
monitor.Status = "down"
monitor.LastCheck = time.Now()
monitor.FailureCount = monitor.MaxRetries // Max out failure count
if oldStatus != "pending" {
monitor.LastStatusChange = time.Now()
}
monitor.NotifiedInBatch = true
s.DB.Save(monitor)
// Record heartbeat
heartbeat := models.UptimeHeartbeat{
MonitorID: monitor.ID,
Status: "down",
Latency: 0,
Message: "Host unreachable",
}
s.DB.Create(&heartbeat)
if oldStatus != "pending" && oldStatus != "down" {
downMonitors = append(downMonitors, monitorDownInfo{
ID: monitor.ID,
Name: monitor.Name,
URL: monitor.URL,
Message: "Host unreachable",
PreviousUptime: durationStr,
})
}
}
// Send consolidated notification if any monitors transitioned to down
if len(downMonitors) > 0 && time.Since(host.LastNotifiedDown) > 5*time.Minute {
s.sendHostDownNotification(host, downMonitors)
}
}
// sendHostDownNotification sends a single consolidated notification for a down host
func (s *UptimeService) sendHostDownNotification(host *models.UptimeHost, downMonitors []monitorDownInfo) {
title := fmt.Sprintf("🔴 Host %s is DOWN (%d services affected)", host.Name, len(downMonitors))
var sb strings.Builder
sb.WriteString(fmt.Sprintf("Host: %s (%s)\n", host.Name, host.Host))
sb.WriteString(fmt.Sprintf("Time: %s\n", time.Now().Format(time.RFC1123)))
sb.WriteString(fmt.Sprintf("Services affected: %d\n\n", len(downMonitors)))
sb.WriteString("Impacted services:\n")
for _, m := range downMonitors {
if m.PreviousUptime != "" {
sb.WriteString(fmt.Sprintf("• %s (was up %s)\n", m.Name, m.PreviousUptime))
} else {
sb.WriteString(fmt.Sprintf("• %s\n", m.Name))
}
}
// Store notification in DB
_, _ = s.NotificationService.Create(
models.NotificationTypeError,
title,
sb.String(),
)
// Collect monitor IDs for tracking
monitorIDs := make([]string, len(downMonitors))
for i, m := range downMonitors {
monitorIDs[i] = m.ID
}
monitorIDsJSON, _ := json.Marshal(monitorIDs)
// Record notification event
event := models.UptimeNotificationEvent{
HostID: host.ID,
EventType: "down",
MonitorIDs: string(monitorIDsJSON),
Message: sb.String(),
SentAt: time.Now(),
}
s.DB.Create(&event)
// Update host notification tracking
host.LastNotifiedDown = time.Now()
host.NotifiedServiceCount = len(downMonitors)
s.DB.Save(host)
// Send external notification
data := map[string]any{
"HostName": host.Name,
"HostIP": host.Host,
"Status": "DOWN",
"ServiceCount": len(downMonitors),
"Services": downMonitors,
"Time": time.Now().Format(time.RFC1123),
}
s.NotificationService.SendExternal(context.Background(), "uptime", title, sb.String(), data)
logger.Log().WithField("host_name", host.Name).WithField("service_count", len(downMonitors)).Info("Sent consolidated DOWN notification")
}
// CheckMonitor is the exported version for on-demand checks
func (s *UptimeService) CheckMonitor(monitor models.UptimeMonitor) {
s.checkMonitor(monitor)
}
func (s *UptimeService) checkMonitor(monitor models.UptimeMonitor) {
start := time.Now()
success := false
var msg string
switch monitor.Type {
case "http", "https":
validatedURL, err := security.ValidateExternalURL(
monitor.URL,
// Uptime monitors are an explicit admin-configured feature and commonly
// target loopback in local/dev setups (and in unit tests).
security.WithAllowLocalhost(),
security.WithAllowHTTP(),
security.WithTimeout(3*time.Second),
)
if err != nil {
msg = fmt.Sprintf("security validation failed: %s", err.Error())
break
}
client := network.NewSafeHTTPClient(
network.WithTimeout(10*time.Second),
network.WithDialTimeout(5*time.Second),
// Explicit redirect policy per call site: disable.
network.WithMaxRedirects(0),
// Uptime monitors are an explicit admin-configured feature and commonly
// target loopback in local/dev setups (and in unit tests).
network.WithAllowLocalhost(),
)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, validatedURL, http.NoBody)
if err != nil {
msg = err.Error()
break
}
resp, err := client.Do(req)
if err == nil {
defer func() {
if err := resp.Body.Close(); err != nil {
logger.Log().WithError(err).Warn("failed to close uptime service response body")
}
}()
// Accept 2xx, 3xx, and 401/403 (Unauthorized/Forbidden often means the service is up but protected)
if (resp.StatusCode >= 200 && resp.StatusCode < 400) || resp.StatusCode == 401 || resp.StatusCode == 403 {
success = true
msg = fmt.Sprintf("HTTP %d", resp.StatusCode)
} else {
msg = fmt.Sprintf("HTTP %d", resp.StatusCode)
}
} else {
msg = err.Error()
}
case "tcp":
conn, err := net.DialTimeout("tcp", monitor.URL, 10*time.Second)
if err == nil {
if err := conn.Close(); err != nil {
logger.Log().WithError(err).Warn("failed to close tcp connection")
}
success = true
msg = "Connection successful"
} else {
msg = err.Error()
}
default:
msg = "Unknown monitor type"
}
latency := time.Since(start).Milliseconds()
// Determine new status based on success and retries
newStatus := monitor.Status
if success {
// If it was down or pending, it's now up immediately
if monitor.Status != "up" {
newStatus = "up"
}
// Reset failure count on success
monitor.FailureCount = 0
} else {
// Increment failure count
monitor.FailureCount++
// Only mark as down if we exceeded max retries
// Default MaxRetries to 3 if 0 (legacy records)
maxRetries := monitor.MaxRetries
if maxRetries <= 0 {
maxRetries = 3
}
if monitor.FailureCount >= maxRetries {
newStatus = "down"
}
}
// Record Heartbeat (always record the raw result)
heartbeatStatus := "down"
if success {
heartbeatStatus = "up"
}
heartbeat := models.UptimeHeartbeat{
MonitorID: monitor.ID,
Status: heartbeatStatus,
Latency: latency,
Message: msg,
}
_ = s.DB.Create(&heartbeat).Error
// Update Monitor Status
oldStatus := monitor.Status
statusChanged := oldStatus != newStatus && oldStatus != "pending"
// Calculate previous uptime/downtime if status changed
var durationStr string
if statusChanged && !monitor.LastStatusChange.IsZero() {
duration := time.Since(monitor.LastStatusChange)
durationStr = formatDuration(duration)
}
monitor.Status = newStatus
monitor.LastCheck = time.Now()
monitor.Latency = latency
if statusChanged {
monitor.LastStatusChange = time.Now()
}
s.DB.Save(&monitor)
// Handle notifications based on status change
if statusChanged {
switch newStatus {
case "down":
// Queue for batched notification
s.queueDownNotification(monitor, msg, durationStr)
case "up":
// Send recovery notification
s.sendRecoveryNotification(monitor, durationStr)
}
}
}
// queueDownNotification adds a down monitor to the batch queue
func (s *UptimeService) queueDownNotification(monitor models.UptimeMonitor, reason, previousUptime string) {
s.notificationMutex.Lock()
defer s.notificationMutex.Unlock()
hostID := ""
if monitor.UptimeHostID != nil {
hostID = *monitor.UptimeHostID
}
// Get host info
var uptimeHost models.UptimeHost
hostName := monitor.UpstreamHost
if hostID != "" {
if err := s.DB.First(&uptimeHost, "id = ?", hostID).Error; err == nil {
hostName = uptimeHost.Name
}
}
info := monitorDownInfo{
ID: monitor.ID,
Name: monitor.Name,
URL: monitor.URL,
Message: reason,
PreviousUptime: previousUptime,
}
if pending, exists := s.pendingNotifications[hostID]; exists {
// Add to existing batch
pending.downMonitors = append(pending.downMonitors, info)
logger.Log().WithField("monitor", util.SanitizeForLog(monitor.Name)).WithField("host", util.SanitizeForLog(hostName)).WithField("count", len(pending.downMonitors)).Info("Added to pending notification batch")
} else {
// Create new batch with timer
pending := &pendingHostNotification{
hostID: hostID,
hostName: hostName,
downMonitors: []monitorDownInfo{info},
createdAt: time.Now(),
}
pending.timer = time.AfterFunc(s.batchWindow, func() {
s.flushPendingNotification(hostID)
})
s.pendingNotifications[hostID] = pending
logger.Log().WithField("host", util.SanitizeForLog(hostName)).WithField("monitor", util.SanitizeForLog(monitor.Name)).Info("Created pending notification batch")
}
}
// flushPendingNotification sends the batched notification
func (s *UptimeService) flushPendingNotification(hostID string) {
s.notificationMutex.Lock()
pending, exists := s.pendingNotifications[hostID]
if !exists {
s.notificationMutex.Unlock()
return
}
delete(s.pendingNotifications, hostID)
s.notificationMutex.Unlock()
if pending.timer != nil {
pending.timer.Stop()
}
if len(pending.downMonitors) == 0 {
return
}
// Build and send notification
var title string
var sb strings.Builder
if len(pending.downMonitors) == 1 {
// Single service down
m := pending.downMonitors[0]
title = fmt.Sprintf("🔴 %s is DOWN", m.Name)
sb.WriteString(fmt.Sprintf("Service: %s\n", m.Name))
sb.WriteString("Status: DOWN\n")
sb.WriteString(fmt.Sprintf("Time: %s\n", time.Now().Format(time.RFC1123)))
if m.PreviousUptime != "" {
sb.WriteString(fmt.Sprintf("Previous Uptime: %s\n", m.PreviousUptime))
}
sb.WriteString(fmt.Sprintf("Reason: %s\n", m.Message))
} else {
// Multiple services down
title = fmt.Sprintf("🔴 %d Services DOWN on %s", len(pending.downMonitors), pending.hostName)
sb.WriteString(fmt.Sprintf("Host: %s\n", pending.hostName))
sb.WriteString(fmt.Sprintf("Time: %s\n", time.Now().Format(time.RFC1123)))
sb.WriteString(fmt.Sprintf("Services affected: %d\n\n", len(pending.downMonitors)))
sb.WriteString("Impacted services:\n")
for _, m := range pending.downMonitors {
if m.PreviousUptime != "" {
sb.WriteString(fmt.Sprintf("• %s - %s (was up %s)\n", m.Name, m.Message, m.PreviousUptime))
} else {
sb.WriteString(fmt.Sprintf("• %s - %s\n", m.Name, m.Message))
}
}
}
// Store in DB
_, _ = s.NotificationService.Create(
models.NotificationTypeError,
title,
sb.String(),
)
// Send external
data := map[string]any{
"HostName": pending.hostName,
"Status": "DOWN",
"ServiceCount": len(pending.downMonitors),
"Services": pending.downMonitors,
"Time": time.Now().Format(time.RFC1123),
}
s.NotificationService.SendExternal(context.Background(), "uptime", title, sb.String(), data)
logger.Log().WithField("count", len(pending.downMonitors)).WithField("host", util.SanitizeForLog(pending.hostName)).Info("Sent batched DOWN notification")
}
// sendRecoveryNotification sends a notification when a service recovers
func (s *UptimeService) sendRecoveryNotification(monitor models.UptimeMonitor, downtime string) {
title := fmt.Sprintf("🟢 %s is UP", monitor.Name)
var sb strings.Builder
sb.WriteString(fmt.Sprintf("Service: %s\n", monitor.Name))
sb.WriteString("Status: UP\n")
sb.WriteString(fmt.Sprintf("Time: %s\n", time.Now().Format(time.RFC1123)))
if downtime != "" {
sb.WriteString(fmt.Sprintf("Downtime: %s\n", downtime))
}
_, _ = s.NotificationService.Create(
models.NotificationTypeSuccess,
title,
sb.String(),
)
data := map[string]any{
"Name": monitor.Name,
"Status": "UP",
"Downtime": downtime,
"Time": time.Now().Format(time.RFC1123),
"URL": monitor.URL,
}
s.NotificationService.SendExternal(context.Background(), "uptime", title, sb.String(), data)
}
// FlushPendingNotifications flushes all pending batched notifications immediately.
// This is useful for testing and graceful shutdown.
func (s *UptimeService) FlushPendingNotifications() {
s.notificationMutex.Lock()
pendingHostIDs := make([]string, 0, len(s.pendingNotifications))
for hostID := range s.pendingNotifications {
pendingHostIDs = append(pendingHostIDs, hostID)
}
s.notificationMutex.Unlock()
for _, hostID := range pendingHostIDs {
s.flushPendingNotification(hostID)
}
}
// SyncMonitorForHost updates the uptime monitor linked to a specific proxy host.
// This should be called when a proxy host is edited to keep the monitor in sync.
// Returns nil if no monitor exists for the host (does not create one).
func (s *UptimeService) SyncMonitorForHost(hostID uint) error {
var host models.ProxyHost
if err := s.DB.First(&host, hostID).Error; err != nil {
return err
}
var monitor models.UptimeMonitor
if err := s.DB.Where("proxy_host_id = ?", hostID).First(&monitor).Error; err != nil {
if errors.Is(err, gorm.ErrRecordNotFound) {
return nil // No monitor to sync
}
return err
}
// Update monitor fields based on current proxy host values
domains := strings.Split(host.DomainNames, ",")
firstDomain := ""
if len(domains) > 0 {
firstDomain = strings.TrimSpace(domains[0])
}
scheme := "http"
if host.SSLForced {
scheme = "https"
}
newName := host.Name
if newName == "" {
newName = firstDomain
}
monitor.Name = newName
monitor.URL = fmt.Sprintf("%s://%s", scheme, firstDomain)
monitor.UpstreamHost = host.ForwardHost
return s.DB.Save(&monitor).Error
}
// CRUD for Monitors
func (s *UptimeService) ListMonitors() ([]models.UptimeMonitor, error) {
var monitors []models.UptimeMonitor
result := s.DB.Order("name ASC").Find(&monitors)
return monitors, result.Error
}
func (s *UptimeService) GetMonitorByID(id string) (*models.UptimeMonitor, error) {
var monitor models.UptimeMonitor
if err := s.DB.First(&monitor, "id = ?", id).Error; err != nil {
return nil, err
}
return &monitor, nil
}
func (s *UptimeService) GetMonitorHistory(id string, limit int) ([]models.UptimeHeartbeat, error) {
var heartbeats []models.UptimeHeartbeat
result := s.DB.Where("monitor_id = ?", id).Order("created_at desc").Limit(limit).Find(&heartbeats)
return heartbeats, result.Error
}
func (s *UptimeService) UpdateMonitor(id string, updates map[string]any) (*models.UptimeMonitor, error) {
var monitor models.UptimeMonitor
if err := s.DB.First(&monitor, "id = ?", id).Error; err != nil {
return nil, err
}
// Whitelist allowed fields to update
allowedUpdates := make(map[string]any)
if val, ok := updates["max_retries"]; ok {
allowedUpdates["max_retries"] = val
}
if val, ok := updates["interval"]; ok {
allowedUpdates["interval"] = val
}
if val, ok := updates["enabled"]; ok {
allowedUpdates["enabled"] = val
}
// Add other fields as needed, but be careful not to overwrite SyncMonitors logic
if err := s.DB.Model(&monitor).Updates(allowedUpdates).Error; err != nil {
return nil, err
}
return &monitor, nil
}
// DeleteMonitor removes a monitor and its heartbeats, and optionally cleans up the parent UptimeHost.
func (s *UptimeService) DeleteMonitor(id string) error {
// Find monitor
var monitor models.UptimeMonitor
if err := s.DB.First(&monitor, "id = ?", id).Error; err != nil {
return err
}
// Delete heartbeats
if err := s.DB.Where("monitor_id = ?", id).Delete(&models.UptimeHeartbeat{}).Error; err != nil {
return err
}
// Delete the monitor
if err := s.DB.Delete(&monitor).Error; err != nil {
return err
}
// If no other monitors reference the uptime host, we don't automatically delete the host.
// Leave host cleanup to a manual process or separate endpoint.
return nil
}