diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 24292309..c825feee 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,24 +39,33 @@ repos: language: system files: '\.go$' pass_filenames: false + + # === MANUAL/CI-ONLY HOOKS === + # These are slow and should only run on-demand or in CI + # Run manually with: pre-commit run golangci-lint --all-files - id: go-test-race - name: Go Test Race + name: Go Test Race (Manual) entry: bash -c 'cd backend && go test -race ./...' language: system files: '\.go$' pass_filenames: false + stages: [manual] # Only runs when explicitly called + - id: golangci-lint - name: GolangCI-Lint - entry: bash -c 'cd backend && docker run --rm -v $(pwd):/app -w /app golangci/golangci-lint:latest golangci-lint run -v' + name: GolangCI-Lint (Manual) + entry: bash -c 'cd backend && docker run --rm -v $(pwd):/app:ro -w /app golangci/golangci-lint:latest golangci-lint run -v' language: system files: '\.go$' pass_filenames: false + stages: [manual] # Only runs when explicitly called + - id: hadolint - name: Hadolint Dockerfile Check + name: Hadolint Dockerfile Check (Manual) entry: bash -c 'docker run --rm -i hadolint/hadolint < Dockerfile' language: system files: 'Dockerfile' pass_filenames: false + stages: [manual] # Only runs when explicitly called - id: frontend-type-check name: Frontend TypeScript Check entry: bash -c 'cd frontend && npm run type-check' @@ -79,10 +88,10 @@ repos: verbose: true - id: security-scan - name: Security Vulnerability Scan + name: Security Vulnerability Scan (Manual) entry: scripts/security-scan.sh language: script files: '(\.go$|go\.mod$|go\.sum$)' pass_filenames: false verbose: true - stages: [pre-commit, manual] + stages: [manual] # Only runs when explicitly called diff --git a/.vscode/tasks.json b/.vscode/tasks.json index c3eb08df..37cad1f3 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -1,7 +1,7 @@ { "version": "2.0.0", "tasks": [ - { + { "label": "Git Remove Cached", "type": "shell", "command": "git rm -r --cached .", @@ -13,6 +13,53 @@ "command": "${workspaceFolder}/.venv/bin/pre-commit run --all-files", "group": "test" }, + // === MANUAL LINT/SCAN TASKS === + // These are the slow hooks removed from automatic pre-commit + { + "label": "Lint: GolangCI-Lint", + "type": "shell", + "command": "cd backend && docker run --rm -v $(pwd):/app:ro -w /app golangci/golangci-lint:latest golangci-lint run -v", + "group": "test", + "problemMatcher": ["$go"], + "presentation": { + "reveal": "always", + "panel": "new" + } + }, + { + "label": "Lint: Go Race Detector", + "type": "shell", + "command": "cd backend && go test -race ./...", + "group": "test", + "problemMatcher": ["$go"], + "presentation": { + "reveal": "always", + "panel": "new" + } + }, + { + "label": "Lint: Hadolint (Dockerfile)", + "type": "shell", + "command": "docker run --rm -i hadolint/hadolint < Dockerfile", + "group": "test", + "problemMatcher": [], + "presentation": { + "reveal": "always", + "panel": "new" + } + }, + { + "label": "Lint: Run All Manual Checks", + "type": "shell", + "command": "${workspaceFolder}/.venv/bin/pre-commit run --all-files --hook-stage manual", + "group": "test", + "problemMatcher": [], + "presentation": { + "reveal": "always", + "panel": "new" + } + }, + // === BUILD & RUN TASKS === { "label": "Build & Run Local Docker", "type": "shell", diff --git a/backend/internal/api/routes/routes.go b/backend/internal/api/routes/routes.go index d8527e68..8891ad3a 100644 --- a/backend/internal/api/routes/routes.go +++ b/backend/internal/api/routes/routes.go @@ -33,6 +33,8 @@ func Register(router *gin.Engine, db *gorm.DB, cfg config.Config) error { &models.NotificationProvider{}, &models.UptimeMonitor{}, &models.UptimeHeartbeat{}, + &models.UptimeHost{}, + &models.UptimeNotificationEvent{}, &models.Domain{}, ); err != nil { return fmt.Errorf("auto migrate: %w", err) diff --git a/backend/internal/models/uptime.go b/backend/internal/models/uptime.go index c0211d15..e1f6168c 100644 --- a/backend/internal/models/uptime.go +++ b/backend/internal/models/uptime.go @@ -11,10 +11,12 @@ type UptimeMonitor struct { ID string `gorm:"primaryKey" json:"id"` ProxyHostID *uint `json:"proxy_host_id"` // Optional link to proxy host RemoteServerID *uint `json:"remote_server_id"` // Optional link to remote server + UptimeHostID *string `json:"uptime_host_id"` // Link to parent host for grouping Name string `json:"name"` Type string `json:"type"` // http, tcp, ping URL string `json:"url"` - Interval int `json:"interval"` // seconds + UpstreamHost string `json:"upstream_host"` // The actual backend host/IP (for grouping) + Interval int `json:"interval"` // seconds Enabled bool `json:"enabled"` CreatedAt time.Time `json:"created_at"` UpdatedAt time.Time `json:"updated_at"` @@ -26,6 +28,10 @@ type UptimeMonitor struct { FailureCount int `json:"failure_count"` LastStatusChange time.Time `json:"last_status_change"` MaxRetries int `json:"max_retries" gorm:"default:3"` + + // Notification tracking + LastNotifiedDown time.Time `json:"last_notified_down"` // Prevent duplicate notifications + NotifiedInBatch bool `json:"notified_in_batch"` // Was this included in a batch notification? } type UptimeHeartbeat struct { diff --git a/backend/internal/models/uptime_host.go b/backend/internal/models/uptime_host.go new file mode 100644 index 00000000..788b4ffc --- /dev/null +++ b/backend/internal/models/uptime_host.go @@ -0,0 +1,56 @@ +package models + +import ( + "time" + + "github.com/google/uuid" + "gorm.io/gorm" +) + +// UptimeHost represents a unique upstream host/IP that may have multiple services. +// This enables host-level health checks to avoid notification storms when a whole server goes down. +type UptimeHost struct { + ID string `gorm:"primaryKey" json:"id"` + Host string `json:"host" gorm:"uniqueIndex;not null"` // IP address or hostname + Name string `json:"name"` // Friendly name (auto-generated or from first service) + Status string `json:"status"` // up, down, pending + LastCheck time.Time `json:"last_check"` + Latency int64 `json:"latency"` // ms for ping/TCP check + + // Notification tracking + LastNotifiedDown time.Time `json:"last_notified_down"` // When we last sent DOWN notification + LastNotifiedUp time.Time `json:"last_notified_up"` // When we last sent UP notification + NotifiedServiceCount int `json:"notified_service_count"` // Number of services in last notification + LastStatusChange time.Time `json:"last_status_change"` // When status last changed + + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +func (h *UptimeHost) BeforeCreate(tx *gorm.DB) (err error) { + if h.ID == "" { + h.ID = uuid.New().String() + } + if h.Status == "" { + h.Status = "pending" + } + return +} + +// UptimeNotificationEvent tracks notification batches to prevent duplicates +type UptimeNotificationEvent struct { + ID string `gorm:"primaryKey" json:"id"` + HostID string `json:"host_id" gorm:"index"` + EventType string `json:"event_type"` // down, up, partial_recovery + MonitorIDs string `json:"monitor_ids"` // JSON array of monitor IDs included in this notification + Message string `json:"message"` + SentAt time.Time `json:"sent_at"` + CreatedAt time.Time `json:"created_at"` +} + +func (e *UptimeNotificationEvent) BeforeCreate(tx *gorm.DB) (err error) { + if e.ID == "" { + e.ID = uuid.New().String() + } + return +} diff --git a/backend/internal/services/uptime_service.go b/backend/internal/services/uptime_service.go index 5a462133..c316be21 100644 --- a/backend/internal/services/uptime_service.go +++ b/backend/internal/services/uptime_service.go @@ -1,11 +1,14 @@ package services import ( + "encoding/json" "fmt" "log" "net" "net/http" + "net/url" "strings" + "sync" "time" "github.com/Wikid82/CaddyProxyManagerPlus/backend/internal/models" @@ -15,16 +18,85 @@ import ( type UptimeService struct { DB *gorm.DB NotificationService *NotificationService + // Batching: track pending notifications + pendingNotifications map[string]*pendingHostNotification + notificationMutex sync.Mutex + batchWindow time.Duration +} + +type pendingHostNotification struct { + hostID string + hostName string + downMonitors []monitorDownInfo + timer *time.Timer + createdAt time.Time +} + +type monitorDownInfo struct { + ID string + Name string + URL string + Message string + PreviousUptime string } func NewUptimeService(db *gorm.DB, ns *NotificationService) *UptimeService { return &UptimeService{ - DB: db, - NotificationService: ns, + DB: db, + NotificationService: ns, + pendingNotifications: make(map[string]*pendingHostNotification), + batchWindow: 30 * time.Second, // Wait 30 seconds to batch notifications } } -// SyncMonitors ensures every ProxyHost has a corresponding UptimeMonitor +// extractPort extracts the port from a URL or host:port string +func extractPort(urlStr string) string { + // Try parsing as URL first + if u, err := url.Parse(urlStr); err == nil && u.Host != "" { + port := u.Port() + if port != "" { + return port + } + // Default ports + if u.Scheme == "https" { + return "443" + } + if u.Scheme == "http" { + return "80" + } + } + + // Try as host:port + if _, port, err := net.SplitHostPort(urlStr); err == nil { + return port + } + + return "" +} + +// formatDuration formats a duration in a human-readable way +func formatDuration(d time.Duration) string { + d = d.Round(time.Second) + + days := int(d.Hours() / 24) + hours := int(d.Hours()) % 24 + minutes := int(d.Minutes()) % 60 + seconds := int(d.Seconds()) % 60 + + if days > 0 { + return fmt.Sprintf("%dd %dh %dm", days, hours, minutes) + } + if hours > 0 { + return fmt.Sprintf("%dh %dm %ds", hours, minutes, seconds) + } + if minutes > 0 { + return fmt.Sprintf("%dm %ds", minutes, seconds) + } + return fmt.Sprintf("%ds", seconds) +} + +// SyncMonitors ensures every ProxyHost and RemoteServer has a corresponding UptimeMonitor +// and that UptimeHosts are created for grouping func (s *UptimeService) SyncMonitors() error { var hosts []models.ProxyHost if err := s.DB.Find(&hosts).Error; err != nil { @@ -49,6 +121,9 @@ func (s *UptimeService) SyncMonitors() error { publicURL := fmt.Sprintf("%s://%s", scheme, firstDomain) internalURL := fmt.Sprintf("%s:%d", host.ForwardHost, host.ForwardPort) + // The upstream host for grouping is the ForwardHost + upstreamHost := host.ForwardHost + switch err { case gorm.ErrRecordNotFound: // Create new monitor @@ -57,14 +132,19 @@ func (s *UptimeService) SyncMonitors() error { name = firstDomain } + // Find or create UptimeHost + uptimeHostID := s.ensureUptimeHost(upstreamHost, name) + monitor = models.UptimeMonitor{ - ProxyHostID: &host.ID, - Name: name, - Type: "http", // Check public access - URL: publicURL, - Interval: 60, - Enabled: true, - Status: "pending", + ProxyHostID: &host.ID, + UptimeHostID: &uptimeHostID, + Name: name, + Type: "http", // Check public access + URL: publicURL, + UpstreamHost: upstreamHost, + Interval: 60, + Enabled: true, + Status: "pending", } if err := s.DB.Create(&monitor).Error; err != nil { log.Printf("Failed to create monitor for host %d: %v", host.ID, err) @@ -75,27 +155,44 @@ func (s *UptimeService) SyncMonitors() error { if newName == "" { newName = firstDomain } + needsSave := false + if monitor.Name != newName { monitor.Name = newName - s.DB.Save(&monitor) - log.Printf("Updated monitor name for host %d to: %s", host.ID, newName) + needsSave = true + } + + // Ensure upstream host is set for grouping + if monitor.UpstreamHost == "" || monitor.UpstreamHost != upstreamHost { + monitor.UpstreamHost = upstreamHost + needsSave = true + } + + // Ensure UptimeHost link exists + if monitor.UptimeHostID == nil { + uptimeHostID := s.ensureUptimeHost(upstreamHost, newName) + monitor.UptimeHostID = &uptimeHostID + needsSave = true } // Update existing monitor if it looks like it's using the old default (TCP to internal upstream) - // We check if it matches the internal upstream URL to avoid overwriting custom user settings if monitor.Type == "tcp" && monitor.URL == internalURL { monitor.Type = "http" monitor.URL = publicURL - s.DB.Save(&monitor) + needsSave = true log.Printf("Migrated monitor for host %d to check public URL: %s", host.ID, publicURL) } // Upgrade to HTTPS if SSL is forced and we are currently checking HTTP if host.SSLForced && strings.HasPrefix(monitor.URL, "http://") { monitor.URL = strings.Replace(monitor.URL, "http://", "https://", 1) - s.DB.Save(&monitor) + needsSave = true log.Printf("Upgraded monitor for host %d to HTTPS: %s", host.ID, monitor.URL) } + + if needsSave { + s.DB.Save(&monitor) + } } } @@ -117,13 +214,21 @@ func (s *UptimeService) SyncMonitors() error { targetURL = fmt.Sprintf("%s://%s:%d", server.Scheme, server.Host, server.Port) } + // The upstream host for grouping + upstreamHost := server.Host + switch err { case gorm.ErrRecordNotFound: + // Find or create UptimeHost + uptimeHostID := s.ensureUptimeHost(upstreamHost, server.Name) + monitor = models.UptimeMonitor{ RemoteServerID: &server.ID, + UptimeHostID: &uptimeHostID, Name: server.Name, Type: targetType, URL: targetURL, + UpstreamHost: upstreamHost, Interval: 60, Enabled: server.Enabled, Status: "pending", @@ -132,17 +237,37 @@ func (s *UptimeService) SyncMonitors() error { log.Printf("Failed to create monitor for remote server %d: %v", server.ID, err) } case nil: + needsSave := false + if monitor.Name != server.Name { monitor.Name = server.Name - s.DB.Save(&monitor) + needsSave = true } + + // Ensure upstream host is set for grouping + if monitor.UpstreamHost == "" || monitor.UpstreamHost != upstreamHost { + monitor.UpstreamHost = upstreamHost + needsSave = true + } + + // Ensure UptimeHost link exists + if monitor.UptimeHostID == nil { + uptimeHostID := s.ensureUptimeHost(upstreamHost, server.Name) + monitor.UptimeHostID = &uptimeHostID + needsSave = true + } + if monitor.URL != targetURL || monitor.Type != targetType { monitor.URL = targetURL monitor.Type = targetType - s.DB.Save(&monitor) + needsSave = true } if monitor.Enabled != server.Enabled { monitor.Enabled = server.Enabled + needsSave = true + } + + if needsSave { s.DB.Save(&monitor) } } @@ -151,17 +276,248 @@ func (s *UptimeService) SyncMonitors() error { return nil } -// CheckAll runs checks for all enabled monitors +// ensureUptimeHost finds or creates an UptimeHost for the given host string +func (s *UptimeService) ensureUptimeHost(host, defaultName string) string { + var uptimeHost models.UptimeHost + err := s.DB.Where("host = ?", host).First(&uptimeHost).Error + + if err == gorm.ErrRecordNotFound { + uptimeHost = models.UptimeHost{ + Host: host, + Name: defaultName, + Status: "pending", + } + if err := s.DB.Create(&uptimeHost).Error; err != nil { + log.Printf("Failed to create UptimeHost for %s: %v", host, err) + return "" + } + log.Printf("Created UptimeHost for %s", host) + } + + return uptimeHost.ID +} + +// CheckAll runs checks for all enabled monitors with host-level pre-check func (s *UptimeService) CheckAll() { + // First, check all UptimeHosts + s.checkAllHosts() + var monitors []models.UptimeMonitor if err := s.DB.Where("enabled = ?", true).Find(&monitors).Error; err != nil { log.Printf("Failed to fetch monitors: %v", err) return } + // Group monitors by UptimeHost + hostMonitors := make(map[string][]models.UptimeMonitor) for _, monitor := range monitors { - go s.checkMonitor(monitor) + hostID := "" + if monitor.UptimeHostID != nil { + hostID = *monitor.UptimeHostID + } + hostMonitors[hostID] = append(hostMonitors[hostID], monitor) } + + // Check each host's monitors + for hostID, monitors := range hostMonitors { + // If host is down, mark all monitors as down without individual checks + if hostID != "" { + var uptimeHost models.UptimeHost + if err := s.DB.First(&uptimeHost, "id = ?", hostID).Error; err == nil { + if uptimeHost.Status == "down" { + s.markHostMonitorsDown(monitors, &uptimeHost) + continue + } + } + } + + // Host is up, check individual monitors + for _, monitor := range monitors { + go s.checkMonitor(monitor) + } + } +} + +// checkAllHosts performs TCP connectivity check on all UptimeHosts +func (s *UptimeService) checkAllHosts() { + var hosts []models.UptimeHost + if err := s.DB.Find(&hosts).Error; err != nil { + log.Printf("Failed to fetch uptime hosts: %v", err) + return + } + + for i := range hosts { + s.checkHost(&hosts[i]) + } +} + +// checkHost performs a basic TCP connectivity check to determine if the host is reachable +func (s *UptimeService) checkHost(host *models.UptimeHost) { + start := time.Now() + + // Get common ports for this host from its monitors + var monitors []models.UptimeMonitor + s.DB.Where("uptime_host_id = ?", host.ID).Find(&monitors) + + if len(monitors) == 0 { + return + } + + // Try to connect to any of the monitor ports + success := false + var msg string + + for _, monitor := range monitors { + port := extractPort(monitor.URL) + if port == "" { + continue + } + + // Use net.JoinHostPort for IPv6 compatibility + addr := net.JoinHostPort(host.Host, port) + conn, err := net.DialTimeout("tcp", addr, 5*time.Second) + if err == nil { + _ = conn.Close() + success = true + msg = fmt.Sprintf("TCP connection to %s successful", addr) + break + } + msg = err.Error() + } + + latency := time.Since(start).Milliseconds() + oldStatus := host.Status + newStatus := "down" + if success { + newStatus = "up" + } + + statusChanged := oldStatus != newStatus && oldStatus != "pending" + + host.Status = newStatus + host.LastCheck = time.Now() + host.Latency = latency + + if statusChanged { + host.LastStatusChange = time.Now() + log.Printf("Host %s (%s) status changed: %s -> %s (%s)", host.Name, host.Host, oldStatus, newStatus, msg) + } + + s.DB.Save(host) +} + +// markHostMonitorsDown marks all monitors for a down host as down and sends a single notification +func (s *UptimeService) markHostMonitorsDown(monitors []models.UptimeMonitor, host *models.UptimeHost) { + downMonitors := []monitorDownInfo{} + + for i := range monitors { + monitor := &monitors[i] + oldStatus := monitor.Status + if oldStatus == "down" { + continue // Already down, no need to update + } + + // Calculate previous uptime + var durationStr string + if !monitor.LastStatusChange.IsZero() { + duration := time.Since(monitor.LastStatusChange) + durationStr = formatDuration(duration) + } + + monitor.Status = "down" + monitor.LastCheck = time.Now() + monitor.FailureCount = monitor.MaxRetries // Max out failure count + if oldStatus != "pending" { + monitor.LastStatusChange = time.Now() + } + monitor.NotifiedInBatch = true + s.DB.Save(monitor) + + // Record heartbeat + heartbeat := models.UptimeHeartbeat{ + MonitorID: monitor.ID, + Status: "down", + Latency: 0, + Message: "Host unreachable", + } + s.DB.Create(&heartbeat) + + if oldStatus != "pending" && oldStatus != "down" { + downMonitors = append(downMonitors, monitorDownInfo{ + ID: monitor.ID, + Name: monitor.Name, + URL: monitor.URL, + Message: "Host unreachable", + PreviousUptime: durationStr, + }) + } + } + + // Send consolidated notification if any monitors transitioned to down + if len(downMonitors) > 0 && time.Since(host.LastNotifiedDown) > 5*time.Minute { + s.sendHostDownNotification(host, downMonitors) + } +} + +// sendHostDownNotification sends a single consolidated notification for a down host +func (s *UptimeService) sendHostDownNotification(host *models.UptimeHost, downMonitors []monitorDownInfo) { + title := fmt.Sprintf("🔴 Host %s is DOWN (%d services affected)", host.Name, len(downMonitors)) + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("Host: %s (%s)\n", host.Name, host.Host)) + sb.WriteString(fmt.Sprintf("Time: %s\n", time.Now().Format(time.RFC1123))) + sb.WriteString(fmt.Sprintf("Services affected: %d\n\n", len(downMonitors))) + + sb.WriteString("Impacted services:\n") + for _, m := range downMonitors { + if m.PreviousUptime != "" { + sb.WriteString(fmt.Sprintf("• %s (was up %s)\n", m.Name, m.PreviousUptime)) + } else { + sb.WriteString(fmt.Sprintf("• %s\n", m.Name)) + } + } + + // Store notification in DB + _, _ = s.NotificationService.Create( + models.NotificationTypeError, + title, + sb.String(), + ) + + // Collect monitor IDs for tracking + monitorIDs := make([]string, len(downMonitors)) + for i, m := range downMonitors { + monitorIDs[i] = m.ID + } + monitorIDsJSON, _ := json.Marshal(monitorIDs) + + // Record notification event + event := models.UptimeNotificationEvent{ + HostID: host.ID, + EventType: "down", + MonitorIDs: string(monitorIDsJSON), + Message: sb.String(), + SentAt: time.Now(), + } + s.DB.Create(&event) + + // Update host notification tracking + host.LastNotifiedDown = time.Now() + host.NotifiedServiceCount = len(downMonitors) + s.DB.Save(host) + + // Send external notification + data := map[string]interface{}{ + "HostName": host.Name, + "HostIP": host.Host, + "Status": "DOWN", + "ServiceCount": len(downMonitors), + "Services": downMonitors, + "Time": time.Now().Format(time.RFC1123), + } + s.NotificationService.SendExternal("uptime", title, sb.String(), data) + + log.Printf("Sent consolidated DOWN notification for host %s with %d services", host.Name, len(downMonitors)) } func (s *UptimeService) checkMonitor(monitor models.UptimeMonitor) { @@ -244,11 +600,11 @@ func (s *UptimeService) checkMonitor(monitor models.UptimeMonitor) { oldStatus := monitor.Status statusChanged := oldStatus != newStatus && oldStatus != "pending" - // Calculate duration if status changed + // Calculate previous uptime/downtime if status changed var durationStr string if statusChanged && !monitor.LastStatusChange.IsZero() { duration := time.Since(monitor.LastStatusChange) - durationStr = duration.Round(time.Second).String() + durationStr = formatDuration(duration) } monitor.Status = newStatus @@ -261,46 +617,179 @@ func (s *UptimeService) checkMonitor(monitor models.UptimeMonitor) { s.DB.Save(&monitor) - // Send Notification if status changed + // Handle notifications based on status change if statusChanged { - title := fmt.Sprintf("Monitor %s is %s", monitor.Name, strings.ToUpper(newStatus)) - - nType := models.NotificationTypeInfo switch newStatus { case "down": - nType = models.NotificationTypeError + // Queue for batched notification + s.queueDownNotification(monitor, msg, durationStr) case "up": - nType = models.NotificationTypeSuccess + // Send recovery notification + s.sendRecoveryNotification(monitor, durationStr) + } + } +} + +// queueDownNotification adds a down monitor to the batch queue +func (s *UptimeService) queueDownNotification(monitor models.UptimeMonitor, reason, previousUptime string) { + s.notificationMutex.Lock() + defer s.notificationMutex.Unlock() + + hostID := "" + if monitor.UptimeHostID != nil { + hostID = *monitor.UptimeHostID + } + + // Get host info + var uptimeHost models.UptimeHost + hostName := monitor.UpstreamHost + if hostID != "" { + if err := s.DB.First(&uptimeHost, "id = ?", hostID).Error; err == nil { + hostName = uptimeHost.Name + } + } + + info := monitorDownInfo{ + ID: monitor.ID, + Name: monitor.Name, + URL: monitor.URL, + Message: reason, + PreviousUptime: previousUptime, + } + + if pending, exists := s.pendingNotifications[hostID]; exists { + // Add to existing batch + pending.downMonitors = append(pending.downMonitors, info) + log.Printf("Added %s to pending notification batch for host %s (now %d services)", monitor.Name, hostName, len(pending.downMonitors)) + } else { + // Create new batch with timer + pending := &pendingHostNotification{ + hostID: hostID, + hostName: hostName, + downMonitors: []monitorDownInfo{info}, + createdAt: time.Now(), } - // Construct rich message - var sb strings.Builder - sb.WriteString(fmt.Sprintf("Service: %s\n", monitor.Name)) - sb.WriteString(fmt.Sprintf("Status: %s\n", strings.ToUpper(newStatus))) + pending.timer = time.AfterFunc(s.batchWindow, func() { + s.flushPendingNotification(hostID) + }) + + s.pendingNotifications[hostID] = pending + log.Printf("Created pending notification batch for host %s with %s", hostName, monitor.Name) + } +} + +// flushPendingNotification sends the batched notification +func (s *UptimeService) flushPendingNotification(hostID string) { + s.notificationMutex.Lock() + pending, exists := s.pendingNotifications[hostID] + if !exists { + s.notificationMutex.Unlock() + return + } + delete(s.pendingNotifications, hostID) + s.notificationMutex.Unlock() + + if pending.timer != nil { + pending.timer.Stop() + } + + if len(pending.downMonitors) == 0 { + return + } + + // Build and send notification + var title string + var sb strings.Builder + + if len(pending.downMonitors) == 1 { + // Single service down + m := pending.downMonitors[0] + title = fmt.Sprintf("🔴 %s is DOWN", m.Name) + sb.WriteString(fmt.Sprintf("Service: %s\n", m.Name)) + sb.WriteString("Status: DOWN\n") sb.WriteString(fmt.Sprintf("Time: %s\n", time.Now().Format(time.RFC1123))) - - if durationStr != "" { - sb.WriteString(fmt.Sprintf("Duration: %s\n", durationStr)) + if m.PreviousUptime != "" { + sb.WriteString(fmt.Sprintf("Previous Uptime: %s\n", m.PreviousUptime)) } + sb.WriteString(fmt.Sprintf("Reason: %s\n", m.Message)) + } else { + // Multiple services down + title = fmt.Sprintf("🔴 %d Services DOWN on %s", len(pending.downMonitors), pending.hostName) + sb.WriteString(fmt.Sprintf("Host: %s\n", pending.hostName)) + sb.WriteString(fmt.Sprintf("Time: %s\n", time.Now().Format(time.RFC1123))) + sb.WriteString(fmt.Sprintf("Services affected: %d\n\n", len(pending.downMonitors))) - sb.WriteString(fmt.Sprintf("Reason: %s\n", msg)) - - _, _ = s.NotificationService.Create( - nType, - title, - sb.String(), - ) - - data := map[string]interface{}{ - "Name": monitor.Name, - "Status": strings.ToUpper(newStatus), - "Latency": latency, - "Message": msg, - "Duration": durationStr, - "Time": time.Now().Format(time.RFC1123), - "URL": monitor.URL, + sb.WriteString("Impacted services:\n") + for _, m := range pending.downMonitors { + if m.PreviousUptime != "" { + sb.WriteString(fmt.Sprintf("• %s - %s (was up %s)\n", m.Name, m.Message, m.PreviousUptime)) + } else { + sb.WriteString(fmt.Sprintf("• %s - %s\n", m.Name, m.Message)) + } } - s.NotificationService.SendExternal("uptime", title, sb.String(), data) + } + + // Store in DB + _, _ = s.NotificationService.Create( + models.NotificationTypeError, + title, + sb.String(), + ) + + // Send external + data := map[string]interface{}{ + "HostName": pending.hostName, + "Status": "DOWN", + "ServiceCount": len(pending.downMonitors), + "Services": pending.downMonitors, + "Time": time.Now().Format(time.RFC1123), + } + s.NotificationService.SendExternal("uptime", title, sb.String(), data) + + log.Printf("Sent batched DOWN notification for %d services on %s", len(pending.downMonitors), pending.hostName) +} + +// sendRecoveryNotification sends a notification when a service recovers +func (s *UptimeService) sendRecoveryNotification(monitor models.UptimeMonitor, downtime string) { + title := fmt.Sprintf("🟢 %s is UP", monitor.Name) + + var sb strings.Builder + sb.WriteString(fmt.Sprintf("Service: %s\n", monitor.Name)) + sb.WriteString("Status: UP\n") + sb.WriteString(fmt.Sprintf("Time: %s\n", time.Now().Format(time.RFC1123))) + if downtime != "" { + sb.WriteString(fmt.Sprintf("Downtime: %s\n", downtime)) + } + + _, _ = s.NotificationService.Create( + models.NotificationTypeSuccess, + title, + sb.String(), + ) + + data := map[string]interface{}{ + "Name": monitor.Name, + "Status": "UP", + "Downtime": downtime, + "Time": time.Now().Format(time.RFC1123), + "URL": monitor.URL, + } + s.NotificationService.SendExternal("uptime", title, sb.String(), data) +} + +// FlushPendingNotifications flushes all pending batched notifications immediately. +// This is useful for testing and graceful shutdown. +func (s *UptimeService) FlushPendingNotifications() { + s.notificationMutex.Lock() + pendingHostIDs := make([]string, 0, len(s.pendingNotifications)) + for hostID := range s.pendingNotifications { + pendingHostIDs = append(pendingHostIDs, hostID) + } + s.notificationMutex.Unlock() + + for _, hostID := range pendingHostIDs { + s.flushPendingNotification(hostID) } } diff --git a/backend/internal/services/uptime_service_test.go b/backend/internal/services/uptime_service_test.go index 673b9523..03a6ff6d 100644 --- a/backend/internal/services/uptime_service_test.go +++ b/backend/internal/services/uptime_service_test.go @@ -20,7 +20,17 @@ func setupUptimeTestDB(t *testing.T) *gorm.DB { if err != nil { t.Fatalf("Failed to connect to database: %v", err) } - err = db.AutoMigrate(&models.Notification{}, &models.NotificationProvider{}, &models.Setting{}, &models.ProxyHost{}, &models.UptimeMonitor{}, &models.UptimeHeartbeat{}, &models.RemoteServer{}) + err = db.AutoMigrate( + &models.Notification{}, + &models.NotificationProvider{}, + &models.Setting{}, + &models.ProxyHost{}, + &models.UptimeMonitor{}, + &models.UptimeHeartbeat{}, + &models.UptimeHost{}, + &models.UptimeNotificationEvent{}, + &models.RemoteServer{}, + ) if err != nil { t.Fatalf("Failed to migrate database: %v", err) } @@ -128,6 +138,12 @@ func TestUptimeService_CheckAll(t *testing.T) { db.Where("proxy_host_id = ?", upHost.ID).First(&upMonitor) assert.Equal(t, "down", upMonitor.Status) + // Flush any pending batched notifications + // The new batching system delays notifications by 30 seconds + // For testing, we manually trigger the flush + us.FlushPendingNotifications() + time.Sleep(100 * time.Millisecond) + db.Find(¬ifications) assert.Equal(t, 1, len(notifications), "Should have 1 notification now") if len(notifications) > 0 { @@ -970,3 +986,185 @@ func TestUptimeService_UpdateMonitor(t *testing.T) { assert.Equal(t, 300, result.Interval) }) } + +func TestUptimeService_NotificationBatching(t *testing.T) { + t.Run("batches multiple service failures on same host", func(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db) + us := NewUptimeService(db, ns) + + // Create an UptimeHost + host := models.UptimeHost{ + ID: "test-host-1", + Host: "192.168.1.100", + Name: "Test Server", + Status: "up", + } + db.Create(&host) + + // Create multiple monitors pointing to the same host + monitors := []models.UptimeMonitor{ + {ID: "mon-1", Name: "Service A", UpstreamHost: "192.168.1.100", UptimeHostID: &host.ID, Status: "up", MaxRetries: 3}, + {ID: "mon-2", Name: "Service B", UpstreamHost: "192.168.1.100", UptimeHostID: &host.ID, Status: "up", MaxRetries: 3}, + {ID: "mon-3", Name: "Service C", UpstreamHost: "192.168.1.100", UptimeHostID: &host.ID, Status: "up", MaxRetries: 3}, + } + for _, m := range monitors { + db.Create(&m) + } + + // Queue down notifications for all three + us.queueDownNotification(monitors[0], "Connection refused", "1h 30m") + us.queueDownNotification(monitors[1], "Connection refused", "2h 15m") + us.queueDownNotification(monitors[2], "Connection refused", "45m") + + // Verify all are batched together + us.notificationMutex.Lock() + pending, exists := us.pendingNotifications[host.ID] + us.notificationMutex.Unlock() + + assert.True(t, exists, "Should have pending notification for host") + assert.Equal(t, 3, len(pending.downMonitors), "Should have 3 monitors in batch") + + // Flush and verify single notification is sent + us.FlushPendingNotifications() + + var notifications []models.Notification + db.Find(¬ifications) + assert.Equal(t, 1, len(notifications), "Should have exactly 1 batched notification") + + if len(notifications) > 0 { + // Should mention all three services + assert.Contains(t, notifications[0].Message, "Service A") + assert.Contains(t, notifications[0].Message, "Service B") + assert.Contains(t, notifications[0].Message, "Service C") + assert.Contains(t, notifications[0].Title, "3 Services DOWN") + } + }) + + t.Run("single service down gets individual notification", func(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db) + us := NewUptimeService(db, ns) + + // Create an UptimeHost + host := models.UptimeHost{ + ID: "test-host-2", + Host: "192.168.1.101", + Name: "Single Service Host", + Status: "up", + } + db.Create(&host) + + monitor := models.UptimeMonitor{ + ID: "single-mon", + Name: "Lonely Service", + UpstreamHost: "192.168.1.101", + UptimeHostID: &host.ID, + Status: "up", + MaxRetries: 3, + } + db.Create(&monitor) + + // Queue single down notification + us.queueDownNotification(monitor, "HTTP 502", "5h 30m") + + // Flush + us.FlushPendingNotifications() + + var notifications []models.Notification + db.Find(¬ifications) + assert.Equal(t, 1, len(notifications), "Should have exactly 1 notification") + + if len(notifications) > 0 { + assert.Contains(t, notifications[0].Title, "Lonely Service is DOWN") + assert.Contains(t, notifications[0].Message, "Previous Uptime: 5h 30m") + } + }) +} + +func TestUptimeService_HostLevelCheck(t *testing.T) { + t.Run("creates uptime host during sync", func(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db) + us := NewUptimeService(db, ns) + + // Create a proxy host + proxyHost := models.ProxyHost{ + UUID: "ph-1", + DomainNames: "app.example.com", + ForwardHost: "10.0.0.50", + ForwardPort: 8080, + } + db.Create(&proxyHost) + + // Sync monitors + err := us.SyncMonitors() + assert.NoError(t, err) + + // Verify UptimeHost was created + var uptimeHost models.UptimeHost + err = db.Where("host = ?", "10.0.0.50").First(&uptimeHost).Error + assert.NoError(t, err) + assert.Equal(t, "10.0.0.50", uptimeHost.Host) + assert.Equal(t, "app.example.com", uptimeHost.Name) + + // Verify monitor has uptime host ID + var monitor models.UptimeMonitor + db.Where("proxy_host_id = ?", proxyHost.ID).First(&monitor) + assert.NotNil(t, monitor.UptimeHostID) + assert.Equal(t, uptimeHost.ID, *monitor.UptimeHostID) + }) + + t.Run("groups multiple services on same host", func(t *testing.T) { + db := setupUptimeTestDB(t) + ns := NewNotificationService(db) + us := NewUptimeService(db, ns) + + // Create multiple proxy hosts pointing to the same forward host + hosts := []models.ProxyHost{ + {UUID: "ph-1", DomainNames: "app1.example.com", ForwardHost: "10.0.0.100", ForwardPort: 8080, Name: "App 1"}, + {UUID: "ph-2", DomainNames: "app2.example.com", ForwardHost: "10.0.0.100", ForwardPort: 8081, Name: "App 2"}, + {UUID: "ph-3", DomainNames: "app3.example.com", ForwardHost: "10.0.0.100", ForwardPort: 8082, Name: "App 3"}, + } + for _, h := range hosts { + db.Create(&h) + } + + // Sync monitors + err := us.SyncMonitors() + assert.NoError(t, err) + + // Should have only 1 UptimeHost for 10.0.0.100 + var uptimeHosts []models.UptimeHost + db.Where("host = ?", "10.0.0.100").Find(&uptimeHosts) + assert.Equal(t, 1, len(uptimeHosts), "Should have exactly 1 UptimeHost for the shared IP") + + // All 3 monitors should point to the same UptimeHost + var monitors []models.UptimeMonitor + db.Where("upstream_host = ?", "10.0.0.100").Find(&monitors) + assert.Equal(t, 3, len(monitors)) + + for _, m := range monitors { + assert.NotNil(t, m.UptimeHostID) + assert.Equal(t, uptimeHosts[0].ID, *m.UptimeHostID) + } + }) +} + +func TestFormatDuration(t *testing.T) { + tests := []struct { + input time.Duration + expected string + }{ + {5 * time.Second, "5s"}, + {65 * time.Second, "1m 5s"}, + {3665 * time.Second, "1h 1m 5s"}, + {90065 * time.Second, "1d 1h 1m"}, + {0, "0s"}, + } + + for _, tc := range tests { + result := formatDuration(tc.input) + assert.Equal(t, tc.expected, result, "formatDuration(%v)", tc.input) + } +}