- Add gotestsum for real-time test progress visibility - Parallelize 174 tests across 14 files for faster execution - Add -short mode support skipping 21 heavy integration tests - Create testutil/db.go helper for future transaction rollbacks - Fix data race in notification_service_test.go - Fix 4 CrowdSec LAPI test failures with permissive validator Performance improvements: - Tests now run in parallel (174 tests with t.Parallel()) - Quick feedback loop via -short mode - Zero race conditions detected - Coverage maintained at 87.7% Closes test optimization initiative
1106 lines
31 KiB
Go
1106 lines
31 KiB
Go
package services
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"net"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/Wikid82/charon/backend/internal/logger"
|
|
"github.com/Wikid82/charon/backend/internal/models"
|
|
"github.com/Wikid82/charon/backend/internal/network"
|
|
"github.com/Wikid82/charon/backend/internal/security"
|
|
"github.com/Wikid82/charon/backend/internal/util"
|
|
"gorm.io/gorm"
|
|
)
|
|
|
|
type UptimeService struct {
|
|
DB *gorm.DB
|
|
NotificationService *NotificationService
|
|
// Batching: track pending notifications
|
|
pendingNotifications map[string]*pendingHostNotification
|
|
notificationMutex sync.Mutex
|
|
batchWindow time.Duration
|
|
// Host-specific mutexes to prevent concurrent database updates
|
|
hostMutexes map[string]*sync.Mutex
|
|
hostMutexLock sync.Mutex
|
|
// Configuration
|
|
config UptimeConfig
|
|
}
|
|
|
|
// UptimeConfig holds configurable timeouts and thresholds
|
|
type UptimeConfig struct {
|
|
TCPTimeout time.Duration
|
|
MaxRetries int
|
|
FailureThreshold int
|
|
CheckTimeout time.Duration
|
|
StaggerDelay time.Duration
|
|
}
|
|
|
|
type pendingHostNotification struct {
|
|
hostID string
|
|
hostName string
|
|
downMonitors []monitorDownInfo
|
|
timer *time.Timer
|
|
createdAt time.Time
|
|
}
|
|
|
|
type monitorDownInfo struct {
|
|
ID string
|
|
Name string
|
|
URL string
|
|
Message string
|
|
PreviousUptime string
|
|
}
|
|
|
|
func NewUptimeService(db *gorm.DB, ns *NotificationService) *UptimeService {
|
|
return &UptimeService{
|
|
DB: db,
|
|
NotificationService: ns,
|
|
pendingNotifications: make(map[string]*pendingHostNotification),
|
|
batchWindow: 30 * time.Second, // Wait 30 seconds to batch notifications
|
|
hostMutexes: make(map[string]*sync.Mutex),
|
|
config: UptimeConfig{
|
|
TCPTimeout: 10 * time.Second,
|
|
MaxRetries: 2,
|
|
FailureThreshold: 2,
|
|
CheckTimeout: 60 * time.Second,
|
|
StaggerDelay: 100 * time.Millisecond,
|
|
},
|
|
}
|
|
}
|
|
|
|
// extractPort extracts the port from a URL or host:port string
|
|
func extractPort(urlStr string) string {
|
|
// Try parsing as URL first
|
|
if u, err := url.Parse(urlStr); err == nil && u.Host != "" {
|
|
port := u.Port()
|
|
if port != "" {
|
|
return port
|
|
}
|
|
// Default ports
|
|
if u.Scheme == "https" {
|
|
return "443"
|
|
}
|
|
if u.Scheme == "http" {
|
|
return "80"
|
|
}
|
|
}
|
|
|
|
// Try as host:port
|
|
if _, port, err := net.SplitHostPort(urlStr); err == nil {
|
|
return port
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
// formatDuration formats a duration in a human-readable way
|
|
func formatDuration(d time.Duration) string {
|
|
d = d.Round(time.Second)
|
|
|
|
days := int(d.Hours() / 24)
|
|
hours := int(d.Hours()) % 24
|
|
minutes := int(d.Minutes()) % 60
|
|
seconds := int(d.Seconds()) % 60
|
|
|
|
if days > 0 {
|
|
return fmt.Sprintf("%dd %dh %dm", days, hours, minutes)
|
|
}
|
|
if hours > 0 {
|
|
return fmt.Sprintf("%dh %dm %ds", hours, minutes, seconds)
|
|
}
|
|
if minutes > 0 {
|
|
return fmt.Sprintf("%dm %ds", minutes, seconds)
|
|
}
|
|
return fmt.Sprintf("%ds", seconds)
|
|
}
|
|
|
|
// SyncMonitors ensures every ProxyHost and RemoteServer has a corresponding UptimeMonitor
|
|
// and that UptimeHosts are created for grouping
|
|
func (s *UptimeService) SyncMonitors() error {
|
|
var hosts []models.ProxyHost
|
|
if err := s.DB.Find(&hosts).Error; err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, host := range hosts {
|
|
var monitor models.UptimeMonitor
|
|
err := s.DB.Where("proxy_host_id = ?", host.ID).First(&monitor).Error
|
|
|
|
domains := strings.Split(host.DomainNames, ",")
|
|
firstDomain := ""
|
|
if len(domains) > 0 {
|
|
firstDomain = strings.TrimSpace(domains[0])
|
|
}
|
|
|
|
// Construct the public URL
|
|
scheme := "http"
|
|
if host.SSLForced {
|
|
scheme = "https"
|
|
}
|
|
publicURL := fmt.Sprintf("%s://%s", scheme, firstDomain)
|
|
internalURL := fmt.Sprintf("%s:%d", host.ForwardHost, host.ForwardPort)
|
|
|
|
// The upstream host for grouping is the ForwardHost
|
|
upstreamHost := host.ForwardHost
|
|
|
|
switch err {
|
|
case gorm.ErrRecordNotFound:
|
|
// Create new monitor
|
|
name := host.Name
|
|
if name == "" {
|
|
name = firstDomain
|
|
}
|
|
|
|
// Find or create UptimeHost
|
|
uptimeHostID := s.ensureUptimeHost(upstreamHost, name)
|
|
|
|
monitor = models.UptimeMonitor{
|
|
ProxyHostID: &host.ID,
|
|
UptimeHostID: &uptimeHostID,
|
|
Name: name,
|
|
Type: "http", // Check public access
|
|
URL: publicURL,
|
|
UpstreamHost: upstreamHost,
|
|
Interval: 60,
|
|
Enabled: true,
|
|
Status: "pending",
|
|
}
|
|
if err := s.DB.Create(&monitor).Error; err != nil {
|
|
logger.Log().WithError(err).WithField("host_id", host.ID).Error("Failed to create monitor")
|
|
}
|
|
case nil:
|
|
// Always sync the name from proxy host
|
|
newName := host.Name
|
|
if newName == "" {
|
|
newName = firstDomain
|
|
}
|
|
needsSave := false
|
|
|
|
if monitor.Name != newName {
|
|
monitor.Name = newName
|
|
needsSave = true
|
|
}
|
|
|
|
// Ensure upstream host is set for grouping
|
|
if monitor.UpstreamHost == "" || monitor.UpstreamHost != upstreamHost {
|
|
monitor.UpstreamHost = upstreamHost
|
|
needsSave = true
|
|
}
|
|
|
|
// Ensure UptimeHost link exists
|
|
if monitor.UptimeHostID == nil {
|
|
uptimeHostID := s.ensureUptimeHost(upstreamHost, newName)
|
|
monitor.UptimeHostID = &uptimeHostID
|
|
needsSave = true
|
|
}
|
|
|
|
// Update existing monitor if it looks like it's using the old default (TCP to internal upstream)
|
|
if monitor.Type == "tcp" && monitor.URL == internalURL {
|
|
monitor.Type = "http"
|
|
monitor.URL = publicURL
|
|
needsSave = true
|
|
logger.Log().WithField("host_id", host.ID).Infof("Migrated monitor for host %d to check public URL: %s", host.ID, publicURL)
|
|
}
|
|
|
|
// Upgrade to HTTPS if SSL is forced and we are currently checking HTTP
|
|
if host.SSLForced && strings.HasPrefix(monitor.URL, "http://") {
|
|
monitor.URL = strings.Replace(monitor.URL, "http://", "https://", 1)
|
|
needsSave = true
|
|
logger.Log().WithField("host_id", host.ID).Infof("Upgraded monitor for host %d to HTTPS: %s", host.ID, monitor.URL)
|
|
}
|
|
|
|
if needsSave {
|
|
s.DB.Save(&monitor)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sync Remote Servers
|
|
var remoteServers []models.RemoteServer
|
|
if err := s.DB.Find(&remoteServers).Error; err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, server := range remoteServers {
|
|
var monitor models.UptimeMonitor
|
|
err := s.DB.Where("remote_server_id = ?", server.ID).First(&monitor).Error
|
|
|
|
targetType := "tcp"
|
|
targetURL := fmt.Sprintf("%s:%d", server.Host, server.Port)
|
|
|
|
if server.Scheme == "http" || server.Scheme == "https" {
|
|
targetType = server.Scheme
|
|
targetURL = fmt.Sprintf("%s://%s:%d", server.Scheme, server.Host, server.Port)
|
|
}
|
|
|
|
// The upstream host for grouping
|
|
upstreamHost := server.Host
|
|
|
|
switch err {
|
|
case gorm.ErrRecordNotFound:
|
|
// Find or create UptimeHost
|
|
uptimeHostID := s.ensureUptimeHost(upstreamHost, server.Name)
|
|
|
|
monitor = models.UptimeMonitor{
|
|
RemoteServerID: &server.ID,
|
|
UptimeHostID: &uptimeHostID,
|
|
Name: server.Name,
|
|
Type: targetType,
|
|
URL: targetURL,
|
|
UpstreamHost: upstreamHost,
|
|
Interval: 60,
|
|
Enabled: server.Enabled,
|
|
Status: "pending",
|
|
}
|
|
if err := s.DB.Create(&monitor).Error; err != nil {
|
|
logger.Log().WithError(err).WithField("remote_server_id", server.ID).Error("Failed to create monitor for remote server")
|
|
}
|
|
case nil:
|
|
needsSave := false
|
|
|
|
if monitor.Name != server.Name {
|
|
monitor.Name = server.Name
|
|
needsSave = true
|
|
}
|
|
|
|
// Ensure upstream host is set for grouping
|
|
if monitor.UpstreamHost == "" || monitor.UpstreamHost != upstreamHost {
|
|
monitor.UpstreamHost = upstreamHost
|
|
needsSave = true
|
|
}
|
|
|
|
// Ensure UptimeHost link exists
|
|
if monitor.UptimeHostID == nil {
|
|
uptimeHostID := s.ensureUptimeHost(upstreamHost, server.Name)
|
|
monitor.UptimeHostID = &uptimeHostID
|
|
needsSave = true
|
|
}
|
|
|
|
if monitor.URL != targetURL || monitor.Type != targetType {
|
|
monitor.URL = targetURL
|
|
monitor.Type = targetType
|
|
needsSave = true
|
|
}
|
|
if monitor.Enabled != server.Enabled {
|
|
monitor.Enabled = server.Enabled
|
|
needsSave = true
|
|
}
|
|
|
|
if needsSave {
|
|
s.DB.Save(&monitor)
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// ensureUptimeHost finds or creates an UptimeHost for the given host string
|
|
func (s *UptimeService) ensureUptimeHost(host, defaultName string) string {
|
|
var uptimeHost models.UptimeHost
|
|
err := s.DB.Where("host = ?", host).First(&uptimeHost).Error
|
|
|
|
if err == gorm.ErrRecordNotFound {
|
|
uptimeHost = models.UptimeHost{
|
|
Host: host,
|
|
Name: defaultName,
|
|
Status: "pending",
|
|
}
|
|
if err := s.DB.Create(&uptimeHost).Error; err != nil {
|
|
logger.Log().WithError(err).WithField("host", util.SanitizeForLog(host)).Error("Failed to create UptimeHost")
|
|
return ""
|
|
}
|
|
logger.Log().WithField("host_id", uptimeHost.ID).WithField("host", util.SanitizeForLog(uptimeHost.Host)).Info("Created UptimeHost")
|
|
}
|
|
|
|
return uptimeHost.ID
|
|
}
|
|
|
|
// CheckAll runs checks for all enabled monitors with host-level pre-check
|
|
func (s *UptimeService) CheckAll() {
|
|
// First, check all UptimeHosts
|
|
s.checkAllHosts()
|
|
|
|
var monitors []models.UptimeMonitor
|
|
if err := s.DB.Where("enabled = ?", true).Find(&monitors).Error; err != nil {
|
|
logger.Log().WithError(err).Error("Failed to fetch monitors")
|
|
return
|
|
}
|
|
|
|
// Group monitors by UptimeHost
|
|
hostMonitors := make(map[string][]models.UptimeMonitor)
|
|
for _, monitor := range monitors {
|
|
hostID := ""
|
|
if monitor.UptimeHostID != nil {
|
|
hostID = *monitor.UptimeHostID
|
|
}
|
|
hostMonitors[hostID] = append(hostMonitors[hostID], monitor)
|
|
}
|
|
|
|
// Check each host's monitors
|
|
for hostID, monitors := range hostMonitors {
|
|
// If host is down, mark all monitors as down without individual checks
|
|
if hostID != "" {
|
|
var uptimeHost models.UptimeHost
|
|
if err := s.DB.First(&uptimeHost, "id = ?", hostID).Error; err == nil {
|
|
if uptimeHost.Status == "down" {
|
|
s.markHostMonitorsDown(monitors, &uptimeHost)
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
// Host is up, check individual monitors
|
|
for _, monitor := range monitors {
|
|
go s.checkMonitor(monitor)
|
|
}
|
|
}
|
|
}
|
|
|
|
// checkAllHosts performs TCP connectivity check on all UptimeHosts
|
|
func (s *UptimeService) checkAllHosts() {
|
|
var hosts []models.UptimeHost
|
|
if err := s.DB.Find(&hosts).Error; err != nil {
|
|
logger.Log().WithError(err).Error("Failed to fetch uptime hosts")
|
|
return
|
|
}
|
|
|
|
if len(hosts) == 0 {
|
|
return
|
|
}
|
|
|
|
logger.Log().WithField("host_count", len(hosts)).Info("Starting host checks")
|
|
|
|
// Create context with timeout for all checks
|
|
ctx, cancel := context.WithTimeout(context.Background(), s.config.CheckTimeout)
|
|
defer cancel()
|
|
|
|
var wg sync.WaitGroup
|
|
for i := range hosts {
|
|
wg.Add(1)
|
|
// Staggered startup to reduce load spikes
|
|
if i > 0 {
|
|
time.Sleep(s.config.StaggerDelay)
|
|
}
|
|
go func(host *models.UptimeHost) {
|
|
defer wg.Done()
|
|
// Check if context is cancelled
|
|
select {
|
|
case <-ctx.Done():
|
|
logger.Log().WithField("host_name", host.Name).Warn("Host check cancelled due to timeout")
|
|
return
|
|
default:
|
|
s.checkHost(ctx, host)
|
|
}
|
|
}(&hosts[i])
|
|
}
|
|
wg.Wait() // Wait for all host checks to complete
|
|
|
|
logger.Log().WithField("host_count", len(hosts)).Info("All host checks completed")
|
|
}
|
|
|
|
// checkHost performs a basic TCP connectivity check to determine if the host is reachable
|
|
func (s *UptimeService) checkHost(ctx context.Context, host *models.UptimeHost) {
|
|
// Get host-specific mutex to prevent concurrent database updates
|
|
s.hostMutexLock.Lock()
|
|
if s.hostMutexes[host.ID] == nil {
|
|
s.hostMutexes[host.ID] = &sync.Mutex{}
|
|
}
|
|
mutex := s.hostMutexes[host.ID]
|
|
s.hostMutexLock.Unlock()
|
|
|
|
mutex.Lock()
|
|
defer mutex.Unlock()
|
|
|
|
start := time.Now()
|
|
|
|
logger.Log().WithFields(map[string]any{
|
|
"host_name": host.Name,
|
|
"host_ip": host.Host,
|
|
"host_id": host.ID,
|
|
}).Debug("Starting TCP check for host")
|
|
|
|
// Get common ports for this host from its monitors
|
|
var monitors []models.UptimeMonitor
|
|
s.DB.Preload("ProxyHost").Where("uptime_host_id = ?", host.ID).Find(&monitors)
|
|
|
|
logger.Log().WithField("host_name", host.Name).WithField("monitor_count", len(monitors)).Debug("Retrieved monitors for host")
|
|
|
|
if len(monitors) == 0 {
|
|
return
|
|
}
|
|
|
|
// Try to connect to any of the monitor ports with retry logic
|
|
success := false
|
|
var msg string
|
|
var lastErr error
|
|
|
|
for retry := 0; retry <= s.config.MaxRetries && !success; retry++ {
|
|
if retry > 0 {
|
|
logger.Log().WithFields(map[string]any{
|
|
"host_name": host.Name,
|
|
"retry": retry,
|
|
"max": s.config.MaxRetries,
|
|
}).Info("Retrying TCP check")
|
|
time.Sleep(2 * time.Second) // Brief delay between retries
|
|
}
|
|
|
|
// Check if context is cancelled
|
|
select {
|
|
case <-ctx.Done():
|
|
logger.Log().WithField("host_name", host.Name).Warn("TCP check cancelled")
|
|
return
|
|
default:
|
|
}
|
|
|
|
for _, monitor := range monitors {
|
|
var port string
|
|
|
|
// Use actual backend port from ProxyHost if available
|
|
if monitor.ProxyHost != nil {
|
|
port = fmt.Sprintf("%d", monitor.ProxyHost.ForwardPort)
|
|
} else {
|
|
// Fallback to extracting from URL for standalone monitors
|
|
port = extractPort(monitor.URL)
|
|
}
|
|
|
|
if port == "" {
|
|
continue
|
|
}
|
|
|
|
logger.Log().WithFields(map[string]any{
|
|
"monitor": monitor.Name,
|
|
"extracted_port": extractPort(monitor.URL),
|
|
"actual_port": port,
|
|
"host": host.Host,
|
|
"retry": retry,
|
|
}).Debug("TCP check port resolution")
|
|
|
|
// Use net.JoinHostPort for IPv6 compatibility
|
|
addr := net.JoinHostPort(host.Host, port)
|
|
|
|
// Create dialer with timeout from context
|
|
dialer := net.Dialer{Timeout: s.config.TCPTimeout}
|
|
conn, err := dialer.DialContext(ctx, "tcp", addr)
|
|
if err == nil {
|
|
if err := conn.Close(); err != nil {
|
|
logger.Log().WithError(err).Warn("failed to close tcp connection")
|
|
}
|
|
success = true
|
|
msg = fmt.Sprintf("TCP connection to %s successful (retry %d)", addr, retry)
|
|
logger.Log().WithFields(map[string]any{
|
|
"host_name": host.Name,
|
|
"addr": addr,
|
|
"retry": retry,
|
|
}).Debug("TCP connection successful")
|
|
break
|
|
}
|
|
lastErr = err
|
|
msg = fmt.Sprintf("TCP check failed: %v", err)
|
|
}
|
|
}
|
|
|
|
latency := time.Since(start).Milliseconds()
|
|
oldStatus := host.Status
|
|
var newStatus string
|
|
|
|
// Implement failure count debouncing
|
|
if success {
|
|
host.FailureCount = 0
|
|
newStatus = "up"
|
|
} else {
|
|
host.FailureCount++
|
|
if host.FailureCount >= s.config.FailureThreshold {
|
|
newStatus = "down"
|
|
} else {
|
|
// Keep current status on first failure
|
|
newStatus = host.Status
|
|
logger.Log().WithFields(map[string]any{
|
|
"host_name": host.Name,
|
|
"failure_count": host.FailureCount,
|
|
"threshold": s.config.FailureThreshold,
|
|
"last_error": lastErr,
|
|
}).Warn("Host check failed, waiting for threshold")
|
|
}
|
|
}
|
|
|
|
statusChanged := oldStatus != newStatus && oldStatus != "pending"
|
|
|
|
host.Status = newStatus
|
|
host.LastCheck = time.Now()
|
|
host.Latency = latency
|
|
|
|
if statusChanged {
|
|
host.LastStatusChange = time.Now()
|
|
logger.Log().WithFields(map[string]any{
|
|
"host_name": host.Name,
|
|
"host_ip": host.Host,
|
|
"old": oldStatus,
|
|
"new": newStatus,
|
|
"message": msg,
|
|
}).Info("Host status changed")
|
|
}
|
|
|
|
logger.Log().WithFields(map[string]any{
|
|
"host_name": host.Name,
|
|
"host_ip": host.Host,
|
|
"success": success,
|
|
"failure_count": host.FailureCount,
|
|
"old_status": oldStatus,
|
|
"new_status": newStatus,
|
|
"elapsed_ms": latency,
|
|
"status_changed": statusChanged,
|
|
}).Debug("Host TCP check completed")
|
|
|
|
s.DB.Save(host)
|
|
}
|
|
|
|
// markHostMonitorsDown marks all monitors for a down host as down and sends a single notification
|
|
func (s *UptimeService) markHostMonitorsDown(monitors []models.UptimeMonitor, host *models.UptimeHost) {
|
|
downMonitors := []monitorDownInfo{}
|
|
|
|
for i := range monitors {
|
|
monitor := &monitors[i]
|
|
oldStatus := monitor.Status
|
|
if oldStatus == "down" {
|
|
continue // Already down, no need to update
|
|
}
|
|
|
|
// Calculate previous uptime
|
|
var durationStr string
|
|
if !monitor.LastStatusChange.IsZero() {
|
|
duration := time.Since(monitor.LastStatusChange)
|
|
durationStr = formatDuration(duration)
|
|
}
|
|
|
|
monitor.Status = "down"
|
|
monitor.LastCheck = time.Now()
|
|
monitor.FailureCount = monitor.MaxRetries // Max out failure count
|
|
if oldStatus != "pending" {
|
|
monitor.LastStatusChange = time.Now()
|
|
}
|
|
monitor.NotifiedInBatch = true
|
|
s.DB.Save(monitor)
|
|
|
|
// Record heartbeat
|
|
heartbeat := models.UptimeHeartbeat{
|
|
MonitorID: monitor.ID,
|
|
Status: "down",
|
|
Latency: 0,
|
|
Message: "Host unreachable",
|
|
}
|
|
s.DB.Create(&heartbeat)
|
|
|
|
if oldStatus != "pending" && oldStatus != "down" {
|
|
downMonitors = append(downMonitors, monitorDownInfo{
|
|
ID: monitor.ID,
|
|
Name: monitor.Name,
|
|
URL: monitor.URL,
|
|
Message: "Host unreachable",
|
|
PreviousUptime: durationStr,
|
|
})
|
|
}
|
|
}
|
|
|
|
// Send consolidated notification if any monitors transitioned to down
|
|
if len(downMonitors) > 0 && time.Since(host.LastNotifiedDown) > 5*time.Minute {
|
|
s.sendHostDownNotification(host, downMonitors)
|
|
}
|
|
}
|
|
|
|
// sendHostDownNotification sends a single consolidated notification for a down host
|
|
func (s *UptimeService) sendHostDownNotification(host *models.UptimeHost, downMonitors []monitorDownInfo) {
|
|
title := fmt.Sprintf("🔴 Host %s is DOWN (%d services affected)", host.Name, len(downMonitors))
|
|
|
|
var sb strings.Builder
|
|
sb.WriteString(fmt.Sprintf("Host: %s (%s)\n", host.Name, host.Host))
|
|
sb.WriteString(fmt.Sprintf("Time: %s\n", time.Now().Format(time.RFC1123)))
|
|
sb.WriteString(fmt.Sprintf("Services affected: %d\n\n", len(downMonitors)))
|
|
|
|
sb.WriteString("Impacted services:\n")
|
|
for _, m := range downMonitors {
|
|
if m.PreviousUptime != "" {
|
|
sb.WriteString(fmt.Sprintf("• %s (was up %s)\n", m.Name, m.PreviousUptime))
|
|
} else {
|
|
sb.WriteString(fmt.Sprintf("• %s\n", m.Name))
|
|
}
|
|
}
|
|
|
|
// Store notification in DB
|
|
_, _ = s.NotificationService.Create(
|
|
models.NotificationTypeError,
|
|
title,
|
|
sb.String(),
|
|
)
|
|
|
|
// Collect monitor IDs for tracking
|
|
monitorIDs := make([]string, len(downMonitors))
|
|
for i, m := range downMonitors {
|
|
monitorIDs[i] = m.ID
|
|
}
|
|
monitorIDsJSON, _ := json.Marshal(monitorIDs)
|
|
|
|
// Record notification event
|
|
event := models.UptimeNotificationEvent{
|
|
HostID: host.ID,
|
|
EventType: "down",
|
|
MonitorIDs: string(monitorIDsJSON),
|
|
Message: sb.String(),
|
|
SentAt: time.Now(),
|
|
}
|
|
s.DB.Create(&event)
|
|
|
|
// Update host notification tracking
|
|
host.LastNotifiedDown = time.Now()
|
|
host.NotifiedServiceCount = len(downMonitors)
|
|
s.DB.Save(host)
|
|
|
|
// Send external notification
|
|
data := map[string]any{
|
|
"HostName": host.Name,
|
|
"HostIP": host.Host,
|
|
"Status": "DOWN",
|
|
"ServiceCount": len(downMonitors),
|
|
"Services": downMonitors,
|
|
"Time": time.Now().Format(time.RFC1123),
|
|
}
|
|
s.NotificationService.SendExternal(context.Background(), "uptime", title, sb.String(), data)
|
|
|
|
logger.Log().WithField("host_name", host.Name).WithField("service_count", len(downMonitors)).Info("Sent consolidated DOWN notification")
|
|
}
|
|
|
|
// CheckMonitor is the exported version for on-demand checks
|
|
func (s *UptimeService) CheckMonitor(monitor models.UptimeMonitor) {
|
|
s.checkMonitor(monitor)
|
|
}
|
|
|
|
func (s *UptimeService) checkMonitor(monitor models.UptimeMonitor) {
|
|
start := time.Now()
|
|
success := false
|
|
var msg string
|
|
|
|
switch monitor.Type {
|
|
case "http", "https":
|
|
validatedURL, err := security.ValidateExternalURL(
|
|
monitor.URL,
|
|
// Uptime monitors are an explicit admin-configured feature and commonly
|
|
// target loopback in local/dev setups (and in unit tests).
|
|
security.WithAllowLocalhost(),
|
|
security.WithAllowHTTP(),
|
|
security.WithTimeout(3*time.Second),
|
|
)
|
|
if err != nil {
|
|
msg = fmt.Sprintf("security validation failed: %s", err.Error())
|
|
break
|
|
}
|
|
|
|
client := network.NewSafeHTTPClient(
|
|
network.WithTimeout(10*time.Second),
|
|
network.WithDialTimeout(5*time.Second),
|
|
// Explicit redirect policy per call site: disable.
|
|
network.WithMaxRedirects(0),
|
|
// Uptime monitors are an explicit admin-configured feature and commonly
|
|
// target loopback in local/dev setups (and in unit tests).
|
|
network.WithAllowLocalhost(),
|
|
)
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, validatedURL, http.NoBody)
|
|
if err != nil {
|
|
msg = err.Error()
|
|
break
|
|
}
|
|
|
|
resp, err := client.Do(req)
|
|
if err == nil {
|
|
defer func() {
|
|
if err := resp.Body.Close(); err != nil {
|
|
logger.Log().WithError(err).Warn("failed to close uptime service response body")
|
|
}
|
|
}()
|
|
// Accept 2xx, 3xx, and 401/403 (Unauthorized/Forbidden often means the service is up but protected)
|
|
if (resp.StatusCode >= 200 && resp.StatusCode < 400) || resp.StatusCode == 401 || resp.StatusCode == 403 {
|
|
success = true
|
|
msg = fmt.Sprintf("HTTP %d", resp.StatusCode)
|
|
} else {
|
|
msg = fmt.Sprintf("HTTP %d", resp.StatusCode)
|
|
}
|
|
} else {
|
|
msg = err.Error()
|
|
}
|
|
case "tcp":
|
|
conn, err := net.DialTimeout("tcp", monitor.URL, 10*time.Second)
|
|
if err == nil {
|
|
if err := conn.Close(); err != nil {
|
|
logger.Log().WithError(err).Warn("failed to close tcp connection")
|
|
}
|
|
success = true
|
|
msg = "Connection successful"
|
|
} else {
|
|
msg = err.Error()
|
|
}
|
|
default:
|
|
msg = "Unknown monitor type"
|
|
}
|
|
|
|
latency := time.Since(start).Milliseconds()
|
|
|
|
// Determine new status based on success and retries
|
|
newStatus := monitor.Status
|
|
|
|
if success {
|
|
// If it was down or pending, it's now up immediately
|
|
if monitor.Status != "up" {
|
|
newStatus = "up"
|
|
}
|
|
// Reset failure count on success
|
|
monitor.FailureCount = 0
|
|
} else {
|
|
// Increment failure count
|
|
monitor.FailureCount++
|
|
|
|
// Only mark as down if we exceeded max retries
|
|
// Default MaxRetries to 3 if 0 (legacy records)
|
|
maxRetries := monitor.MaxRetries
|
|
if maxRetries <= 0 {
|
|
maxRetries = 3
|
|
}
|
|
|
|
if monitor.FailureCount >= maxRetries {
|
|
newStatus = "down"
|
|
}
|
|
}
|
|
|
|
// Record Heartbeat (always record the raw result)
|
|
heartbeatStatus := "down"
|
|
if success {
|
|
heartbeatStatus = "up"
|
|
}
|
|
|
|
heartbeat := models.UptimeHeartbeat{
|
|
MonitorID: monitor.ID,
|
|
Status: heartbeatStatus,
|
|
Latency: latency,
|
|
Message: msg,
|
|
}
|
|
_ = s.DB.Create(&heartbeat).Error
|
|
|
|
// Update Monitor Status
|
|
oldStatus := monitor.Status
|
|
statusChanged := oldStatus != newStatus && oldStatus != "pending"
|
|
|
|
// Calculate previous uptime/downtime if status changed
|
|
var durationStr string
|
|
if statusChanged && !monitor.LastStatusChange.IsZero() {
|
|
duration := time.Since(monitor.LastStatusChange)
|
|
durationStr = formatDuration(duration)
|
|
}
|
|
|
|
monitor.Status = newStatus
|
|
monitor.LastCheck = time.Now()
|
|
monitor.Latency = latency
|
|
|
|
if statusChanged {
|
|
monitor.LastStatusChange = time.Now()
|
|
}
|
|
|
|
s.DB.Save(&monitor)
|
|
|
|
// Handle notifications based on status change
|
|
if statusChanged {
|
|
switch newStatus {
|
|
case "down":
|
|
// Queue for batched notification
|
|
s.queueDownNotification(monitor, msg, durationStr)
|
|
case "up":
|
|
// Send recovery notification
|
|
s.sendRecoveryNotification(monitor, durationStr)
|
|
}
|
|
}
|
|
}
|
|
|
|
// queueDownNotification adds a down monitor to the batch queue
|
|
func (s *UptimeService) queueDownNotification(monitor models.UptimeMonitor, reason, previousUptime string) {
|
|
s.notificationMutex.Lock()
|
|
defer s.notificationMutex.Unlock()
|
|
|
|
hostID := ""
|
|
if monitor.UptimeHostID != nil {
|
|
hostID = *monitor.UptimeHostID
|
|
}
|
|
|
|
// Get host info
|
|
var uptimeHost models.UptimeHost
|
|
hostName := monitor.UpstreamHost
|
|
if hostID != "" {
|
|
if err := s.DB.First(&uptimeHost, "id = ?", hostID).Error; err == nil {
|
|
hostName = uptimeHost.Name
|
|
}
|
|
}
|
|
|
|
info := monitorDownInfo{
|
|
ID: monitor.ID,
|
|
Name: monitor.Name,
|
|
URL: monitor.URL,
|
|
Message: reason,
|
|
PreviousUptime: previousUptime,
|
|
}
|
|
|
|
if pending, exists := s.pendingNotifications[hostID]; exists {
|
|
// Add to existing batch
|
|
pending.downMonitors = append(pending.downMonitors, info)
|
|
logger.Log().WithField("monitor", util.SanitizeForLog(monitor.Name)).WithField("host", util.SanitizeForLog(hostName)).WithField("count", len(pending.downMonitors)).Info("Added to pending notification batch")
|
|
} else {
|
|
// Create new batch with timer
|
|
pending := &pendingHostNotification{
|
|
hostID: hostID,
|
|
hostName: hostName,
|
|
downMonitors: []monitorDownInfo{info},
|
|
createdAt: time.Now(),
|
|
}
|
|
|
|
pending.timer = time.AfterFunc(s.batchWindow, func() {
|
|
s.flushPendingNotification(hostID)
|
|
})
|
|
|
|
s.pendingNotifications[hostID] = pending
|
|
logger.Log().WithField("host", util.SanitizeForLog(hostName)).WithField("monitor", util.SanitizeForLog(monitor.Name)).Info("Created pending notification batch")
|
|
}
|
|
}
|
|
|
|
// flushPendingNotification sends the batched notification
|
|
func (s *UptimeService) flushPendingNotification(hostID string) {
|
|
s.notificationMutex.Lock()
|
|
pending, exists := s.pendingNotifications[hostID]
|
|
if !exists {
|
|
s.notificationMutex.Unlock()
|
|
return
|
|
}
|
|
delete(s.pendingNotifications, hostID)
|
|
s.notificationMutex.Unlock()
|
|
|
|
if pending.timer != nil {
|
|
pending.timer.Stop()
|
|
}
|
|
|
|
if len(pending.downMonitors) == 0 {
|
|
return
|
|
}
|
|
|
|
// Build and send notification
|
|
var title string
|
|
var sb strings.Builder
|
|
|
|
if len(pending.downMonitors) == 1 {
|
|
// Single service down
|
|
m := pending.downMonitors[0]
|
|
title = fmt.Sprintf("🔴 %s is DOWN", m.Name)
|
|
sb.WriteString(fmt.Sprintf("Service: %s\n", m.Name))
|
|
sb.WriteString("Status: DOWN\n")
|
|
sb.WriteString(fmt.Sprintf("Time: %s\n", time.Now().Format(time.RFC1123)))
|
|
if m.PreviousUptime != "" {
|
|
sb.WriteString(fmt.Sprintf("Previous Uptime: %s\n", m.PreviousUptime))
|
|
}
|
|
sb.WriteString(fmt.Sprintf("Reason: %s\n", m.Message))
|
|
} else {
|
|
// Multiple services down
|
|
title = fmt.Sprintf("🔴 %d Services DOWN on %s", len(pending.downMonitors), pending.hostName)
|
|
sb.WriteString(fmt.Sprintf("Host: %s\n", pending.hostName))
|
|
sb.WriteString(fmt.Sprintf("Time: %s\n", time.Now().Format(time.RFC1123)))
|
|
sb.WriteString(fmt.Sprintf("Services affected: %d\n\n", len(pending.downMonitors)))
|
|
|
|
sb.WriteString("Impacted services:\n")
|
|
for _, m := range pending.downMonitors {
|
|
if m.PreviousUptime != "" {
|
|
sb.WriteString(fmt.Sprintf("• %s - %s (was up %s)\n", m.Name, m.Message, m.PreviousUptime))
|
|
} else {
|
|
sb.WriteString(fmt.Sprintf("• %s - %s\n", m.Name, m.Message))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Store in DB
|
|
_, _ = s.NotificationService.Create(
|
|
models.NotificationTypeError,
|
|
title,
|
|
sb.String(),
|
|
)
|
|
|
|
// Send external
|
|
data := map[string]any{
|
|
"HostName": pending.hostName,
|
|
"Status": "DOWN",
|
|
"ServiceCount": len(pending.downMonitors),
|
|
"Services": pending.downMonitors,
|
|
"Time": time.Now().Format(time.RFC1123),
|
|
}
|
|
s.NotificationService.SendExternal(context.Background(), "uptime", title, sb.String(), data)
|
|
|
|
logger.Log().WithField("count", len(pending.downMonitors)).WithField("host", util.SanitizeForLog(pending.hostName)).Info("Sent batched DOWN notification")
|
|
}
|
|
|
|
// sendRecoveryNotification sends a notification when a service recovers
|
|
func (s *UptimeService) sendRecoveryNotification(monitor models.UptimeMonitor, downtime string) {
|
|
title := fmt.Sprintf("🟢 %s is UP", monitor.Name)
|
|
|
|
var sb strings.Builder
|
|
sb.WriteString(fmt.Sprintf("Service: %s\n", monitor.Name))
|
|
sb.WriteString("Status: UP\n")
|
|
sb.WriteString(fmt.Sprintf("Time: %s\n", time.Now().Format(time.RFC1123)))
|
|
if downtime != "" {
|
|
sb.WriteString(fmt.Sprintf("Downtime: %s\n", downtime))
|
|
}
|
|
|
|
_, _ = s.NotificationService.Create(
|
|
models.NotificationTypeSuccess,
|
|
title,
|
|
sb.String(),
|
|
)
|
|
|
|
data := map[string]any{
|
|
"Name": monitor.Name,
|
|
"Status": "UP",
|
|
"Downtime": downtime,
|
|
"Time": time.Now().Format(time.RFC1123),
|
|
"URL": monitor.URL,
|
|
}
|
|
s.NotificationService.SendExternal(context.Background(), "uptime", title, sb.String(), data)
|
|
}
|
|
|
|
// FlushPendingNotifications flushes all pending batched notifications immediately.
|
|
// This is useful for testing and graceful shutdown.
|
|
func (s *UptimeService) FlushPendingNotifications() {
|
|
s.notificationMutex.Lock()
|
|
pendingHostIDs := make([]string, 0, len(s.pendingNotifications))
|
|
for hostID := range s.pendingNotifications {
|
|
pendingHostIDs = append(pendingHostIDs, hostID)
|
|
}
|
|
s.notificationMutex.Unlock()
|
|
|
|
for _, hostID := range pendingHostIDs {
|
|
s.flushPendingNotification(hostID)
|
|
}
|
|
}
|
|
|
|
// SyncMonitorForHost updates the uptime monitor linked to a specific proxy host.
|
|
// This should be called when a proxy host is edited to keep the monitor in sync.
|
|
// Returns nil if no monitor exists for the host (does not create one).
|
|
func (s *UptimeService) SyncMonitorForHost(hostID uint) error {
|
|
var host models.ProxyHost
|
|
if err := s.DB.First(&host, hostID).Error; err != nil {
|
|
return err
|
|
}
|
|
|
|
var monitor models.UptimeMonitor
|
|
if err := s.DB.Where("proxy_host_id = ?", hostID).First(&monitor).Error; err != nil {
|
|
if errors.Is(err, gorm.ErrRecordNotFound) {
|
|
return nil // No monitor to sync
|
|
}
|
|
return err
|
|
}
|
|
|
|
// Update monitor fields based on current proxy host values
|
|
domains := strings.Split(host.DomainNames, ",")
|
|
firstDomain := ""
|
|
if len(domains) > 0 {
|
|
firstDomain = strings.TrimSpace(domains[0])
|
|
}
|
|
|
|
scheme := "http"
|
|
if host.SSLForced {
|
|
scheme = "https"
|
|
}
|
|
|
|
newName := host.Name
|
|
if newName == "" {
|
|
newName = firstDomain
|
|
}
|
|
|
|
monitor.Name = newName
|
|
monitor.URL = fmt.Sprintf("%s://%s", scheme, firstDomain)
|
|
monitor.UpstreamHost = host.ForwardHost
|
|
|
|
return s.DB.Save(&monitor).Error
|
|
}
|
|
|
|
// CRUD for Monitors
|
|
|
|
func (s *UptimeService) ListMonitors() ([]models.UptimeMonitor, error) {
|
|
var monitors []models.UptimeMonitor
|
|
result := s.DB.Order("name ASC").Find(&monitors)
|
|
return monitors, result.Error
|
|
}
|
|
|
|
func (s *UptimeService) GetMonitorByID(id string) (*models.UptimeMonitor, error) {
|
|
var monitor models.UptimeMonitor
|
|
if err := s.DB.First(&monitor, "id = ?", id).Error; err != nil {
|
|
return nil, err
|
|
}
|
|
return &monitor, nil
|
|
}
|
|
|
|
func (s *UptimeService) GetMonitorHistory(id string, limit int) ([]models.UptimeHeartbeat, error) {
|
|
var heartbeats []models.UptimeHeartbeat
|
|
result := s.DB.Where("monitor_id = ?", id).Order("created_at desc").Limit(limit).Find(&heartbeats)
|
|
return heartbeats, result.Error
|
|
}
|
|
|
|
func (s *UptimeService) UpdateMonitor(id string, updates map[string]any) (*models.UptimeMonitor, error) {
|
|
var monitor models.UptimeMonitor
|
|
if err := s.DB.First(&monitor, "id = ?", id).Error; err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Whitelist allowed fields to update
|
|
allowedUpdates := make(map[string]any)
|
|
if val, ok := updates["max_retries"]; ok {
|
|
allowedUpdates["max_retries"] = val
|
|
}
|
|
if val, ok := updates["interval"]; ok {
|
|
allowedUpdates["interval"] = val
|
|
}
|
|
if val, ok := updates["enabled"]; ok {
|
|
allowedUpdates["enabled"] = val
|
|
}
|
|
// Add other fields as needed, but be careful not to overwrite SyncMonitors logic
|
|
|
|
if err := s.DB.Model(&monitor).Updates(allowedUpdates).Error; err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &monitor, nil
|
|
}
|
|
|
|
// DeleteMonitor removes a monitor and its heartbeats, and optionally cleans up the parent UptimeHost.
|
|
func (s *UptimeService) DeleteMonitor(id string) error {
|
|
// Find monitor
|
|
var monitor models.UptimeMonitor
|
|
if err := s.DB.First(&monitor, "id = ?", id).Error; err != nil {
|
|
return err
|
|
}
|
|
|
|
// Delete heartbeats
|
|
if err := s.DB.Where("monitor_id = ?", id).Delete(&models.UptimeHeartbeat{}).Error; err != nil {
|
|
return err
|
|
}
|
|
|
|
// Delete the monitor
|
|
if err := s.DB.Delete(&monitor).Error; err != nil {
|
|
return err
|
|
}
|
|
|
|
// If no other monitors reference the uptime host, we don't automatically delete the host.
|
|
// Leave host cleanup to a manual process or separate endpoint.
|
|
|
|
return nil
|
|
}
|