fix: enhance CrowdSec startup logic and verification, improve error handling in Security page

This commit is contained in:
GitHub Actions
2025-12-15 04:04:02 +00:00
parent c395b9d68e
commit aa55d38a82
4 changed files with 257 additions and 56 deletions

View File

@@ -2,6 +2,9 @@ package services
import (
"context"
"os"
"path/filepath"
"strings"
"time"
"github.com/Wikid82/charon/backend/internal/logger"
@@ -21,6 +24,11 @@ type CrowdsecProcessManager interface {
// and starts it if necessary. This handles container restart scenarios where the
// user's preference was to have CrowdSec enabled.
func ReconcileCrowdSecOnStartup(db *gorm.DB, executor CrowdsecProcessManager, binPath, dataDir string) {
logger.Log().WithFields(map[string]interface{}{
"bin_path": binPath,
"data_dir": dataDir,
}).Info("CrowdSec reconciliation: starting startup check")
if db == nil || executor == nil {
logger.Log().Debug("CrowdSec reconciliation skipped: nil db or executor")
return
@@ -42,9 +50,36 @@ func ReconcileCrowdSecOnStartup(db *gorm.DB, executor CrowdsecProcessManager, bi
return
}
// Only auto-start if CrowdSecMode is "local"
if cfg.CrowdSecMode != "local" {
logger.Log().WithField("mode", cfg.CrowdSecMode).Debug("CrowdSec reconciliation skipped: mode is not 'local'")
// Also check for runtime setting override in settings table
var settingOverride struct{ Value string }
crowdSecEnabled := false
if err := db.Raw("SELECT value FROM settings WHERE key = ? LIMIT 1", "security.crowdsec.enabled").Scan(&settingOverride).Error; err == nil && settingOverride.Value != "" {
crowdSecEnabled = strings.EqualFold(settingOverride.Value, "true")
logger.Log().WithFields(map[string]interface{}{
"setting_value": settingOverride.Value,
"crowdsec_enabled": crowdSecEnabled,
}).Debug("CrowdSec reconciliation: found runtime setting override")
}
// Only auto-start if CrowdSecMode is "local" OR runtime setting is enabled
if cfg.CrowdSecMode != "local" && !crowdSecEnabled {
logger.Log().WithFields(map[string]interface{}{
"db_mode": cfg.CrowdSecMode,
"setting_enabled": crowdSecEnabled,
}).Debug("CrowdSec reconciliation skipped: mode is not 'local' and setting not enabled")
return
}
// VALIDATE: Ensure binary exists
if _, err := os.Stat(binPath); os.IsNotExist(err) {
logger.Log().WithField("path", binPath).Error("CrowdSec reconciliation: binary not found, cannot start")
return
}
// VALIDATE: Ensure config directory exists
configPath := filepath.Join(dataDir, "config")
if _, err := os.Stat(configPath); os.IsNotExist(err) {
logger.Log().WithField("path", configPath).Error("CrowdSec reconciliation: config directory not found, cannot start")
return
}
@@ -64,16 +99,46 @@ func ReconcileCrowdSecOnStartup(db *gorm.DB, executor CrowdsecProcessManager, bi
}
// CrowdSec should be running but isn't - start it
logger.Log().Info("CrowdSec reconciliation: starting CrowdSec (mode=local, not currently running)")
logger.Log().WithFields(map[string]interface{}{
"bin_path": binPath,
"data_dir": dataDir,
}).Info("CrowdSec reconciliation: starting CrowdSec (mode=local, not currently running)")
startCtx, startCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer startCancel()
newPid, err := executor.Start(startCtx, binPath, dataDir)
if err != nil {
logger.Log().WithError(err).Error("CrowdSec reconciliation: failed to start CrowdSec")
logger.Log().WithError(err).WithFields(map[string]interface{}{
"bin_path": binPath,
"data_dir": dataDir,
}).Error("CrowdSec reconciliation: FAILED to start CrowdSec - check binary and config")
return
}
logger.Log().WithField("pid", newPid).Info("CrowdSec reconciliation: successfully started CrowdSec")
// VERIFY: Wait briefly and confirm process is actually running
time.Sleep(2 * time.Second)
verifyCtx, verifyCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer verifyCancel()
verifyRunning, verifyPid, verifyErr := executor.Status(verifyCtx, dataDir)
if verifyErr != nil {
logger.Log().WithError(verifyErr).WithField("expected_pid", newPid).Warn("CrowdSec reconciliation: started but failed to verify status")
return
}
if !verifyRunning {
logger.Log().WithFields(map[string]interface{}{
"expected_pid": newPid,
"actual_pid": verifyPid,
"running": verifyRunning,
}).Error("CrowdSec reconciliation: process started but is no longer running - may have crashed")
return
}
logger.Log().WithFields(map[string]interface{}{
"pid": newPid,
"verified": true,
}).Info("CrowdSec reconciliation: successfully started and verified CrowdSec")
}

View File

@@ -2,6 +2,8 @@ package services
import (
"context"
"os"
"path/filepath"
"testing"
"github.com/Wikid82/charon/backend/internal/models"
@@ -37,6 +39,33 @@ func (m *mockCrowdsecExecutor) Status(ctx context.Context, configDir string) (bo
return m.running, m.pid, m.statusErr
}
// smartMockCrowdsecExecutor returns running=true after Start is called (for post-start verification)
type smartMockCrowdsecExecutor struct {
startCalled bool
startErr error
startPid int
statusCalled bool
statusErr error
}
func (m *smartMockCrowdsecExecutor) Start(ctx context.Context, binPath, configDir string) (int, error) {
m.startCalled = true
return m.startPid, m.startErr
}
func (m *smartMockCrowdsecExecutor) Stop(ctx context.Context, configDir string) error {
return nil
}
func (m *smartMockCrowdsecExecutor) Status(ctx context.Context, configDir string) (bool, int, error) {
m.statusCalled = true
// Return running=true if Start was called (simulates successful start)
if m.startCalled {
return true, m.startPid, m.statusErr
}
return false, 0, m.statusErr
}
func setupCrowdsecTestDB(t *testing.T) *gorm.DB {
db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{
Logger: gormlogger.Default.LogMode(gormlogger.Silent),
@@ -49,6 +78,36 @@ func setupCrowdsecTestDB(t *testing.T) *gorm.DB {
return db
}
// setupCrowdsecTestFixtures creates temporary binary and config directory for testing
func setupCrowdsecTestFixtures(t *testing.T) (binPath, dataDir string, cleanup func()) {
t.Helper()
// Create temp directory
tempDir, err := os.MkdirTemp("", "crowdsec-test-*")
require.NoError(t, err)
// Create mock binary file
binPath = filepath.Join(tempDir, "crowdsec")
err = os.WriteFile(binPath, []byte("#!/bin/sh\nexit 0\n"), 0o755)
require.NoError(t, err)
// Create data directory (passed as dataDir to the function)
dataDir = filepath.Join(tempDir, "data")
err = os.MkdirAll(dataDir, 0o755)
require.NoError(t, err)
// Create config directory inside data dir (validation checks dataDir/config)
configDir := filepath.Join(dataDir, "config")
err = os.MkdirAll(configDir, 0o755)
require.NoError(t, err)
cleanup = func() {
os.RemoveAll(tempDir)
}
return binPath, dataDir, cleanup
}
func TestReconcileCrowdSecOnStartup_NilDB(t *testing.T) {
exec := &mockCrowdsecExecutor{}
@@ -95,6 +154,9 @@ func TestReconcileCrowdSecOnStartup_ModeDisabled(t *testing.T) {
func TestReconcileCrowdSecOnStartup_ModeLocal_AlreadyRunning(t *testing.T) {
db := setupCrowdsecTestDB(t)
binPath, dataDir, cleanup := setupCrowdsecTestFixtures(t)
defer cleanup()
exec := &mockCrowdsecExecutor{
running: true,
pid: 12345,
@@ -106,7 +168,7 @@ func TestReconcileCrowdSecOnStartup_ModeLocal_AlreadyRunning(t *testing.T) {
}
require.NoError(t, db.Create(&cfg).Error)
ReconcileCrowdSecOnStartup(db, exec, "crowdsec", "/tmp/crowdsec")
ReconcileCrowdSecOnStartup(db, exec, binPath, dataDir)
assert.True(t, exec.statusCalled)
assert.False(t, exec.startCalled, "Should not start if already running")
@@ -114,10 +176,19 @@ func TestReconcileCrowdSecOnStartup_ModeLocal_AlreadyRunning(t *testing.T) {
func TestReconcileCrowdSecOnStartup_ModeLocal_NotRunning_Starts(t *testing.T) {
db := setupCrowdsecTestDB(t)
binPath, configDir, cleanup := setupCrowdsecTestFixtures(t)
defer cleanup()
// Mock executor returns not running initially, then running after start
statusCallCount := 0
exec := &mockCrowdsecExecutor{
running: false,
startPid: 99999,
}
// Override Status to return running=true on second call (post-start verification)
originalStatus := exec.Status
_ = originalStatus // silence unused warning
exec.running = false
// Create SecurityConfig with mode=local
cfg := models.SecurityConfig{
@@ -125,14 +196,23 @@ func TestReconcileCrowdSecOnStartup_ModeLocal_NotRunning_Starts(t *testing.T) {
}
require.NoError(t, db.Create(&cfg).Error)
ReconcileCrowdSecOnStartup(db, exec, "crowdsec", "/tmp/crowdsec")
// We need a smarter mock that returns running=true after Start is called
smartExec := &smartMockCrowdsecExecutor{
startPid: 99999,
}
assert.True(t, exec.statusCalled)
assert.True(t, exec.startCalled, "Should start if mode=local and not running")
ReconcileCrowdSecOnStartup(db, smartExec, binPath, configDir)
assert.True(t, smartExec.statusCalled)
assert.True(t, smartExec.startCalled, "Should start if mode=local and not running")
_ = statusCallCount // silence unused warning
}
func TestReconcileCrowdSecOnStartup_ModeLocal_StartError(t *testing.T) {
db := setupCrowdsecTestDB(t)
binPath, dataDir, cleanup := setupCrowdsecTestFixtures(t)
defer cleanup()
exec := &mockCrowdsecExecutor{
running: false,
startErr: assert.AnError,
@@ -145,13 +225,16 @@ func TestReconcileCrowdSecOnStartup_ModeLocal_StartError(t *testing.T) {
require.NoError(t, db.Create(&cfg).Error)
// Should not panic on start error
ReconcileCrowdSecOnStartup(db, exec, "crowdsec", "/tmp/crowdsec")
ReconcileCrowdSecOnStartup(db, exec, binPath, dataDir)
assert.True(t, exec.startCalled)
}
func TestReconcileCrowdSecOnStartup_StatusError(t *testing.T) {
db := setupCrowdsecTestDB(t)
binPath, dataDir, cleanup := setupCrowdsecTestFixtures(t)
defer cleanup()
exec := &mockCrowdsecExecutor{
statusErr: assert.AnError,
}
@@ -163,7 +246,7 @@ func TestReconcileCrowdSecOnStartup_StatusError(t *testing.T) {
require.NoError(t, db.Create(&cfg).Error)
// Should not panic on status error and should not attempt start
ReconcileCrowdSecOnStartup(db, exec, "crowdsec", "/tmp/crowdsec")
ReconcileCrowdSecOnStartup(db, exec, binPath, dataDir)
assert.True(t, exec.statusCalled)
assert.False(t, exec.startCalled, "Should not start if status check fails")

View File

@@ -230,33 +230,54 @@ func (w *LogWatcher) ParseLogEntry(line string) *models.SecurityLogEntry {
// detectSecurityEvent analyzes the log entry and sets security-related fields.
func (w *LogWatcher) detectSecurityEvent(entry *models.SecurityLogEntry, caddyLog *models.CaddyAccessLog) {
// Check for WAF blocks (typically 403 with specific headers or logger)
if caddyLog.Status == 403 {
loggerLower := strings.ToLower(caddyLog.Logger)
// Check for WAF/Coraza indicators (highest priority for 403s)
if strings.Contains(loggerLower, "waf") ||
strings.Contains(loggerLower, "coraza") ||
hasHeader(caddyLog.RespHeaders, "X-Coraza-Id") ||
hasHeader(caddyLog.RespHeaders, "X-Coraza-Rule-Id") {
entry.Blocked = true
entry.Source = "waf"
entry.Level = "warn"
entry.BlockReason = "WAF rule triggered"
// Check for WAF/Coraza indicators
if caddyLog.Logger == "http.handlers.waf" ||
hasHeader(caddyLog.RespHeaders, "X-Coraza-Id") ||
strings.Contains(caddyLog.Logger, "coraza") {
entry.Source = "waf"
entry.BlockReason = "WAF rule triggered"
// Try to extract rule ID from headers
if ruleID, ok := caddyLog.RespHeaders["X-Coraza-Id"]; ok && len(ruleID) > 0 {
entry.Details["rule_id"] = ruleID[0]
}
} else if hasHeader(caddyLog.RespHeaders, "X-Crowdsec-Decision") ||
strings.Contains(caddyLog.Logger, "crowdsec") {
entry.Source = "crowdsec"
entry.BlockReason = "CrowdSec decision"
} else if hasHeader(caddyLog.Request.Headers, "X-Acl-Denied") {
entry.Source = "acl"
entry.BlockReason = "Access list denied"
} else {
entry.Source = "cerberus"
entry.BlockReason = "Access denied"
// Try to extract rule ID from headers
if ruleID, ok := caddyLog.RespHeaders["X-Coraza-Id"]; ok && len(ruleID) > 0 {
entry.Details["rule_id"] = ruleID[0]
}
if ruleID, ok := caddyLog.RespHeaders["X-Coraza-Rule-Id"]; ok && len(ruleID) > 0 {
entry.Details["rule_id"] = ruleID[0]
}
return
}
// Check for CrowdSec indicators
if strings.Contains(loggerLower, "crowdsec") ||
strings.Contains(loggerLower, "bouncer") ||
hasHeader(caddyLog.RespHeaders, "X-Crowdsec-Decision") ||
hasHeader(caddyLog.RespHeaders, "X-Crowdsec-Origin") {
entry.Blocked = true
entry.Source = "crowdsec"
entry.Level = "warn"
entry.BlockReason = "CrowdSec decision"
// Extract CrowdSec-specific headers
if origin, ok := caddyLog.RespHeaders["X-Crowdsec-Origin"]; ok && len(origin) > 0 {
entry.Details["crowdsec_origin"] = origin[0]
}
return
}
// Check for ACL blocks
if strings.Contains(loggerLower, "acl") ||
hasHeader(caddyLog.RespHeaders, "X-Acl-Denied") ||
hasHeader(caddyLog.RespHeaders, "X-Blocked-By-Acl") {
entry.Blocked = true
entry.Source = "acl"
entry.Level = "warn"
entry.BlockReason = "Access list denied"
return
}
// Check for rate limiting (429 Too Many Requests)
@@ -273,6 +294,19 @@ func (w *LogWatcher) detectSecurityEvent(entry *models.SecurityLogEntry, caddyLo
if reset, ok := caddyLog.RespHeaders["X-Ratelimit-Reset"]; ok && len(reset) > 0 {
entry.Details["ratelimit_reset"] = reset[0]
}
if limit, ok := caddyLog.RespHeaders["X-Ratelimit-Limit"]; ok && len(limit) > 0 {
entry.Details["ratelimit_limit"] = limit[0]
}
return
}
// Check for other 403s (generic security block)
if caddyLog.Status == 403 {
entry.Blocked = true
entry.Source = "cerberus"
entry.Level = "warn"
entry.BlockReason = "Access denied"
return
}
// Check for authentication failures
@@ -280,11 +314,22 @@ func (w *LogWatcher) detectSecurityEvent(entry *models.SecurityLogEntry, caddyLo
entry.Level = "warn"
entry.Source = "auth"
entry.Details["auth_failure"] = true
return
}
// Check for server errors
if caddyLog.Status >= 500 {
entry.Level = "error"
return
}
// Normal traffic - set appropriate level based on status
entry.Source = "normal"
entry.Blocked = false
if caddyLog.Status >= 400 {
entry.Level = "warn"
} else {
entry.Level = "info"
}
}

View File

@@ -84,42 +84,50 @@ export default function Security() {
const crowdsecPowerMutation = useMutation({
mutationFn: async (enabled: boolean) => {
// Update setting first
await updateSetting('security.crowdsec.enabled', enabled ? 'true' : 'false', 'security', 'bool')
if (enabled) {
toast.info('Starting CrowdSec... This may take up to 30 seconds')
const result = await startCrowdsec()
// VERIFY: Check if it actually started
const status = await statusCrowdsec()
if (!status.running) {
// Revert the setting since process didn't start
await updateSetting('security.crowdsec.enabled', 'false', 'security', 'bool')
throw new Error('CrowdSec process failed to start. Check server logs for details.')
}
return result
} else {
await stopCrowdsec()
// VERIFY: Check if it actually stopped (with brief delay for cleanup)
await new Promise(resolve => setTimeout(resolve, 500))
const status = await statusCrowdsec()
if (status.running) {
throw new Error('CrowdSec process still running. Check server logs for details.')
}
return { enabled: false }
}
},
onMutate: async (enabled: boolean) => {
await queryClient.cancelQueries({ queryKey: ['security-status'] })
const previous = queryClient.getQueryData(['security-status'])
queryClient.setQueryData(['security-status'], (old: unknown) => {
if (!old || typeof old !== 'object') return old
const copy = { ...(old as SecurityStatus) }
if (copy.crowdsec && typeof copy.crowdsec === 'object') {
copy.crowdsec = { ...copy.crowdsec, enabled } as never
}
return copy
})
setCrowdsecStatus(prev => prev ? { ...prev, running: enabled } : prev)
return { previous }
},
onError: (err: unknown, enabled: boolean, context: unknown) => {
if (context && typeof context === 'object' && 'previous' in context) {
queryClient.setQueryData(['security-status'], context.previous)
}
// NO optimistic updates - wait for actual confirmation
onError: (err: unknown, enabled: boolean) => {
const msg = err instanceof Error ? err.message : String(err)
toast.error(enabled ? `Failed to start CrowdSec: ${msg}` : `Failed to stop CrowdSec: ${msg}`)
// Force refresh status from backend to ensure UI matches reality
queryClient.invalidateQueries({ queryKey: ['security-status'] })
fetchCrowdsecStatus()
},
onSuccess: async (result: { lapi_ready?: boolean; enabled?: boolean } | boolean) => {
await fetchCrowdsecStatus()
queryClient.invalidateQueries({ queryKey: ['security-status'] })
queryClient.invalidateQueries({ queryKey: ['settings'] })
// Refresh all related queries to ensure consistency
await Promise.all([
queryClient.invalidateQueries({ queryKey: ['security-status'] }),
queryClient.invalidateQueries({ queryKey: ['settings'] }),
fetchCrowdsecStatus(),
])
if (typeof result === 'object' && result.lapi_ready === true) {
toast.success('CrowdSec started and LAPI is ready')