fix: enhance CrowdSec startup logic and verification, improve error handling in Security page
This commit is contained in:
@@ -2,6 +2,9 @@ package services
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Wikid82/charon/backend/internal/logger"
|
||||
@@ -21,6 +24,11 @@ type CrowdsecProcessManager interface {
|
||||
// and starts it if necessary. This handles container restart scenarios where the
|
||||
// user's preference was to have CrowdSec enabled.
|
||||
func ReconcileCrowdSecOnStartup(db *gorm.DB, executor CrowdsecProcessManager, binPath, dataDir string) {
|
||||
logger.Log().WithFields(map[string]interface{}{
|
||||
"bin_path": binPath,
|
||||
"data_dir": dataDir,
|
||||
}).Info("CrowdSec reconciliation: starting startup check")
|
||||
|
||||
if db == nil || executor == nil {
|
||||
logger.Log().Debug("CrowdSec reconciliation skipped: nil db or executor")
|
||||
return
|
||||
@@ -42,9 +50,36 @@ func ReconcileCrowdSecOnStartup(db *gorm.DB, executor CrowdsecProcessManager, bi
|
||||
return
|
||||
}
|
||||
|
||||
// Only auto-start if CrowdSecMode is "local"
|
||||
if cfg.CrowdSecMode != "local" {
|
||||
logger.Log().WithField("mode", cfg.CrowdSecMode).Debug("CrowdSec reconciliation skipped: mode is not 'local'")
|
||||
// Also check for runtime setting override in settings table
|
||||
var settingOverride struct{ Value string }
|
||||
crowdSecEnabled := false
|
||||
if err := db.Raw("SELECT value FROM settings WHERE key = ? LIMIT 1", "security.crowdsec.enabled").Scan(&settingOverride).Error; err == nil && settingOverride.Value != "" {
|
||||
crowdSecEnabled = strings.EqualFold(settingOverride.Value, "true")
|
||||
logger.Log().WithFields(map[string]interface{}{
|
||||
"setting_value": settingOverride.Value,
|
||||
"crowdsec_enabled": crowdSecEnabled,
|
||||
}).Debug("CrowdSec reconciliation: found runtime setting override")
|
||||
}
|
||||
|
||||
// Only auto-start if CrowdSecMode is "local" OR runtime setting is enabled
|
||||
if cfg.CrowdSecMode != "local" && !crowdSecEnabled {
|
||||
logger.Log().WithFields(map[string]interface{}{
|
||||
"db_mode": cfg.CrowdSecMode,
|
||||
"setting_enabled": crowdSecEnabled,
|
||||
}).Debug("CrowdSec reconciliation skipped: mode is not 'local' and setting not enabled")
|
||||
return
|
||||
}
|
||||
|
||||
// VALIDATE: Ensure binary exists
|
||||
if _, err := os.Stat(binPath); os.IsNotExist(err) {
|
||||
logger.Log().WithField("path", binPath).Error("CrowdSec reconciliation: binary not found, cannot start")
|
||||
return
|
||||
}
|
||||
|
||||
// VALIDATE: Ensure config directory exists
|
||||
configPath := filepath.Join(dataDir, "config")
|
||||
if _, err := os.Stat(configPath); os.IsNotExist(err) {
|
||||
logger.Log().WithField("path", configPath).Error("CrowdSec reconciliation: config directory not found, cannot start")
|
||||
return
|
||||
}
|
||||
|
||||
@@ -64,16 +99,46 @@ func ReconcileCrowdSecOnStartup(db *gorm.DB, executor CrowdsecProcessManager, bi
|
||||
}
|
||||
|
||||
// CrowdSec should be running but isn't - start it
|
||||
logger.Log().Info("CrowdSec reconciliation: starting CrowdSec (mode=local, not currently running)")
|
||||
logger.Log().WithFields(map[string]interface{}{
|
||||
"bin_path": binPath,
|
||||
"data_dir": dataDir,
|
||||
}).Info("CrowdSec reconciliation: starting CrowdSec (mode=local, not currently running)")
|
||||
|
||||
startCtx, startCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer startCancel()
|
||||
|
||||
newPid, err := executor.Start(startCtx, binPath, dataDir)
|
||||
if err != nil {
|
||||
logger.Log().WithError(err).Error("CrowdSec reconciliation: failed to start CrowdSec")
|
||||
logger.Log().WithError(err).WithFields(map[string]interface{}{
|
||||
"bin_path": binPath,
|
||||
"data_dir": dataDir,
|
||||
}).Error("CrowdSec reconciliation: FAILED to start CrowdSec - check binary and config")
|
||||
return
|
||||
}
|
||||
|
||||
logger.Log().WithField("pid", newPid).Info("CrowdSec reconciliation: successfully started CrowdSec")
|
||||
// VERIFY: Wait briefly and confirm process is actually running
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
verifyCtx, verifyCancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer verifyCancel()
|
||||
|
||||
verifyRunning, verifyPid, verifyErr := executor.Status(verifyCtx, dataDir)
|
||||
if verifyErr != nil {
|
||||
logger.Log().WithError(verifyErr).WithField("expected_pid", newPid).Warn("CrowdSec reconciliation: started but failed to verify status")
|
||||
return
|
||||
}
|
||||
|
||||
if !verifyRunning {
|
||||
logger.Log().WithFields(map[string]interface{}{
|
||||
"expected_pid": newPid,
|
||||
"actual_pid": verifyPid,
|
||||
"running": verifyRunning,
|
||||
}).Error("CrowdSec reconciliation: process started but is no longer running - may have crashed")
|
||||
return
|
||||
}
|
||||
|
||||
logger.Log().WithFields(map[string]interface{}{
|
||||
"pid": newPid,
|
||||
"verified": true,
|
||||
}).Info("CrowdSec reconciliation: successfully started and verified CrowdSec")
|
||||
}
|
||||
|
||||
@@ -2,6 +2,8 @@ package services
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/Wikid82/charon/backend/internal/models"
|
||||
@@ -37,6 +39,33 @@ func (m *mockCrowdsecExecutor) Status(ctx context.Context, configDir string) (bo
|
||||
return m.running, m.pid, m.statusErr
|
||||
}
|
||||
|
||||
// smartMockCrowdsecExecutor returns running=true after Start is called (for post-start verification)
|
||||
type smartMockCrowdsecExecutor struct {
|
||||
startCalled bool
|
||||
startErr error
|
||||
startPid int
|
||||
statusCalled bool
|
||||
statusErr error
|
||||
}
|
||||
|
||||
func (m *smartMockCrowdsecExecutor) Start(ctx context.Context, binPath, configDir string) (int, error) {
|
||||
m.startCalled = true
|
||||
return m.startPid, m.startErr
|
||||
}
|
||||
|
||||
func (m *smartMockCrowdsecExecutor) Stop(ctx context.Context, configDir string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *smartMockCrowdsecExecutor) Status(ctx context.Context, configDir string) (bool, int, error) {
|
||||
m.statusCalled = true
|
||||
// Return running=true if Start was called (simulates successful start)
|
||||
if m.startCalled {
|
||||
return true, m.startPid, m.statusErr
|
||||
}
|
||||
return false, 0, m.statusErr
|
||||
}
|
||||
|
||||
func setupCrowdsecTestDB(t *testing.T) *gorm.DB {
|
||||
db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{
|
||||
Logger: gormlogger.Default.LogMode(gormlogger.Silent),
|
||||
@@ -49,6 +78,36 @@ func setupCrowdsecTestDB(t *testing.T) *gorm.DB {
|
||||
return db
|
||||
}
|
||||
|
||||
// setupCrowdsecTestFixtures creates temporary binary and config directory for testing
|
||||
func setupCrowdsecTestFixtures(t *testing.T) (binPath, dataDir string, cleanup func()) {
|
||||
t.Helper()
|
||||
|
||||
// Create temp directory
|
||||
tempDir, err := os.MkdirTemp("", "crowdsec-test-*")
|
||||
require.NoError(t, err)
|
||||
|
||||
// Create mock binary file
|
||||
binPath = filepath.Join(tempDir, "crowdsec")
|
||||
err = os.WriteFile(binPath, []byte("#!/bin/sh\nexit 0\n"), 0o755)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Create data directory (passed as dataDir to the function)
|
||||
dataDir = filepath.Join(tempDir, "data")
|
||||
err = os.MkdirAll(dataDir, 0o755)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Create config directory inside data dir (validation checks dataDir/config)
|
||||
configDir := filepath.Join(dataDir, "config")
|
||||
err = os.MkdirAll(configDir, 0o755)
|
||||
require.NoError(t, err)
|
||||
|
||||
cleanup = func() {
|
||||
os.RemoveAll(tempDir)
|
||||
}
|
||||
|
||||
return binPath, dataDir, cleanup
|
||||
}
|
||||
|
||||
func TestReconcileCrowdSecOnStartup_NilDB(t *testing.T) {
|
||||
exec := &mockCrowdsecExecutor{}
|
||||
|
||||
@@ -95,6 +154,9 @@ func TestReconcileCrowdSecOnStartup_ModeDisabled(t *testing.T) {
|
||||
|
||||
func TestReconcileCrowdSecOnStartup_ModeLocal_AlreadyRunning(t *testing.T) {
|
||||
db := setupCrowdsecTestDB(t)
|
||||
binPath, dataDir, cleanup := setupCrowdsecTestFixtures(t)
|
||||
defer cleanup()
|
||||
|
||||
exec := &mockCrowdsecExecutor{
|
||||
running: true,
|
||||
pid: 12345,
|
||||
@@ -106,7 +168,7 @@ func TestReconcileCrowdSecOnStartup_ModeLocal_AlreadyRunning(t *testing.T) {
|
||||
}
|
||||
require.NoError(t, db.Create(&cfg).Error)
|
||||
|
||||
ReconcileCrowdSecOnStartup(db, exec, "crowdsec", "/tmp/crowdsec")
|
||||
ReconcileCrowdSecOnStartup(db, exec, binPath, dataDir)
|
||||
|
||||
assert.True(t, exec.statusCalled)
|
||||
assert.False(t, exec.startCalled, "Should not start if already running")
|
||||
@@ -114,10 +176,19 @@ func TestReconcileCrowdSecOnStartup_ModeLocal_AlreadyRunning(t *testing.T) {
|
||||
|
||||
func TestReconcileCrowdSecOnStartup_ModeLocal_NotRunning_Starts(t *testing.T) {
|
||||
db := setupCrowdsecTestDB(t)
|
||||
binPath, configDir, cleanup := setupCrowdsecTestFixtures(t)
|
||||
defer cleanup()
|
||||
|
||||
// Mock executor returns not running initially, then running after start
|
||||
statusCallCount := 0
|
||||
exec := &mockCrowdsecExecutor{
|
||||
running: false,
|
||||
startPid: 99999,
|
||||
}
|
||||
// Override Status to return running=true on second call (post-start verification)
|
||||
originalStatus := exec.Status
|
||||
_ = originalStatus // silence unused warning
|
||||
exec.running = false
|
||||
|
||||
// Create SecurityConfig with mode=local
|
||||
cfg := models.SecurityConfig{
|
||||
@@ -125,14 +196,23 @@ func TestReconcileCrowdSecOnStartup_ModeLocal_NotRunning_Starts(t *testing.T) {
|
||||
}
|
||||
require.NoError(t, db.Create(&cfg).Error)
|
||||
|
||||
ReconcileCrowdSecOnStartup(db, exec, "crowdsec", "/tmp/crowdsec")
|
||||
// We need a smarter mock that returns running=true after Start is called
|
||||
smartExec := &smartMockCrowdsecExecutor{
|
||||
startPid: 99999,
|
||||
}
|
||||
|
||||
assert.True(t, exec.statusCalled)
|
||||
assert.True(t, exec.startCalled, "Should start if mode=local and not running")
|
||||
ReconcileCrowdSecOnStartup(db, smartExec, binPath, configDir)
|
||||
|
||||
assert.True(t, smartExec.statusCalled)
|
||||
assert.True(t, smartExec.startCalled, "Should start if mode=local and not running")
|
||||
_ = statusCallCount // silence unused warning
|
||||
}
|
||||
|
||||
func TestReconcileCrowdSecOnStartup_ModeLocal_StartError(t *testing.T) {
|
||||
db := setupCrowdsecTestDB(t)
|
||||
binPath, dataDir, cleanup := setupCrowdsecTestFixtures(t)
|
||||
defer cleanup()
|
||||
|
||||
exec := &mockCrowdsecExecutor{
|
||||
running: false,
|
||||
startErr: assert.AnError,
|
||||
@@ -145,13 +225,16 @@ func TestReconcileCrowdSecOnStartup_ModeLocal_StartError(t *testing.T) {
|
||||
require.NoError(t, db.Create(&cfg).Error)
|
||||
|
||||
// Should not panic on start error
|
||||
ReconcileCrowdSecOnStartup(db, exec, "crowdsec", "/tmp/crowdsec")
|
||||
ReconcileCrowdSecOnStartup(db, exec, binPath, dataDir)
|
||||
|
||||
assert.True(t, exec.startCalled)
|
||||
}
|
||||
|
||||
func TestReconcileCrowdSecOnStartup_StatusError(t *testing.T) {
|
||||
db := setupCrowdsecTestDB(t)
|
||||
binPath, dataDir, cleanup := setupCrowdsecTestFixtures(t)
|
||||
defer cleanup()
|
||||
|
||||
exec := &mockCrowdsecExecutor{
|
||||
statusErr: assert.AnError,
|
||||
}
|
||||
@@ -163,7 +246,7 @@ func TestReconcileCrowdSecOnStartup_StatusError(t *testing.T) {
|
||||
require.NoError(t, db.Create(&cfg).Error)
|
||||
|
||||
// Should not panic on status error and should not attempt start
|
||||
ReconcileCrowdSecOnStartup(db, exec, "crowdsec", "/tmp/crowdsec")
|
||||
ReconcileCrowdSecOnStartup(db, exec, binPath, dataDir)
|
||||
|
||||
assert.True(t, exec.statusCalled)
|
||||
assert.False(t, exec.startCalled, "Should not start if status check fails")
|
||||
|
||||
@@ -230,33 +230,54 @@ func (w *LogWatcher) ParseLogEntry(line string) *models.SecurityLogEntry {
|
||||
|
||||
// detectSecurityEvent analyzes the log entry and sets security-related fields.
|
||||
func (w *LogWatcher) detectSecurityEvent(entry *models.SecurityLogEntry, caddyLog *models.CaddyAccessLog) {
|
||||
// Check for WAF blocks (typically 403 with specific headers or logger)
|
||||
if caddyLog.Status == 403 {
|
||||
loggerLower := strings.ToLower(caddyLog.Logger)
|
||||
|
||||
// Check for WAF/Coraza indicators (highest priority for 403s)
|
||||
if strings.Contains(loggerLower, "waf") ||
|
||||
strings.Contains(loggerLower, "coraza") ||
|
||||
hasHeader(caddyLog.RespHeaders, "X-Coraza-Id") ||
|
||||
hasHeader(caddyLog.RespHeaders, "X-Coraza-Rule-Id") {
|
||||
entry.Blocked = true
|
||||
entry.Source = "waf"
|
||||
entry.Level = "warn"
|
||||
entry.BlockReason = "WAF rule triggered"
|
||||
|
||||
// Check for WAF/Coraza indicators
|
||||
if caddyLog.Logger == "http.handlers.waf" ||
|
||||
hasHeader(caddyLog.RespHeaders, "X-Coraza-Id") ||
|
||||
strings.Contains(caddyLog.Logger, "coraza") {
|
||||
entry.Source = "waf"
|
||||
entry.BlockReason = "WAF rule triggered"
|
||||
|
||||
// Try to extract rule ID from headers
|
||||
if ruleID, ok := caddyLog.RespHeaders["X-Coraza-Id"]; ok && len(ruleID) > 0 {
|
||||
entry.Details["rule_id"] = ruleID[0]
|
||||
}
|
||||
} else if hasHeader(caddyLog.RespHeaders, "X-Crowdsec-Decision") ||
|
||||
strings.Contains(caddyLog.Logger, "crowdsec") {
|
||||
entry.Source = "crowdsec"
|
||||
entry.BlockReason = "CrowdSec decision"
|
||||
} else if hasHeader(caddyLog.Request.Headers, "X-Acl-Denied") {
|
||||
entry.Source = "acl"
|
||||
entry.BlockReason = "Access list denied"
|
||||
} else {
|
||||
entry.Source = "cerberus"
|
||||
entry.BlockReason = "Access denied"
|
||||
// Try to extract rule ID from headers
|
||||
if ruleID, ok := caddyLog.RespHeaders["X-Coraza-Id"]; ok && len(ruleID) > 0 {
|
||||
entry.Details["rule_id"] = ruleID[0]
|
||||
}
|
||||
if ruleID, ok := caddyLog.RespHeaders["X-Coraza-Rule-Id"]; ok && len(ruleID) > 0 {
|
||||
entry.Details["rule_id"] = ruleID[0]
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Check for CrowdSec indicators
|
||||
if strings.Contains(loggerLower, "crowdsec") ||
|
||||
strings.Contains(loggerLower, "bouncer") ||
|
||||
hasHeader(caddyLog.RespHeaders, "X-Crowdsec-Decision") ||
|
||||
hasHeader(caddyLog.RespHeaders, "X-Crowdsec-Origin") {
|
||||
entry.Blocked = true
|
||||
entry.Source = "crowdsec"
|
||||
entry.Level = "warn"
|
||||
entry.BlockReason = "CrowdSec decision"
|
||||
|
||||
// Extract CrowdSec-specific headers
|
||||
if origin, ok := caddyLog.RespHeaders["X-Crowdsec-Origin"]; ok && len(origin) > 0 {
|
||||
entry.Details["crowdsec_origin"] = origin[0]
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Check for ACL blocks
|
||||
if strings.Contains(loggerLower, "acl") ||
|
||||
hasHeader(caddyLog.RespHeaders, "X-Acl-Denied") ||
|
||||
hasHeader(caddyLog.RespHeaders, "X-Blocked-By-Acl") {
|
||||
entry.Blocked = true
|
||||
entry.Source = "acl"
|
||||
entry.Level = "warn"
|
||||
entry.BlockReason = "Access list denied"
|
||||
return
|
||||
}
|
||||
|
||||
// Check for rate limiting (429 Too Many Requests)
|
||||
@@ -273,6 +294,19 @@ func (w *LogWatcher) detectSecurityEvent(entry *models.SecurityLogEntry, caddyLo
|
||||
if reset, ok := caddyLog.RespHeaders["X-Ratelimit-Reset"]; ok && len(reset) > 0 {
|
||||
entry.Details["ratelimit_reset"] = reset[0]
|
||||
}
|
||||
if limit, ok := caddyLog.RespHeaders["X-Ratelimit-Limit"]; ok && len(limit) > 0 {
|
||||
entry.Details["ratelimit_limit"] = limit[0]
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Check for other 403s (generic security block)
|
||||
if caddyLog.Status == 403 {
|
||||
entry.Blocked = true
|
||||
entry.Source = "cerberus"
|
||||
entry.Level = "warn"
|
||||
entry.BlockReason = "Access denied"
|
||||
return
|
||||
}
|
||||
|
||||
// Check for authentication failures
|
||||
@@ -280,11 +314,22 @@ func (w *LogWatcher) detectSecurityEvent(entry *models.SecurityLogEntry, caddyLo
|
||||
entry.Level = "warn"
|
||||
entry.Source = "auth"
|
||||
entry.Details["auth_failure"] = true
|
||||
return
|
||||
}
|
||||
|
||||
// Check for server errors
|
||||
if caddyLog.Status >= 500 {
|
||||
entry.Level = "error"
|
||||
return
|
||||
}
|
||||
|
||||
// Normal traffic - set appropriate level based on status
|
||||
entry.Source = "normal"
|
||||
entry.Blocked = false
|
||||
if caddyLog.Status >= 400 {
|
||||
entry.Level = "warn"
|
||||
} else {
|
||||
entry.Level = "info"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -84,42 +84,50 @@ export default function Security() {
|
||||
|
||||
const crowdsecPowerMutation = useMutation({
|
||||
mutationFn: async (enabled: boolean) => {
|
||||
// Update setting first
|
||||
await updateSetting('security.crowdsec.enabled', enabled ? 'true' : 'false', 'security', 'bool')
|
||||
|
||||
if (enabled) {
|
||||
toast.info('Starting CrowdSec... This may take up to 30 seconds')
|
||||
const result = await startCrowdsec()
|
||||
|
||||
// VERIFY: Check if it actually started
|
||||
const status = await statusCrowdsec()
|
||||
if (!status.running) {
|
||||
// Revert the setting since process didn't start
|
||||
await updateSetting('security.crowdsec.enabled', 'false', 'security', 'bool')
|
||||
throw new Error('CrowdSec process failed to start. Check server logs for details.')
|
||||
}
|
||||
|
||||
return result
|
||||
} else {
|
||||
await stopCrowdsec()
|
||||
|
||||
// VERIFY: Check if it actually stopped (with brief delay for cleanup)
|
||||
await new Promise(resolve => setTimeout(resolve, 500))
|
||||
const status = await statusCrowdsec()
|
||||
if (status.running) {
|
||||
throw new Error('CrowdSec process still running. Check server logs for details.')
|
||||
}
|
||||
|
||||
return { enabled: false }
|
||||
}
|
||||
},
|
||||
onMutate: async (enabled: boolean) => {
|
||||
await queryClient.cancelQueries({ queryKey: ['security-status'] })
|
||||
const previous = queryClient.getQueryData(['security-status'])
|
||||
queryClient.setQueryData(['security-status'], (old: unknown) => {
|
||||
if (!old || typeof old !== 'object') return old
|
||||
const copy = { ...(old as SecurityStatus) }
|
||||
if (copy.crowdsec && typeof copy.crowdsec === 'object') {
|
||||
copy.crowdsec = { ...copy.crowdsec, enabled } as never
|
||||
}
|
||||
return copy
|
||||
})
|
||||
setCrowdsecStatus(prev => prev ? { ...prev, running: enabled } : prev)
|
||||
return { previous }
|
||||
},
|
||||
onError: (err: unknown, enabled: boolean, context: unknown) => {
|
||||
if (context && typeof context === 'object' && 'previous' in context) {
|
||||
queryClient.setQueryData(['security-status'], context.previous)
|
||||
}
|
||||
// NO optimistic updates - wait for actual confirmation
|
||||
onError: (err: unknown, enabled: boolean) => {
|
||||
const msg = err instanceof Error ? err.message : String(err)
|
||||
toast.error(enabled ? `Failed to start CrowdSec: ${msg}` : `Failed to stop CrowdSec: ${msg}`)
|
||||
// Force refresh status from backend to ensure UI matches reality
|
||||
queryClient.invalidateQueries({ queryKey: ['security-status'] })
|
||||
fetchCrowdsecStatus()
|
||||
},
|
||||
onSuccess: async (result: { lapi_ready?: boolean; enabled?: boolean } | boolean) => {
|
||||
await fetchCrowdsecStatus()
|
||||
queryClient.invalidateQueries({ queryKey: ['security-status'] })
|
||||
queryClient.invalidateQueries({ queryKey: ['settings'] })
|
||||
// Refresh all related queries to ensure consistency
|
||||
await Promise.all([
|
||||
queryClient.invalidateQueries({ queryKey: ['security-status'] }),
|
||||
queryClient.invalidateQueries({ queryKey: ['settings'] }),
|
||||
fetchCrowdsecStatus(),
|
||||
])
|
||||
|
||||
if (typeof result === 'object' && result.lapi_ready === true) {
|
||||
toast.success('CrowdSec started and LAPI is ready')
|
||||
|
||||
Reference in New Issue
Block a user