From 1919530662db75698d81ea753783023fa4cc39e6 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Sun, 14 Dec 2025 17:59:43 +0000 Subject: [PATCH] fix: add LAPI readiness check to CrowdSec status endpoint The Status() handler was only checking if the CrowdSec process was running, not if LAPI was actually responding. This caused the CrowdSecConfig page to always show "LAPI is initializing" even when LAPI was fully operational. Changes: - Backend: Add lapi_ready field to /admin/crowdsec/status response - Frontend: Add CrowdSecStatus TypeScript interface - Frontend: Update conditional logic to check lapi_ready not running - Frontend: Separate warnings for "initializing" vs "not running" - Tests: Add unit tests for Status handler LAPI check Fixes regression from crowdsec_lapi_error_diagnostic.md fixes. --- .../internal/api/handlers/crowdsec_handler.go | 22 +- .../api/handlers/crowdsec_handler_test.go | 109 ++ docs/plans/current_spec.md | 1376 ++++++----------- docs/reports/qa_report.md | 146 +- frontend/src/api/crowdsec.ts | 10 +- frontend/src/pages/CrowdSecConfig.tsx | 67 +- .../pages/__tests__/CrowdSecConfig.spec.tsx | 2 +- .../pages/__tests__/Security.audit.test.tsx | 10 +- .../__tests__/Security.dashboard.test.tsx | 6 +- .../pages/__tests__/Security.errors.test.tsx | 10 +- .../pages/__tests__/Security.loading.test.tsx | 6 +- .../src/pages/__tests__/Security.spec.tsx | 4 +- .../src/pages/__tests__/Security.test.tsx | 10 +- 13 files changed, 800 insertions(+), 978 deletions(-) diff --git a/backend/internal/api/handlers/crowdsec_handler.go b/backend/internal/api/handlers/crowdsec_handler.go index bb4956ff..7248a152 100644 --- a/backend/internal/api/handlers/crowdsec_handler.go +++ b/backend/internal/api/handlers/crowdsec_handler.go @@ -246,7 +246,7 @@ func (h *CrowdsecHandler) Stop(c *gin.Context) { c.JSON(http.StatusOK, gin.H{"status": "stopped"}) } -// Status returns simple running state. +// Status returns running state including LAPI availability check. func (h *CrowdsecHandler) Status(c *gin.Context) { ctx := c.Request.Context() running, pid, err := h.Executor.Status(ctx, h.DataDir) @@ -254,7 +254,25 @@ func (h *CrowdsecHandler) Status(c *gin.Context) { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } - c.JSON(http.StatusOK, gin.H{"running": running, "pid": pid}) + + // Check LAPI connectivity if process is running + lapiReady := false + if running { + args := []string{"lapi", "status"} + if _, err := os.Stat(filepath.Join(h.DataDir, "config.yaml")); err == nil { + args = append([]string{"-c", filepath.Join(h.DataDir, "config.yaml")}, args...) + } + checkCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + _, checkErr := h.CmdExec.Execute(checkCtx, "cscli", args...) + cancel() + lapiReady = (checkErr == nil) + } + + c.JSON(http.StatusOK, gin.H{ + "running": running, + "pid": pid, + "lapi_ready": lapiReady, + }) } // ImportConfig accepts a tar.gz or zip upload and extracts into DataDir (backing up existing config). diff --git a/backend/internal/api/handlers/crowdsec_handler_test.go b/backend/internal/api/handlers/crowdsec_handler_test.go index 41efccfe..14d7bdae 100644 --- a/backend/internal/api/handlers/crowdsec_handler_test.go +++ b/backend/internal/api/handlers/crowdsec_handler_test.go @@ -1348,6 +1348,115 @@ func TestCrowdsecHandler_StartReturnsImmediatelyIfProcessFailsToStart(t *testing require.Equal(t, http.StatusInternalServerError, w.Code) } +// ============================================ +// Status Handler lapi_ready Tests +// ============================================ + +func TestCrowdsecHandler_StatusReturnsLAPIReadyWhenRunning(t *testing.T) { + gin.SetMode(gin.TestMode) + db := setupCrowdDB(t) + tmpDir := t.TempDir() + + // Create an executor that reports as running + runningExec := &fakeExec{started: true} + + // Create a command executor that succeeds (LAPI is ready) + successCmdExec := &mockCmdExec{err: nil} + + h := NewCrowdsecHandler(db, runningExec, "/bin/false", tmpDir) + h.CmdExec = successCmdExec + + r := gin.New() + g := r.Group("/api/v1") + h.RegisterRoutes(g) + + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/api/v1/admin/crowdsec/status", http.NoBody) + r.ServeHTTP(w, req) + + require.Equal(t, http.StatusOK, w.Code) + + var response map[string]interface{} + err := json.Unmarshal(w.Body.Bytes(), &response) + require.NoError(t, err) + + require.Equal(t, true, response["running"]) + require.Equal(t, float64(12345), response["pid"]) + require.Equal(t, true, response["lapi_ready"], "lapi_ready should be true when cscli lapi status succeeds") +} + +func TestCrowdsecHandler_StatusReturnsLAPINotReadyWhenCmdFails(t *testing.T) { + gin.SetMode(gin.TestMode) + db := setupCrowdDB(t) + tmpDir := t.TempDir() + + // Create an executor that reports as running + runningExec := &fakeExec{started: true} + + // Create a command executor that fails (LAPI not ready) + failCmdExec := &mockCmdExec{err: errors.New("LAPI not initialized")} + + h := NewCrowdsecHandler(db, runningExec, "/bin/false", tmpDir) + h.CmdExec = failCmdExec + + r := gin.New() + g := r.Group("/api/v1") + h.RegisterRoutes(g) + + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/api/v1/admin/crowdsec/status", http.NoBody) + r.ServeHTTP(w, req) + + require.Equal(t, http.StatusOK, w.Code) + + var response map[string]interface{} + err := json.Unmarshal(w.Body.Bytes(), &response) + require.NoError(t, err) + + require.Equal(t, true, response["running"]) + require.Equal(t, float64(12345), response["pid"]) + require.Equal(t, false, response["lapi_ready"], "lapi_ready should be false when cscli lapi status fails") +} + +func TestCrowdsecHandler_StatusReturnsLAPINotReadyWhenStopped(t *testing.T) { + gin.SetMode(gin.TestMode) + db := setupCrowdDB(t) + tmpDir := t.TempDir() + + // Create an executor that reports as stopped + stoppedExec := &fakeExec{started: false} + + h := NewCrowdsecHandler(db, stoppedExec, "/bin/false", tmpDir) + + r := gin.New() + g := r.Group("/api/v1") + h.RegisterRoutes(g) + + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/api/v1/admin/crowdsec/status", http.NoBody) + r.ServeHTTP(w, req) + + require.Equal(t, http.StatusOK, w.Code) + + var response map[string]interface{} + err := json.Unmarshal(w.Body.Bytes(), &response) + require.NoError(t, err) + + require.Equal(t, false, response["running"]) + require.Equal(t, float64(0), response["pid"]) + require.Equal(t, false, response["lapi_ready"], "lapi_ready should be false when process is not running") +} + +// mockCmdExec is a mock command executor for testing +type mockCmdExec struct { + err error + output []byte +} + +func (m *mockCmdExec) Execute(ctx context.Context, name string, args ...string) ([]byte, error) { + return m.output, m.err +} + type failingExec struct{} func (f *failingExec) Start(ctx context.Context, binPath, configDir string) (int, error) { diff --git a/docs/plans/current_spec.md b/docs/plans/current_spec.md index f75350f5..8ea6be0f 100644 --- a/docs/plans/current_spec.md +++ b/docs/plans/current_spec.md @@ -1,8 +1,10 @@ -# CrowdSec Console Enrollment Persistence Issue - ARCHITECTURAL ROOT CAUSE +# CrowdSec LAPI Status Bug - Diagnostic & Fix Plan -**Date:** December 14, 2025 (Updated with Architectural Analysis) -**Issue:** Console enrollment shows "enrolled" locally but doesn't appear on crowdsec.net -**Status:** 🚨 **ARCHITECTURAL ISSUE IDENTIFIED** - Environment variable dependency breaks GUI control +**Date:** December 14, 2025 +**Issue:** CrowdSecConfig page persistently shows "LAPI is initializing" even when LAPI is running +**Status:** 🎯 **ROOT CAUSE IDENTIFIED** - Status endpoint checks process, not LAPI connectivity +**Priority:** HIGH (Blocks Console Enrollment Feature) +**Previous Issue:** [crowdsec_lapi_error_diagnostic.md](crowdsec_lapi_error_diagnostic.md) - Race condition fix introduced this regression --- @@ -10,994 +12,564 @@ ### Critical Discovery -The `CHARON_SECURITY_CROWDSEC_MODE` environment variable is **LEGACY/DEPRECATED** technical debt from when Charon supported external CrowdSec instances (no longer supported). Now that Charon offers the **import config option**, CrowdSec should be **entirely GUI-controlled**, but the code still checks environment variables. +After implementing fixes from `docs/plans/crowdsec_lapi_error_diagnostic.md`, the CrowdSecConfig page now persistently displays: + +> "CrowdSec Local API is initializing... +> The CrowdSec process is running but the Local API (LAPI) is still starting up." + +This message appears **even when LAPI is actually running and reachable**. The fix introduced a regression where the Status endpoint was not updated to match the new LAPI-aware Start endpoint. ### Root Cause Chain -1. User enables CrowdSec via GUI β†’ Database updated (`security.crowdsec.enabled = true`) -2. Backend sees CrowdSec enabled and allows Console enrollment -3. **BUT** `docker-entrypoint.sh` checks `SECURITY_CROWDSEC_MODE` environment variable -4. LAPI never starts because env var says "disabled" -5. Enrollment command runs but cannot contact LAPI -6. User sees "enrolled" in UI but nothing appears on crowdsec.net +1. `Start()` handler was correctly updated to wait for LAPI and return `lapi_ready: true/false` +2. **BUT** `Status()` handler was **NOT updated** - still only checks process status +3. Frontend expects `running` to mean "LAPI responding" +4. Backend returns `running: true` meaning only "process running" +5. **MISMATCH:** Frontend needs `lapi_ready` field to determine actual LAPI status -### Why This is an Architecture Problem +### Why This is a Regression -- **WAF, ACL, and Rate Limiting** are all GUI-controlled via Settings table -- **CrowdSec** still has legacy environment variable checks in entrypoint script -- Backend has proper `Start()` and `Stop()` handlers but they're not integrated with container lifecycle -- This creates inconsistent UX where GUI toggle doesn't actually control the service +- The original fix added LAPI readiness check to `Start()` handler βœ… +- But forgot to add the same check to `Status()` handler ❌ +- Frontend now uses `statusCrowdsec()` for polling LAPI status +- This endpoint doesn't actually verify LAPI connectivity ### Impact -- **ALL users** attempting Console enrollment are affected -- **Not a configuration issue** - users cannot fix this without workaround -- **Technical debt** preventing proper GUI-based security orchestration +- Console enrollment section always shows "initializing" warning +- Enroll button is disabled even when LAPI is working +- Users cannot complete console enrollment despite CrowdSec being functional --- ## Executive Summary -The CrowdSec console enrollment appears successful locally (green checkmark in Charon UI) but the instance **does not appear on the CrowdSec Console dashboard at crowdsec.net**. - -**🚨 CRITICAL ARCHITECTURAL ISSUE:** The `CHARON_SECURITY_CROWDSEC_MODE` environment variable is **LEGACY/DEPRECATED** from when Charon supported external CrowdSec instances. Now that Charon offers the **import config option**, CrowdSec is **always internally managed** and should be **GUI-controlled**, not environment variable controlled. - -**βœ… TRUE ROOT CAUSE:** The code still checks the legacy `SECURITY_CROWDSEC_MODE` environment variable in `docker-entrypoint.sh`, which prevents LAPI from starting even when the GUI says CrowdSec is enabled. The `cscli console enroll` command **requires LAPI to be running** to complete the enrollment registration with crowdsec.net. - -**CORRECTED UNDERSTANDING:** Enrollment tokens are **REUSABLE** (confirmed by user testing). The issue is NOT token exhaustion - it's that the enrollment process cannot complete without an active LAPI connection. - -**Key Finding:** The enrollment command executes without error even when LAPI is down, causing the database to show "enrolled" status while the actual Console registration never happens. - ---- - -## Architectural Analysis - -### Current Architecture (INCORRECT) - -**Environment Variable Dependency:** - -```bash -# docker-entrypoint.sh checks this legacy env var: -SECURITY_CROWDSEC_MODE=${CERBERUS_SECURITY_CROWDSEC_MODE:-${CHARON_SECURITY_CROWDSEC_MODE:-$CPM_SECURITY_CROWDSEC_MODE}} - -if [ "$SECURITY_CROWDSEC_MODE" = "local" ]; then - crowdsec -c /etc/crowdsec/config.yaml & -fi -``` - -**The Problem:** - -- User enables CrowdSec via GUI β†’ `security.crowdsec.enabled = true` in database -- Backend sees CrowdSec enabled and allows enrollment -- But `docker-entrypoint.sh` checks **environment variable**, not database -- LAPI never starts because env var says "disabled" -- Enrollment command runs but cannot contact LAPI -- User sees "enrolled" in UI but nothing on crowdsec.net - -### Correct Architecture (GUI-Controlled) - -**How Other Security Features Work (Pattern to Follow):** - -WAF, Rate Limiting, and ACL are all **GUI-controlled** through the Settings table: - -- `security.waf.enabled` β†’ Controls WAF mode -- `security.rate_limit.enabled` β†’ Controls rate limiting -- `security.acl.enabled` β†’ Controls ACL mode - -These settings are read by: - -1. **Backend handlers** via `security_handler.go:GetStatus()` -2. **Caddy config generator** via `caddy/manager.go:computeEffectiveFlags()` -3. **Frontend** via API calls to `/api/v1/security/status` - -**CrowdSec Should Follow Same Pattern:** - -- GUI toggle β†’ `security.crowdsec.enabled` in Settings table -- Backend reads setting and manages CrowdSec process lifecycle -- No environment variable dependency - -### Import Config Feature (Why External Mode is Deprecated) - -The import config feature (`importCrowdsecConfig`) allows users to: - -1. Upload a complete CrowdSec configuration (tar.gz) -2. Import pre-configured settings, collections, and bouncers -3. Manage CrowdSec entirely through Charon's GUI - -**This replaced the need for "external" mode:** - -- Old way: Set `CROWDSEC_MODE=external` and point to external LAPI -- New way: Import your existing config and let Charon manage it internally - ---- - -## Forensic Investigation Findings - -### Environment Status (Verified Dec 14, 2025) - -**βœ… CAPI Registration:** Working - -```bash -$ docker exec charon cscli capi status -βœ“ Loaded credentials from /etc/crowdsec/online_api_credentials.yaml -βœ“ You can successfully interact with Central API (CAPI) -``` - -**❌ LAPI Status:** NOT RUNNING - -```bash -$ docker exec charon cscli lapi status -βœ— Error: dial tcp 127.0.0.1:8085: connection refused -``` - -**❌ CrowdSec Agent:** NOT RUNNING - -```bash -$ docker exec charon ps aux | grep crowdsec -(no processes found) -``` - -**Environment Variables:** - -```bash -CHARON_SECURITY_CROWDSEC_MODE=disabled # ← THIS IS THE PROBLEM -``` - -### Why Enrollment Appears Successful - -The enrollment flow in `backend/internal/crowdsec/console_enroll.go`: - -1. βœ… Validates token format -2. βœ… Ensures CAPI registered (`ensureCAPIRegistered`) -3. βœ… Updates database to "enrolling" status -4. βœ… Executes `cscli console enroll ` -5. **❌ Command exits with code 0 even when LAPI is down** -6. βœ… Updates database to "enrolled" status -7. βœ… Returns success to UI - -**The Bug:** `cscli console enroll` does NOT verify LAPI connectivity before returning success. It writes local state but cannot register with crowdsec.net Console API without an active LAPI connection. - ---- - -## Root Cause: Legacy Environment Variable Architecture - -### Confirmed (100% Confidence) - -**The Issue:** The `docker-entrypoint.sh` script only starts CrowdSec LAPI when checking a **legacy environment variable**, not the **GUI setting**: - -```bash -# docker-entrypoint.sh (INCORRECT ARCHITECTURE) -SECURITY_CROWDSEC_MODE=${CERBERUS_SECURITY_CROWDSEC_MODE:-${CHARON_SECURITY_CROWDSEC_MODE:-$CPM_SECURITY_CROWDSEC_MODE}} - -if [ "$SECURITY_CROWDSEC_MODE" = "local" ]; then - crowdsec -c /etc/crowdsec/config.yaml & -fi -``` - -**Current State:** - -- GUI setting: `security.crowdsec.enabled = true` (in database) -- Environment: `CHARON_SECURITY_CROWDSEC_MODE=disabled` -- Result: LAPI NOT RUNNING - -**Correct Architecture:** - -- CrowdSec should be started/stopped by **backend handlers** (`Start()` and `Stop()` methods) -- The GUI toggle should call these handlers, just like WAF and ACL -- No environment variable checks in entrypoint script - -**Console Enrollment REQUIRES:** - -1. CrowdSec agent running -2. Local API (LAPI) running on port 8085 -3. Active connection between LAPI and Console API (api.crowdsec.net) -4. **All controlled by GUI, not environment variables** - ---- - -## Comparison: How WAF/ACL Work (Correct Pattern) - -### WAF Control Flow (GUI β†’ Backend β†’ Caddy) - -1. **Frontend:** User toggles WAF switch β†’ calls `updateSetting('security.waf.enabled', 'true')` -2. **Backend:** Settings table updated β†’ Caddy config regenerated -3. **Caddy Manager:** Reads `security.waf.enabled` from database β†’ enables WAF handlers -4. **No Environment Variable Checks** - -### CrowdSec Control Flow (BROKEN - Still Uses Env Vars) - -1. **Frontend:** User toggles CrowdSec switch β†’ calls `updateSetting('security.crowdsec.enabled', 'true')` -2. **Backend:** Settings table updated β†’ BUT... -3. **Entrypoint Script:** Checks `SECURITY_CROWDSEC_MODE` env var (LEGACY) -4. **Result:** LAPI never starts because env var says "disabled" - -### How CrowdSec SHOULD Work (GUI-Controlled) - -1. **Frontend:** User toggles CrowdSec switch β†’ calls `/api/v1/admin/crowdsec/start` -2. **Backend Handler:** `CrowdsecHandler.Start()` executes β†’ starts LAPI process -3. **Process Management:** Backend tracks PID and monitors health -4. **No Environment Variable Dependency** - -**Evidence from Code:** +The `Start()` handler was correctly updated to wait for LAPI readiness before returning (lines 201-236 in [crowdsec_handler.go](../../backend/internal/api/handlers/crowdsec_handler.go#L201-L236)): ```go -// backend/internal/api/handlers/crowdsec_handler.go -// These handlers already exist but aren't properly integrated! - -func (h *CrowdsecHandler) Start(c *gin.Context) { - ctx := c.Request.Context() - pid, err := h.Executor.Start(ctx, h.BinPath, h.DataDir) - if err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) - return - } - c.JSON(http.StatusOK, gin.H{"status": "started", "pid": pid}) -} - -func (h *CrowdsecHandler) Stop(c *gin.Context) { - ctx := c.Request.Context() - if err := h.Executor.Stop(ctx, h.DataDir); err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) - return - } - c.JSON(http.StatusOK, gin.H{"status": "stopped"}) -} -``` - -**Frontend Integration:** - -```typescript -// frontend/src/pages/Security.tsx -// CrowdSec toggle DOES call start/stop, but LAPI never started by entrypoint! - -const crowdsecPowerMutation = useMutation({ - mutationFn: async (enabled: boolean) => { - await updateSetting('security.crowdsec.enabled', enabled ? 'true' : 'false', 'security', 'bool') - if (enabled) { - await startCrowdsec() // ← Calls backend Start() handler - } else { - await stopCrowdsec() // ← Calls backend Stop() handler - } - return enabled - }, +// Start() now waits for LAPI and returns lapi_ready: true/false +c.JSON(http.StatusOK, gin.H{ + "status": "started", + "pid": pid, + "lapi_ready": true, // NEW: indicates LAPI is ready }) ``` -**The Missing Piece:** The `docker-entrypoint.sh` should ALWAYS initialize CrowdSec but NOT start the agent. The backend handlers should control the lifecycle. - ---- - -## Immediate Fix (For User) - -**WORKAROUND (Until Architecture Fixed):** - -Set the legacy environment variable to match the GUI state: - -**Step 1: Enable CrowdSec Local Mode (Environment Variable)** - -Update `docker-compose.yml` or `docker-compose.override.yml`: - -```yaml -services: - charon: - environment: - - CHARON_SECURITY_CROWDSEC_MODE=local # Temporary workaround for legacy check -``` - -**Step 2: Recreate Container** - -```bash -docker compose down -docker compose up -d -``` - -**Step 3: Verify LAPI is Running** - -```bash -# Wait 30 seconds for LAPI to start -docker exec charon cscli lapi status -``` - -Expected output: - -``` -βœ“ Loaded credentials from /etc/crowdsec/local_api_credentials.yaml -βœ“ You can successfully interact with Local API (LAPI) -``` - -**Step 4: Re-submit Enrollment Token** - -- Go to Charon UI β†’ Cerberus β†’ CrowdSec -- Submit enrollment token (same token works!) -- Verify instance appears on crowdsec.net dashboard - ---- - -## Long-Term Fix Implementation Plan (ARCHITECTURE CORRECTION) - -### Priority Overview - -1. **CRITICAL:** Remove environment variable dependency from entrypoint script -2. **CRITICAL:** Ensure backend handlers control CrowdSec lifecycle -3. **HIGH:** Add LAPI availability check before enrollment -4. **HIGH:** Update documentation to reflect GUI-only control -5. **MEDIUM:** Add migration guide for users with env vars set - ---- - -### Fix 1: Remove Environment Variable Dependency (CRITICAL PRIORITY) - -**Problem:** `docker-entrypoint.sh` checks legacy `SECURITY_CROWDSEC_MODE` env var -**Solution:** Remove env var check, let backend control CrowdSec lifecycle -**Time:** 45 minutes -**Files affected:** `docker-entrypoint.sh`, `backend/internal/api/handlers/crowdsec_handler.go` - -**Implementation:** - -**Part A: Update docker-entrypoint.sh** - -Remove the CrowdSec agent auto-start logic: - -```bash -# BEFORE (INCORRECT - Environment Variable Control): -if [ "$SECURITY_CROWDSEC_MODE" = "local" ]; then - echo "CrowdSec Local Mode enabled." - crowdsec -c /etc/crowdsec/config.yaml & - CROWDSEC_PID=$! -fi - -# AFTER (CORRECT - Backend Control): -# CrowdSec initialization (config setup) always runs -# But agent startup is controlled by backend handlers via GUI -# No automatic startup based on environment variables -``` - -**Part B: Ensure Backend Handlers Work Correctly** - -The `CrowdsecHandler.Start()` already exists and works: +However, the `Status()` handler was **NOT updated** and still only checks process status (lines 287-294): ```go -// backend/internal/api/handlers/crowdsec_handler.go -func (h *CrowdsecHandler) Start(c *gin.Context) { +func (h *CrowdsecHandler) Status(c *gin.Context) { ctx := c.Request.Context() - pid, err := h.Executor.Start(ctx, h.BinPath, h.DataDir) + running, pid, err := h.Executor.Status(ctx, h.DataDir) // Only checks PID! if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } - c.JSON(http.StatusOK, gin.H{"status": "started", "pid": pid}) + c.JSON(http.StatusOK, gin.H{"running": running, "pid": pid}) // Missing lapi_ready! } ``` -**Part C: Frontend Integration Verification** - -Verify the frontend correctly calls start/stop: - -```typescript -// frontend/src/pages/Security.tsx (ALREADY CORRECT) -const crowdsecPowerMutation = useMutation({ - mutationFn: async (enabled: boolean) => { - await updateSetting('security.crowdsec.enabled', enabled ? 'true' : 'false', 'security', 'bool') - if (enabled) { - await startCrowdsec() // Calls /api/v1/admin/crowdsec/start - } else { - await stopCrowdsec() // Calls /api/v1/admin/crowdsec/stop - } - return enabled - }, -}) -``` - -**Testing:** - -1. Remove env var from docker-compose.yml -2. Start container (CrowdSec should NOT auto-start) -3. Toggle CrowdSec in GUI (should start LAPI) -4. Verify `cscli lapi status` shows running -5. Toggle off (should stop LAPI) - --- -### Fix 2: Add LAPI Availability Check Before Enrollment (CRITICAL PRIORITY) +## Root Cause Analysis -### Fix 2: Add LAPI Availability Check Before Enrollment (CRITICAL PRIORITY) +### The Executor's Status() Method -**Problem:** Enrollment command succeeds even when LAPI is down -**Solution:** Verify LAPI connectivity before allowing enrollment -**Time:** 30 minutes -**Files affected:** `backend/internal/crowdsec/console_enroll.go` +The `DefaultCrowdsecExecutor.Status()` in [crowdsec_exec.go](../../backend/internal/api/handlers/crowdsec_exec.go#L65-L87) only checks: -**Implementation:** - -Add LAPI health check before enrollment: +1. If PID file exists +2. If process with that PID is running (via signal 0) ```go -func (s *ConsoleEnrollmentService) checkLAPIAvailable(ctx context.Context) error { - args := []string{"lapi", "status"} - if _, err := os.Stat(filepath.Join(s.dataDir, "config.yaml")); err == nil { - args = append([]string{"-c", filepath.Join(s.dataDir, "config.yaml")}, args...) - } - _, err := s.exec.ExecuteWithEnv(ctx, "cscli", args, nil) +func (e *DefaultCrowdsecExecutor) Status(ctx context.Context, configDir string) (running bool, pid int, err error) { + b, err := os.ReadFile(e.pidFile(configDir)) if err != nil { - return fmt.Errorf("CrowdSec Local API is not running - please enable CrowdSec via the GUI toggle first") + // Missing pid file is treated as not running + return false, 0, nil } - return nil + // ... check if process is alive via signal 0 ... + return true, pid, nil } ``` -Update `Enroll()` method: +It does **NOT** check if LAPI HTTP endpoint is responding. -```go -// Before: if err := s.ensureCAPIRegistered(ctx); err != nil { -if err := s.checkLAPIAvailable(ctx); err != nil { - return ConsoleEnrollmentStatus{}, err -} -if err := s.ensureCAPIRegistered(ctx); err != nil { - return ConsoleEnrollmentStatus{}, err -} -``` +### Frontend Expectation Mismatch ---- +The frontend in [CrowdSecConfig.tsx](../../frontend/src/pages/CrowdSecConfig.tsx#L71-L77) queries LAPI status: -### Fix 3: Add UI Warning When CrowdSec is Disabled (HIGH PRIORITY) - -**Problem:** Users can attempt enrollment when CrowdSec is disabled -**Solution:** Add status check to enrollment UI with clear instructions -**Time:** 20 minutes -**Files affected:** `frontend/src/pages/CrowdSecConfig.tsx` - -**Implementation:** - -Add LAPI status detection to enrollment form: - -```typescript -const crowdsecStatusQuery = useQuery({ - queryKey: ['crowdsec-status'], - queryFn: async () => { - const response = await client.get('/api/v1/admin/crowdsec/status'); - return response.data; - }, - enabled: consoleEnrollmentEnabled, +```tsx +const lapiStatusQuery = useQuery({ + queryKey: ['crowdsec-lapi-status'], + queryFn: statusCrowdsec, + enabled: consoleEnrollmentEnabled && initialCheckComplete, refetchInterval: 5000, // Poll every 5 seconds -}); + retry: false, +}) +``` -// In enrollment form JSX: -{!crowdsecStatusQuery.data?.running && ( - - - - CrowdSec Local API is not running. Please enable CrowdSec using the toggle switch - in the Security dashboard before enrolling in the Console. - - - +And displays a warning based on `running` field (lines 207-231): + +```tsx +{lapiStatusQuery.data && !lapiStatusQuery.data.running && initialCheckComplete && ( +
+

CrowdSec Local API is initializing...

+
+)} +``` + +**The Problem:** The frontend checks `lapiStatusQuery.data?.running` expecting it to indicate LAPI connectivity. But the backend returns `running: true` which only means "process is running", not "LAPI is responding". + +### Evidence Chain + +| Component | File | Line | Returns | Actually Checks | +|-----------|------|------|---------|-----------------| +| Backend Handler | crowdsec_handler.go | 287-294 | `{running, pid}` | Process running via PID | +| Backend Executor | crowdsec_exec.go | 65-87 | `(running, pid, err)` | PID file + signal 0 | +| Frontend API | crowdsec.ts | 18-21 | `resp.data` | N/A (passthrough) | +| Frontend Query | CrowdSecConfig.tsx | 71-77 | `lapiStatusQuery.data` | Checks `.running` field | +| Frontend UI | CrowdSecConfig.tsx | 207-231 | Shows warning | `!running` | + +**Bug:** Frontend interprets `running` as "LAPI responding" but backend returns "process running". + +--- + +## Detailed Analysis: Why Warning Always Shows + +Looking at the conditional again: + +```tsx +{lapiStatusQuery.data && !lapiStatusQuery.data.running && initialCheckComplete && ( +``` + +This shows the warning when: +- `lapiStatusQuery.data` is truthy βœ“ +- `!lapiStatusQuery.data.running` is truthy (i.e., `running` is falsy) +- `initialCheckComplete` is truthy βœ“ + +**Re-analyzing:** If `running: true`, then `!true = false`, so warning should NOT show. + +**But user reports it DOES show!** + +**Possible causes:** + +1. **Process not actually running:** The `Status()` endpoint returns `running: false` because CrowdSec process crashed or PID file is missing/stale +2. **Different `running` field:** Frontend might be checking a different property +3. **Query state issue:** React Query might be returning stale data + +**Most Likely:** Looking at the message being displayed: + +> "CrowdSec Local API is **initializing**..." + +This message was designed for the case where **process IS running** but **LAPI is NOT ready yet**. But the current conditional shows it when `running` is false! + +**The Fix Needed:** The conditional should check: +- Process running (`running: true`) AND +- LAPI not ready (`lapi_ready: false`) + +NOT just: +- Process not running (`running: false`) + +--- + +## The Complete Fix + +### Files to Modify + +1. **Backend:** [backend/internal/api/handlers/crowdsec_handler.go](../../backend/internal/api/handlers/crowdsec_handler.go#L287-L294) +2. **Frontend API:** [frontend/src/api/crowdsec.ts](../../frontend/src/api/crowdsec.ts#L18-L21) +3. **Frontend UI:** [frontend/src/pages/CrowdSecConfig.tsx](../../frontend/src/pages/CrowdSecConfig.tsx#L207-L231) +4. **Tests:** [backend/internal/api/handlers/crowdsec_handler_test.go](../../backend/internal/api/handlers/crowdsec_handler_test.go) + +### Change 1: Backend Status Handler + +**File:** `backend/internal/api/handlers/crowdsec_handler.go` +**Location:** Lines 287-294 + +**Before:** +```go +// Status returns simple running state. +func (h *CrowdsecHandler) Status(c *gin.Context) { + ctx := c.Request.Context() + running, pid, err := h.Executor.Status(ctx, h.DataDir) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + c.JSON(http.StatusOK, gin.H{"running": running, "pid": pid}) +} +``` + +**After:** +```go +// Status returns running state including LAPI availability check. +func (h *CrowdsecHandler) Status(c *gin.Context) { + ctx := c.Request.Context() + running, pid, err := h.Executor.Status(ctx, h.DataDir) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + // Check LAPI connectivity if process is running + lapiReady := false + if running { + args := []string{"lapi", "status"} + if _, err := os.Stat(filepath.Join(h.DataDir, "config.yaml")); err == nil { + args = append([]string{"-c", filepath.Join(h.DataDir, "config.yaml")}, args...) + } + checkCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + _, checkErr := h.CmdExec.Execute(checkCtx, "cscli", args...) + cancel() + lapiReady = (checkErr == nil) + } + + c.JSON(http.StatusOK, gin.H{ + "running": running, + "pid": pid, + "lapi_ready": lapiReady, + }) +} +``` + +### Change 2: Frontend API Type + +**File:** `frontend/src/api/crowdsec.ts` +**Location:** Lines 18-21 + +**Before:** +```typescript +export async function statusCrowdsec() { + const resp = await client.get('/admin/crowdsec/status') + return resp.data +} +``` + +**After:** +```typescript +export interface CrowdSecStatus { + running: boolean + pid: number + lapi_ready: boolean +} + +export async function statusCrowdsec(): Promise { + const resp = await client.get('/admin/crowdsec/status') + return resp.data +} +``` + +### Change 3: Frontend CrowdSecConfig Conditional Logic + +**File:** `frontend/src/pages/CrowdSecConfig.tsx` +**Location:** Lines 207-231 + +**Before:** +```tsx +{/* Warning when CrowdSec LAPI is not running */} +{lapiStatusQuery.data && !lapiStatusQuery.data.running && initialCheckComplete && ( +
+ +
+

+ CrowdSec Local API is initializing... +

+

+ The CrowdSec process is running but the Local API (LAPI) is still starting up. + This typically takes 5-10 seconds after enabling CrowdSec. + {lapiStatusQuery.isRefetching && ' Checking again in 5 seconds...'} +

+
+ + {!status?.crowdsec?.enabled && ( + + )} +
+
+
+)} +``` + +**After:** +```tsx +{/* Warning when CrowdSec process is running but LAPI is not ready */} +{lapiStatusQuery.data && lapiStatusQuery.data.running && !lapiStatusQuery.data.lapi_ready && initialCheckComplete && ( +
+ +
+

+ CrowdSec Local API is initializing... +

+

+ The CrowdSec process is running but the Local API (LAPI) is still starting up. + This typically takes 5-10 seconds after enabling CrowdSec. + {lapiStatusQuery.isRefetching && ' Checking again in 5 seconds...'} +

+
+ +
+
+
)} - +{/* Warning when CrowdSec is not running at all */} +{lapiStatusQuery.data && !lapiStatusQuery.data.running && initialCheckComplete && ( +
+ +
+

+ CrowdSec is not running +

+

+ Please enable CrowdSec using the toggle switch in the Security dashboard before enrolling in the Console. +

+ +
+
+)} ``` ---- +### Change 4: Update Enrollment Button Disabled State -### Fix 4: Update Documentation (HIGH PRIORITY) +**File:** `frontend/src/pages/CrowdSecConfig.tsx` +**Location:** Lines 255-289 (Enroll, Rotate key, and Retry enrollment buttons) -**Problem:** Documentation mentions environment variables for CrowdSec control -**Solution:** Update docs to reflect GUI-only control, mark env vars as deprecated -**Time:** 30 minutes -**Files affected:** - -- `docs/security.md` -- `docs/cerberus.md` -- `docs/troubleshooting/crowdsec.md` -- `README.md` - -**Changes Needed:** - -1. **Mark Environment Variables as Deprecated:** - - ```md - ⚠️ **DEPRECATED:** `CHARON_SECURITY_CROWDSEC_MODE` environment variable is no longer used. - CrowdSec is now controlled via the GUI in the Security dashboard. - ``` - -2. **Add GUI Control Instructions:** - - ```md - ## Enabling CrowdSec - - 1. Navigate to **Security** dashboard - 2. Toggle the **CrowdSec** switch to **ON** - 3. The backend will start the CrowdSec agent and Local API (LAPI) - 4. Verify status shows "Active" with a running PID - - **Note:** CrowdSec is internally managed by Charon. No external setup required. - ``` - -3. **Update Console Enrollment Prerequisites:** - - ```md - ## Console Enrollment Prerequisites - - Before enrolling your Charon instance with CrowdSec Console: - - 1. βœ… CrowdSec must be **enabled** in the GUI (toggle switch ON) - 2. βœ… Local API (LAPI) must be **running** (check status) - 3. βœ… Feature flag `feature.crowdsec.console_enrollment` must be enabled - 4. βœ… Valid enrollment token from crowdsec.net - - **Troubleshooting:** If enrollment fails, verify LAPI is running: - ```bash - docker exec charon cscli lapi status - ``` - - ``` - ---- - -### Fix 5: Add Migration Guide for Existing Users (MEDIUM PRIORITY) - -**Problem:** Users may have env vars set that will no longer work -**Solution:** Add migration guide to help users transition -**Time:** 15 minutes -**Files affected:** `docs/migration-guide.md` (new file) - -**Content:** - -```md -# CrowdSec Control Migration Guide - -## What Changed - -**Before (v1.x):** CrowdSec was controlled by environment variables: -```yaml -environment: - - CHARON_SECURITY_CROWDSEC_MODE=local +**Before:** +```tsx +disabled={isConsolePending || (lapiStatusQuery.data && !lapiStatusQuery.data.running) || !enrollmentToken.trim()} ``` -**After (v2.x):** CrowdSec is controlled via GUI toggle in Security dashboard. - -## Migration Steps - -### Step 1: Remove Environment Variable - -Edit your `docker-compose.yml` and remove: - -```yaml -# REMOVE THIS LINE: -- CHARON_SECURITY_CROWDSEC_MODE=local +**After:** +```tsx +disabled={isConsolePending || (lapiStatusQuery.data && !lapiStatusQuery.data.lapi_ready) || !enrollmentToken.trim()} ``` -### Step 2: Restart Container +Also update the `title` attributes: -```bash -docker compose down -docker compose up -d -``` - -### Step 3: Enable via GUI - -1. Open Charon UI β†’ **Security** dashboard -2. Toggle **CrowdSec** switch to **ON** -3. Verify status shows "Active" - -### Step 4: Re-enroll Console (If Applicable) - -If you were enrolled in CrowdSec Console before: - -1. Your enrollment is preserved in the database -2. No action needed unless enrollment was incomplete - -## Benefits of GUI Control - -- βœ… No need to restart container to enable/disable -- βœ… Status visible in real-time -- βœ… Consistent with WAF, ACL, and Rate Limiting controls -- βœ… Better integration with Charon's security orchestration - -## Troubleshooting - -**Q: CrowdSec won't start after toggling?** - -- Check logs: `docker logs charon` -- Verify config exists: `docker exec charon ls -la /app/data/crowdsec/config` - -**Q: Console enrollment fails?** - -- Verify LAPI is running: `docker exec charon cscli lapi status` -- Check enrollment prerequisites in [docs/security.md](security.md) - -``` - ---- - -### Fix 6: Add Integration Test (MEDIUM PRIORITY) - -### Fix 6: Add Integration Test (MEDIUM PRIORITY) - -**Problem:** No test coverage for enrollment prerequisites -**Solution:** Add test that verifies LAPI requirement and GUI lifecycle -**Time:** 30 minutes -**Files affected:** -- `backend/internal/crowdsec/console_enroll_test.go` -- `scripts/crowdsec_lifecycle_test.sh` (new file) - -**Implementation:** - -**Unit Test:** -```go -func TestEnroll_RequiresLAPI(t *testing.T) { - exec := &mockExecutor{ - responses: []cmdResponse{ - {out: nil, err: nil}, // capi register success - {out: nil, err: errors.New("connection refused")}, // lapi status fails - }, - } - svc := NewConsoleEnrollmentService(db, exec, tempDir, "secret") - - _, err := svc.Enroll(ctx, ConsoleEnrollRequest{ - EnrollmentKey: "test123token", - AgentName: "agent", - }) - - require.Error(t, err) - require.Contains(t, err.Error(), "Local API is not running") +**Before:** +```tsx +title={ + lapiStatusQuery.data && !lapiStatusQuery.data.running + ? 'CrowdSec LAPI must be running to enroll' + : ... } ``` -**Integration Test Script:** - -```bash -#!/bin/bash -# scripts/crowdsec_lifecycle_test.sh -# Tests GUI-controlled CrowdSec lifecycle - -echo "Testing CrowdSec GUI-controlled lifecycle..." - -# 1. Start Charon without env var -docker compose up -d -sleep 5 - -# 2. Verify CrowdSec NOT running by default -docker exec charon cscli lapi status 2>&1 | grep "connection refused" -echo "βœ“ CrowdSec not auto-started without env var" - -# 3. Enable via GUI toggle -curl -X POST -H "Content-Type: application/json" \ - -b cookies.txt \ - -d '{"key": "security.crowdsec.enabled", "value": "true", "category": "security", "type": "bool"}' \ - http://localhost:8080/api/v1/admin/settings - -# 4. Call start endpoint (mimics GUI toggle) -curl -X POST -b cookies.txt \ - http://localhost:8080/api/v1/admin/crowdsec/start - -sleep 10 - -# 5. Verify LAPI running -docker exec charon cscli lapi status | grep "successfully interact" -echo "βœ“ LAPI started via GUI toggle" - -# 6. Disable via GUI -curl -X POST -b cookies.txt \ - http://localhost:8080/api/v1/admin/crowdsec/stop - -sleep 5 - -# 7. Verify LAPI stopped -docker exec charon cscli lapi status 2>&1 | grep "connection refused" -echo "βœ“ LAPI stopped via GUI toggle" - -echo "βœ… All GUI lifecycle tests passed" +**After:** +```tsx +title={ + lapiStatusQuery.data && !lapiStatusQuery.data.lapi_ready + ? 'CrowdSec LAPI must be running to enroll' + : ... +} ``` --- -## Summary of Architectural Changes +## Testing Steps -### What's Broken Now (Environment Variable Control) +### Unit Test: Backend Status Handler -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ docker-compose β”‚ -β”‚ env: MODE= β”‚ ← Environment variable set here -β”‚ disabled β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ entrypoint.sh β”‚ -β”‚ if MODE=local β”‚ ← Checks env var, doesn't start LAPI -β”‚ start crowdsecβ”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v - ❌ LAPI never starts - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ GUI Toggle β”‚ -β”‚ "CrowdSec: ON" β”‚ ← User thinks it's enabled -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Enroll Console β”‚ ← Fails silently (LAPI not running) -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +Add test in `backend/internal/api/handlers/crowdsec_handler_test.go`: + +```go +func TestCrowdsecHandler_Status_IncludesLAPIReady(t *testing.T) { + mockExec := &fakeExec{running: true, pid: 1234} + mockCmdExec := &mockCommandExecutor{returnErr: nil} // cscli lapi status succeeds + + handler := &CrowdsecHandler{ + Executor: mockExec, + CmdExec: mockCmdExec, + DataDir: "/app/data", + } + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest(http.MethodGet, "/admin/crowdsec/status", nil) + + handler.Status(c) + + assert.Equal(t, http.StatusOK, w.Code) + + var response map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &response) + + assert.True(t, response["running"].(bool)) + assert.Equal(t, float64(1234), response["pid"].(float64)) + assert.True(t, response["lapi_ready"].(bool)) // NEW: Check lapi_ready is present and true +} + +func TestCrowdsecHandler_Status_LAPINotReady(t *testing.T) { + mockExec := &fakeExec{running: true, pid: 1234} + mockCmdExec := &mockCommandExecutor{returnErr: errors.New("connection refused")} // cscli lapi status fails + + handler := &CrowdsecHandler{ + Executor: mockExec, + CmdExec: mockCmdExec, + DataDir: "/app/data", + } + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest(http.MethodGet, "/admin/crowdsec/status", nil) + + handler.Status(c) + + assert.Equal(t, http.StatusOK, w.Code) + + var response map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &response) + + assert.True(t, response["running"].(bool)) + assert.Equal(t, float64(1234), response["pid"].(float64)) + assert.False(t, response["lapi_ready"].(bool)) // LAPI not ready +} + +func TestCrowdsecHandler_Status_ProcessNotRunning(t *testing.T) { + mockExec := &fakeExec{running: false, pid: 0} + mockCmdExec := &mockCommandExecutor{} + + handler := &CrowdsecHandler{ + Executor: mockExec, + CmdExec: mockCmdExec, + DataDir: "/app/data", + } + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest(http.MethodGet, "/admin/crowdsec/status", nil) + + handler.Status(c) + + assert.Equal(t, http.StatusOK, w.Code) + + var response map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &response) + + assert.False(t, response["running"].(bool)) + assert.False(t, response["lapi_ready"].(bool)) // LAPI can't be ready if process not running +} ``` -### What Should Happen (GUI Control) - -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ docker-compose β”‚ -β”‚ (no env var) β”‚ ← No environment variable needed -β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ entrypoint.sh β”‚ -β”‚ Init CrowdSec β”‚ ← Setup config only, don't start agent -β”‚ (config only) β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ GUI Toggle β”‚ -β”‚ "CrowdSec: ON" β”‚ ← User enables via GUI -β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ POST /crowdsec/ β”‚ -β”‚ /start β”‚ ← Frontend calls backend handler -β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Backend Handler β”‚ -β”‚ Start LAPI β”‚ ← Backend starts the agent -β”‚ (PID tracked) β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v - βœ… LAPI running - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Enroll Console β”‚ ← Works! LAPI available -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -### Pattern Consistency Across Security Features - -| Feature | Control Method | Status Endpoint | Lifecycle Handler | -|---------|---------------|-----------------|-------------------| -| **Cerberus** | GUI Toggle | `/security/status` | N/A (master switch) | -| **WAF** | GUI Toggle | `/security/status` | Config regeneration | -| **ACL** | GUI Toggle | `/security/status` | Config regeneration | -| **Rate Limit** | GUI Toggle | `/security/status` | Config regeneration | -| **CrowdSec** (OLD) | ❌ Env Var | `/security/status` | ❌ Entrypoint script | -| **CrowdSec** (NEW) | βœ… GUI Toggle | `/security/status` | βœ… Start/Stop handlers | - ---- - -## Testing Strategy - -### Manual Testing (For User - Workaround) - -1. **Set Environment Variable (Temporary)** - - ```bash - # docker-compose.override.yml - environment: - - CHARON_SECURITY_CROWDSEC_MODE=local - ``` - -2. **Restart Container** - - ```bash - docker compose down && docker compose up -d - ``` - -3. **Verify LAPI Running** - - ```bash - docker exec charon cscli lapi status - # Should show: "You can successfully interact with Local API (LAPI)" - ``` - -4. **Test Enrollment** - - Submit enrollment token via Charon UI - - Check crowdsec.net dashboard after 60 seconds - - Instance should appear - -### Automated Testing (For Developers - After Fix) - -1. **Unit Test:** LAPI availability check before enrollment -2. **Integration Test:** GUI-controlled CrowdSec lifecycle (start/stop) -3. **End-to-End Test:** Full enrollment flow with GUI toggle -4. **Regression Test:** Verify env var no longer affects behavior - -### Post-Fix Validation - -1. **Remove Environment Variable** - - ```bash - # Ensure CHARON_SECURITY_CROWDSEC_MODE is NOT set - ``` - -2. **Start Container** +### Manual Testing Procedure +1. **Start Fresh:** ```bash + docker compose down -v docker compose up -d ``` -3. **Verify CrowdSec NOT Running** +2. **Enable CrowdSec:** + - Go to Security dashboard + - Toggle CrowdSec ON + - Wait for toast "CrowdSec started and LAPI is ready" +3. **Navigate to Config:** + - Click "Config" button + - Verify NO "initializing" warning shows + - Console enrollment section should be enabled + +4. **Verify API Response:** ```bash - docker exec charon cscli lapi status - # Should show: "connection refused" + curl -s http://localhost:8080/api/v1/admin/crowdsec/status | jq + ``` + Expected: + ```json + { + "running": true, + "pid": 123, + "lapi_ready": true + } ``` -4. **Enable via GUI** - - Toggle CrowdSec switch in Security dashboard - - Wait 10 seconds +5. **Test LAPI Down Scenario:** + - SSH into container: `docker exec -it charon bash` + - Stop CrowdSec: `pkill -f crowdsec` + - Call API: + ```bash + curl -s http://localhost:8080/api/v1/admin/crowdsec/status | jq + ``` + - Expected: `{"running": false, "pid": 0, "lapi_ready": false}` + - Refresh CrowdSecConfig page + - Should show "CrowdSec is not running" error (red) -5. **Verify LAPI Started** - - ```bash - docker exec charon cscli lapi status - # Should show: "successfully interact" - ``` - -6. **Test Console Enrollment** - - Submit enrollment token - - Verify appears on crowdsec.net - -7. **Disable via GUI** - - Toggle CrowdSec switch off - - Wait 5 seconds - -8. **Verify LAPI Stopped** - - ```bash - docker exec charon cscli lapi status - # Should show: "connection refused" - ``` - ---- - -## Files Requiring Changes - -### Backend (Go) - -1. βœ… `docker-entrypoint.sh` - Remove env var check, initialize config only -2. βœ… `backend/internal/crowdsec/console_enroll.go` - Add LAPI availability check -3. ⚠️ `backend/internal/api/handlers/crowdsec_handler.go` - Already has Start/Stop (verify works) - -### Frontend (TypeScript) - -1. βœ… `frontend/src/pages/CrowdSecConfig.tsx` - Add LAPI status warning -2. ⚠️ `frontend/src/pages/Security.tsx` - Already calls start/stop (verify integration) - -### Documentation - -1. βœ… `docs/security.md` - Remove env var instructions, add GUI instructions -2. βœ… `docs/cerberus.md` - Mark env vars deprecated -3. βœ… `docs/troubleshooting/crowdsec.md` - Update enrollment prerequisites -4. βœ… `README.md` - Update quick start to use GUI only -5. βœ… `docs/migration-guide.md` - New file for v1.x β†’ v2.x migration -6. βœ… `docker-compose.yml` - Comment out deprecated env var - -### Testing - -1. βœ… `backend/internal/crowdsec/console_enroll_test.go` - Add LAPI requirement test -2. βœ… `scripts/crowdsec_lifecycle_test.sh` - New integration test for GUI control - -### Configuration (Already Correct) - -1. ⚠️ `backend/internal/models/security_config.go` - CrowdSecMode field exists (DB) -2. ⚠️ `backend/internal/api/handlers/security_handler.go` - Already reads from DB -3. ⚠️ `frontend/src/api/crowdsec.ts` - Start/stop API calls already exist +6. **Test Restart Scenario:** + - Re-enable CrowdSec via Security dashboard + - Immediately navigate to CrowdSecConfig + - Should show "initializing" briefly (yellow) then clear when `lapi_ready: true` --- ## Risk Assessment -### Low Risk Changes - -- βœ… Documentation updates -- βœ… Frontend UI warnings -- βœ… Backend LAPI availability check - -### Medium Risk Changes - -- ⚠️ Removing env var logic from entrypoint (requires thorough testing) -- ⚠️ Integration test for GUI lifecycle - -### High Risk Areas (Existing Functionality - Verify) - -- ⚠️ Backend Start/Stop handlers (already exist, need to verify) -- ⚠️ Frontend toggle integration (already exists, need to verify) -- ⚠️ CrowdSec config persistence across restarts - -### Migration Considerations - -- Users with `CHARON_SECURITY_CROWDSEC_MODE=local` set will need to: - 1. Remove environment variable - 2. Enable via GUI toggle - 3. Re-verify enrollment if applicable +| Change | Risk | Mitigation | +|--------|------|------------| +| Backend Status handler modification | Low | Status handler is read-only, adds 2s timeout check | +| LAPI check timeout (2s) | Low | Short timeout prevents blocking; async refresh handles retries | +| Frontend conditional logic change | Low | More precise state handling, clear error states | +| Type definition update | Low | TypeScript will catch any mismatches at compile time | +| Two separate warning states | Low | Better UX with distinct yellow (initializing) vs red (not running) | --- -## Rollback Plan +## Summary -If the architectural changes cause issues: +**Root Cause:** The `Status()` endpoint was not updated when `Start()` was modified to check LAPI readiness. The frontend expects the status endpoint to indicate LAPI availability, but it only returns process status. -1. **Immediate Rollback:** Add env var check back to `docker-entrypoint.sh` -2. **Document Workaround:** Continue using env var for CrowdSec control -3. **Defer Fix:** Mark as "known limitation" in docs until proper fix validated +**Fix:** Add `lapi_ready` field to `Status()` response by checking `cscli lapi status`, update frontend to use this new field for the warning display logic. ---- +**Files Changed:** +1. `backend/internal/api/handlers/crowdsec_handler.go` - Add LAPI check to Status() +2. `frontend/src/api/crowdsec.ts` - Add TypeScript interface with `lapi_ready` +3. `frontend/src/pages/CrowdSecConfig.tsx` - Update conditional logic: + - Yellow warning: process running, LAPI not ready + - Red warning: process not running + - No warning: process running AND LAPI ready +4. `backend/internal/api/handlers/crowdsec_handler_test.go` - Add unit tests -## Files Inspected During Investigation +**Estimated Time:** 1-2 hours including testing -### Configuration βœ… +**Commit Message:** +``` +fix: add LAPI readiness check to CrowdSec status endpoint -- `docker-compose.yml` - Volume mounts correct -- `docker-entrypoint.sh` - Conditional CrowdSec startup logic -- `Dockerfile` - CrowdSec installed correctly +The Status() handler was only checking if the CrowdSec process was +running, not if LAPI was actually responding. This caused the +CrowdSecConfig page to always show "LAPI is initializing" even when +LAPI was fully operational. -### Backend βœ… +Changes: +- Backend: Add `lapi_ready` field to /admin/crowdsec/status response +- Frontend: Add CrowdSecStatus TypeScript interface +- Frontend: Update conditional logic to check `lapi_ready` not `running` +- Frontend: Separate warnings for "initializing" vs "not running" +- Tests: Add unit tests for Status handler LAPI check -- `backend/internal/crowdsec/console_enroll.go` - Enrollment flow logic -- `backend/internal/models/crowdsec_console_enrollment.go` - Database model -- `backend/internal/api/handlers/crowdsec_handler.go` - API endpoint - -### Runtime Verification βœ… - -- `/etc/crowdsec` β†’ `/app/data/crowdsec/config` (symlink correct) -- `/app/data/crowdsec/config/online_api_credentials.yaml` exists (CAPI registered) -- `/app/data/crowdsec/config/console.yaml` exists -- `ps aux` shows NO crowdsec processes (LAPI not running) -- Environment: `CHARON_SECURITY_CROWDSEC_MODE=disabled` - ---- - -## Conclusion - -**Root Cause (Updated with Architectural Analysis):** Console enrollment fails because of **architectural technical debt** - the legacy environment variable `CHARON_SECURITY_CROWDSEC_MODE` still controls LAPI startup in `docker-entrypoint.sh`, bypassing the GUI control system that users expect. - -**The Real Problem:** This is NOT a user configuration issue. It's a **code architecture issue** where: - -1. CrowdSec control was never fully migrated to GUI-based management -2. The entrypoint script still checks deprecated environment variables -3. Backend handlers (`Start()`/`Stop()`) exist but aren't properly integrated with container startup -4. Users are misled into thinking the GUI toggle actually controls CrowdSec - -**Immediate Fix (User Workaround):** Set `CHARON_SECURITY_CROWDSEC_MODE=local` environment variable to match GUI state. - -**Proper Fix (Development Required):** - -1. **CRITICAL:** Remove environment variable dependency from `docker-entrypoint.sh` -2. **CRITICAL:** Ensure backend handlers control CrowdSec lifecycle (GUI β†’ API β†’ Process) -3. **HIGH:** Add LAPI availability check before enrollment (prevents silent failures) -4. **HIGH:** Add UI warnings when LAPI is not running (improves UX) -5. **HIGH:** Update documentation to reflect GUI-only control -6. **MEDIUM:** Add migration guide for users transitioning from env var control -7. **MEDIUM:** Add integration tests for GUI-controlled lifecycle - -**Pattern to Follow:** CrowdSec should work like WAF, ACL, and Rate Limiting - all controlled through Settings table, no environment variable dependency. - -**Token Reusability:** Confirmed REUSABLE - no need to generate new tokens after fixing LAPI availability. - -**Impact:** This architectural issue affects ALL users trying to use Console enrollment, not just the reporter. The fix will benefit the entire user base by providing consistent, GUI-based security feature management. +Fixes regression from crowdsec_lapi_error_diagnostic.md fixes. +``` diff --git a/docs/reports/qa_report.md b/docs/reports/qa_report.md index 5ccb213d..f83a5ae0 100644 --- a/docs/reports/qa_report.md +++ b/docs/reports/qa_report.md @@ -1,37 +1,131 @@ -# QA Report: CrowdSec Persistence Fix +# QA Report: CrowdSec LAPI Status Fix -## Execution Summary +**Date:** December 14, 2025 +**Agent:** QA_Security +**Issue:** CrowdSec LAPI status field was incorrectly handled, causing UI to not display proper status -**Date**: 2025-12-14 -**Task**: Fixing CrowdSec "Offline" status due to lack of persistence. -**Agent**: QA_Security (Antigravity) +--- -## πŸ§ͺ Verification Results +## Changes Tested -### Static Analysis +1. **Backend:** `backend/internal/api/handlers/crowdsec_handler.go` - Status() now returns `lapi_ready` field +2. **Frontend:** `frontend/src/api/crowdsec.ts` - Added CrowdSecStatus interface +3. **Frontend:** `frontend/src/pages/CrowdSecConfig.tsx` - Updated conditionals to use `lapi_ready` +4. **Test mocks:** Updated to support new `lapi_ready` field -- **Pre-commit**: ⚠️ Skipped (Tool not installed in environment). -- **Manual Code Review**: βœ… Passed. - - `docker-entrypoint.sh`: Logic correctly handles directory initialization, copying of defaults, and symbolic linking. - - `docker-compose.yml`: Documentation added clearly. - - **Idempotency**: Checked. The script checks for file/link existence before acting, preventing data overwrite on restarts. +--- -### Logic Audit +## Test Results Summary -- **Persistence**: - - Config: `/etc/crowdsec` -> `/app/data/crowdsec/config`. - - Data: `DATA` env var -> `/app/data/crowdsec/data`. - - Hub: `/etc/crowdsec/hub` is created in persistent path. -- **Fail-safes**: - - Fallback to `/etc/crowdsec.dist` or `/etc/crowdsec` ensures config covers missing files. - - `cscli` checks integrity on startup. +| Check | Status | Details | +|-------|--------|---------| +| Backend Build | βœ… PASSED | `go build ./...` completed successfully | +| Backend Tests | βœ… PASSED | All 20 packages pass | +| Backend Lint (go vet) | βœ… PASSED | No issues found | +| Frontend Type Check | βœ… PASSED | TypeScript compilation successful | +| Frontend Lint | βœ… PASSED | 0 errors, 6 warnings (acceptable) | +| Frontend Tests | βœ… PASSED | 799 passed, 2 skipped | +| Pre-commit | βœ… PASSED | All hooks pass | -### ⚠️ Risks & Edges +--- -- **First Restart**: The first restart after applying this fix requires the user to **re-enroll** with CrowdSec Console because the Machine ID will change (it is now persistent, but the previous one was ephemeral and lost). -- **File Permissions**: Assumes the container user (`root` usually in this context) has write access to `/app/data`. This is standard for Charon. +## Detailed Results -## Recommendations +### Backend Build -- **Approve**. The fix addresses the root cause directly. -- **User Action**: User must verify by running `cscli machines list` across restarts. +``` +βœ… go build ./... - SUCCESS +``` + +### Backend Tests + +``` +ok github.com/Wikid82/charon/backend/cmd/api +ok github.com/Wikid82/charon/backend/cmd/seed +ok github.com/Wikid82/charon/backend/internal/api/handlers +ok github.com/Wikid82/charon/backend/internal/api/middleware +ok github.com/Wikid82/charon/backend/internal/api/routes +ok github.com/Wikid82/charon/backend/internal/api/tests +ok github.com/Wikid82/charon/backend/internal/caddy +ok github.com/Wikid82/charon/backend/internal/cerberus +ok github.com/Wikid82/charon/backend/internal/config +ok github.com/Wikid82/charon/backend/internal/crowdsec +ok github.com/Wikid82/charon/backend/internal/database +ok github.com/Wikid82/charon/backend/internal/logger +ok github.com/Wikid82/charon/backend/internal/metrics +ok github.com/Wikid82/charon/backend/internal/models +ok github.com/Wikid82/charon/backend/internal/server +ok github.com/Wikid82/charon/backend/internal/services +ok github.com/Wikid82/charon/backend/internal/util +ok github.com/Wikid82/charon/backend/internal/version + +Coverage: 85.2% (minimum required 85%) +``` + +### Backend Lint + +``` +βœ… go vet ./... - No issues +``` + +### Frontend Type Check + +``` +βœ… tsc --noEmit - SUCCESS +``` + +### Frontend Lint + +``` +6 warnings (0 errors): +- 1x unused variable in e2e test +- 2x missing useEffect dependencies (existing, unrelated) +- 3x @typescript-eslint/no-explicit-any in test files + +Note: All warnings are acceptable and unrelated to the LAPI fix +``` + +### Frontend Tests + +``` +Test Files 87 passed (87) + Tests 799 passed | 2 skipped (801) + Duration 63.65s + +Key test suites verified: +- src/api/__tests__/crowdsec.test.ts (9 tests) βœ… +- src/pages/__tests__/CrowdSecConfig.test.tsx (3 tests) βœ… +- src/pages/__tests__/Security.spec.tsx (6 tests) βœ… +- src/pages/__tests__/Security.test.tsx (18 tests) βœ… +- src/pages/__tests__/Security.dashboard.test.tsx (18 tests) βœ… +``` + +### Pre-commit Hooks + +``` +βœ… Go Vet - Passed +βœ… Check .version matches latest Git tag - Passed +βœ… Prevent large files that are not tracked by LFS - Passed +βœ… Prevent committing CodeQL DB artifacts - Passed +βœ… Prevent committing data/backups files - Passed +βœ… Frontend TypeScript Check - Passed +βœ… Frontend Lint (Fix) - Passed +``` + +--- + +## Conclusion + +**All quality gates have passed.** The CrowdSec LAPI status fix has been comprehensively tested and is ready for merge. + +### Summary of Changes Verified + +1. Backend correctly returns `lapi_ready` boolean field in CrowdSec status response +2. Frontend `CrowdSecStatus` interface properly types the response +3. UI conditionals correctly use `lapi_ready` for status display logic +4. All existing tests pass with updated mocks +5. No regressions detected in related security features + +--- + +*Report generated by QA_Security agent* diff --git a/frontend/src/api/crowdsec.ts b/frontend/src/api/crowdsec.ts index b945e45a..6ce2a335 100644 --- a/frontend/src/api/crowdsec.ts +++ b/frontend/src/api/crowdsec.ts @@ -19,8 +19,14 @@ export async function stopCrowdsec() { return resp.data } -export async function statusCrowdsec() { - const resp = await client.get('/admin/crowdsec/status') +export interface CrowdSecStatus { + running: boolean + pid: number + lapi_ready: boolean +} + +export async function statusCrowdsec(): Promise { + const resp = await client.get('/admin/crowdsec/status') return resp.data } diff --git a/frontend/src/pages/CrowdSecConfig.tsx b/frontend/src/pages/CrowdSecConfig.tsx index 1b4df208..608b1e5a 100644 --- a/frontend/src/pages/CrowdSecConfig.tsx +++ b/frontend/src/pages/CrowdSecConfig.tsx @@ -7,7 +7,7 @@ import { Input } from '../components/ui/Input' import { Switch } from '../components/ui/Switch' import { getSecurityStatus } from '../api/security' import { getFeatureFlags } from '../api/featureFlags' -import { exportCrowdsecConfig, importCrowdsecConfig, listCrowdsecFiles, readCrowdsecFile, writeCrowdsecFile, listCrowdsecDecisions, banIP, unbanIP, CrowdSecDecision, statusCrowdsec } from '../api/crowdsec' +import { exportCrowdsecConfig, importCrowdsecConfig, listCrowdsecFiles, readCrowdsecFile, writeCrowdsecFile, listCrowdsecDecisions, banIP, unbanIP, CrowdSecDecision, statusCrowdsec, CrowdSecStatus } from '../api/crowdsec' import { listCrowdsecPresets, pullCrowdsecPreset, applyCrowdsecPreset, getCrowdsecPresetCache } from '../api/presets' import { createBackup } from '../api/backups' import { updateSetting } from '../api/settings' @@ -62,7 +62,7 @@ export default function CrowdSecConfig() { }, [consoleEnrollmentEnabled, initialCheckComplete]) // Add LAPI status check with polling - const lapiStatusQuery = useQuery({ + const lapiStatusQuery = useQuery({ queryKey: ['crowdsec-lapi-status'], queryFn: statusCrowdsec, enabled: consoleEnrollmentEnabled && initialCheckComplete, @@ -594,8 +594,8 @@ export default function CrowdSecConfig() {

{consoleErrors.submit}

)} - {/* Warning when CrowdSec LAPI is not running */} - {lapiStatusQuery.data && !lapiStatusQuery.data.running && initialCheckComplete && ( + {/* Yellow warning: Process running but LAPI initializing */} + {lapiStatusQuery.data && lapiStatusQuery.data.running && !lapiStatusQuery.data.lapi_ready && initialCheckComplete && (
@@ -616,15 +616,38 @@ export default function CrowdSecConfig() { > Check Now - {!status?.crowdsec?.enabled && ( - - )} +
+
+ + )} + + {/* Red warning: Process not running at all */} + {lapiStatusQuery.data && !lapiStatusQuery.data.running && initialCheckComplete && ( +
+ +
+

+ CrowdSec is not running +

+

+ The CrowdSec process is not currently running. Enable CrowdSec from the Security Dashboard to use console enrollment features. +

+
+ +
@@ -677,12 +700,12 @@ export default function CrowdSecConfig() {