diff --git a/backend/internal/api/handlers/crowdsec_handler.go b/backend/internal/api/handlers/crowdsec_handler.go index bb4956ff..7248a152 100644 --- a/backend/internal/api/handlers/crowdsec_handler.go +++ b/backend/internal/api/handlers/crowdsec_handler.go @@ -246,7 +246,7 @@ func (h *CrowdsecHandler) Stop(c *gin.Context) { c.JSON(http.StatusOK, gin.H{"status": "stopped"}) } -// Status returns simple running state. +// Status returns running state including LAPI availability check. func (h *CrowdsecHandler) Status(c *gin.Context) { ctx := c.Request.Context() running, pid, err := h.Executor.Status(ctx, h.DataDir) @@ -254,7 +254,25 @@ func (h *CrowdsecHandler) Status(c *gin.Context) { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } - c.JSON(http.StatusOK, gin.H{"running": running, "pid": pid}) + + // Check LAPI connectivity if process is running + lapiReady := false + if running { + args := []string{"lapi", "status"} + if _, err := os.Stat(filepath.Join(h.DataDir, "config.yaml")); err == nil { + args = append([]string{"-c", filepath.Join(h.DataDir, "config.yaml")}, args...) + } + checkCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + _, checkErr := h.CmdExec.Execute(checkCtx, "cscli", args...) + cancel() + lapiReady = (checkErr == nil) + } + + c.JSON(http.StatusOK, gin.H{ + "running": running, + "pid": pid, + "lapi_ready": lapiReady, + }) } // ImportConfig accepts a tar.gz or zip upload and extracts into DataDir (backing up existing config). diff --git a/backend/internal/api/handlers/crowdsec_handler_test.go b/backend/internal/api/handlers/crowdsec_handler_test.go index 41efccfe..14d7bdae 100644 --- a/backend/internal/api/handlers/crowdsec_handler_test.go +++ b/backend/internal/api/handlers/crowdsec_handler_test.go @@ -1348,6 +1348,115 @@ func TestCrowdsecHandler_StartReturnsImmediatelyIfProcessFailsToStart(t *testing require.Equal(t, http.StatusInternalServerError, w.Code) } +// ============================================ +// Status Handler lapi_ready Tests +// ============================================ + +func TestCrowdsecHandler_StatusReturnsLAPIReadyWhenRunning(t *testing.T) { + gin.SetMode(gin.TestMode) + db := setupCrowdDB(t) + tmpDir := t.TempDir() + + // Create an executor that reports as running + runningExec := &fakeExec{started: true} + + // Create a command executor that succeeds (LAPI is ready) + successCmdExec := &mockCmdExec{err: nil} + + h := NewCrowdsecHandler(db, runningExec, "/bin/false", tmpDir) + h.CmdExec = successCmdExec + + r := gin.New() + g := r.Group("/api/v1") + h.RegisterRoutes(g) + + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/api/v1/admin/crowdsec/status", http.NoBody) + r.ServeHTTP(w, req) + + require.Equal(t, http.StatusOK, w.Code) + + var response map[string]interface{} + err := json.Unmarshal(w.Body.Bytes(), &response) + require.NoError(t, err) + + require.Equal(t, true, response["running"]) + require.Equal(t, float64(12345), response["pid"]) + require.Equal(t, true, response["lapi_ready"], "lapi_ready should be true when cscli lapi status succeeds") +} + +func TestCrowdsecHandler_StatusReturnsLAPINotReadyWhenCmdFails(t *testing.T) { + gin.SetMode(gin.TestMode) + db := setupCrowdDB(t) + tmpDir := t.TempDir() + + // Create an executor that reports as running + runningExec := &fakeExec{started: true} + + // Create a command executor that fails (LAPI not ready) + failCmdExec := &mockCmdExec{err: errors.New("LAPI not initialized")} + + h := NewCrowdsecHandler(db, runningExec, "/bin/false", tmpDir) + h.CmdExec = failCmdExec + + r := gin.New() + g := r.Group("/api/v1") + h.RegisterRoutes(g) + + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/api/v1/admin/crowdsec/status", http.NoBody) + r.ServeHTTP(w, req) + + require.Equal(t, http.StatusOK, w.Code) + + var response map[string]interface{} + err := json.Unmarshal(w.Body.Bytes(), &response) + require.NoError(t, err) + + require.Equal(t, true, response["running"]) + require.Equal(t, float64(12345), response["pid"]) + require.Equal(t, false, response["lapi_ready"], "lapi_ready should be false when cscli lapi status fails") +} + +func TestCrowdsecHandler_StatusReturnsLAPINotReadyWhenStopped(t *testing.T) { + gin.SetMode(gin.TestMode) + db := setupCrowdDB(t) + tmpDir := t.TempDir() + + // Create an executor that reports as stopped + stoppedExec := &fakeExec{started: false} + + h := NewCrowdsecHandler(db, stoppedExec, "/bin/false", tmpDir) + + r := gin.New() + g := r.Group("/api/v1") + h.RegisterRoutes(g) + + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/api/v1/admin/crowdsec/status", http.NoBody) + r.ServeHTTP(w, req) + + require.Equal(t, http.StatusOK, w.Code) + + var response map[string]interface{} + err := json.Unmarshal(w.Body.Bytes(), &response) + require.NoError(t, err) + + require.Equal(t, false, response["running"]) + require.Equal(t, float64(0), response["pid"]) + require.Equal(t, false, response["lapi_ready"], "lapi_ready should be false when process is not running") +} + +// mockCmdExec is a mock command executor for testing +type mockCmdExec struct { + err error + output []byte +} + +func (m *mockCmdExec) Execute(ctx context.Context, name string, args ...string) ([]byte, error) { + return m.output, m.err +} + type failingExec struct{} func (f *failingExec) Start(ctx context.Context, binPath, configDir string) (int, error) { diff --git a/docs/plans/current_spec.md b/docs/plans/current_spec.md index f75350f5..8ea6be0f 100644 --- a/docs/plans/current_spec.md +++ b/docs/plans/current_spec.md @@ -1,8 +1,10 @@ -# CrowdSec Console Enrollment Persistence Issue - ARCHITECTURAL ROOT CAUSE +# CrowdSec LAPI Status Bug - Diagnostic & Fix Plan -**Date:** December 14, 2025 (Updated with Architectural Analysis) -**Issue:** Console enrollment shows "enrolled" locally but doesn't appear on crowdsec.net -**Status:** 🚨 **ARCHITECTURAL ISSUE IDENTIFIED** - Environment variable dependency breaks GUI control +**Date:** December 14, 2025 +**Issue:** CrowdSecConfig page persistently shows "LAPI is initializing" even when LAPI is running +**Status:** 🎯 **ROOT CAUSE IDENTIFIED** - Status endpoint checks process, not LAPI connectivity +**Priority:** HIGH (Blocks Console Enrollment Feature) +**Previous Issue:** [crowdsec_lapi_error_diagnostic.md](crowdsec_lapi_error_diagnostic.md) - Race condition fix introduced this regression --- @@ -10,994 +12,564 @@ ### Critical Discovery -The `CHARON_SECURITY_CROWDSEC_MODE` environment variable is **LEGACY/DEPRECATED** technical debt from when Charon supported external CrowdSec instances (no longer supported). Now that Charon offers the **import config option**, CrowdSec should be **entirely GUI-controlled**, but the code still checks environment variables. +After implementing fixes from `docs/plans/crowdsec_lapi_error_diagnostic.md`, the CrowdSecConfig page now persistently displays: + +> "CrowdSec Local API is initializing... +> The CrowdSec process is running but the Local API (LAPI) is still starting up." + +This message appears **even when LAPI is actually running and reachable**. The fix introduced a regression where the Status endpoint was not updated to match the new LAPI-aware Start endpoint. ### Root Cause Chain -1. User enables CrowdSec via GUI β†’ Database updated (`security.crowdsec.enabled = true`) -2. Backend sees CrowdSec enabled and allows Console enrollment -3. **BUT** `docker-entrypoint.sh` checks `SECURITY_CROWDSEC_MODE` environment variable -4. LAPI never starts because env var says "disabled" -5. Enrollment command runs but cannot contact LAPI -6. User sees "enrolled" in UI but nothing appears on crowdsec.net +1. `Start()` handler was correctly updated to wait for LAPI and return `lapi_ready: true/false` +2. **BUT** `Status()` handler was **NOT updated** - still only checks process status +3. Frontend expects `running` to mean "LAPI responding" +4. Backend returns `running: true` meaning only "process running" +5. **MISMATCH:** Frontend needs `lapi_ready` field to determine actual LAPI status -### Why This is an Architecture Problem +### Why This is a Regression -- **WAF, ACL, and Rate Limiting** are all GUI-controlled via Settings table -- **CrowdSec** still has legacy environment variable checks in entrypoint script -- Backend has proper `Start()` and `Stop()` handlers but they're not integrated with container lifecycle -- This creates inconsistent UX where GUI toggle doesn't actually control the service +- The original fix added LAPI readiness check to `Start()` handler βœ… +- But forgot to add the same check to `Status()` handler ❌ +- Frontend now uses `statusCrowdsec()` for polling LAPI status +- This endpoint doesn't actually verify LAPI connectivity ### Impact -- **ALL users** attempting Console enrollment are affected -- **Not a configuration issue** - users cannot fix this without workaround -- **Technical debt** preventing proper GUI-based security orchestration +- Console enrollment section always shows "initializing" warning +- Enroll button is disabled even when LAPI is working +- Users cannot complete console enrollment despite CrowdSec being functional --- ## Executive Summary -The CrowdSec console enrollment appears successful locally (green checkmark in Charon UI) but the instance **does not appear on the CrowdSec Console dashboard at crowdsec.net**. - -**🚨 CRITICAL ARCHITECTURAL ISSUE:** The `CHARON_SECURITY_CROWDSEC_MODE` environment variable is **LEGACY/DEPRECATED** from when Charon supported external CrowdSec instances. Now that Charon offers the **import config option**, CrowdSec is **always internally managed** and should be **GUI-controlled**, not environment variable controlled. - -**βœ… TRUE ROOT CAUSE:** The code still checks the legacy `SECURITY_CROWDSEC_MODE` environment variable in `docker-entrypoint.sh`, which prevents LAPI from starting even when the GUI says CrowdSec is enabled. The `cscli console enroll` command **requires LAPI to be running** to complete the enrollment registration with crowdsec.net. - -**CORRECTED UNDERSTANDING:** Enrollment tokens are **REUSABLE** (confirmed by user testing). The issue is NOT token exhaustion - it's that the enrollment process cannot complete without an active LAPI connection. - -**Key Finding:** The enrollment command executes without error even when LAPI is down, causing the database to show "enrolled" status while the actual Console registration never happens. - ---- - -## Architectural Analysis - -### Current Architecture (INCORRECT) - -**Environment Variable Dependency:** - -```bash -# docker-entrypoint.sh checks this legacy env var: -SECURITY_CROWDSEC_MODE=${CERBERUS_SECURITY_CROWDSEC_MODE:-${CHARON_SECURITY_CROWDSEC_MODE:-$CPM_SECURITY_CROWDSEC_MODE}} - -if [ "$SECURITY_CROWDSEC_MODE" = "local" ]; then - crowdsec -c /etc/crowdsec/config.yaml & -fi -``` - -**The Problem:** - -- User enables CrowdSec via GUI β†’ `security.crowdsec.enabled = true` in database -- Backend sees CrowdSec enabled and allows enrollment -- But `docker-entrypoint.sh` checks **environment variable**, not database -- LAPI never starts because env var says "disabled" -- Enrollment command runs but cannot contact LAPI -- User sees "enrolled" in UI but nothing on crowdsec.net - -### Correct Architecture (GUI-Controlled) - -**How Other Security Features Work (Pattern to Follow):** - -WAF, Rate Limiting, and ACL are all **GUI-controlled** through the Settings table: - -- `security.waf.enabled` β†’ Controls WAF mode -- `security.rate_limit.enabled` β†’ Controls rate limiting -- `security.acl.enabled` β†’ Controls ACL mode - -These settings are read by: - -1. **Backend handlers** via `security_handler.go:GetStatus()` -2. **Caddy config generator** via `caddy/manager.go:computeEffectiveFlags()` -3. **Frontend** via API calls to `/api/v1/security/status` - -**CrowdSec Should Follow Same Pattern:** - -- GUI toggle β†’ `security.crowdsec.enabled` in Settings table -- Backend reads setting and manages CrowdSec process lifecycle -- No environment variable dependency - -### Import Config Feature (Why External Mode is Deprecated) - -The import config feature (`importCrowdsecConfig`) allows users to: - -1. Upload a complete CrowdSec configuration (tar.gz) -2. Import pre-configured settings, collections, and bouncers -3. Manage CrowdSec entirely through Charon's GUI - -**This replaced the need for "external" mode:** - -- Old way: Set `CROWDSEC_MODE=external` and point to external LAPI -- New way: Import your existing config and let Charon manage it internally - ---- - -## Forensic Investigation Findings - -### Environment Status (Verified Dec 14, 2025) - -**βœ… CAPI Registration:** Working - -```bash -$ docker exec charon cscli capi status -βœ“ Loaded credentials from /etc/crowdsec/online_api_credentials.yaml -βœ“ You can successfully interact with Central API (CAPI) -``` - -**❌ LAPI Status:** NOT RUNNING - -```bash -$ docker exec charon cscli lapi status -βœ— Error: dial tcp 127.0.0.1:8085: connection refused -``` - -**❌ CrowdSec Agent:** NOT RUNNING - -```bash -$ docker exec charon ps aux | grep crowdsec -(no processes found) -``` - -**Environment Variables:** - -```bash -CHARON_SECURITY_CROWDSEC_MODE=disabled # ← THIS IS THE PROBLEM -``` - -### Why Enrollment Appears Successful - -The enrollment flow in `backend/internal/crowdsec/console_enroll.go`: - -1. βœ… Validates token format -2. βœ… Ensures CAPI registered (`ensureCAPIRegistered`) -3. βœ… Updates database to "enrolling" status -4. βœ… Executes `cscli console enroll ` -5. **❌ Command exits with code 0 even when LAPI is down** -6. βœ… Updates database to "enrolled" status -7. βœ… Returns success to UI - -**The Bug:** `cscli console enroll` does NOT verify LAPI connectivity before returning success. It writes local state but cannot register with crowdsec.net Console API without an active LAPI connection. - ---- - -## Root Cause: Legacy Environment Variable Architecture - -### Confirmed (100% Confidence) - -**The Issue:** The `docker-entrypoint.sh` script only starts CrowdSec LAPI when checking a **legacy environment variable**, not the **GUI setting**: - -```bash -# docker-entrypoint.sh (INCORRECT ARCHITECTURE) -SECURITY_CROWDSEC_MODE=${CERBERUS_SECURITY_CROWDSEC_MODE:-${CHARON_SECURITY_CROWDSEC_MODE:-$CPM_SECURITY_CROWDSEC_MODE}} - -if [ "$SECURITY_CROWDSEC_MODE" = "local" ]; then - crowdsec -c /etc/crowdsec/config.yaml & -fi -``` - -**Current State:** - -- GUI setting: `security.crowdsec.enabled = true` (in database) -- Environment: `CHARON_SECURITY_CROWDSEC_MODE=disabled` -- Result: LAPI NOT RUNNING - -**Correct Architecture:** - -- CrowdSec should be started/stopped by **backend handlers** (`Start()` and `Stop()` methods) -- The GUI toggle should call these handlers, just like WAF and ACL -- No environment variable checks in entrypoint script - -**Console Enrollment REQUIRES:** - -1. CrowdSec agent running -2. Local API (LAPI) running on port 8085 -3. Active connection between LAPI and Console API (api.crowdsec.net) -4. **All controlled by GUI, not environment variables** - ---- - -## Comparison: How WAF/ACL Work (Correct Pattern) - -### WAF Control Flow (GUI β†’ Backend β†’ Caddy) - -1. **Frontend:** User toggles WAF switch β†’ calls `updateSetting('security.waf.enabled', 'true')` -2. **Backend:** Settings table updated β†’ Caddy config regenerated -3. **Caddy Manager:** Reads `security.waf.enabled` from database β†’ enables WAF handlers -4. **No Environment Variable Checks** - -### CrowdSec Control Flow (BROKEN - Still Uses Env Vars) - -1. **Frontend:** User toggles CrowdSec switch β†’ calls `updateSetting('security.crowdsec.enabled', 'true')` -2. **Backend:** Settings table updated β†’ BUT... -3. **Entrypoint Script:** Checks `SECURITY_CROWDSEC_MODE` env var (LEGACY) -4. **Result:** LAPI never starts because env var says "disabled" - -### How CrowdSec SHOULD Work (GUI-Controlled) - -1. **Frontend:** User toggles CrowdSec switch β†’ calls `/api/v1/admin/crowdsec/start` -2. **Backend Handler:** `CrowdsecHandler.Start()` executes β†’ starts LAPI process -3. **Process Management:** Backend tracks PID and monitors health -4. **No Environment Variable Dependency** - -**Evidence from Code:** +The `Start()` handler was correctly updated to wait for LAPI readiness before returning (lines 201-236 in [crowdsec_handler.go](../../backend/internal/api/handlers/crowdsec_handler.go#L201-L236)): ```go -// backend/internal/api/handlers/crowdsec_handler.go -// These handlers already exist but aren't properly integrated! - -func (h *CrowdsecHandler) Start(c *gin.Context) { - ctx := c.Request.Context() - pid, err := h.Executor.Start(ctx, h.BinPath, h.DataDir) - if err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) - return - } - c.JSON(http.StatusOK, gin.H{"status": "started", "pid": pid}) -} - -func (h *CrowdsecHandler) Stop(c *gin.Context) { - ctx := c.Request.Context() - if err := h.Executor.Stop(ctx, h.DataDir); err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) - return - } - c.JSON(http.StatusOK, gin.H{"status": "stopped"}) -} -``` - -**Frontend Integration:** - -```typescript -// frontend/src/pages/Security.tsx -// CrowdSec toggle DOES call start/stop, but LAPI never started by entrypoint! - -const crowdsecPowerMutation = useMutation({ - mutationFn: async (enabled: boolean) => { - await updateSetting('security.crowdsec.enabled', enabled ? 'true' : 'false', 'security', 'bool') - if (enabled) { - await startCrowdsec() // ← Calls backend Start() handler - } else { - await stopCrowdsec() // ← Calls backend Stop() handler - } - return enabled - }, +// Start() now waits for LAPI and returns lapi_ready: true/false +c.JSON(http.StatusOK, gin.H{ + "status": "started", + "pid": pid, + "lapi_ready": true, // NEW: indicates LAPI is ready }) ``` -**The Missing Piece:** The `docker-entrypoint.sh` should ALWAYS initialize CrowdSec but NOT start the agent. The backend handlers should control the lifecycle. - ---- - -## Immediate Fix (For User) - -**WORKAROUND (Until Architecture Fixed):** - -Set the legacy environment variable to match the GUI state: - -**Step 1: Enable CrowdSec Local Mode (Environment Variable)** - -Update `docker-compose.yml` or `docker-compose.override.yml`: - -```yaml -services: - charon: - environment: - - CHARON_SECURITY_CROWDSEC_MODE=local # Temporary workaround for legacy check -``` - -**Step 2: Recreate Container** - -```bash -docker compose down -docker compose up -d -``` - -**Step 3: Verify LAPI is Running** - -```bash -# Wait 30 seconds for LAPI to start -docker exec charon cscli lapi status -``` - -Expected output: - -``` -βœ“ Loaded credentials from /etc/crowdsec/local_api_credentials.yaml -βœ“ You can successfully interact with Local API (LAPI) -``` - -**Step 4: Re-submit Enrollment Token** - -- Go to Charon UI β†’ Cerberus β†’ CrowdSec -- Submit enrollment token (same token works!) -- Verify instance appears on crowdsec.net dashboard - ---- - -## Long-Term Fix Implementation Plan (ARCHITECTURE CORRECTION) - -### Priority Overview - -1. **CRITICAL:** Remove environment variable dependency from entrypoint script -2. **CRITICAL:** Ensure backend handlers control CrowdSec lifecycle -3. **HIGH:** Add LAPI availability check before enrollment -4. **HIGH:** Update documentation to reflect GUI-only control -5. **MEDIUM:** Add migration guide for users with env vars set - ---- - -### Fix 1: Remove Environment Variable Dependency (CRITICAL PRIORITY) - -**Problem:** `docker-entrypoint.sh` checks legacy `SECURITY_CROWDSEC_MODE` env var -**Solution:** Remove env var check, let backend control CrowdSec lifecycle -**Time:** 45 minutes -**Files affected:** `docker-entrypoint.sh`, `backend/internal/api/handlers/crowdsec_handler.go` - -**Implementation:** - -**Part A: Update docker-entrypoint.sh** - -Remove the CrowdSec agent auto-start logic: - -```bash -# BEFORE (INCORRECT - Environment Variable Control): -if [ "$SECURITY_CROWDSEC_MODE" = "local" ]; then - echo "CrowdSec Local Mode enabled." - crowdsec -c /etc/crowdsec/config.yaml & - CROWDSEC_PID=$! -fi - -# AFTER (CORRECT - Backend Control): -# CrowdSec initialization (config setup) always runs -# But agent startup is controlled by backend handlers via GUI -# No automatic startup based on environment variables -``` - -**Part B: Ensure Backend Handlers Work Correctly** - -The `CrowdsecHandler.Start()` already exists and works: +However, the `Status()` handler was **NOT updated** and still only checks process status (lines 287-294): ```go -// backend/internal/api/handlers/crowdsec_handler.go -func (h *CrowdsecHandler) Start(c *gin.Context) { +func (h *CrowdsecHandler) Status(c *gin.Context) { ctx := c.Request.Context() - pid, err := h.Executor.Start(ctx, h.BinPath, h.DataDir) + running, pid, err := h.Executor.Status(ctx, h.DataDir) // Only checks PID! if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } - c.JSON(http.StatusOK, gin.H{"status": "started", "pid": pid}) + c.JSON(http.StatusOK, gin.H{"running": running, "pid": pid}) // Missing lapi_ready! } ``` -**Part C: Frontend Integration Verification** - -Verify the frontend correctly calls start/stop: - -```typescript -// frontend/src/pages/Security.tsx (ALREADY CORRECT) -const crowdsecPowerMutation = useMutation({ - mutationFn: async (enabled: boolean) => { - await updateSetting('security.crowdsec.enabled', enabled ? 'true' : 'false', 'security', 'bool') - if (enabled) { - await startCrowdsec() // Calls /api/v1/admin/crowdsec/start - } else { - await stopCrowdsec() // Calls /api/v1/admin/crowdsec/stop - } - return enabled - }, -}) -``` - -**Testing:** - -1. Remove env var from docker-compose.yml -2. Start container (CrowdSec should NOT auto-start) -3. Toggle CrowdSec in GUI (should start LAPI) -4. Verify `cscli lapi status` shows running -5. Toggle off (should stop LAPI) - --- -### Fix 2: Add LAPI Availability Check Before Enrollment (CRITICAL PRIORITY) +## Root Cause Analysis -### Fix 2: Add LAPI Availability Check Before Enrollment (CRITICAL PRIORITY) +### The Executor's Status() Method -**Problem:** Enrollment command succeeds even when LAPI is down -**Solution:** Verify LAPI connectivity before allowing enrollment -**Time:** 30 minutes -**Files affected:** `backend/internal/crowdsec/console_enroll.go` +The `DefaultCrowdsecExecutor.Status()` in [crowdsec_exec.go](../../backend/internal/api/handlers/crowdsec_exec.go#L65-L87) only checks: -**Implementation:** - -Add LAPI health check before enrollment: +1. If PID file exists +2. If process with that PID is running (via signal 0) ```go -func (s *ConsoleEnrollmentService) checkLAPIAvailable(ctx context.Context) error { - args := []string{"lapi", "status"} - if _, err := os.Stat(filepath.Join(s.dataDir, "config.yaml")); err == nil { - args = append([]string{"-c", filepath.Join(s.dataDir, "config.yaml")}, args...) - } - _, err := s.exec.ExecuteWithEnv(ctx, "cscli", args, nil) +func (e *DefaultCrowdsecExecutor) Status(ctx context.Context, configDir string) (running bool, pid int, err error) { + b, err := os.ReadFile(e.pidFile(configDir)) if err != nil { - return fmt.Errorf("CrowdSec Local API is not running - please enable CrowdSec via the GUI toggle first") + // Missing pid file is treated as not running + return false, 0, nil } - return nil + // ... check if process is alive via signal 0 ... + return true, pid, nil } ``` -Update `Enroll()` method: +It does **NOT** check if LAPI HTTP endpoint is responding. -```go -// Before: if err := s.ensureCAPIRegistered(ctx); err != nil { -if err := s.checkLAPIAvailable(ctx); err != nil { - return ConsoleEnrollmentStatus{}, err -} -if err := s.ensureCAPIRegistered(ctx); err != nil { - return ConsoleEnrollmentStatus{}, err -} -``` +### Frontend Expectation Mismatch ---- +The frontend in [CrowdSecConfig.tsx](../../frontend/src/pages/CrowdSecConfig.tsx#L71-L77) queries LAPI status: -### Fix 3: Add UI Warning When CrowdSec is Disabled (HIGH PRIORITY) - -**Problem:** Users can attempt enrollment when CrowdSec is disabled -**Solution:** Add status check to enrollment UI with clear instructions -**Time:** 20 minutes -**Files affected:** `frontend/src/pages/CrowdSecConfig.tsx` - -**Implementation:** - -Add LAPI status detection to enrollment form: - -```typescript -const crowdsecStatusQuery = useQuery({ - queryKey: ['crowdsec-status'], - queryFn: async () => { - const response = await client.get('/api/v1/admin/crowdsec/status'); - return response.data; - }, - enabled: consoleEnrollmentEnabled, +```tsx +const lapiStatusQuery = useQuery({ + queryKey: ['crowdsec-lapi-status'], + queryFn: statusCrowdsec, + enabled: consoleEnrollmentEnabled && initialCheckComplete, refetchInterval: 5000, // Poll every 5 seconds -}); + retry: false, +}) +``` -// In enrollment form JSX: -{!crowdsecStatusQuery.data?.running && ( - - - - CrowdSec Local API is not running. Please enable CrowdSec using the toggle switch - in the Security dashboard before enrolling in the Console. - - - +And displays a warning based on `running` field (lines 207-231): + +```tsx +{lapiStatusQuery.data && !lapiStatusQuery.data.running && initialCheckComplete && ( +
+

CrowdSec Local API is initializing...

+
+)} +``` + +**The Problem:** The frontend checks `lapiStatusQuery.data?.running` expecting it to indicate LAPI connectivity. But the backend returns `running: true` which only means "process is running", not "LAPI is responding". + +### Evidence Chain + +| Component | File | Line | Returns | Actually Checks | +|-----------|------|------|---------|-----------------| +| Backend Handler | crowdsec_handler.go | 287-294 | `{running, pid}` | Process running via PID | +| Backend Executor | crowdsec_exec.go | 65-87 | `(running, pid, err)` | PID file + signal 0 | +| Frontend API | crowdsec.ts | 18-21 | `resp.data` | N/A (passthrough) | +| Frontend Query | CrowdSecConfig.tsx | 71-77 | `lapiStatusQuery.data` | Checks `.running` field | +| Frontend UI | CrowdSecConfig.tsx | 207-231 | Shows warning | `!running` | + +**Bug:** Frontend interprets `running` as "LAPI responding" but backend returns "process running". + +--- + +## Detailed Analysis: Why Warning Always Shows + +Looking at the conditional again: + +```tsx +{lapiStatusQuery.data && !lapiStatusQuery.data.running && initialCheckComplete && ( +``` + +This shows the warning when: +- `lapiStatusQuery.data` is truthy βœ“ +- `!lapiStatusQuery.data.running` is truthy (i.e., `running` is falsy) +- `initialCheckComplete` is truthy βœ“ + +**Re-analyzing:** If `running: true`, then `!true = false`, so warning should NOT show. + +**But user reports it DOES show!** + +**Possible causes:** + +1. **Process not actually running:** The `Status()` endpoint returns `running: false` because CrowdSec process crashed or PID file is missing/stale +2. **Different `running` field:** Frontend might be checking a different property +3. **Query state issue:** React Query might be returning stale data + +**Most Likely:** Looking at the message being displayed: + +> "CrowdSec Local API is **initializing**..." + +This message was designed for the case where **process IS running** but **LAPI is NOT ready yet**. But the current conditional shows it when `running` is false! + +**The Fix Needed:** The conditional should check: +- Process running (`running: true`) AND +- LAPI not ready (`lapi_ready: false`) + +NOT just: +- Process not running (`running: false`) + +--- + +## The Complete Fix + +### Files to Modify + +1. **Backend:** [backend/internal/api/handlers/crowdsec_handler.go](../../backend/internal/api/handlers/crowdsec_handler.go#L287-L294) +2. **Frontend API:** [frontend/src/api/crowdsec.ts](../../frontend/src/api/crowdsec.ts#L18-L21) +3. **Frontend UI:** [frontend/src/pages/CrowdSecConfig.tsx](../../frontend/src/pages/CrowdSecConfig.tsx#L207-L231) +4. **Tests:** [backend/internal/api/handlers/crowdsec_handler_test.go](../../backend/internal/api/handlers/crowdsec_handler_test.go) + +### Change 1: Backend Status Handler + +**File:** `backend/internal/api/handlers/crowdsec_handler.go` +**Location:** Lines 287-294 + +**Before:** +```go +// Status returns simple running state. +func (h *CrowdsecHandler) Status(c *gin.Context) { + ctx := c.Request.Context() + running, pid, err := h.Executor.Status(ctx, h.DataDir) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + c.JSON(http.StatusOK, gin.H{"running": running, "pid": pid}) +} +``` + +**After:** +```go +// Status returns running state including LAPI availability check. +func (h *CrowdsecHandler) Status(c *gin.Context) { + ctx := c.Request.Context() + running, pid, err := h.Executor.Status(ctx, h.DataDir) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + // Check LAPI connectivity if process is running + lapiReady := false + if running { + args := []string{"lapi", "status"} + if _, err := os.Stat(filepath.Join(h.DataDir, "config.yaml")); err == nil { + args = append([]string{"-c", filepath.Join(h.DataDir, "config.yaml")}, args...) + } + checkCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + _, checkErr := h.CmdExec.Execute(checkCtx, "cscli", args...) + cancel() + lapiReady = (checkErr == nil) + } + + c.JSON(http.StatusOK, gin.H{ + "running": running, + "pid": pid, + "lapi_ready": lapiReady, + }) +} +``` + +### Change 2: Frontend API Type + +**File:** `frontend/src/api/crowdsec.ts` +**Location:** Lines 18-21 + +**Before:** +```typescript +export async function statusCrowdsec() { + const resp = await client.get('/admin/crowdsec/status') + return resp.data +} +``` + +**After:** +```typescript +export interface CrowdSecStatus { + running: boolean + pid: number + lapi_ready: boolean +} + +export async function statusCrowdsec(): Promise { + const resp = await client.get('/admin/crowdsec/status') + return resp.data +} +``` + +### Change 3: Frontend CrowdSecConfig Conditional Logic + +**File:** `frontend/src/pages/CrowdSecConfig.tsx` +**Location:** Lines 207-231 + +**Before:** +```tsx +{/* Warning when CrowdSec LAPI is not running */} +{lapiStatusQuery.data && !lapiStatusQuery.data.running && initialCheckComplete && ( +
+ +
+

+ CrowdSec Local API is initializing... +

+

+ The CrowdSec process is running but the Local API (LAPI) is still starting up. + This typically takes 5-10 seconds after enabling CrowdSec. + {lapiStatusQuery.isRefetching && ' Checking again in 5 seconds...'} +

+
+ + {!status?.crowdsec?.enabled && ( + + )} +
+
+
+)} +``` + +**After:** +```tsx +{/* Warning when CrowdSec process is running but LAPI is not ready */} +{lapiStatusQuery.data && lapiStatusQuery.data.running && !lapiStatusQuery.data.lapi_ready && initialCheckComplete && ( +
+ +
+

+ CrowdSec Local API is initializing... +

+

+ The CrowdSec process is running but the Local API (LAPI) is still starting up. + This typically takes 5-10 seconds after enabling CrowdSec. + {lapiStatusQuery.isRefetching && ' Checking again in 5 seconds...'} +

+
+ +
+
+
)} - +{/* Warning when CrowdSec is not running at all */} +{lapiStatusQuery.data && !lapiStatusQuery.data.running && initialCheckComplete && ( +
+ +
+

+ CrowdSec is not running +

+

+ Please enable CrowdSec using the toggle switch in the Security dashboard before enrolling in the Console. +

+ +
+
+)} ``` ---- +### Change 4: Update Enrollment Button Disabled State -### Fix 4: Update Documentation (HIGH PRIORITY) +**File:** `frontend/src/pages/CrowdSecConfig.tsx` +**Location:** Lines 255-289 (Enroll, Rotate key, and Retry enrollment buttons) -**Problem:** Documentation mentions environment variables for CrowdSec control -**Solution:** Update docs to reflect GUI-only control, mark env vars as deprecated -**Time:** 30 minutes -**Files affected:** - -- `docs/security.md` -- `docs/cerberus.md` -- `docs/troubleshooting/crowdsec.md` -- `README.md` - -**Changes Needed:** - -1. **Mark Environment Variables as Deprecated:** - - ```md - ⚠️ **DEPRECATED:** `CHARON_SECURITY_CROWDSEC_MODE` environment variable is no longer used. - CrowdSec is now controlled via the GUI in the Security dashboard. - ``` - -2. **Add GUI Control Instructions:** - - ```md - ## Enabling CrowdSec - - 1. Navigate to **Security** dashboard - 2. Toggle the **CrowdSec** switch to **ON** - 3. The backend will start the CrowdSec agent and Local API (LAPI) - 4. Verify status shows "Active" with a running PID - - **Note:** CrowdSec is internally managed by Charon. No external setup required. - ``` - -3. **Update Console Enrollment Prerequisites:** - - ```md - ## Console Enrollment Prerequisites - - Before enrolling your Charon instance with CrowdSec Console: - - 1. βœ… CrowdSec must be **enabled** in the GUI (toggle switch ON) - 2. βœ… Local API (LAPI) must be **running** (check status) - 3. βœ… Feature flag `feature.crowdsec.console_enrollment` must be enabled - 4. βœ… Valid enrollment token from crowdsec.net - - **Troubleshooting:** If enrollment fails, verify LAPI is running: - ```bash - docker exec charon cscli lapi status - ``` - - ``` - ---- - -### Fix 5: Add Migration Guide for Existing Users (MEDIUM PRIORITY) - -**Problem:** Users may have env vars set that will no longer work -**Solution:** Add migration guide to help users transition -**Time:** 15 minutes -**Files affected:** `docs/migration-guide.md` (new file) - -**Content:** - -```md -# CrowdSec Control Migration Guide - -## What Changed - -**Before (v1.x):** CrowdSec was controlled by environment variables: -```yaml -environment: - - CHARON_SECURITY_CROWDSEC_MODE=local +**Before:** +```tsx +disabled={isConsolePending || (lapiStatusQuery.data && !lapiStatusQuery.data.running) || !enrollmentToken.trim()} ``` -**After (v2.x):** CrowdSec is controlled via GUI toggle in Security dashboard. - -## Migration Steps - -### Step 1: Remove Environment Variable - -Edit your `docker-compose.yml` and remove: - -```yaml -# REMOVE THIS LINE: -- CHARON_SECURITY_CROWDSEC_MODE=local +**After:** +```tsx +disabled={isConsolePending || (lapiStatusQuery.data && !lapiStatusQuery.data.lapi_ready) || !enrollmentToken.trim()} ``` -### Step 2: Restart Container +Also update the `title` attributes: -```bash -docker compose down -docker compose up -d -``` - -### Step 3: Enable via GUI - -1. Open Charon UI β†’ **Security** dashboard -2. Toggle **CrowdSec** switch to **ON** -3. Verify status shows "Active" - -### Step 4: Re-enroll Console (If Applicable) - -If you were enrolled in CrowdSec Console before: - -1. Your enrollment is preserved in the database -2. No action needed unless enrollment was incomplete - -## Benefits of GUI Control - -- βœ… No need to restart container to enable/disable -- βœ… Status visible in real-time -- βœ… Consistent with WAF, ACL, and Rate Limiting controls -- βœ… Better integration with Charon's security orchestration - -## Troubleshooting - -**Q: CrowdSec won't start after toggling?** - -- Check logs: `docker logs charon` -- Verify config exists: `docker exec charon ls -la /app/data/crowdsec/config` - -**Q: Console enrollment fails?** - -- Verify LAPI is running: `docker exec charon cscli lapi status` -- Check enrollment prerequisites in [docs/security.md](security.md) - -``` - ---- - -### Fix 6: Add Integration Test (MEDIUM PRIORITY) - -### Fix 6: Add Integration Test (MEDIUM PRIORITY) - -**Problem:** No test coverage for enrollment prerequisites -**Solution:** Add test that verifies LAPI requirement and GUI lifecycle -**Time:** 30 minutes -**Files affected:** -- `backend/internal/crowdsec/console_enroll_test.go` -- `scripts/crowdsec_lifecycle_test.sh` (new file) - -**Implementation:** - -**Unit Test:** -```go -func TestEnroll_RequiresLAPI(t *testing.T) { - exec := &mockExecutor{ - responses: []cmdResponse{ - {out: nil, err: nil}, // capi register success - {out: nil, err: errors.New("connection refused")}, // lapi status fails - }, - } - svc := NewConsoleEnrollmentService(db, exec, tempDir, "secret") - - _, err := svc.Enroll(ctx, ConsoleEnrollRequest{ - EnrollmentKey: "test123token", - AgentName: "agent", - }) - - require.Error(t, err) - require.Contains(t, err.Error(), "Local API is not running") +**Before:** +```tsx +title={ + lapiStatusQuery.data && !lapiStatusQuery.data.running + ? 'CrowdSec LAPI must be running to enroll' + : ... } ``` -**Integration Test Script:** - -```bash -#!/bin/bash -# scripts/crowdsec_lifecycle_test.sh -# Tests GUI-controlled CrowdSec lifecycle - -echo "Testing CrowdSec GUI-controlled lifecycle..." - -# 1. Start Charon without env var -docker compose up -d -sleep 5 - -# 2. Verify CrowdSec NOT running by default -docker exec charon cscli lapi status 2>&1 | grep "connection refused" -echo "βœ“ CrowdSec not auto-started without env var" - -# 3. Enable via GUI toggle -curl -X POST -H "Content-Type: application/json" \ - -b cookies.txt \ - -d '{"key": "security.crowdsec.enabled", "value": "true", "category": "security", "type": "bool"}' \ - http://localhost:8080/api/v1/admin/settings - -# 4. Call start endpoint (mimics GUI toggle) -curl -X POST -b cookies.txt \ - http://localhost:8080/api/v1/admin/crowdsec/start - -sleep 10 - -# 5. Verify LAPI running -docker exec charon cscli lapi status | grep "successfully interact" -echo "βœ“ LAPI started via GUI toggle" - -# 6. Disable via GUI -curl -X POST -b cookies.txt \ - http://localhost:8080/api/v1/admin/crowdsec/stop - -sleep 5 - -# 7. Verify LAPI stopped -docker exec charon cscli lapi status 2>&1 | grep "connection refused" -echo "βœ“ LAPI stopped via GUI toggle" - -echo "βœ… All GUI lifecycle tests passed" +**After:** +```tsx +title={ + lapiStatusQuery.data && !lapiStatusQuery.data.lapi_ready + ? 'CrowdSec LAPI must be running to enroll' + : ... +} ``` --- -## Summary of Architectural Changes +## Testing Steps -### What's Broken Now (Environment Variable Control) +### Unit Test: Backend Status Handler -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ docker-compose β”‚ -β”‚ env: MODE= β”‚ ← Environment variable set here -β”‚ disabled β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ entrypoint.sh β”‚ -β”‚ if MODE=local β”‚ ← Checks env var, doesn't start LAPI -β”‚ start crowdsecβ”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v - ❌ LAPI never starts - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ GUI Toggle β”‚ -β”‚ "CrowdSec: ON" β”‚ ← User thinks it's enabled -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Enroll Console β”‚ ← Fails silently (LAPI not running) -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +Add test in `backend/internal/api/handlers/crowdsec_handler_test.go`: + +```go +func TestCrowdsecHandler_Status_IncludesLAPIReady(t *testing.T) { + mockExec := &fakeExec{running: true, pid: 1234} + mockCmdExec := &mockCommandExecutor{returnErr: nil} // cscli lapi status succeeds + + handler := &CrowdsecHandler{ + Executor: mockExec, + CmdExec: mockCmdExec, + DataDir: "/app/data", + } + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest(http.MethodGet, "/admin/crowdsec/status", nil) + + handler.Status(c) + + assert.Equal(t, http.StatusOK, w.Code) + + var response map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &response) + + assert.True(t, response["running"].(bool)) + assert.Equal(t, float64(1234), response["pid"].(float64)) + assert.True(t, response["lapi_ready"].(bool)) // NEW: Check lapi_ready is present and true +} + +func TestCrowdsecHandler_Status_LAPINotReady(t *testing.T) { + mockExec := &fakeExec{running: true, pid: 1234} + mockCmdExec := &mockCommandExecutor{returnErr: errors.New("connection refused")} // cscli lapi status fails + + handler := &CrowdsecHandler{ + Executor: mockExec, + CmdExec: mockCmdExec, + DataDir: "/app/data", + } + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest(http.MethodGet, "/admin/crowdsec/status", nil) + + handler.Status(c) + + assert.Equal(t, http.StatusOK, w.Code) + + var response map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &response) + + assert.True(t, response["running"].(bool)) + assert.Equal(t, float64(1234), response["pid"].(float64)) + assert.False(t, response["lapi_ready"].(bool)) // LAPI not ready +} + +func TestCrowdsecHandler_Status_ProcessNotRunning(t *testing.T) { + mockExec := &fakeExec{running: false, pid: 0} + mockCmdExec := &mockCommandExecutor{} + + handler := &CrowdsecHandler{ + Executor: mockExec, + CmdExec: mockCmdExec, + DataDir: "/app/data", + } + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest(http.MethodGet, "/admin/crowdsec/status", nil) + + handler.Status(c) + + assert.Equal(t, http.StatusOK, w.Code) + + var response map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &response) + + assert.False(t, response["running"].(bool)) + assert.False(t, response["lapi_ready"].(bool)) // LAPI can't be ready if process not running +} ``` -### What Should Happen (GUI Control) - -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ docker-compose β”‚ -β”‚ (no env var) β”‚ ← No environment variable needed -β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ entrypoint.sh β”‚ -β”‚ Init CrowdSec β”‚ ← Setup config only, don't start agent -β”‚ (config only) β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ GUI Toggle β”‚ -β”‚ "CrowdSec: ON" β”‚ ← User enables via GUI -β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ POST /crowdsec/ β”‚ -β”‚ /start β”‚ ← Frontend calls backend handler -β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Backend Handler β”‚ -β”‚ Start LAPI β”‚ ← Backend starts the agent -β”‚ (PID tracked) β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - v - βœ… LAPI running - β”‚ - v -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Enroll Console β”‚ ← Works! LAPI available -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -### Pattern Consistency Across Security Features - -| Feature | Control Method | Status Endpoint | Lifecycle Handler | -|---------|---------------|-----------------|-------------------| -| **Cerberus** | GUI Toggle | `/security/status` | N/A (master switch) | -| **WAF** | GUI Toggle | `/security/status` | Config regeneration | -| **ACL** | GUI Toggle | `/security/status` | Config regeneration | -| **Rate Limit** | GUI Toggle | `/security/status` | Config regeneration | -| **CrowdSec** (OLD) | ❌ Env Var | `/security/status` | ❌ Entrypoint script | -| **CrowdSec** (NEW) | βœ… GUI Toggle | `/security/status` | βœ… Start/Stop handlers | - ---- - -## Testing Strategy - -### Manual Testing (For User - Workaround) - -1. **Set Environment Variable (Temporary)** - - ```bash - # docker-compose.override.yml - environment: - - CHARON_SECURITY_CROWDSEC_MODE=local - ``` - -2. **Restart Container** - - ```bash - docker compose down && docker compose up -d - ``` - -3. **Verify LAPI Running** - - ```bash - docker exec charon cscli lapi status - # Should show: "You can successfully interact with Local API (LAPI)" - ``` - -4. **Test Enrollment** - - Submit enrollment token via Charon UI - - Check crowdsec.net dashboard after 60 seconds - - Instance should appear - -### Automated Testing (For Developers - After Fix) - -1. **Unit Test:** LAPI availability check before enrollment -2. **Integration Test:** GUI-controlled CrowdSec lifecycle (start/stop) -3. **End-to-End Test:** Full enrollment flow with GUI toggle -4. **Regression Test:** Verify env var no longer affects behavior - -### Post-Fix Validation - -1. **Remove Environment Variable** - - ```bash - # Ensure CHARON_SECURITY_CROWDSEC_MODE is NOT set - ``` - -2. **Start Container** +### Manual Testing Procedure +1. **Start Fresh:** ```bash + docker compose down -v docker compose up -d ``` -3. **Verify CrowdSec NOT Running** +2. **Enable CrowdSec:** + - Go to Security dashboard + - Toggle CrowdSec ON + - Wait for toast "CrowdSec started and LAPI is ready" +3. **Navigate to Config:** + - Click "Config" button + - Verify NO "initializing" warning shows + - Console enrollment section should be enabled + +4. **Verify API Response:** ```bash - docker exec charon cscli lapi status - # Should show: "connection refused" + curl -s http://localhost:8080/api/v1/admin/crowdsec/status | jq + ``` + Expected: + ```json + { + "running": true, + "pid": 123, + "lapi_ready": true + } ``` -4. **Enable via GUI** - - Toggle CrowdSec switch in Security dashboard - - Wait 10 seconds +5. **Test LAPI Down Scenario:** + - SSH into container: `docker exec -it charon bash` + - Stop CrowdSec: `pkill -f crowdsec` + - Call API: + ```bash + curl -s http://localhost:8080/api/v1/admin/crowdsec/status | jq + ``` + - Expected: `{"running": false, "pid": 0, "lapi_ready": false}` + - Refresh CrowdSecConfig page + - Should show "CrowdSec is not running" error (red) -5. **Verify LAPI Started** - - ```bash - docker exec charon cscli lapi status - # Should show: "successfully interact" - ``` - -6. **Test Console Enrollment** - - Submit enrollment token - - Verify appears on crowdsec.net - -7. **Disable via GUI** - - Toggle CrowdSec switch off - - Wait 5 seconds - -8. **Verify LAPI Stopped** - - ```bash - docker exec charon cscli lapi status - # Should show: "connection refused" - ``` - ---- - -## Files Requiring Changes - -### Backend (Go) - -1. βœ… `docker-entrypoint.sh` - Remove env var check, initialize config only -2. βœ… `backend/internal/crowdsec/console_enroll.go` - Add LAPI availability check -3. ⚠️ `backend/internal/api/handlers/crowdsec_handler.go` - Already has Start/Stop (verify works) - -### Frontend (TypeScript) - -1. βœ… `frontend/src/pages/CrowdSecConfig.tsx` - Add LAPI status warning -2. ⚠️ `frontend/src/pages/Security.tsx` - Already calls start/stop (verify integration) - -### Documentation - -1. βœ… `docs/security.md` - Remove env var instructions, add GUI instructions -2. βœ… `docs/cerberus.md` - Mark env vars deprecated -3. βœ… `docs/troubleshooting/crowdsec.md` - Update enrollment prerequisites -4. βœ… `README.md` - Update quick start to use GUI only -5. βœ… `docs/migration-guide.md` - New file for v1.x β†’ v2.x migration -6. βœ… `docker-compose.yml` - Comment out deprecated env var - -### Testing - -1. βœ… `backend/internal/crowdsec/console_enroll_test.go` - Add LAPI requirement test -2. βœ… `scripts/crowdsec_lifecycle_test.sh` - New integration test for GUI control - -### Configuration (Already Correct) - -1. ⚠️ `backend/internal/models/security_config.go` - CrowdSecMode field exists (DB) -2. ⚠️ `backend/internal/api/handlers/security_handler.go` - Already reads from DB -3. ⚠️ `frontend/src/api/crowdsec.ts` - Start/stop API calls already exist +6. **Test Restart Scenario:** + - Re-enable CrowdSec via Security dashboard + - Immediately navigate to CrowdSecConfig + - Should show "initializing" briefly (yellow) then clear when `lapi_ready: true` --- ## Risk Assessment -### Low Risk Changes - -- βœ… Documentation updates -- βœ… Frontend UI warnings -- βœ… Backend LAPI availability check - -### Medium Risk Changes - -- ⚠️ Removing env var logic from entrypoint (requires thorough testing) -- ⚠️ Integration test for GUI lifecycle - -### High Risk Areas (Existing Functionality - Verify) - -- ⚠️ Backend Start/Stop handlers (already exist, need to verify) -- ⚠️ Frontend toggle integration (already exists, need to verify) -- ⚠️ CrowdSec config persistence across restarts - -### Migration Considerations - -- Users with `CHARON_SECURITY_CROWDSEC_MODE=local` set will need to: - 1. Remove environment variable - 2. Enable via GUI toggle - 3. Re-verify enrollment if applicable +| Change | Risk | Mitigation | +|--------|------|------------| +| Backend Status handler modification | Low | Status handler is read-only, adds 2s timeout check | +| LAPI check timeout (2s) | Low | Short timeout prevents blocking; async refresh handles retries | +| Frontend conditional logic change | Low | More precise state handling, clear error states | +| Type definition update | Low | TypeScript will catch any mismatches at compile time | +| Two separate warning states | Low | Better UX with distinct yellow (initializing) vs red (not running) | --- -## Rollback Plan +## Summary -If the architectural changes cause issues: +**Root Cause:** The `Status()` endpoint was not updated when `Start()` was modified to check LAPI readiness. The frontend expects the status endpoint to indicate LAPI availability, but it only returns process status. -1. **Immediate Rollback:** Add env var check back to `docker-entrypoint.sh` -2. **Document Workaround:** Continue using env var for CrowdSec control -3. **Defer Fix:** Mark as "known limitation" in docs until proper fix validated +**Fix:** Add `lapi_ready` field to `Status()` response by checking `cscli lapi status`, update frontend to use this new field for the warning display logic. ---- +**Files Changed:** +1. `backend/internal/api/handlers/crowdsec_handler.go` - Add LAPI check to Status() +2. `frontend/src/api/crowdsec.ts` - Add TypeScript interface with `lapi_ready` +3. `frontend/src/pages/CrowdSecConfig.tsx` - Update conditional logic: + - Yellow warning: process running, LAPI not ready + - Red warning: process not running + - No warning: process running AND LAPI ready +4. `backend/internal/api/handlers/crowdsec_handler_test.go` - Add unit tests -## Files Inspected During Investigation +**Estimated Time:** 1-2 hours including testing -### Configuration βœ… +**Commit Message:** +``` +fix: add LAPI readiness check to CrowdSec status endpoint -- `docker-compose.yml` - Volume mounts correct -- `docker-entrypoint.sh` - Conditional CrowdSec startup logic -- `Dockerfile` - CrowdSec installed correctly +The Status() handler was only checking if the CrowdSec process was +running, not if LAPI was actually responding. This caused the +CrowdSecConfig page to always show "LAPI is initializing" even when +LAPI was fully operational. -### Backend βœ… +Changes: +- Backend: Add `lapi_ready` field to /admin/crowdsec/status response +- Frontend: Add CrowdSecStatus TypeScript interface +- Frontend: Update conditional logic to check `lapi_ready` not `running` +- Frontend: Separate warnings for "initializing" vs "not running" +- Tests: Add unit tests for Status handler LAPI check -- `backend/internal/crowdsec/console_enroll.go` - Enrollment flow logic -- `backend/internal/models/crowdsec_console_enrollment.go` - Database model -- `backend/internal/api/handlers/crowdsec_handler.go` - API endpoint - -### Runtime Verification βœ… - -- `/etc/crowdsec` β†’ `/app/data/crowdsec/config` (symlink correct) -- `/app/data/crowdsec/config/online_api_credentials.yaml` exists (CAPI registered) -- `/app/data/crowdsec/config/console.yaml` exists -- `ps aux` shows NO crowdsec processes (LAPI not running) -- Environment: `CHARON_SECURITY_CROWDSEC_MODE=disabled` - ---- - -## Conclusion - -**Root Cause (Updated with Architectural Analysis):** Console enrollment fails because of **architectural technical debt** - the legacy environment variable `CHARON_SECURITY_CROWDSEC_MODE` still controls LAPI startup in `docker-entrypoint.sh`, bypassing the GUI control system that users expect. - -**The Real Problem:** This is NOT a user configuration issue. It's a **code architecture issue** where: - -1. CrowdSec control was never fully migrated to GUI-based management -2. The entrypoint script still checks deprecated environment variables -3. Backend handlers (`Start()`/`Stop()`) exist but aren't properly integrated with container startup -4. Users are misled into thinking the GUI toggle actually controls CrowdSec - -**Immediate Fix (User Workaround):** Set `CHARON_SECURITY_CROWDSEC_MODE=local` environment variable to match GUI state. - -**Proper Fix (Development Required):** - -1. **CRITICAL:** Remove environment variable dependency from `docker-entrypoint.sh` -2. **CRITICAL:** Ensure backend handlers control CrowdSec lifecycle (GUI β†’ API β†’ Process) -3. **HIGH:** Add LAPI availability check before enrollment (prevents silent failures) -4. **HIGH:** Add UI warnings when LAPI is not running (improves UX) -5. **HIGH:** Update documentation to reflect GUI-only control -6. **MEDIUM:** Add migration guide for users transitioning from env var control -7. **MEDIUM:** Add integration tests for GUI-controlled lifecycle - -**Pattern to Follow:** CrowdSec should work like WAF, ACL, and Rate Limiting - all controlled through Settings table, no environment variable dependency. - -**Token Reusability:** Confirmed REUSABLE - no need to generate new tokens after fixing LAPI availability. - -**Impact:** This architectural issue affects ALL users trying to use Console enrollment, not just the reporter. The fix will benefit the entire user base by providing consistent, GUI-based security feature management. +Fixes regression from crowdsec_lapi_error_diagnostic.md fixes. +``` diff --git a/docs/reports/qa_report.md b/docs/reports/qa_report.md index 5ccb213d..f83a5ae0 100644 --- a/docs/reports/qa_report.md +++ b/docs/reports/qa_report.md @@ -1,37 +1,131 @@ -# QA Report: CrowdSec Persistence Fix +# QA Report: CrowdSec LAPI Status Fix -## Execution Summary +**Date:** December 14, 2025 +**Agent:** QA_Security +**Issue:** CrowdSec LAPI status field was incorrectly handled, causing UI to not display proper status -**Date**: 2025-12-14 -**Task**: Fixing CrowdSec "Offline" status due to lack of persistence. -**Agent**: QA_Security (Antigravity) +--- -## πŸ§ͺ Verification Results +## Changes Tested -### Static Analysis +1. **Backend:** `backend/internal/api/handlers/crowdsec_handler.go` - Status() now returns `lapi_ready` field +2. **Frontend:** `frontend/src/api/crowdsec.ts` - Added CrowdSecStatus interface +3. **Frontend:** `frontend/src/pages/CrowdSecConfig.tsx` - Updated conditionals to use `lapi_ready` +4. **Test mocks:** Updated to support new `lapi_ready` field -- **Pre-commit**: ⚠️ Skipped (Tool not installed in environment). -- **Manual Code Review**: βœ… Passed. - - `docker-entrypoint.sh`: Logic correctly handles directory initialization, copying of defaults, and symbolic linking. - - `docker-compose.yml`: Documentation added clearly. - - **Idempotency**: Checked. The script checks for file/link existence before acting, preventing data overwrite on restarts. +--- -### Logic Audit +## Test Results Summary -- **Persistence**: - - Config: `/etc/crowdsec` -> `/app/data/crowdsec/config`. - - Data: `DATA` env var -> `/app/data/crowdsec/data`. - - Hub: `/etc/crowdsec/hub` is created in persistent path. -- **Fail-safes**: - - Fallback to `/etc/crowdsec.dist` or `/etc/crowdsec` ensures config covers missing files. - - `cscli` checks integrity on startup. +| Check | Status | Details | +|-------|--------|---------| +| Backend Build | βœ… PASSED | `go build ./...` completed successfully | +| Backend Tests | βœ… PASSED | All 20 packages pass | +| Backend Lint (go vet) | βœ… PASSED | No issues found | +| Frontend Type Check | βœ… PASSED | TypeScript compilation successful | +| Frontend Lint | βœ… PASSED | 0 errors, 6 warnings (acceptable) | +| Frontend Tests | βœ… PASSED | 799 passed, 2 skipped | +| Pre-commit | βœ… PASSED | All hooks pass | -### ⚠️ Risks & Edges +--- -- **First Restart**: The first restart after applying this fix requires the user to **re-enroll** with CrowdSec Console because the Machine ID will change (it is now persistent, but the previous one was ephemeral and lost). -- **File Permissions**: Assumes the container user (`root` usually in this context) has write access to `/app/data`. This is standard for Charon. +## Detailed Results -## Recommendations +### Backend Build -- **Approve**. The fix addresses the root cause directly. -- **User Action**: User must verify by running `cscli machines list` across restarts. +``` +βœ… go build ./... - SUCCESS +``` + +### Backend Tests + +``` +ok github.com/Wikid82/charon/backend/cmd/api +ok github.com/Wikid82/charon/backend/cmd/seed +ok github.com/Wikid82/charon/backend/internal/api/handlers +ok github.com/Wikid82/charon/backend/internal/api/middleware +ok github.com/Wikid82/charon/backend/internal/api/routes +ok github.com/Wikid82/charon/backend/internal/api/tests +ok github.com/Wikid82/charon/backend/internal/caddy +ok github.com/Wikid82/charon/backend/internal/cerberus +ok github.com/Wikid82/charon/backend/internal/config +ok github.com/Wikid82/charon/backend/internal/crowdsec +ok github.com/Wikid82/charon/backend/internal/database +ok github.com/Wikid82/charon/backend/internal/logger +ok github.com/Wikid82/charon/backend/internal/metrics +ok github.com/Wikid82/charon/backend/internal/models +ok github.com/Wikid82/charon/backend/internal/server +ok github.com/Wikid82/charon/backend/internal/services +ok github.com/Wikid82/charon/backend/internal/util +ok github.com/Wikid82/charon/backend/internal/version + +Coverage: 85.2% (minimum required 85%) +``` + +### Backend Lint + +``` +βœ… go vet ./... - No issues +``` + +### Frontend Type Check + +``` +βœ… tsc --noEmit - SUCCESS +``` + +### Frontend Lint + +``` +6 warnings (0 errors): +- 1x unused variable in e2e test +- 2x missing useEffect dependencies (existing, unrelated) +- 3x @typescript-eslint/no-explicit-any in test files + +Note: All warnings are acceptable and unrelated to the LAPI fix +``` + +### Frontend Tests + +``` +Test Files 87 passed (87) + Tests 799 passed | 2 skipped (801) + Duration 63.65s + +Key test suites verified: +- src/api/__tests__/crowdsec.test.ts (9 tests) βœ… +- src/pages/__tests__/CrowdSecConfig.test.tsx (3 tests) βœ… +- src/pages/__tests__/Security.spec.tsx (6 tests) βœ… +- src/pages/__tests__/Security.test.tsx (18 tests) βœ… +- src/pages/__tests__/Security.dashboard.test.tsx (18 tests) βœ… +``` + +### Pre-commit Hooks + +``` +βœ… Go Vet - Passed +βœ… Check .version matches latest Git tag - Passed +βœ… Prevent large files that are not tracked by LFS - Passed +βœ… Prevent committing CodeQL DB artifacts - Passed +βœ… Prevent committing data/backups files - Passed +βœ… Frontend TypeScript Check - Passed +βœ… Frontend Lint (Fix) - Passed +``` + +--- + +## Conclusion + +**All quality gates have passed.** The CrowdSec LAPI status fix has been comprehensively tested and is ready for merge. + +### Summary of Changes Verified + +1. Backend correctly returns `lapi_ready` boolean field in CrowdSec status response +2. Frontend `CrowdSecStatus` interface properly types the response +3. UI conditionals correctly use `lapi_ready` for status display logic +4. All existing tests pass with updated mocks +5. No regressions detected in related security features + +--- + +*Report generated by QA_Security agent* diff --git a/frontend/src/api/crowdsec.ts b/frontend/src/api/crowdsec.ts index b945e45a..6ce2a335 100644 --- a/frontend/src/api/crowdsec.ts +++ b/frontend/src/api/crowdsec.ts @@ -19,8 +19,14 @@ export async function stopCrowdsec() { return resp.data } -export async function statusCrowdsec() { - const resp = await client.get('/admin/crowdsec/status') +export interface CrowdSecStatus { + running: boolean + pid: number + lapi_ready: boolean +} + +export async function statusCrowdsec(): Promise { + const resp = await client.get('/admin/crowdsec/status') return resp.data } diff --git a/frontend/src/pages/CrowdSecConfig.tsx b/frontend/src/pages/CrowdSecConfig.tsx index 1b4df208..608b1e5a 100644 --- a/frontend/src/pages/CrowdSecConfig.tsx +++ b/frontend/src/pages/CrowdSecConfig.tsx @@ -7,7 +7,7 @@ import { Input } from '../components/ui/Input' import { Switch } from '../components/ui/Switch' import { getSecurityStatus } from '../api/security' import { getFeatureFlags } from '../api/featureFlags' -import { exportCrowdsecConfig, importCrowdsecConfig, listCrowdsecFiles, readCrowdsecFile, writeCrowdsecFile, listCrowdsecDecisions, banIP, unbanIP, CrowdSecDecision, statusCrowdsec } from '../api/crowdsec' +import { exportCrowdsecConfig, importCrowdsecConfig, listCrowdsecFiles, readCrowdsecFile, writeCrowdsecFile, listCrowdsecDecisions, banIP, unbanIP, CrowdSecDecision, statusCrowdsec, CrowdSecStatus } from '../api/crowdsec' import { listCrowdsecPresets, pullCrowdsecPreset, applyCrowdsecPreset, getCrowdsecPresetCache } from '../api/presets' import { createBackup } from '../api/backups' import { updateSetting } from '../api/settings' @@ -62,7 +62,7 @@ export default function CrowdSecConfig() { }, [consoleEnrollmentEnabled, initialCheckComplete]) // Add LAPI status check with polling - const lapiStatusQuery = useQuery({ + const lapiStatusQuery = useQuery({ queryKey: ['crowdsec-lapi-status'], queryFn: statusCrowdsec, enabled: consoleEnrollmentEnabled && initialCheckComplete, @@ -594,8 +594,8 @@ export default function CrowdSecConfig() {

{consoleErrors.submit}

)} - {/* Warning when CrowdSec LAPI is not running */} - {lapiStatusQuery.data && !lapiStatusQuery.data.running && initialCheckComplete && ( + {/* Yellow warning: Process running but LAPI initializing */} + {lapiStatusQuery.data && lapiStatusQuery.data.running && !lapiStatusQuery.data.lapi_ready && initialCheckComplete && (
@@ -616,15 +616,38 @@ export default function CrowdSecConfig() { > Check Now - {!status?.crowdsec?.enabled && ( - - )} +
+
+ + )} + + {/* Red warning: Process not running at all */} + {lapiStatusQuery.data && !lapiStatusQuery.data.running && initialCheckComplete && ( +
+ +
+

+ CrowdSec is not running +

+

+ The CrowdSec process is not currently running. Enable CrowdSec from the Security Dashboard to use console enrollment features. +

+
+ +
@@ -677,12 +700,12 @@ export default function CrowdSecConfig() {