diff --git a/backend/internal/api/handlers/crowdsec_bouncer_test.go b/backend/internal/api/handlers/crowdsec_bouncer_test.go index abfa7e12..61777e9b 100644 --- a/backend/internal/api/handlers/crowdsec_bouncer_test.go +++ b/backend/internal/api/handlers/crowdsec_bouncer_test.go @@ -7,6 +7,14 @@ import ( ) func TestGetBouncerAPIKeyFromEnv(t *testing.T) { + envKeys := []string{ + "CROWDSEC_API_KEY", + "CROWDSEC_BOUNCER_API_KEY", + "CERBERUS_SECURITY_CROWDSEC_API_KEY", + "CHARON_SECURITY_CROWDSEC_API_KEY", + "CPM_SECURITY_CROWDSEC_API_KEY", + } + tests := []struct { name string envVars map[string]string @@ -43,23 +51,18 @@ func TestGetBouncerAPIKeyFromEnv(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - // Clear env vars - _ = os.Unsetenv("CROWDSEC_BOUNCER_API_KEY") - _ = os.Unsetenv("CROWDSEC_API_KEY") + for _, key := range envKeys { + t.Setenv(key, "") + } - // Set test env vars for k, v := range tt.envVars { - _ = os.Setenv(k, v) + t.Setenv(k, v) } key := getBouncerAPIKeyFromEnv() if key != tt.expectedKey { t.Errorf("getBouncerAPIKeyFromEnv() key = %q, want %q", key, tt.expectedKey) } - - // Cleanup - _ = os.Unsetenv("CROWDSEC_BOUNCER_API_KEY") - _ = os.Unsetenv("CROWDSEC_API_KEY") }) } } diff --git a/backend/internal/api/handlers/crowdsec_coverage_target_test.go b/backend/internal/api/handlers/crowdsec_coverage_target_test.go index e59da5ed..164cc86a 100644 --- a/backend/internal/api/handlers/crowdsec_coverage_target_test.go +++ b/backend/internal/api/handlers/crowdsec_coverage_target_test.go @@ -185,6 +185,10 @@ func TestCheckLAPIHealthRequest(t *testing.T) { // TestGetLAPIKeyFromEnv tests environment variable lookup func TestGetLAPIKeyLookup(t *testing.T) { + t.Setenv("CROWDSEC_BOUNCER_API_KEY", "") + t.Setenv("CERBERUS_SECURITY_CROWDSEC_API_KEY", "") + t.Setenv("CHARON_SECURITY_CROWDSEC_API_KEY", "") + t.Setenv("CPM_SECURITY_CROWDSEC_API_KEY", "") // Test that getLAPIKey checks multiple env vars // Set one and verify it's found t.Setenv("CROWDSEC_API_KEY", "test-key-123") @@ -195,9 +199,11 @@ func TestGetLAPIKeyLookup(t *testing.T) { // TestGetLAPIKeyEmpty tests no env vars set func TestGetLAPIKeyEmpty(t *testing.T) { - // Ensure no env vars are set - _ = os.Unsetenv("CROWDSEC_API_KEY") - _ = os.Unsetenv("CROWDSEC_BOUNCER_API_KEY") + t.Setenv("CROWDSEC_API_KEY", "") + t.Setenv("CROWDSEC_BOUNCER_API_KEY", "") + t.Setenv("CERBERUS_SECURITY_CROWDSEC_API_KEY", "") + t.Setenv("CHARON_SECURITY_CROWDSEC_API_KEY", "") + t.Setenv("CPM_SECURITY_CROWDSEC_API_KEY", "") key := getLAPIKey() require.Equal(t, "", key) @@ -205,6 +211,10 @@ func TestGetLAPIKeyEmpty(t *testing.T) { // TestGetLAPIKeyAlternative tests alternative env var func TestGetLAPIKeyAlternative(t *testing.T) { + t.Setenv("CROWDSEC_API_KEY", "") + t.Setenv("CERBERUS_SECURITY_CROWDSEC_API_KEY", "") + t.Setenv("CHARON_SECURITY_CROWDSEC_API_KEY", "") + t.Setenv("CPM_SECURITY_CROWDSEC_API_KEY", "") t.Setenv("CROWDSEC_BOUNCER_API_KEY", "bouncer-key-456") key := getLAPIKey() diff --git a/docs/reports/qa_report.md b/docs/reports/qa_report.md index 42bf305b..2d352fcd 100644 --- a/docs/reports/qa_report.md +++ b/docs/reports/qa_report.md @@ -370,3 +370,78 @@ CONDITIONAL ### Local Backend Status for PR #666 - **Overall investigation status: FAIL (reproduced backend CI-like failures locally).** + +## PR #666 CI-Only Backend Failure Deep Dive Addendum - 2026-02-17 + +### Exact CI Failure Evidence + +- Source: GitHub Actions run `22087372370`, job `63824895671` (`backend-quality`). +- Exact failing assertion extracted from job logs: + - `--- FAIL: TestFetchIndexFallbackHTTP` + - `open testdata/hub_index.json: no such file or directory` + +### CI-Parity Local Matrix Executed + +All commands were run from `/projects/Charon` or `/projects/Charon/backend` with a valid 32-byte base64 `CHARON_ENCRYPTION_KEY`. + +1. `bash scripts/go-test-coverage.sh` +2. `go test ./... -race -count=1 -shuffle=on -v` +3. `go test ./... -race -count=1 -shuffle=on -v -p 1` +4. `go test ./... -race -count=1 -shuffle=on -v -p 4` + +### Reproduction Outcomes + +- CI-specific missing fixture (`testdata/hub_index.json`) was confirmed in CI logs. +- Local targeted stress for the CI-failing test (`internal/crowdsec` `TestFetchIndexFallbackHTTP`) passed repeatedly (10/10). +- Full matrix runs repeatedly surfaced lock/closure instability outside the single CI assertion: + - `database table is locked` + - `sql: database is closed` +- Representative failing packages in parity reruns: + - `internal/api/handlers` + - `internal/config` + - `internal/services` + - `internal/caddy` (deterministic fallback-env-key test failure in local matrix) + +### Root Cause (Evidence-Based) + +Primary root cause is **test isolation breakdown under race+shuffle execution**, not encryption-key preflight: + +1. **SQLite cross-test contamination/contention** + - Shared DB state patterns caused row leakage and lock events under shuffled execution. + +2. **Process-level environment variable contamination** + - CrowdSec env-key tests depended on mutable global env without full reset, causing order-sensitive behavior. + +3. **Separate CI-only fixture-path issue** + - CI log shows missing `testdata/hub_index.json` for `TestFetchIndexFallbackHTTP`, which did not reproduce locally. + +### Low-Risk Fixes Applied During Investigation + +1. `backend/internal/api/handlers/notification_handler_test.go` + - Reworked test DB setup from shared in-memory sqlite to per-test sqlite file in `t.TempDir()` with WAL + busy timeout. + - Updated tests to call `setupNotificationTestDB(t)`. + +2. `backend/internal/api/handlers/crowdsec_bouncer_test.go` + - Hardened `TestGetBouncerAPIKeyFromEnv` to reset all supported env keys per subtest before setting case-specific values. + +3. `backend/internal/api/handlers/crowdsec_coverage_target_test.go` + - Added explicit reset of all relevant CrowdSec env keys in `TestGetLAPIKeyLookup`, `TestGetLAPIKeyEmpty`, and `TestGetLAPIKeyAlternative`. + +### Post-Fix Verification + +- Targeted suites stabilized after fixes: + - Notification handler list flake (row leakage) no longer reproduced in repeated stress loops. + - CrowdSec env-key tests remained stable in repeated shuffled runs. +- Broad matrix remained unstable with additional pre-existing failures (`sql: database is closed`/`database table is locked`) across multiple packages. + +### Final Parity Status + +- **Scoped fix validation**: PASS (targeted flaky tests stabilized). +- **Full CI-parity matrix**: FAIL (broader baseline instability remains; not fully resolved in this pass). + +### Recommended Next Fix Plan (No Sleep/Retry Band-Aids) + +1. Enforce per-test DB isolation in remaining backend test helpers still using shared sqlite state. +2. Eliminate global mutable env leakage by standardizing full-key reset in all env-sensitive tests. +3. Fix CI fixture path robustness for `TestFetchIndexFallbackHTTP` (`testdata` resolution independent of working directory). +4. Re-run parity matrix (`coverage`, `race+shuffle`, `-p 1`, `-p 4`) after each isolation patch batch.