chore: Implement manual test plan for SMTP mock server flakiness fix
- Added a new documentation file outlining the manual test plan to validate the SMTP mock server flakiness fix, ensuring improved mail test reliability without affecting production behavior. - Updated the current specification document to reflect the focus on stabilizing flaky SMTP STARTTLS+AUTH unit tests, including detailed research findings and requirements for the implementation. - Created a QA/Security validation report for the SMTP flaky test fix, confirming that changes are test-only, stable under repeated runs, and do not introduce new security risks.
This commit is contained in:
@@ -9,12 +9,15 @@ import (
|
||||
"crypto/x509"
|
||||
"crypto/x509/pkix"
|
||||
"encoding/pem"
|
||||
"errors"
|
||||
"math/big"
|
||||
"net"
|
||||
"net/mail"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -26,6 +29,61 @@ import (
|
||||
"gorm.io/gorm/logger"
|
||||
)
|
||||
|
||||
// TestMain sets up the SSL_CERT_FILE environment variable globally BEFORE any tests run.
|
||||
// This ensures x509.SystemCertPool() initializes with our test CA, which is critical for
|
||||
// parallel test execution with -race flag where cert pool initialization timing matters.
|
||||
func TestMain(m *testing.M) {
|
||||
// Initialize shared test CA and write stable cert file
|
||||
initializeTestCAForSuite()
|
||||
|
||||
// Set SSL_CERT_FILE globally so cert pool initialization uses our CA
|
||||
if err := os.Setenv("SSL_CERT_FILE", testCAFile); err != nil {
|
||||
panic("failed to set SSL_CERT_FILE: " + err.Error())
|
||||
}
|
||||
|
||||
// Run tests
|
||||
exitCode := m.Run()
|
||||
|
||||
// Cleanup (optional, OS will clean /tmp on reboot)
|
||||
_ = os.Remove(testCAFile)
|
||||
|
||||
os.Exit(exitCode)
|
||||
}
|
||||
|
||||
// initializeTestCAForSuite is called once by TestMain to set up the shared CA infrastructure.
|
||||
func initializeTestCAForSuite() {
|
||||
testCAOnce.Do(func() {
|
||||
var err error
|
||||
testCAKey, err = rsa.GenerateKey(rand.Reader, 2048)
|
||||
if err != nil {
|
||||
panic("GenerateKey failed: " + err.Error())
|
||||
}
|
||||
|
||||
testCATemplate = &x509.Certificate{
|
||||
SerialNumber: big.NewInt(1),
|
||||
Subject: pkix.Name{
|
||||
CommonName: "charon-test-ca",
|
||||
},
|
||||
NotBefore: time.Now().Add(-time.Hour),
|
||||
NotAfter: time.Now().Add(24 * 365 * time.Hour), // 24 years
|
||||
KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageCRLSign,
|
||||
BasicConstraintsValid: true,
|
||||
IsCA: true,
|
||||
}
|
||||
|
||||
caDER, err := x509.CreateCertificate(rand.Reader, testCATemplate, testCATemplate, &testCAKey.PublicKey, testCAKey)
|
||||
if err != nil {
|
||||
panic("CreateCertificate failed: " + err.Error())
|
||||
}
|
||||
testCAPEM = pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: caDER})
|
||||
|
||||
testCAFile = filepath.Join(os.TempDir(), "charon-test-ca-mail-service.pem")
|
||||
if err := os.WriteFile(testCAFile, testCAPEM, 0o600); err != nil {
|
||||
panic("WriteFile failed: " + err.Error())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func setupMailTestDB(t *testing.T) *gorm.DB {
|
||||
db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{
|
||||
Logger: logger.Default.LogMode(logger.Silent),
|
||||
@@ -774,6 +832,60 @@ func TestEncodeSubject_RejectsCRLF(t *testing.T) {
|
||||
require.ErrorIs(t, err, errEmailHeaderInjection)
|
||||
}
|
||||
|
||||
// Shared test CA infrastructure to work around Go's cert pool caching.
|
||||
// When tests run with -count=N, Go caches x509.SystemCertPool() after the first run.
|
||||
// Generating a new CA per test causes failures because cached pool references old CA.
|
||||
// Solution: Generate CA once, reuse across runs, and use stable cert file path.
|
||||
var (
|
||||
testCAOnce sync.Once
|
||||
testCAPEM []byte
|
||||
testCAKey *rsa.PrivateKey
|
||||
testCATemplate *x509.Certificate
|
||||
testCAFile string
|
||||
)
|
||||
|
||||
func initTestCA(t *testing.T) {
|
||||
t.Helper()
|
||||
// Delegate to the suite-level initialization (already called by TestMain)
|
||||
initializeTestCAForSuite()
|
||||
}
|
||||
|
||||
func newTestTLSConfigShared(t *testing.T) (*tls.Config, []byte) {
|
||||
t.Helper()
|
||||
|
||||
// Ensure shared CA is initialized
|
||||
initTestCA(t)
|
||||
|
||||
// Generate leaf certificate signed by shared CA
|
||||
leafKey, err := rsa.GenerateKey(rand.Reader, 2048)
|
||||
require.NoError(t, err)
|
||||
|
||||
leafTemplate := &x509.Certificate{
|
||||
SerialNumber: big.NewInt(time.Now().UnixNano()), // Unique serial per leaf
|
||||
Subject: pkix.Name{
|
||||
CommonName: "127.0.0.1",
|
||||
},
|
||||
NotBefore: time.Now().Add(-time.Hour),
|
||||
NotAfter: time.Now().Add(24 * time.Hour),
|
||||
KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment,
|
||||
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
|
||||
BasicConstraintsValid: true,
|
||||
DNSNames: []string{"localhost"},
|
||||
IPAddresses: []net.IP{net.ParseIP("127.0.0.1")},
|
||||
}
|
||||
|
||||
leafDER, err := x509.CreateCertificate(rand.Reader, leafTemplate, testCATemplate, &leafKey.PublicKey, testCAKey)
|
||||
require.NoError(t, err)
|
||||
|
||||
leafCertPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: leafDER})
|
||||
leafKeyPEM := pem.EncodeToMemory(&pem.Block{Type: "RSA PRIVATE KEY", Bytes: x509.MarshalPKCS1PrivateKey(leafKey)})
|
||||
|
||||
cert, err := tls.X509KeyPair(leafCertPEM, leafKeyPEM)
|
||||
require.NoError(t, err)
|
||||
|
||||
return &tls.Config{Certificates: []tls.Certificate{cert}, MinVersion: tls.VersionTLS12}, testCAPEM
|
||||
}
|
||||
|
||||
func TestMailService_GetSMTPConfig_DBError(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -870,8 +982,10 @@ func TestMailService_sendSTARTTLS_DialFailure(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMailService_TestConnection_StartTLSSuccessWithAuth(t *testing.T) {
|
||||
tlsConf, certPEM := newTestTLSConfig(t)
|
||||
trustTestCertificate(t, certPEM)
|
||||
t.Parallel()
|
||||
|
||||
tlsConf, _ := newTestTLSConfigShared(t)
|
||||
trustTestCertificate(t, nil)
|
||||
addr, cleanup := startMockSMTPServer(t, tlsConf, true, true)
|
||||
defer cleanup()
|
||||
|
||||
@@ -919,8 +1033,10 @@ func TestMailService_TestConnection_NoneSuccess(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMailService_SendEmail_STARTTLSSuccess(t *testing.T) {
|
||||
tlsConf, certPEM := newTestTLSConfig(t)
|
||||
trustTestCertificate(t, certPEM)
|
||||
t.Parallel()
|
||||
|
||||
tlsConf, _ := newTestTLSConfigShared(t)
|
||||
trustTestCertificate(t, nil)
|
||||
addr, cleanup := startMockSMTPServer(t, tlsConf, true, true)
|
||||
defer cleanup()
|
||||
|
||||
@@ -940,14 +1056,16 @@ func TestMailService_SendEmail_STARTTLSSuccess(t *testing.T) {
|
||||
Encryption: "starttls",
|
||||
}))
|
||||
|
||||
// With fixed cert trust, STARTTLS connection and email send succeed
|
||||
err = svc.SendEmail("recipient@example.com", "Subject", "Body")
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "STARTTLS failed")
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
func TestMailService_SendEmail_SSLSuccess(t *testing.T) {
|
||||
tlsConf, certPEM := newTestTLSConfig(t)
|
||||
trustTestCertificate(t, certPEM)
|
||||
t.Parallel()
|
||||
|
||||
tlsConf, _ := newTestTLSConfigShared(t)
|
||||
trustTestCertificate(t, nil)
|
||||
addr, cleanup := startMockSSLSMTPServer(t, tlsConf, true)
|
||||
defer cleanup()
|
||||
|
||||
@@ -967,9 +1085,9 @@ func TestMailService_SendEmail_SSLSuccess(t *testing.T) {
|
||||
Encryption: "ssl",
|
||||
}))
|
||||
|
||||
// With fixed cert trust, SSL connection and email send succeed
|
||||
err = svc.SendEmail("recipient@example.com", "Subject", "Body")
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "SSL connection failed")
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
func newTestTLSConfig(t *testing.T) (*tls.Config, []byte) {
|
||||
@@ -1025,10 +1143,9 @@ func newTestTLSConfig(t *testing.T) (*tls.Config, []byte) {
|
||||
|
||||
func trustTestCertificate(t *testing.T, certPEM []byte) {
|
||||
t.Helper()
|
||||
|
||||
caFile := t.TempDir() + "/ca-cert.pem"
|
||||
require.NoError(t, os.WriteFile(caFile, certPEM, 0o600))
|
||||
t.Setenv("SSL_CERT_FILE", caFile)
|
||||
// SSL_CERT_FILE is already set globally by TestMain.
|
||||
// This function kept for API compatibility but no longer needs to set environment.
|
||||
initTestCA(t) // Ensure CA is initialized (already done by TestMain, but safe to call)
|
||||
}
|
||||
|
||||
func startMockSMTPServer(t *testing.T, tlsConf *tls.Config, supportStartTLS bool, requireAuth bool) (string, func()) {
|
||||
@@ -1038,21 +1155,60 @@ func startMockSMTPServer(t *testing.T, tlsConf *tls.Config, supportStartTLS bool
|
||||
require.NoError(t, err)
|
||||
|
||||
done := make(chan struct{})
|
||||
var wg sync.WaitGroup
|
||||
var connsMu sync.Mutex
|
||||
var conns []net.Conn
|
||||
|
||||
go func() {
|
||||
defer close(done)
|
||||
conn, acceptErr := listener.Accept()
|
||||
if acceptErr != nil {
|
||||
return
|
||||
for {
|
||||
conn, acceptErr := listener.Accept()
|
||||
if acceptErr != nil {
|
||||
// Expected shutdown path: listener closed
|
||||
if errors.Is(acceptErr, net.ErrClosed) || strings.Contains(acceptErr.Error(), "use of closed network connection") {
|
||||
return
|
||||
}
|
||||
// Unexpected accept error - signal test failure
|
||||
t.Errorf("unexpected accept error: %v", acceptErr)
|
||||
return
|
||||
}
|
||||
|
||||
connsMu.Lock()
|
||||
conns = append(conns, conn)
|
||||
connsMu.Unlock()
|
||||
|
||||
wg.Add(1)
|
||||
go func(c net.Conn) {
|
||||
defer wg.Done()
|
||||
defer func() { _ = c.Close() }()
|
||||
handleSMTPConn(c, tlsConf, supportStartTLS, requireAuth)
|
||||
}(conn)
|
||||
}
|
||||
defer func() { _ = conn.Close() }()
|
||||
handleSMTPConn(conn, tlsConf, supportStartTLS, requireAuth)
|
||||
}()
|
||||
|
||||
cleanup := func() {
|
||||
_ = listener.Close()
|
||||
|
||||
// Close all active connections to unblock handlers
|
||||
connsMu.Lock()
|
||||
for _, conn := range conns {
|
||||
_ = conn.Close()
|
||||
}
|
||||
connsMu.Unlock()
|
||||
|
||||
// Wait for accept-loop exit and active handlers with timeout
|
||||
cleanupDone := make(chan struct{})
|
||||
go func() {
|
||||
<-done
|
||||
wg.Wait()
|
||||
close(cleanupDone)
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
case <-cleanupDone:
|
||||
// Success
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Errorf("cleanup timeout: server did not shut down cleanly")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1066,21 +1222,60 @@ func startMockSSLSMTPServer(t *testing.T, tlsConf *tls.Config, requireAuth bool)
|
||||
require.NoError(t, err)
|
||||
|
||||
done := make(chan struct{})
|
||||
var wg sync.WaitGroup
|
||||
var connsMu sync.Mutex
|
||||
var conns []net.Conn
|
||||
|
||||
go func() {
|
||||
defer close(done)
|
||||
conn, acceptErr := listener.Accept()
|
||||
if acceptErr != nil {
|
||||
return
|
||||
for {
|
||||
conn, acceptErr := listener.Accept()
|
||||
if acceptErr != nil {
|
||||
// Expected shutdown path: listener closed
|
||||
if errors.Is(acceptErr, net.ErrClosed) || strings.Contains(acceptErr.Error(), "use of closed network connection") {
|
||||
return
|
||||
}
|
||||
// Unexpected accept error - signal test failure
|
||||
t.Errorf("unexpected accept error: %v", acceptErr)
|
||||
return
|
||||
}
|
||||
|
||||
connsMu.Lock()
|
||||
conns = append(conns, conn)
|
||||
connsMu.Unlock()
|
||||
|
||||
wg.Add(1)
|
||||
go func(c net.Conn) {
|
||||
defer wg.Done()
|
||||
defer func() { _ = c.Close() }()
|
||||
handleSMTPConn(c, tlsConf, false, requireAuth)
|
||||
}(conn)
|
||||
}
|
||||
defer func() { _ = conn.Close() }()
|
||||
handleSMTPConn(conn, tlsConf, false, requireAuth)
|
||||
}()
|
||||
|
||||
cleanup := func() {
|
||||
_ = listener.Close()
|
||||
|
||||
// Close all active connections to unblock handlers
|
||||
connsMu.Lock()
|
||||
for _, conn := range conns {
|
||||
_ = conn.Close()
|
||||
}
|
||||
connsMu.Unlock()
|
||||
|
||||
// Wait for accept-loop exit and active handlers with timeout
|
||||
cleanupDone := make(chan struct{})
|
||||
go func() {
|
||||
<-done
|
||||
wg.Wait()
|
||||
close(cleanupDone)
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
case <-cleanupDone:
|
||||
// Success
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Errorf("cleanup timeout: server did not shut down cleanly")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
48
docs/issues/manual_test_smtp_mock_flakiness_fix.md
Normal file
48
docs/issues/manual_test_smtp_mock_flakiness_fix.md
Normal file
@@ -0,0 +1,48 @@
|
||||
---
|
||||
title: Manual Test Plan - SMTP Mock Server Flakiness Fix
|
||||
status: Open
|
||||
priority: High
|
||||
assignee: QA
|
||||
labels: testing, backend, reliability
|
||||
---
|
||||
|
||||
# Test Objective
|
||||
Confirm the SMTP mock server flakiness fix improves mail test reliability without changing production mail behavior.
|
||||
|
||||
# Scope
|
||||
- In scope: test reliability for SMTP mock server flows used by backend mail tests.
|
||||
- Out of scope: production SMTP sending behavior and user-facing mail features.
|
||||
|
||||
# Prerequisites
|
||||
- Charon repository is up to date.
|
||||
- Backend test environment is available.
|
||||
- Ability to run backend tests repeatedly.
|
||||
|
||||
# Manual Scenarios
|
||||
|
||||
## 1) Target flaky test repeated run
|
||||
- [ ] Run `TestMailService_TestConnection_StartTLSSuccessWithAuth` repeatedly (at least 20 times).
|
||||
- [ ] Record pass/fail count and any intermittent errors.
|
||||
|
||||
## 2) Mail service targeted subset run
|
||||
- [ ] Run mail service connection and send test subset once.
|
||||
- [ ] Confirm no new intermittent failures appear in related tests.
|
||||
|
||||
## 3) Race-focused verification
|
||||
- [ ] Run targeted mail service tests with race detection enabled.
|
||||
- [ ] Confirm no race warnings or hangs occur.
|
||||
|
||||
## 4) Cleanup/shutdown stability check
|
||||
- [ ] Repeat targeted runs and watch for stuck test processes or timeout behavior.
|
||||
- [ ] Confirm test execution exits cleanly each run.
|
||||
|
||||
# Expected Results
|
||||
- Repeated target test runs complete with zero flaky failures.
|
||||
- Related mail service test subset remains stable.
|
||||
- No race detector findings for targeted scenarios.
|
||||
- No hangs during test cleanup/shutdown.
|
||||
|
||||
# Regression Checks (No Production Impact)
|
||||
- [ ] Confirm only test reliability behavior changed; no production mail behavior changes are required.
|
||||
- [ ] Confirm no production endpoints, settings, or user-facing mail flows are affected.
|
||||
- [ ] Confirm standard backend test workflow still completes successfully after this fix.
|
||||
@@ -1,520 +1,308 @@
|
||||
---
|
||||
post_title: "Current Spec: Discord Notify-Only Dispatch (No Legacy Fallback)"
|
||||
post_title: "Current Spec: Stabilize Flaky SMTP STARTTLS+AUTH Unit Test"
|
||||
categories:
|
||||
- actions
|
||||
- testing
|
||||
- security
|
||||
- backend
|
||||
- reliability
|
||||
tags:
|
||||
- discord
|
||||
- notifications
|
||||
- notify-rollout
|
||||
- migration
|
||||
- e2e
|
||||
summary: "Authoritative implementation plan for Discord Notify-only dispatch with explicit retirement of legacy fallback ambiguity, deterministic migration behavior, and proof-oriented testing."
|
||||
post_date: 2026-02-21
|
||||
- go
|
||||
- smtp
|
||||
- starttls
|
||||
- unit-tests
|
||||
- ci-stability
|
||||
summary: "Implementation plan to remove CI flakiness in MailService STARTTLS+AUTH tests by hardening mock SMTP server lifecycle and connection handling in test helpers only."
|
||||
post_date: 2026-02-22
|
||||
---
|
||||
|
||||
## Active Plan: Discord Notify-Only Dispatch (No Legacy Fallback)
|
||||
## Active Plan: Stabilize Flaky SMTP STARTTLS+AUTH Unit Test
|
||||
|
||||
Date: 2026-02-21
|
||||
Date: 2026-02-22
|
||||
Status: Active and authoritative
|
||||
Scope Type: Product implementation plan (frontend + backend + migration + verification)
|
||||
Scope Type: Test reliability hardening (backend unit tests only)
|
||||
|
||||
## 1) Introduction
|
||||
|
||||
This plan defines the implementation to satisfy the updated requirement direction:
|
||||
This plan addresses flakiness in backend unit test
|
||||
`TestMailService_TestConnection_StartTLSSuccessWithAuth` by improving mock SMTP
|
||||
test helpers used by `backend/internal/services/mail_service_test.go`.
|
||||
|
||||
- Do not rely on legacy fallback engine for Discord dispatch.
|
||||
- Discord payloads saved in DB must be sent by Notify engine directly.
|
||||
- Remove ambiguity where delivery success could still come through legacy path.
|
||||
- Keep rollout scope focused on Discord-only cutover without unrelated refactors.
|
||||
Root-cause hypothesis to validate and fix:
|
||||
|
||||
This plan is intentionally scoped to Discord-only dispatch/runtime behavior, fallback retirement, and auditable proof that no hidden legacy path remains.
|
||||
- Existing mock SMTP server helpers accept only one connection, then exit.
|
||||
- STARTTLS + AUTH flows in `net/smtp` are negotiation-heavy and can involve
|
||||
additional command/connection behavior under CI timing variance.
|
||||
- Single-accept test server behavior creates race-prone shutdown windows.
|
||||
|
||||
Goal:
|
||||
|
||||
- Make test helper servers robust and deterministic by accepting connections in
|
||||
a loop until cleanup, handling each connection in its own goroutine, and
|
||||
enforcing deterministic shutdown that waits for accept-loop exit and active
|
||||
handler goroutines (explicit per-connection synchronization via waitgroup).
|
||||
|
||||
Non-goal:
|
||||
|
||||
- No production mail service behavior changes.
|
||||
|
||||
## 2) Research Findings
|
||||
|
||||
### 2.1 Frontend provider-type control points (exact files)
|
||||
### 2.1 Current flaky target and helper topology
|
||||
|
||||
- `frontend/src/pages/Notifications.tsx`
|
||||
- Provider type is normalized to Discord (`DISCORD_PROVIDER_TYPE`) before submit/test/preview.
|
||||
- Provider dropdown currently renders Discord-only option, and request payloads are forced to Discord.
|
||||
- `frontend/src/api/notifications.ts`
|
||||
- `withDiscordType(...)` normalizes outbound payload type to Discord.
|
||||
- Interface remains `type: string`, requiring backend to remain authoritative for no-fallback invariants.
|
||||
- `frontend/src/pages/__tests__/Notifications.test.tsx`
|
||||
- Must verify Discord-only create/edit/test behavior and avoid implying fallback rescue semantics.
|
||||
- `tests/settings/notifications.spec.ts`
|
||||
- Includes coverage for Discord behavior and deprecated non-Discord rendering semantics.
|
||||
Primary target test:
|
||||
|
||||
### 2.2 Backend/provider type validation/default/control points (exact files)
|
||||
- `backend/internal/services/mail_service_test.go`
|
||||
- `TestMailService_TestConnection_StartTLSSuccessWithAuth`
|
||||
|
||||
- `backend/internal/api/handlers/notification_provider_handler.go`
|
||||
- Enforces Discord-only on create/update and blocks enabling deprecated non-Discord providers.
|
||||
- `backend/internal/services/notification_service.go`
|
||||
- Enforces Discord-only for provider lifecycle/test and skips non-Discord dispatch.
|
||||
- Contains explicit legacy-fallback-disabled sentinel/error path that must stay fail-closed.
|
||||
- Includes `EnsureNotifyOnlyProviderMigration(...)` for deterministic cutover state.
|
||||
- `backend/internal/services/enhanced_security_notification_service.go`
|
||||
- `SendViaProviders(...)` dispatches only to Discord providers in current rollout stage.
|
||||
- `dispatchToProvider(...)` still has non-Discord branches and is a key ambiguity-removal target.
|
||||
- `backend/internal/models/notification_provider.go`
|
||||
- `Type` comment still lists non-Discord types.
|
||||
- `Engine` and URL comments still include retired legacy-notifier URL semantics.
|
||||
- `backend/internal/notifications/engine.go`
|
||||
- Defines a retired legacy notification engine constant.
|
||||
- `backend/internal/notifications/router.go`
|
||||
- Legacy engine branch remains.
|
||||
- `backend/internal/notifications/feature_flags.go`
|
||||
- Retired fallback flag key remains in feature-flag wiring.
|
||||
- `backend/internal/api/handlers/feature_flags_handler.go`
|
||||
- Still exposes a retired legacy fallback key in defaults and update path guards.
|
||||
- `backend/internal/api/routes/routes.go`
|
||||
- Notification services are wired via both `NotificationService` and
|
||||
`EnhancedSecurityNotificationService`; migration path still references transitional behavior.
|
||||
Helper functions in same file (current behavior):
|
||||
|
||||
### 2.3 Exact backend/frontend files impacted by fallback removal
|
||||
- `startMockSMTPServer(...)`
|
||||
- Single `Accept()` call, single connection handler, then goroutine exits.
|
||||
- `startMockSSLSMTPServer(...)`
|
||||
- Single `Accept()` call, single connection handler, then goroutine exits.
|
||||
- `handleSMTPConn(...)`
|
||||
- Handles SMTP protocol conversation for each accepted connection.
|
||||
|
||||
Backend files:
|
||||
### 2.2 Flakiness vector summary
|
||||
|
||||
- `backend/internal/notifications/engine.go`
|
||||
- `backend/internal/notifications/router.go`
|
||||
- `backend/internal/notifications/feature_flags.go`
|
||||
- `backend/internal/api/handlers/feature_flags_handler.go`
|
||||
- `backend/internal/services/notification_service.go`
|
||||
- `backend/internal/services/enhanced_security_notification_service.go`
|
||||
- `backend/internal/api/handlers/notification_provider_handler.go`
|
||||
- `backend/internal/api/routes/routes.go`
|
||||
- `backend/internal/models/notification_provider.go`
|
||||
- Current single-accept model is fragile for tests where client behavior may
|
||||
include additional negotiation/timing paths.
|
||||
- Cleanup currently closes listener and waits on a done signal, but done is
|
||||
tied to a single accept goroutine rather than a full server lifecycle.
|
||||
- Under CI contention, this can surface nondeterministic failures in tests
|
||||
expecting reliable STARTTLS + AUTH handshake behavior.
|
||||
|
||||
Frontend files:
|
||||
### 2.3 Repo config review requested by user
|
||||
|
||||
- `frontend/src/pages/Notifications.tsx`
|
||||
- `frontend/src/api/notifications.ts`
|
||||
Reviewed files:
|
||||
|
||||
### 2.4 Dependency/runtime current state (legacy notifier)
|
||||
- `.gitignore`
|
||||
- `codecov.yml`
|
||||
- `.dockerignore`
|
||||
- `Dockerfile`
|
||||
|
||||
- `backend/go.mod`: no active legacy notifier dependency.
|
||||
- `backend/go.sum`: no active legacy notifier entry found.
|
||||
- Direct import/call in backend source currently not present.
|
||||
- Legacy strings/constants/tests remain in runtime code paths and tests.
|
||||
- Historical artifacts and cached module content may contain legacy notifier references but are not
|
||||
production dependencies.
|
||||
Conclusion for this scope:
|
||||
|
||||
### 2.5 Firefox failing specs context
|
||||
|
||||
- Artifact `playwright-output/manual-dns-targeted/.last-run.json` contains 16 failed test IDs only; suite attribution must come from Playwright report/traces, not this file alone.
|
||||
- Separate summary `FIREFOX_E2E_FIXES_SUMMARY.md` references prior Firefox remediation work.
|
||||
- User-reported “5 failing Firefox specs” is treated as active triage input and must be
|
||||
classified at execution time into in-scope vs out-of-scope suites (defined below).
|
||||
- No changes required for this test-only helper stabilization.
|
||||
- Existing ignore/coverage/docker config does not block this plan.
|
||||
|
||||
## 3) Requirements (EARS)
|
||||
|
||||
- R1: WHEN opening notification provider create/edit UI, THE SYSTEM SHALL offer only `discord`
|
||||
in provider type selection for this rollout.
|
||||
- R2: WHEN create/update/test notification provider API endpoints receive a non-Discord type,
|
||||
THE SYSTEM SHALL reject with deterministic validation error.
|
||||
- R3: WHILE legacy non-Discord providers exist in DB, THE SYSTEM SHALL handle them per explicit
|
||||
compatibility policy without enabling dispatch.
|
||||
- R4: WHEN verifying runtime and dependency state, THE SYSTEM SHALL produce auditable evidence
|
||||
that the retired legacy notifier is not installed, linked, or used by active runtime code paths.
|
||||
- R5: IF additional services are introduced in future, THEN THE SYSTEM SHALL enable them
|
||||
behind explicit validation gates and staged rollout controls.
|
||||
- R6: WHEN Discord notifications are delivered successfully, THE SYSTEM SHALL provide evidence that Notify path handled dispatch and not a legacy fallback path.
|
||||
- R7: IF rollback is required, THEN THE SYSTEM SHALL use explicit rollback controls without reintroducing hidden legacy fallback behavior.
|
||||
- R1: WHEN mock SMTP test helpers are started, THE SYSTEM SHALL accept
|
||||
connections in a loop until explicit cleanup closes the listener.
|
||||
- R2: WHEN a connection is accepted, THE SYSTEM SHALL handle it in its own
|
||||
goroutine so one slow session cannot block new accepts.
|
||||
- R3: WHEN cleanup is invoked, THE SYSTEM SHALL close the listener and await
|
||||
deterministic server-loop termination plus completion of active connection
|
||||
handlers (done channel + waitgroup synchronization).
|
||||
- R4: IF cleanup wait exceeds timeout, THEN THE SYSTEM SHALL report a failure
|
||||
signal and return without hanging test execution.
|
||||
- R5: WHEN this fix is implemented, THE SYSTEM SHALL keep production code
|
||||
untouched and restrict changes to test helper scope.
|
||||
- R6: WHEN targeted backend tests run, THE SYSTEM SHALL pass reliably in local
|
||||
and CI-like conditions.
|
||||
|
||||
## 4) Technical Specification
|
||||
|
||||
### 4.1 Discord-only enforcement strategy (defense in depth)
|
||||
### 4.1 Exact target files and functions (minimal diff scope)
|
||||
|
||||
#### Layer A: UI/input restriction
|
||||
Primary file to edit:
|
||||
|
||||
- `frontend/src/pages/Notifications.tsx`
|
||||
- Replace provider type dropdown options with a single option: `discord`.
|
||||
- Keep visible label semantics intact for accessibility.
|
||||
- Prevent stale-form values from preserving non-Discord types in edit mode.
|
||||
- `backend/internal/services/mail_service_test.go`
|
||||
|
||||
#### Layer B: Frontend request contract hardening
|
||||
Functions in scope:
|
||||
|
||||
- `frontend/src/api/notifications.ts`
|
||||
- Frontend request contract SHALL submit `type` as `discord` only; client must reject or normalize any stale non-discord value before submit.
|
||||
- `startMockSMTPServer(t *testing.T, tlsConf *tls.Config, supportStartTLS bool, requireAuth bool) (string, func())`
|
||||
- `startMockSSLSMTPServer(t *testing.T, tlsConf *tls.Config, requireAuth bool) (string, func())`
|
||||
|
||||
#### Layer C: API handler validation (authoritative gate)
|
||||
Related function (read-only unless strictly necessary):
|
||||
|
||||
- `backend/internal/api/handlers/notification_provider_handler.go`
|
||||
- Enforce `req.Type == "discord"` for all create/update paths (not only security-event cases).
|
||||
- Return stable error code/message for non-Discord type attempts.
|
||||
- `handleSMTPConn(conn net.Conn, tlsConf *tls.Config, supportStartTLS bool, requireAuth bool)`
|
||||
|
||||
#### Layer D: Service-layer invariant
|
||||
Out of scope:
|
||||
|
||||
- `backend/internal/services/notification_service.go`
|
||||
- Add service-level type guard for create/update/test/send paths to reject non-Discord.
|
||||
- Keep this guard even if handler validation exists (defense in depth).
|
||||
- `backend/internal/services/mail_service.go`
|
||||
- Any non-mail-service test files.
|
||||
|
||||
#### Layer E: Dispatch/runtime invariant
|
||||
### 4.2 Planned helper behavior changes
|
||||
|
||||
- `backend/internal/services/enhanced_security_notification_service.go`
|
||||
- Maintain dispatch as Discord-only.
|
||||
- Remove non-Discord dispatch branches from active runtime logic for this phase.
|
||||
For both helper server starters:
|
||||
|
||||
### 4.2 Feature-flag policy change for legacy fallback
|
||||
1. Replace single `Accept()` with accept loop:
|
||||
- Continue accepting until listener close returns an error.
|
||||
- Treat listener-close accept errors as normal shutdown path.
|
||||
2. Spawn per-connection handler goroutine:
|
||||
- Each accepted connection handled independently.
|
||||
- Connection closed within goroutine after handling.
|
||||
3. Deterministic lifecycle signaling:
|
||||
- Keep `done` channel signaling tied to server accept-loop exit.
|
||||
- Ensure exactly one close of `done` from server goroutine.
|
||||
- Track active connection-handler goroutines with explicit waitgroup sync.
|
||||
4. Cleanup contract:
|
||||
- `cleanup()` closes listener first.
|
||||
- Wait for accept-loop exit and active handler completion with bounded timeout (`2s` currently acceptable).
|
||||
- Never block indefinitely.
|
||||
- Timeout path is a test failure signal (non-hanging), not a silent pass.
|
||||
|
||||
Policy for `feature.notifications.legacy.fallback_enabled`:
|
||||
### 4.3 Concurrency and race safety notes
|
||||
|
||||
- Deprecate now.
|
||||
- Force false in reads and writes.
|
||||
- Immutable false: never true at runtime.
|
||||
- Reject `true` update attempts with deterministic API error.
|
||||
- Keep compatibility aliases read-only and forced false during transition.
|
||||
- Remove `feature.notifications.legacy.fallback_enabled` from all public/default flag responses in PR-3 of this plan, and remove all retired env aliases in the same PR. No compatibility-window extension is allowed without a new approved spec revision.
|
||||
- Accept-loop ownership:
|
||||
- One goroutine owns listener accept loop and is the only goroutine that may
|
||||
close `done`.
|
||||
- Connection handler isolation:
|
||||
- One goroutine per accepted connection; no shared mutable state required for
|
||||
protocol behavior in this helper.
|
||||
- Per-connection synchronization:
|
||||
- A waitgroup must increment before each handler goroutine starts and must
|
||||
decrement on handler exit so cleanup can deterministically wait for active
|
||||
handlers to finish.
|
||||
- Listener close semantics:
|
||||
- `listener.Close()` from cleanup is expected to break `Accept()`.
|
||||
- Exit condition should avoid noisy test failures on intentional close.
|
||||
- Cleanup timeout behavior:
|
||||
- Timeout remains defensive to prevent suite hangs under pathological CI
|
||||
resource starvation.
|
||||
- Timeout branch must report failure (e.g., `t.Errorf`/`t.Fatalf` policy) and
|
||||
return; no panic and no silent success.
|
||||
|
||||
Migration implications:
|
||||
### 4.4 Error handling policy in helpers
|
||||
|
||||
- Existing settings row for legacy fallback is normalized to false if present.
|
||||
- Transition period allows key visibility as false-only for compatibility.
|
||||
- Final state keeps this key out of runtime dispatch decisions; there is no fallback runtime path.
|
||||
- Treat only expected shutdown accept errors (listener close path) as normal.
|
||||
- Surface unexpected `Accept()` failures as test failure signals.
|
||||
- Keep helper logic simple and deterministic; avoid over-engineered retry logic.
|
||||
|
||||
Legacy fallback control surface policy: any API write attempting to set legacy fallback true SHALL return `400` with code `LEGACY_FALLBACK_REMOVED`; any legacy fallback read endpoint/field retained temporarily SHALL return hard-false only and SHALL be removed in PR-3. No hidden route or config path may alter dispatch away from Notify, and Notify failures SHALL NOT reroute to a fallback engine.
|
||||
## 5) Implementation Plan
|
||||
|
||||
### 4.3 Runtime dispatch contract (Discord-only Notify path)
|
||||
### Phase 1: Testing protocol sequencing note
|
||||
|
||||
Contract:
|
||||
- Policy remains E2E-first globally.
|
||||
- Explicit exception rationale for this change: scope is backend test-helper-only
|
||||
(`mail_service_test.go`) with no production/frontend/runtime behavior delta,
|
||||
so targeted backend test-first verification is authorized for this plan.
|
||||
- Mandatory preflight before unit/coverage steps:
|
||||
- `bash scripts/local-patch-report.sh`
|
||||
- Artifacts: `test-results/local-patch-report.md` and `test-results/local-patch-report.json`
|
||||
|
||||
- Discord dispatch source of truth is Notify engine path (`notify_v1`) only.
|
||||
- `NotificationService.SendExternal(...)` and `EnhancedSecurityNotificationService.SendViaProviders(...)` must fail closed for non-Discord delivery attempts.
|
||||
- No implicit retry/reroute to legacy engine on Notify failure.
|
||||
- Any successful Discord delivery during rollout must be attributable to Notify path execution.
|
||||
### Phase 2: Baseline confirmation and failure reproduction context
|
||||
|
||||
#### Layer F: Optional DB invariant (recommended)
|
||||
- Capture current helper behavior and flaky test target:
|
||||
- `go test ./backend/internal/services -run TestMailService_TestConnection_StartTLSSuccessWithAuth -count=1 -v`
|
||||
|
||||
- Add migration with DB-level check constraint for notification provider type to `discord`
|
||||
for newly created/updated rows in this rollout.
|
||||
- If DB constraint is deferred, service/API guards remain mandatory.
|
||||
### Phase 3: Helper lifecycle hardening
|
||||
|
||||
### 4.4 Data migration and compatibility policy for existing non-Discord providers
|
||||
- Update `startMockSMTPServer` and `startMockSSLSMTPServer` to loop accept.
|
||||
- Add per-connection goroutine handling.
|
||||
- Preserve/strengthen deterministic cleanup using listener close + done wait +
|
||||
per-connection waitgroup completion.
|
||||
|
||||
Policy decision for this rollout: **Deprecate + read-only visibility + non-dispatch**.
|
||||
### Phase 4: Targeted validation
|
||||
|
||||
- Existing non-Discord rows are preserved for audit/history but cannot be newly created
|
||||
or converted back to active dispatch in this phase.
|
||||
- Compatibility behavior:
|
||||
- `enabled` forced false for non-Discord providers during migration reconciliation.
|
||||
- `migration_state` remains failed/deprecated for non-supported providers.
|
||||
- UI list behavior: render non-Discord rows as deprecated read-only rows with a clear non-dispatch badge.
|
||||
- API contract: non-Discord existing rows are always non-dispatch, non-enable, deletable, and return deterministic validation error on enable/type mutation attempts.
|
||||
- Explicitly block enable and type mutation attempts for non-Discord rows via API with deterministic validation errors.
|
||||
- Allow deletion of deprecated rows.
|
||||
- Re-run target test repeatedly to validate stability:
|
||||
- `go test ./backend/internal/services -run TestMailService_TestConnection_StartTLSSuccessWithAuth -count=20`
|
||||
- Run mail service test subset:
|
||||
- `go test ./backend/internal/services -run TestMailService_ -count=1`
|
||||
- Run race-focused targeted validation:
|
||||
- `go test -race ./backend/internal/services -run 'TestMailService_(TestConnection|Send)' -count=1`
|
||||
|
||||
Migration safety requirements:
|
||||
### Phase 5: Backend coverage validation
|
||||
|
||||
- Migration SHALL be idempotent.
|
||||
- Migration SHALL execute within an explicit transaction boundary.
|
||||
- Migration SHALL require a pre-migration backup before mutating provider rows.
|
||||
- Migration SHALL define and document a rollback procedure.
|
||||
- Migration SHALL persist audit log fields for every mutated provider row (including provider identifier, previous values, new values, mutation timestamp, and operation identifier).
|
||||
- Run backend coverage task/script required by repo workflow:
|
||||
- Preferred script: `scripts/go-test-coverage.sh`
|
||||
- VS Code equivalent: backend coverage task if available.
|
||||
|
||||
Migration implementation candidates:
|
||||
## 6) Validation Matrix
|
||||
|
||||
- `backend/internal/services/notification_service.go` (`EnsureNotifyOnlyProviderMigration`)
|
||||
- `backend/internal/services/enhanced_security_notification_service.go`
|
||||
- new migration file under backend migration path if schema/data migration is introduced.
|
||||
| Validation Item | Command / Task | Scope | Pass Criteria |
|
||||
|---|---|---|---|
|
||||
| Targeted flaky test | `go test ./backend/internal/services -run TestMailService_TestConnection_StartTLSSuccessWithAuth -count=20` | Direct flaky test | No failures across repeated runs |
|
||||
| Mail service subset | `go test ./backend/internal/services -run TestMailService_ -count=1` | Nearby regression safety | All selected tests pass |
|
||||
| Race-focused targeted tests | `go test -race ./backend/internal/services -run 'TestMailService_(TestConnection|Send)' -count=1` | Concurrency/race safety | No race reports; tests pass |
|
||||
| Package sanity | `go test ./backend/internal/services -count=1` | Service package confidence | Package tests pass |
|
||||
| Backend coverage gate | `scripts/go-test-coverage.sh` | Repo-required backend coverage check | Meets configured minimum threshold (85% default) |
|
||||
|
||||
Notes:
|
||||
|
||||
- E2E-first protocol remains project-wide policy. This plan uses the explicit
|
||||
backend-only targeted-test exception because scope is confined to test helper
|
||||
internals with no production/UI behavior changes.
|
||||
- Local patch report preflight is required before unit/coverage gates.
|
||||
|
||||
## 7) Risk Assessment
|
||||
|
||||
Risk level: Low.
|
||||
|
||||
- Change type: test-only.
|
||||
- Production code: untouched.
|
||||
- Runtime behavior: unchanged for shipped binary.
|
||||
- Primary risk: helper lifecycle bug causing test hangs.
|
||||
- Mitigation: bounded cleanup timeout, accept-loop exit on listener close,
|
||||
focused repeated-run validation.
|
||||
|
||||
## 8) PR Slicing Strategy
|
||||
|
||||
### 4.5 Legacy notifier retirement/removal scope
|
||||
Decision: Single PR.
|
||||
|
||||
Targets to remove from active runtime/config surface:
|
||||
Rationale:
|
||||
|
||||
- `backend/internal/notifications/engine.go` legacy engine constant
|
||||
- `backend/internal/notifications/router.go` legacy engine branch
|
||||
- `backend/internal/notifications/feature_flags.go` legacy flag constant
|
||||
- `backend/internal/api/handlers/feature_flags_handler.go` retired legacy flag exposure
|
||||
- `backend/internal/models/notification_provider.go` retired legacy notifier comments/engine semantics
|
||||
- `backend/internal/services/notification_service.go` legacy naming/comments/hooks
|
||||
- `backend/internal/services/enhanced_security_notification_service.go` non-Discord dispatch branches in `dispatchToProvider`
|
||||
- `frontend/src/pages/Notifications.tsx` and `frontend/src/api/notifications.ts` to keep Discord-only UX/request contract aligned with backend fallback retirement
|
||||
- Extremely small scope (one test file, two helper functions).
|
||||
- No cross-domain dependencies.
|
||||
- Easier review and rollback.
|
||||
|
||||
Out of scope for blocking completion:
|
||||
### PR-1 (single slice)
|
||||
|
||||
- Archived docs/history artifacts under `docs/plans/archive/**` and similar archive/report folders.
|
||||
- Scope:
|
||||
- `backend/internal/services/mail_service_test.go` helper lifecycle updates.
|
||||
- Dependencies:
|
||||
- None.
|
||||
- Validation gates:
|
||||
- Validation matrix in Section 6.
|
||||
- Rollback contingency:
|
||||
- Revert single PR if instability increases.
|
||||
|
||||
## 5) Dependency and Runtime Audit Plan (verify Discord-only runtime)
|
||||
## 9) Config File Review Outcome
|
||||
|
||||
Run and record all commands in implementation evidence.
|
||||
|
||||
Task-aligned proof gates (mandatory):
|
||||
|
||||
- Run task `Security: Verify SBOM` and capture output evidence.
|
||||
- Run task `Security: Scan Docker Image (Local)` and capture output evidence.
|
||||
- Store reproducibility evidence under `test-results/notify-rollout-audit/` (task outputs, command transcripts, and generated reports).
|
||||
|
||||
### 5.1 Dependency manifests
|
||||
|
||||
- `cd /projects/Charon/backend && go mod graph | grep -i "legacy_shoutrrr|feature.notifications.legacy.fallback_enabled|EngineLegacy|legacySendFunc|shoutrrr" || true`
|
||||
- `grep -i "legacy_shoutrrr|feature.notifications.legacy.fallback_enabled|EngineLegacy|legacySendFunc|shoutrrr" /projects/Charon/backend/go.mod /projects/Charon/backend/go.sum || true`
|
||||
- `grep -i "legacy_shoutrrr|feature.notifications.legacy.fallback_enabled|EngineLegacy|legacySendFunc|shoutrrr" /projects/Charon/package-lock.json /projects/Charon/frontend/package-lock.json 2>/dev/null || true`
|
||||
|
||||
### 5.2 Source/runtime references (non-archive)
|
||||
|
||||
- `grep -RIn --exclude-dir=.git --exclude-dir=.cache --exclude-dir=docs --exclude-dir=playwright-output --exclude-dir=test-results "legacy_shoutrrr|feature.notifications.legacy.fallback_enabled|EngineLegacy|legacySendFunc|shoutrrr" /projects/Charon/backend /projects/Charon/frontend || true`
|
||||
|
||||
### 5.3 Built artifact/container runtime checks
|
||||
|
||||
- Build image: `docker build -t charon:discord-only-audit /projects/Charon`
|
||||
- Module metadata in binary:
|
||||
- `docker run --rm charon:discord-only-audit sh -lc 'go version -m /app/charon 2>/dev/null | grep -i "legacy_shoutrrr|feature.notifications.legacy.fallback_enabled|EngineLegacy|legacySendFunc|shoutrrr" || true'`
|
||||
- Runtime filesystem/strings scan:
|
||||
- `docker run --rm charon:discord-only-audit sh -lc 'find /app /usr/local/bin -maxdepth 4 -type f | xargs grep -Iin "legacy_shoutrrr|feature.notifications.legacy.fallback_enabled|EngineLegacy|legacySendFunc|shoutrrr" 2>/dev/null || true'`
|
||||
|
||||
### 5.4 Documentation references (active docs only)
|
||||
|
||||
- `grep -RIn "legacy_shoutrrr|feature.notifications.legacy.fallback_enabled|EngineLegacy|legacySendFunc|shoutrrr" /projects/Charon/README.md /projects/Charon/ARCHITECTURE.md /projects/Charon/docs/features /projects/Charon/docs/security* || true`
|
||||
|
||||
Audit pass criteria:
|
||||
|
||||
- No active dependency references.
|
||||
- No active runtime/path references.
|
||||
- No active user-facing docs claiming retired multi-service notifier behavior for this rollout.
|
||||
|
||||
## 6) Implementation Plan (phased)
|
||||
|
||||
### Phase 1: Docker E2E rebuild decision and environment prep
|
||||
|
||||
- Apply repo testing-rule decision for E2E container rebuild based on changed paths.
|
||||
- Rebuild/start E2E environment when required before any test execution.
|
||||
|
||||
### Phase 2: E2E first and failing-spec triage baseline
|
||||
|
||||
- Capture exact Firefox failures for current branch:
|
||||
- `cd /projects/Charon && npx playwright test --project=firefox`
|
||||
- Classify failing specs:
|
||||
- **In-scope**: notification provider tests (`tests/settings/notifications.spec.ts` and related).
|
||||
- **Out-of-scope**: manual DNS provider suites unless directly regressed by this change.
|
||||
- If user-reported 5 failing specs are not notifications, create/attach separate tracking item and
|
||||
do not block Discord-only implementation on unrelated suite failures.
|
||||
|
||||
### Phase 3: Local Patch Report preflight (mandatory before unit/coverage)
|
||||
|
||||
- Run local patch coverage preflight and generate required artifacts:
|
||||
- `test-results/local-patch-report.md`
|
||||
- `test-results/local-patch-report.json`
|
||||
|
||||
### Phase 4: Backend hard gate (authoritative)
|
||||
|
||||
- Implement Discord-only validation in API handlers and service layer.
|
||||
- Remove/retire active legacy engine and flag exposure from runtime path.
|
||||
- Apply migration policy for existing non-Discord rows.
|
||||
|
||||
### Phase 5: Conditional GORM security check gate
|
||||
|
||||
- Run GORM security scanner in check mode when backend model/database trigger paths are modified.
|
||||
- Gate completion on zero CRITICAL/HIGH findings for triggered changes.
|
||||
|
||||
### Phase 6: Frontend Discord-only UX
|
||||
|
||||
- Restrict dropdown/options to Discord only.
|
||||
- Align frontend type contract and submission behavior.
|
||||
- Ensure edit/create forms cannot persist stale non-Discord values.
|
||||
|
||||
### Phase 7: Legacy notifier retirement cleanup + audit evidence
|
||||
|
||||
- Remove active runtime legacy references listed in Section 4.5.
|
||||
- Execute dependency/runtime/doc audit commands in Section 5 and save evidence.
|
||||
|
||||
### Phase 8: Validation and release readiness
|
||||
|
||||
- Run validation sequence in required order (Section 7).
|
||||
- Confirm rollout note: additional providers remain disabled pending per-provider validation PRs.
|
||||
|
||||
## 7) Targeted Testing Strategy
|
||||
|
||||
Mandatory repo QA/test order for this plan:
|
||||
|
||||
1. Docker E2E rebuild decision and required rebuild/start.
|
||||
2. E2E first (Firefox baseline + targeted notification suite).
|
||||
3. Local Patch Report preflight artifact generation.
|
||||
4. Conditional GORM security check gate (`--check` semantics when trigger paths change).
|
||||
5. CodeQL CI-aligned scans (Go and JS).
|
||||
6. Coverage gates (backend/frontend coverage thresholds and patch coverage review).
|
||||
7. Frontend type-check and backend/frontend build verification.
|
||||
|
||||
### 7.1 Backend tests (in scope)
|
||||
|
||||
- `backend/internal/api/handlers/notification_provider_blocker3_test.go`
|
||||
- Expand from security-event-only gating to global Discord-only gating.
|
||||
- `backend/internal/services/notification_service_test.go`
|
||||
- Ensure non-Discord create/update/test/send rejection.
|
||||
- `backend/internal/notifications/router_test.go`
|
||||
- Prove legacy fallback decision remains disabled and cannot be toggled into active use.
|
||||
- `backend/internal/services/notification_service_discord_only_test.go`
|
||||
- Prove Discord-only dispatch behavior and no successful fallback path.
|
||||
- `backend/internal/services/security_notification_service_test.go`
|
||||
- Prove security notifications route through Discord-only provider dispatch in rollout mode.
|
||||
- Add/adjust tests for migration policy behavior on existing non-Discord rows.
|
||||
|
||||
### 7.1.1 Notify-vs-fallback evidence points (required)
|
||||
|
||||
- Unit proof SHALL assert legacy dispatch hook call-count is exactly 0 for all Discord-success and Discord-failure scenarios.
|
||||
- Unit/integration proof SHALL assert Notify dispatch hook/request path is called for Discord deliveries (including `Content-Type: application/json` and deterministic provider-type guard behavior).
|
||||
- Negative proof SHALL assert non-Discord create/update/test attempts fail with stable error codes and cannot produce a successful delivery event.
|
||||
- CI gate SHALL fail if any test demonstrates delivery success through a legacy fallback path.
|
||||
|
||||
### 7.2 Frontend unit tests (in scope)
|
||||
|
||||
- `frontend/src/pages/__tests__/Notifications.test.tsx`
|
||||
- Assert only Discord option exists in provider-type select.
|
||||
- Remove/adjust non-Discord create assumptions.
|
||||
|
||||
### 7.3 E2E tests (in scope)
|
||||
|
||||
- `tests/settings/notifications.spec.ts`
|
||||
- Keep Discord create/edit/test flows.
|
||||
- Replace or retire non-Discord CRUD expectations for this phase.
|
||||
|
||||
### 7.4 Firefox failing specs handling (required context)
|
||||
|
||||
- If the current failing 5 specs are notification-related, they are in scope and must be fixed
|
||||
within this PR.
|
||||
- If they are unrelated (for example manual-dns-provider failures), track separately and avoid
|
||||
scope creep; do not mark as fixed under this provider-type change.
|
||||
|
||||
### 7.5 Suggested verification command set
|
||||
|
||||
- `cd /projects/Charon && npx playwright test --project=firefox tests/settings/notifications.spec.ts`
|
||||
- `cd /projects/Charon && npx playwright test --project=firefox --grep "Notification Providers"`
|
||||
- `cd /projects/Charon && bash scripts/local-patch-report.sh`
|
||||
- `cd /projects/Charon && ./scripts/scan-gorm-security.sh --check` (conditional on trigger paths)
|
||||
- `cd /projects/Charon && pre-commit run codeql-go-scan --all-files`
|
||||
- `cd /projects/Charon && pre-commit run codeql-js-scan --all-files`
|
||||
- `cd /projects/Charon && go test ./backend/internal/api/handlers -run Blocker3 -v`
|
||||
- `cd /projects/Charon && go test ./backend/internal/services -run Notification -v`
|
||||
- `cd /projects/Charon && scripts/go-test-coverage.sh`
|
||||
- `cd /projects/Charon/frontend && npm test -- Notifications`
|
||||
- `cd /projects/Charon && scripts/frontend-test-coverage.sh`
|
||||
- `cd /projects/Charon/frontend && npm run type-check && npm run build`
|
||||
- `cd /projects/Charon/backend && go build ./...`
|
||||
|
||||
## 8) Review of `.gitignore`, `.dockerignore`, `codecov.yml`, `Dockerfile`
|
||||
|
||||
### Findings and required updates decision
|
||||
Reviewed for this request:
|
||||
|
||||
- `.gitignore`
|
||||
- No mandatory change required for feature behavior.
|
||||
- Optional: ensure any new audit evidence artifacts (if added) are either committed intentionally
|
||||
or ignored consistently.
|
||||
- `.dockerignore`
|
||||
- No mandatory change required for Discord-only behavior.
|
||||
- If adding migration/audit helper scripts needed at build time, ensure they are not excluded.
|
||||
- `codecov.yml`
|
||||
- No mandatory change required.
|
||||
- If new test files are added, verify they are not accidentally ignored by broad patterns.
|
||||
- `.dockerignore`
|
||||
- `Dockerfile`
|
||||
- No direct package-level retired notifier dependency is declared.
|
||||
- No mandatory Dockerfile change for Discord-only behavior unless runtime audit shows residual
|
||||
retired notifier-linked binaries or references.
|
||||
|
||||
## 9) PR Slicing Strategy
|
||||
Suggested updates:
|
||||
|
||||
Decision: **Multiple PRs (3 slices)** for safer rollout and easier rollback.
|
||||
- None required for this scope.
|
||||
- Revisit only if implementation introduces new generated artifacts or test
|
||||
output paths not currently handled (not expected here).
|
||||
|
||||
### PR-1: Backend guardrails + migration policy
|
||||
## 10) Acceptance Criteria
|
||||
|
||||
- Scope:
|
||||
- API/service Discord-only validation.
|
||||
- Existing non-Discord compatibility policy implementation.
|
||||
- Legacy flag/engine runtime retirement in backend.
|
||||
- Primary files:
|
||||
- `backend/internal/api/handlers/notification_provider_handler.go`
|
||||
- `backend/internal/services/notification_service.go`
|
||||
- `backend/internal/services/enhanced_security_notification_service.go`
|
||||
- `backend/internal/notifications/engine.go`
|
||||
- `backend/internal/notifications/router.go`
|
||||
- `backend/internal/notifications/feature_flags.go`
|
||||
- `backend/internal/api/handlers/feature_flags_handler.go`
|
||||
- `backend/internal/models/notification_provider.go`
|
||||
- Gate:
|
||||
- Backend tests pass, API rejects non-Discord, migration behavior verified.
|
||||
- Rollback:
|
||||
- Revert PR-1 if existing-provider compatibility regresses.
|
||||
- Do not reintroduce fallback by setting legacy fallback flag true or restoring legacy dispatch routing as a hotfix.
|
||||
- AC1: `startMockSMTPServer` accepts in loop until cleanup and no longer exits
|
||||
after a single connection.
|
||||
- AC2: `startMockSSLSMTPServer` accepts in loop until cleanup and no longer
|
||||
exits after a single connection.
|
||||
- AC3: Each accepted connection is handled in its own goroutine.
|
||||
- AC4: Cleanup closes listener and uses done-channel plus per-connection
|
||||
waitgroup synchronization with bounded timeout.
|
||||
- AC5: Unexpected `Accept()` failures are surfaced as test failure signals;
|
||||
expected listener-close shutdown errors are treated as normal.
|
||||
- AC6: `TestMailService_TestConnection_StartTLSSuccessWithAuth` passes reliably
|
||||
under repeated runs.
|
||||
- AC7: Targeted race validation for mail-service tests passes with `go test -race`.
|
||||
- AC8: Cleanup timeout path reports failure and returns (non-hanging), never
|
||||
silent pass.
|
||||
- AC9: Backend coverage script/task completes successfully at configured
|
||||
threshold.
|
||||
- AC10: No production file changes are included in the implementation PR.
|
||||
|
||||
### PR-2: Frontend Discord-only UX and tests
|
||||
## 11) Definition of Done
|
||||
|
||||
- Scope:
|
||||
- Dropdown/UI restriction to Discord only.
|
||||
- Frontend test updates for new provider-type policy.
|
||||
- Primary files:
|
||||
- `frontend/src/pages/Notifications.tsx`
|
||||
- `frontend/src/api/notifications.ts`
|
||||
- `frontend/src/pages/__tests__/Notifications.test.tsx`
|
||||
- `tests/settings/notifications.spec.ts`
|
||||
- Gate:
|
||||
- Dependency: PR-2 entry is blocked until PR-1 is merged.
|
||||
- Firefox notifications E2E targeted suite passes.
|
||||
- Exit: frontend request contract submits `type=discord` only and backend rejects non-Discord mutations.
|
||||
- Rollback:
|
||||
- Revert PR-2 without reverting backend safeguards.
|
||||
- No rollback action may re-enable hidden fallback behavior.
|
||||
|
||||
### PR-3: Legacy notifier retirement evidence + docs alignment
|
||||
|
||||
- Scope:
|
||||
- Execute audit commands, capture evidence, update active docs if needed.
|
||||
- Primary files:
|
||||
- `README.md`, `ARCHITECTURE.md`, relevant `docs/features/**` and `docs/security*` files (only if active references remain).
|
||||
- Gate:
|
||||
- Dependency: PR-3 entry is blocked until PR-2 is merged.
|
||||
- Entry blocker: runtime/dependency audit must pass before PR-3 can proceed.
|
||||
- Dependency/runtime/doc audit pass criteria met.
|
||||
- Exit: docs must reflect final backend/frontend state after PR-1 and PR-2 behavior.
|
||||
- Rollback:
|
||||
- Revert docs-only evidence changes without impacting runtime behavior.
|
||||
- Preserve explicit no-fallback runtime guarantees from earlier slices.
|
||||
|
||||
## 10) Rollback Strategy (No Hidden Fallback Ambiguity)
|
||||
|
||||
Allowed rollback mechanisms:
|
||||
|
||||
1. Revert entire rollout slice(s) to previous known-good commit/tag.
|
||||
2. Pause notifications by disabling delivery globally while keeping legacy fallback permanently disabled (`feature.notifications.legacy.fallback_enabled=false`, immutable). Rollback SHALL revert to a prior release tag/DB snapshot only; it SHALL NOT re-enable or simulate legacy fallback.
|
||||
3. Restore DB snapshot if migration state must be reverted.
|
||||
|
||||
Disallowed rollback mechanisms:
|
||||
|
||||
- Re-enable `feature.notifications.legacy.fallback_enabled` for runtime dispatch.
|
||||
- Reintroduce runtime `EngineLegacy` or implicit fallback retry paths.
|
||||
- Apply emergency patches that create dual-path ambiguity for Discord success attribution.
|
||||
|
||||
## 11) Risks and Mitigations
|
||||
|
||||
- Risk: Existing non-Discord providers silently break user expectations.
|
||||
- Mitigation: explicit deprecated/read-only policy and clear API errors.
|
||||
- Risk: Hidden runtime legacy paths remain while UI appears Discord-only.
|
||||
- Mitigation: backend-first enforcement and runtime audit commands.
|
||||
- Risk: Scope creep from unrelated Firefox failures.
|
||||
- Mitigation: strict in-scope/out-of-scope classification in Phase 1.
|
||||
- Risk: Future provider rollout pressure bypasses validation discipline.
|
||||
- Mitigation: staged enablement policy with per-provider validation gate.
|
||||
|
||||
## 12) Acceptance Criteria
|
||||
|
||||
1. Provider type dropdown in notifications UI exposes only Discord.
|
||||
2. API create/update/test rejects any non-Discord provider type with deterministic error.
|
||||
3. Service/runtime dispatch is Discord-only for this rollout and attributable to Notify path.
|
||||
4. Existing non-Discord DB rows follow explicit compatibility policy (deprecated read-only and non-dispatch).
|
||||
5. Retired notifier references are absent from active dependency manifests (`go.mod`, `go.sum`, package lock files).
|
||||
6. Runtime audit confirms no active retired-notifier install/link/reference in executable/runtime paths.
|
||||
7. Active docs no longer claim multi-service notifier behavior for the current rollout.
|
||||
8. Firefox failing-spec triage clearly distinguishes in-scope notification failures from separately tracked suites.
|
||||
9. `.gitignore`, `.dockerignore`, `codecov.yml`, and `Dockerfile` are reviewed and updated only if required by actual implementation deltas.
|
||||
10. Additional provider services remain disabled pending one-by-one validated rollout PRs.
|
||||
11. Legacy fallback feature-flag behavior is force-false and non-revivable through standard API operations.
|
||||
12. Rollback procedures preserve no-fallback ambiguity constraints.
|
||||
|
||||
## 13) Handoff to Supervisor
|
||||
|
||||
This spec is ready for Supervisor review and implementation assignment.
|
||||
|
||||
Review focus:
|
||||
|
||||
- Verify defense-in-depth layering is complete (UI + API + service + runtime).
|
||||
- Confirm migration policy is explicit and reversible.
|
||||
- Confirm audit command set is sufficient to prove legacy notifier retirement.
|
||||
- Confirm PR slicing minimizes risk and isolates rollback paths.
|
||||
- Planned helper changes are implemented exactly in scoped functions.
|
||||
- Cleanup deterministically waits for accept-loop exit and active handlers via
|
||||
done + waitgroup synchronization.
|
||||
- Only expected listener-close shutdown accept errors are non-fatal; unexpected
|
||||
accept errors fail tests.
|
||||
- Cleanup timeout is reported as failure signal and cannot pass silently.
|
||||
- Validation matrix passes.
|
||||
- Diff is limited to test helper scope in `mail_service_test.go`.
|
||||
- No updates to `.gitignore`, `codecov.yml`, `.dockerignore`, or `Dockerfile`
|
||||
are required or included.
|
||||
|
||||
@@ -97,3 +97,47 @@ Scope: Final focused QA/security gate for notifications/security-event UX change
|
||||
|
||||
- Manual test plan (PR-1 + PR-2): `docs/issues/manual_test_provider_security_notifications_pr1_pr2.md`
|
||||
- Existing focused QA evidence in this report remains the baseline for automated validation.
|
||||
|
||||
## QA/Security Validation Report - SMTP Flaky Test Fix (Test-Only Backend Change)
|
||||
|
||||
Date: 2026-02-22
|
||||
Repository: /projects/Charon
|
||||
Scope: Validate SMTP STARTTLS test-stability fix without production behavior change.
|
||||
|
||||
### Scope Verification
|
||||
|
||||
| Check | Status | Evidence |
|
||||
|---|---|---|
|
||||
| Changed files are test-only (no production code changes) | PASS | `git status --short` shows only `backend/internal/services/mail_service_test.go` and `docs/plans/current_spec.md` modified. |
|
||||
| Production behavior unchanged by diff scope | PASS | No non-test backend/service implementation files modified. |
|
||||
|
||||
### Required Validation Results
|
||||
|
||||
| # | Command | Status | Evidence Snippet |
|
||||
|---|---|---|---|
|
||||
| 1 | `go test ./backend/internal/services -run TestMailService_TestConnection_StartTLSSuccessWithAuth -count=20` | PASS | `ok github.com/Wikid82/charon/backend/internal/services 1.403s` |
|
||||
| 2 | `go test -race ./backend/internal/services -run 'TestMailService_(TestConnection|Send)' -count=1` | PASS | `ok github.com/Wikid82/charon/backend/internal/services 1.270s` |
|
||||
| 3 | `bash scripts/go-test-coverage.sh` | PASS | `Statement coverage: 86.1%` / `Line coverage: 86.4%` / `Coverage requirement met` |
|
||||
| 4 | `pre-commit run --all-files` | PASS | All hooks passed, including `golangci-lint (Fast Linters - BLOCKING)`, `Go Vet`, `Frontend TypeScript Check`, `Frontend Lint (Fix)`. |
|
||||
|
||||
### Additional QA Context
|
||||
|
||||
| Check | Status | Evidence |
|
||||
|---|---|---|
|
||||
| Local patch coverage preflight artifacts generated | PASS | `bash scripts/local-patch-report.sh` produced `test-results/local-patch-report.md` and `test-results/local-patch-report.json`. |
|
||||
| Patch coverage threshold warning (advisory) | WARN (non-blocking) | Report output: `WARN: Overall patch coverage 53.8% ...` and `WARN: Backend patch coverage 52.0% ...`. |
|
||||
|
||||
### Security Stance
|
||||
|
||||
| Check | Status | Notes |
|
||||
|---|---|---|
|
||||
| New secret/token exposure risk introduced by test changes | PASS | Change scope is test helper logic only; no credentials/tokens were added to production paths, logs, or API outputs. |
|
||||
| Gotify token leakage pattern introduced | PASS | No Gotify tokenized URLs or token fields were added in the changed test file. |
|
||||
|
||||
### Blockers
|
||||
|
||||
- None.
|
||||
|
||||
### Verdict
|
||||
|
||||
**PASS** — SMTP flaky test fix validates as test-only, stable under repetition/race checks, meets backend coverage gate, passes full pre-commit, and introduces no new secret/token exposure risk.
|
||||
|
||||
Reference in New Issue
Block a user