chore: Implement manual test plan for SMTP mock server flakiness fix

- Added a new documentation file outlining the manual test plan to validate the SMTP mock server flakiness fix, ensuring improved mail test reliability without affecting production behavior.
- Updated the current specification document to reflect the focus on stabilizing flaky SMTP STARTTLS+AUTH unit tests, including detailed research findings and requirements for the implementation.
- Created a QA/Security validation report for the SMTP flaky test fix, confirming that changes are test-only, stable under repeated runs, and do not introduce new security risks.
This commit is contained in:
GitHub Actions
2026-02-22 06:29:37 +00:00
parent b14f6f040f
commit 7497cbecd0
4 changed files with 542 additions and 467 deletions
+221 -26
View File
@@ -9,12 +9,15 @@ import (
"crypto/x509"
"crypto/x509/pkix"
"encoding/pem"
"errors"
"math/big"
"net"
"net/mail"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"testing"
"time"
@@ -26,6 +29,61 @@ import (
"gorm.io/gorm/logger"
)
// TestMain sets up the SSL_CERT_FILE environment variable globally BEFORE any tests run.
// This ensures x509.SystemCertPool() initializes with our test CA, which is critical for
// parallel test execution with -race flag where cert pool initialization timing matters.
func TestMain(m *testing.M) {
// Initialize shared test CA and write stable cert file
initializeTestCAForSuite()
// Set SSL_CERT_FILE globally so cert pool initialization uses our CA
if err := os.Setenv("SSL_CERT_FILE", testCAFile); err != nil {
panic("failed to set SSL_CERT_FILE: " + err.Error())
}
// Run tests
exitCode := m.Run()
// Cleanup (optional, OS will clean /tmp on reboot)
_ = os.Remove(testCAFile)
os.Exit(exitCode)
}
// initializeTestCAForSuite is called once by TestMain to set up the shared CA infrastructure.
func initializeTestCAForSuite() {
testCAOnce.Do(func() {
var err error
testCAKey, err = rsa.GenerateKey(rand.Reader, 2048)
if err != nil {
panic("GenerateKey failed: " + err.Error())
}
testCATemplate = &x509.Certificate{
SerialNumber: big.NewInt(1),
Subject: pkix.Name{
CommonName: "charon-test-ca",
},
NotBefore: time.Now().Add(-time.Hour),
NotAfter: time.Now().Add(24 * 365 * time.Hour), // 24 years
KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageCRLSign,
BasicConstraintsValid: true,
IsCA: true,
}
caDER, err := x509.CreateCertificate(rand.Reader, testCATemplate, testCATemplate, &testCAKey.PublicKey, testCAKey)
if err != nil {
panic("CreateCertificate failed: " + err.Error())
}
testCAPEM = pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: caDER})
testCAFile = filepath.Join(os.TempDir(), "charon-test-ca-mail-service.pem")
if err := os.WriteFile(testCAFile, testCAPEM, 0o600); err != nil {
panic("WriteFile failed: " + err.Error())
}
})
}
func setupMailTestDB(t *testing.T) *gorm.DB {
db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{
Logger: logger.Default.LogMode(logger.Silent),
@@ -774,6 +832,60 @@ func TestEncodeSubject_RejectsCRLF(t *testing.T) {
require.ErrorIs(t, err, errEmailHeaderInjection)
}
// Shared test CA infrastructure to work around Go's cert pool caching.
// When tests run with -count=N, Go caches x509.SystemCertPool() after the first run.
// Generating a new CA per test causes failures because cached pool references old CA.
// Solution: Generate CA once, reuse across runs, and use stable cert file path.
var (
testCAOnce sync.Once
testCAPEM []byte
testCAKey *rsa.PrivateKey
testCATemplate *x509.Certificate
testCAFile string
)
func initTestCA(t *testing.T) {
t.Helper()
// Delegate to the suite-level initialization (already called by TestMain)
initializeTestCAForSuite()
}
func newTestTLSConfigShared(t *testing.T) (*tls.Config, []byte) {
t.Helper()
// Ensure shared CA is initialized
initTestCA(t)
// Generate leaf certificate signed by shared CA
leafKey, err := rsa.GenerateKey(rand.Reader, 2048)
require.NoError(t, err)
leafTemplate := &x509.Certificate{
SerialNumber: big.NewInt(time.Now().UnixNano()), // Unique serial per leaf
Subject: pkix.Name{
CommonName: "127.0.0.1",
},
NotBefore: time.Now().Add(-time.Hour),
NotAfter: time.Now().Add(24 * time.Hour),
KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment,
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
BasicConstraintsValid: true,
DNSNames: []string{"localhost"},
IPAddresses: []net.IP{net.ParseIP("127.0.0.1")},
}
leafDER, err := x509.CreateCertificate(rand.Reader, leafTemplate, testCATemplate, &leafKey.PublicKey, testCAKey)
require.NoError(t, err)
leafCertPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: leafDER})
leafKeyPEM := pem.EncodeToMemory(&pem.Block{Type: "RSA PRIVATE KEY", Bytes: x509.MarshalPKCS1PrivateKey(leafKey)})
cert, err := tls.X509KeyPair(leafCertPEM, leafKeyPEM)
require.NoError(t, err)
return &tls.Config{Certificates: []tls.Certificate{cert}, MinVersion: tls.VersionTLS12}, testCAPEM
}
func TestMailService_GetSMTPConfig_DBError(t *testing.T) {
t.Parallel()
@@ -870,8 +982,10 @@ func TestMailService_sendSTARTTLS_DialFailure(t *testing.T) {
}
func TestMailService_TestConnection_StartTLSSuccessWithAuth(t *testing.T) {
tlsConf, certPEM := newTestTLSConfig(t)
trustTestCertificate(t, certPEM)
t.Parallel()
tlsConf, _ := newTestTLSConfigShared(t)
trustTestCertificate(t, nil)
addr, cleanup := startMockSMTPServer(t, tlsConf, true, true)
defer cleanup()
@@ -919,8 +1033,10 @@ func TestMailService_TestConnection_NoneSuccess(t *testing.T) {
}
func TestMailService_SendEmail_STARTTLSSuccess(t *testing.T) {
tlsConf, certPEM := newTestTLSConfig(t)
trustTestCertificate(t, certPEM)
t.Parallel()
tlsConf, _ := newTestTLSConfigShared(t)
trustTestCertificate(t, nil)
addr, cleanup := startMockSMTPServer(t, tlsConf, true, true)
defer cleanup()
@@ -940,14 +1056,16 @@ func TestMailService_SendEmail_STARTTLSSuccess(t *testing.T) {
Encryption: "starttls",
}))
// With fixed cert trust, STARTTLS connection and email send succeed
err = svc.SendEmail("recipient@example.com", "Subject", "Body")
require.Error(t, err)
assert.Contains(t, err.Error(), "STARTTLS failed")
require.NoError(t, err)
}
func TestMailService_SendEmail_SSLSuccess(t *testing.T) {
tlsConf, certPEM := newTestTLSConfig(t)
trustTestCertificate(t, certPEM)
t.Parallel()
tlsConf, _ := newTestTLSConfigShared(t)
trustTestCertificate(t, nil)
addr, cleanup := startMockSSLSMTPServer(t, tlsConf, true)
defer cleanup()
@@ -967,9 +1085,9 @@ func TestMailService_SendEmail_SSLSuccess(t *testing.T) {
Encryption: "ssl",
}))
// With fixed cert trust, SSL connection and email send succeed
err = svc.SendEmail("recipient@example.com", "Subject", "Body")
require.Error(t, err)
assert.Contains(t, err.Error(), "SSL connection failed")
require.NoError(t, err)
}
func newTestTLSConfig(t *testing.T) (*tls.Config, []byte) {
@@ -1025,10 +1143,9 @@ func newTestTLSConfig(t *testing.T) (*tls.Config, []byte) {
func trustTestCertificate(t *testing.T, certPEM []byte) {
t.Helper()
caFile := t.TempDir() + "/ca-cert.pem"
require.NoError(t, os.WriteFile(caFile, certPEM, 0o600))
t.Setenv("SSL_CERT_FILE", caFile)
// SSL_CERT_FILE is already set globally by TestMain.
// This function kept for API compatibility but no longer needs to set environment.
initTestCA(t) // Ensure CA is initialized (already done by TestMain, but safe to call)
}
func startMockSMTPServer(t *testing.T, tlsConf *tls.Config, supportStartTLS bool, requireAuth bool) (string, func()) {
@@ -1038,21 +1155,60 @@ func startMockSMTPServer(t *testing.T, tlsConf *tls.Config, supportStartTLS bool
require.NoError(t, err)
done := make(chan struct{})
var wg sync.WaitGroup
var connsMu sync.Mutex
var conns []net.Conn
go func() {
defer close(done)
conn, acceptErr := listener.Accept()
if acceptErr != nil {
return
for {
conn, acceptErr := listener.Accept()
if acceptErr != nil {
// Expected shutdown path: listener closed
if errors.Is(acceptErr, net.ErrClosed) || strings.Contains(acceptErr.Error(), "use of closed network connection") {
return
}
// Unexpected accept error - signal test failure
t.Errorf("unexpected accept error: %v", acceptErr)
return
}
connsMu.Lock()
conns = append(conns, conn)
connsMu.Unlock()
wg.Add(1)
go func(c net.Conn) {
defer wg.Done()
defer func() { _ = c.Close() }()
handleSMTPConn(c, tlsConf, supportStartTLS, requireAuth)
}(conn)
}
defer func() { _ = conn.Close() }()
handleSMTPConn(conn, tlsConf, supportStartTLS, requireAuth)
}()
cleanup := func() {
_ = listener.Close()
// Close all active connections to unblock handlers
connsMu.Lock()
for _, conn := range conns {
_ = conn.Close()
}
connsMu.Unlock()
// Wait for accept-loop exit and active handlers with timeout
cleanupDone := make(chan struct{})
go func() {
<-done
wg.Wait()
close(cleanupDone)
}()
select {
case <-done:
case <-cleanupDone:
// Success
case <-time.After(2 * time.Second):
t.Errorf("cleanup timeout: server did not shut down cleanly")
}
}
@@ -1066,21 +1222,60 @@ func startMockSSLSMTPServer(t *testing.T, tlsConf *tls.Config, requireAuth bool)
require.NoError(t, err)
done := make(chan struct{})
var wg sync.WaitGroup
var connsMu sync.Mutex
var conns []net.Conn
go func() {
defer close(done)
conn, acceptErr := listener.Accept()
if acceptErr != nil {
return
for {
conn, acceptErr := listener.Accept()
if acceptErr != nil {
// Expected shutdown path: listener closed
if errors.Is(acceptErr, net.ErrClosed) || strings.Contains(acceptErr.Error(), "use of closed network connection") {
return
}
// Unexpected accept error - signal test failure
t.Errorf("unexpected accept error: %v", acceptErr)
return
}
connsMu.Lock()
conns = append(conns, conn)
connsMu.Unlock()
wg.Add(1)
go func(c net.Conn) {
defer wg.Done()
defer func() { _ = c.Close() }()
handleSMTPConn(c, tlsConf, false, requireAuth)
}(conn)
}
defer func() { _ = conn.Close() }()
handleSMTPConn(conn, tlsConf, false, requireAuth)
}()
cleanup := func() {
_ = listener.Close()
// Close all active connections to unblock handlers
connsMu.Lock()
for _, conn := range conns {
_ = conn.Close()
}
connsMu.Unlock()
// Wait for accept-loop exit and active handlers with timeout
cleanupDone := make(chan struct{})
go func() {
<-done
wg.Wait()
close(cleanupDone)
}()
select {
case <-done:
case <-cleanupDone:
// Success
case <-time.After(2 * time.Second):
t.Errorf("cleanup timeout: server did not shut down cleanly")
}
}