diff --git a/backend/internal/api/handlers/user_handler.go b/backend/internal/api/handlers/user_handler.go index ef838361..31eca210 100644 --- a/backend/internal/api/handlers/user_handler.go +++ b/backend/internal/api/handlers/user_handler.go @@ -199,10 +199,19 @@ func (h *UserHandler) Setup(c *gin.Context) { }) } +// rejectPassthrough aborts with 403 if the caller is a passthrough user. +// Returns true if the request was rejected (caller should return). +func rejectPassthrough(c *gin.Context, action string) bool { + if c.GetString("role") == string(models.RolePassthrough) { + c.JSON(http.StatusForbidden, gin.H{"error": "Passthrough users cannot " + action}) + return true + } + return false +} + // RegenerateAPIKey generates a new API key for the authenticated user. func (h *UserHandler) RegenerateAPIKey(c *gin.Context) { - if c.GetString("role") == string(models.RolePassthrough) { - c.JSON(http.StatusForbidden, gin.H{"error": "Passthrough users cannot manage API keys"}) + if rejectPassthrough(c, "manage API keys") { return } userID, exists := c.Get("userID") @@ -228,8 +237,7 @@ func (h *UserHandler) RegenerateAPIKey(c *gin.Context) { // GetProfile returns the current user's profile including API key. func (h *UserHandler) GetProfile(c *gin.Context) { - if c.GetString("role") == string(models.RolePassthrough) { - c.JSON(http.StatusForbidden, gin.H{"error": "Passthrough users cannot access profile"}) + if rejectPassthrough(c, "access profile") { return } userID, exists := c.Get("userID") @@ -262,8 +270,7 @@ type UpdateProfileRequest struct { // UpdateProfile updates the authenticated user's profile. func (h *UserHandler) UpdateProfile(c *gin.Context) { - if c.GetString("role") == string(models.RolePassthrough) { - c.JSON(http.StatusForbidden, gin.H{"error": "Passthrough users cannot update profile"}) + if rejectPassthrough(c, "update profile") { return } userID, exists := c.Get("userID") @@ -734,8 +741,8 @@ func (h *UserHandler) UpdateUser(c *gin.Context) { } var req UpdateUserRequest - if err := c.ShouldBindJSON(&req); err != nil { - c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + if bindErr := c.ShouldBindJSON(&req); bindErr != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": bindErr.Error()}) return } @@ -763,7 +770,7 @@ func (h *UserHandler) UpdateUser(c *gin.Context) { if req.Email != "" { email := strings.ToLower(req.Email) var count int64 - if err := h.DB.Model(&models.User{}).Where("email = ? AND id != ?", email, id).Count(&count).Error; err == nil && count > 0 { + if dbErr := h.DB.Model(&models.User{}).Where("email = ? AND id != ?", email, id).Count(&count).Error; dbErr == nil && count > 0 { c.JSON(http.StatusConflict, gin.H{"error": "Email already in use"}) return } @@ -786,23 +793,13 @@ func (h *UserHandler) UpdateUser(c *gin.Context) { return } - // Last-admin protection - if user.Role == models.RoleAdmin && newRole != models.RoleAdmin { - var adminCount int64 - h.DB.Model(&models.User{}).Where("role = ? AND enabled = ?", models.RoleAdmin, true).Count(&adminCount) - if adminCount <= 1 { - c.JSON(http.StatusForbidden, gin.H{"error": "Cannot demote the last admin"}) - return - } - } - updates["role"] = string(newRole) needsSessionInvalidation = true } } if req.Password != nil { - if err := user.SetPassword(*req.Password); err != nil { + if hashErr := user.SetPassword(*req.Password); hashErr != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to hash password"}) return } @@ -818,28 +815,70 @@ func (h *UserHandler) UpdateUser(c *gin.Context) { return } - // Last-admin protection for disabling - if user.Role == models.RoleAdmin && !*req.Enabled { - var adminCount int64 - h.DB.Model(&models.User{}).Where("role = ? AND enabled = ?", models.RoleAdmin, true).Count(&adminCount) - if adminCount <= 1 { - c.JSON(http.StatusForbidden, gin.H{"error": "Cannot disable the last admin"}) - return - } - } - updates["enabled"] = *req.Enabled if !*req.Enabled { needsSessionInvalidation = true } } - if len(updates) > 0 { - if err := h.DB.Model(&user).Updates(updates).Error; err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to update user"}) - return + // Wrap the last-admin checks and the actual update in a transaction to prevent + // race conditions: two concurrent requests could both read adminCount==2 + // and both proceed, leaving zero admins. + err = h.DB.Transaction(func(tx *gorm.DB) error { + // Re-fetch user inside transaction for consistent state + if txErr := tx.First(&user, id).Error; txErr != nil { + return txErr } + // Last-admin protection for role demotion + if newRoleStr, ok := updates["role"]; ok { + newRole := models.UserRole(newRoleStr.(string)) + if user.Role == models.RoleAdmin && newRole != models.RoleAdmin { + var adminCount int64 + // Policy: count only enabled admins. This is stricter than "WHERE role = ?" + // because a disabled admin cannot act; treating them as non-existent + // prevents leaving the system with only disabled admins. + tx.Model(&models.User{}).Where("role = ? AND enabled = ?", models.RoleAdmin, true).Count(&adminCount) + if adminCount <= 1 { + return fmt.Errorf("cannot demote the last admin") + } + } + } + + // Last-admin protection for disabling + if enabledVal, ok := updates["enabled"]; ok { + if enabled, isBool := enabledVal.(bool); isBool && !enabled { + if user.Role == models.RoleAdmin { + var adminCount int64 + // Policy: count only enabled admins (same rationale as above). + tx.Model(&models.User{}).Where("role = ? AND enabled = ?", models.RoleAdmin, true).Count(&adminCount) + if adminCount <= 1 { + return fmt.Errorf("cannot disable the last admin") + } + } + } + } + + if len(updates) > 0 { + if txErr := tx.Model(&user).Updates(updates).Error; txErr != nil { + return txErr + } + } + + return nil + }) + + if err != nil { + errMsg := err.Error() + if errMsg == "cannot demote the last admin" || errMsg == "cannot disable the last admin" { + c.JSON(http.StatusForbidden, gin.H{"error": "Cannot" + errMsg[len("cannot"):]}) + return + } + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to update user"}) + return + } + + if len(updates) > 0 { if needsSessionInvalidation && h.AuthService != nil { if invErr := h.AuthService.InvalidateSessions(user.ID); invErr != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to invalidate sessions"}) diff --git a/backend/internal/api/middleware/auth.go b/backend/internal/api/middleware/auth.go index 726eb65a..9eea57fb 100644 --- a/backend/internal/api/middleware/auth.go +++ b/backend/internal/api/middleware/auth.go @@ -117,7 +117,7 @@ func RequireManagementAccess() gin.HandlerFunc { return func(c *gin.Context) { role := c.GetString("role") if role == string(models.RolePassthrough) { - c.AbortWithStatusJSON(http.StatusForbidden, gin.H{"error": "Forbidden"}) + c.AbortWithStatusJSON(http.StatusForbidden, gin.H{"error": "Pass-through users cannot access management features"}) return } c.Next() diff --git a/docs/plans/current_spec.md b/docs/plans/current_spec.md index a69a91c1..490a4c5f 100644 --- a/docs/plans/current_spec.md +++ b/docs/plans/current_spec.md @@ -1,362 +1,981 @@ -# Uptime Monitoring Regression Investigation (Scheduled vs Manual) +# User Management Consolidation & Privilege Tier System -Date: 2026-03-01 +Date: 2026-06-25 Owner: Planning Agent -Status: Investigation Complete, Fix Plan Proposed -Severity: High (false DOWN states on automated monitoring) +Status: Proposed +Severity: Medium (UX consolidation + feature enhancement) -## 1. Executive Summary +--- -Two services (Wizarr and Charon) can flip to `DOWN` during scheduled cycles while manual checks immediately return `UP` because scheduled checks use a host-level TCP gate that can short-circuit monitor-level HTTP checks. +## 1. Introduction -The scheduled path is: -- `ticker -> CheckAll -> checkAllHosts -> (host status down) -> markHostMonitorsDown` +### 1.1 Overview -The manual path is: -- `POST /api/v1/uptime/monitors/:id/check -> CheckMonitor -> checkMonitor` +Charon currently manages users through **two separate interfaces**: -Only the scheduled path runs host precheck gating. If host precheck fails (TCP to upstream host/port), `CheckAll` skips HTTP checks and forcibly writes monitor status to `down` with heartbeat message `Host unreachable`. +1. **Admin Account** page at `/settings/account` — self-service profile management (name, email, password, API key, certificate email) +2. **Account Management** page at `/settings/account-management` — admin-only user CRUD (list, invite, delete, update roles/permissions) -This is a backend state mutation problem (not only UI rendering). +This split is confusing and redundant. A logged-in admin must navigate to two different places to manage their own account versus managing other users. The system also lacks a formal privilege tier model — the `Role` field on the `User` model is a raw string defaulting to `"user"`, with only `"admin"` and `"user"` exposed in the frontend selector (the model comment mentions `"viewer"` but it is entirely unused). -## 1.1 Monitoring Policy (Authoritative Behavior) +### 1.2 Objectives -Charon uptime monitoring SHALL follow URL-truth semantics for HTTP/HTTPS monitors, -matching third-party external monitor behavior (Uptime Kuma style) without requiring -any additional service. +1. **Consolidate** user management into a single **"Users"** page that lists ALL users, including the admin. +2. **Introduce a formal privilege tier system** with three tiers: **Admin**, **User**, and **Pass-through**. +3. **Self-service profile editing** occurs inline or via a detail drawer/modal on the consolidated page — no separate "Admin Account" page. +4. **Remove** the `/settings/account` route and its sidebar/tab navigation entry entirely. +5. **Update navigation** to a single "Users" entry under Settings (or as a top-level item). +6. **Write E2E Playwright tests** for the consolidated page before any implementation changes. -Policy: -- HTTP/HTTPS monitors are URL-truth based. The monitor result is authoritative based - on the configured URL check outcome (status code/timeout/TLS/connectivity from URL - perspective). -- Internal TCP reachability precheck (`ForwardHost:ForwardPort`) is - non-authoritative for HTTP/HTTPS monitor status. -- TCP monitors remain endpoint-socket checks and may rely on direct socket - reachability semantics. -- Host precheck may still be used for optimization, grouping telemetry, and operator - diagnostics, but SHALL NOT force HTTP/HTTPS monitors to DOWN. +--- ## 2. Research Findings -### 2.1 Execution Path Comparison (Required) +### 2.1 Current Architecture: Backend + +#### 2.1.1 User Model — `backend/internal/models/user.go` + +```go +type User struct { + ID uint `json:"-" gorm:"primaryKey"` + UUID string `json:"uuid" gorm:"uniqueIndex"` + Email string `json:"email" gorm:"uniqueIndex"` + APIKey string `json:"-" gorm:"uniqueIndex"` + PasswordHash string `json:"-"` + Name string `json:"name"` + Role string `json:"role" gorm:"default:'user'"` // "admin", "user", "viewer" + Enabled bool `json:"enabled" gorm:"default:true"` + FailedLoginAttempts int `json:"-" gorm:"default:0"` + LockedUntil *time.Time `json:"-"` + LastLogin *time.Time `json:"last_login,omitempty"` + SessionVersion uint `json:"-" gorm:"default:0"` + // Invite system fields ... + PermissionMode PermissionMode `json:"permission_mode" gorm:"default:'allow_all'"` + PermittedHosts []ProxyHost `json:"permitted_hosts,omitempty" gorm:"many2many:user_permitted_hosts;"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} +``` + +**Key observations:** + +| Aspect | Current State | Issue | +|--------|--------------|-------| +| `Role` field | Raw string, default `"user"` | No Go-level enum/const; `"viewer"` mentioned in comment but unused anywhere | +| Permission logic | `CanAccessHost()` — admins bypass all checks | Sound; needs extension for pass-through tier | +| JSON serialization | `ID` is `json:"-"`, `APIKey` is `json:"-"` | Correctly hidden from API responses | +| Password hashing | bcrypt via `SetPassword()` / `CheckPassword()` | Solid | + +#### 2.1.2 Auth Service — `backend/internal/services/auth_service.go` + +- `Register()`: First user becomes admin (`role = "admin"`) +- `Login()`: Account lockout after 5 failed attempts; sets `LastLogin` +- `GenerateToken()`: JWT with claims `{UserID, Role, SessionVersion}`, 24h expiry +- `AuthenticateToken()`: Validates JWT + checks `Enabled` + matches `SessionVersion` +- `InvalidateSessions()`: Increments `SessionVersion` to revoke all existing tokens + +**Impact of role changes:** The `Role` string is embedded in the JWT. When a user's role is changed, their existing JWT still carries the old role until it expires or `SessionVersion` is bumped. The `InvalidateSessions()` method exists for this purpose, **but it is NOT currently called in `UpdateUser()`**. This is a security gap — a user demoted from admin to passthrough retains their admin JWT for up to 24 hours. Task 2.6 addresses this. + +#### 2.1.3 Auth Middleware — `backend/internal/api/middleware/auth.go` + +```go +func AuthMiddleware(authService *services.AuthService) gin.HandlerFunc +func RequireRole(role string) gin.HandlerFunc +``` + +- `AuthMiddleware`: Extracts token from header/cookie/query, sets `userID` and `role` in Gin context. +- `RequireRole`: Checks `role` against a **single** required role string, always allows `"admin"` as fallback. Currently used only for SMTP settings. + +**Gap:** Most admin-only endpoints in `user_handler.go` do inline `role != "admin"` checks via `requireAdmin(c)` in `permission_helpers.go` rather than using middleware. This works but is inconsistent. + +#### 2.1.4 User Handler — `backend/internal/api/handlers/user_handler.go` + +Two disjoint groups of endpoints: + +| Group | Endpoints | Purpose | Auth | +|-------|-----------|---------|------| +| **Self-service** | `GET /user/profile`, `POST /user/profile`, `POST /user/api-key` | Current user edits own profile/API key | Any authenticated user | +| **Admin CRUD** | `GET /users`, `POST /users`, `POST /users/invite`, `GET /users/:id`, `PUT /users/:id`, `DELETE /users/:id`, `PUT /users/:id/permissions`, `POST /users/:id/resend-invite`, `POST /users/preview-invite-url` | Admin manages all users | Admin only (inline check) | + +**Key handler behaviors:** + +- `ListUsers()`: Returns all users with safe fields (excludes password hash, API key). Admin only. +- `UpdateUser()`: Can change `name`, `email`, `password`, `role`, `enabled`. **Does NOT call `InvalidateSessions()` on role change** — a demoted user retains their old-role JWT for up to 24 hours. This is a security gap that must be fixed in this spec (see Task 2.6). +- `DeleteUser()`: Prevents self-deletion. +- `GetProfile()`: Returns `{id, email, name, role, has_api_key, api_key_masked}` for the calling user. +- `UpdateProfile()`: Requires `current_password` verification when email changes. +- `RegenerateAPIKey()`: Generates new UUID-based API key for the calling user. + +#### 2.1.5 Route Registration — `backend/internal/api/routes/routes.go` + +``` +Public: + POST /api/v1/auth/login + POST /api/v1/auth/register + GET /api/v1/auth/verify (forward auth for Caddy) + GET /api/v1/auth/status + GET /api/v1/setup/status + POST /api/v1/setup + GET /api/v1/invite/validate + POST /api/v1/invite/accept + +Protected (authMiddleware): + POST /api/v1/auth/logout + POST /api/v1/auth/refresh + GET /api/v1/auth/me + POST /api/v1/auth/change-password + GET /api/v1/user/profile ← Self-service group + POST /api/v1/user/profile ← + POST /api/v1/user/api-key ← + GET /api/v1/users ← Admin CRUD group + POST /api/v1/users ← + POST /api/v1/users/invite ← + POST /api/v1/users/preview-invite-url ← + GET /api/v1/users/:id ← + PUT /api/v1/users/:id ← + DELETE /api/v1/users/:id ← + PUT /api/v1/users/:id/permissions ← + POST /api/v1/users/:id/resend-invite ← +``` + +#### 2.1.6 Forward Auth — `backend/internal/api/handlers/auth_handler.go` + +The `Verify()` handler at `GET /api/v1/auth/verify` is the critical integration point for the pass-through tier: + +1. Extracts token from cookie/header/query +2. Authenticates user via `AuthenticateToken()` +3. Reads `X-Forwarded-Host` from Caddy +4. Checks `user.CanAccessHost(hostID)` against the permission model +5. Returns `200 OK` with `X-Auth-User` headers or `401`/`403` + +**Pass-through users will use this exact path** — they log in to Charon only to establish a session, then Caddy's `forward_auth` directive validates their access to proxied services. They never need to see the Charon management UI. + +### 2.2 Current Architecture: Frontend + +#### 2.2.1 Router — `frontend/src/App.tsx` + +```tsx +// Three routes serve user management: +} /> // top-level +}> + } /> // self-service + } /> // admin CRUD + +``` -### Scheduled path behavior -- Entry: `backend/internal/api/routes/routes.go` (background ticker, calls `uptimeService.CheckAll()`) -- `CheckAll()` calls `checkAllHosts()` first. - - File: `backend/internal/services/uptime_service.go:354` -- `checkAllHosts()` updates each `UptimeHost.Status` via TCP checks in `checkHost()`. - - File: `backend/internal/services/uptime_service.go:395` -- `checkHost()` dials `UptimeHost.Host` + monitor port (prefer `ProxyHost.ForwardPort`, fallback to URL port). - - File: `backend/internal/services/uptime_service.go:437` -- Back in `CheckAll()`, monitors are grouped by `UptimeHostID`. - - File: `backend/internal/services/uptime_service.go:367` -- If `UptimeHost.Status == "down"`, `markHostMonitorsDown()` is called and individual monitor checks are skipped. - - File: `backend/internal/services/uptime_service.go:381` - - File: `backend/internal/services/uptime_service.go:593` - -### Manual path behavior -- Entry: `POST /api/v1/uptime/monitors/:id/check`. - - Handler: `backend/internal/api/handlers/uptime_handler.go:107` -- Calls `service.CheckMonitor(*monitor)` asynchronously. - - File: `backend/internal/services/uptime_service.go:707` -- `checkMonitor()` performs direct HTTP/TCP monitor check and updates monitor + heartbeat. - - File: `backend/internal/services/uptime_service.go:711` - -### Key divergence -- Scheduled: host-gated (precheck can override monitor) -- Manual: direct monitor check (no host gate) +The `UsersPage` component is mounted at **two** routes: `/users` and `/settings/account-management`. The `Account` component is only at `/settings/account`. -## 3. Root Cause With Evidence +#### 2.2.2 Settings Page — `frontend/src/pages/Settings.tsx` -## 3.1 Primary Root Cause: Host Precheck Overrides HTTP Success in Scheduled Cycles - -When `UptimeHost` is marked `down`, scheduled checks do not run `checkMonitor()` for that host's monitors. Instead they call `markHostMonitorsDown()` which: -- sets each monitor `Status = "down"` -- writes `UptimeHeartbeat{Status: "down", Message: "Host unreachable"}` -- maxes failure count (`FailureCount = MaxRetries`) - -Evidence: -- Short-circuit: `backend/internal/services/uptime_service.go:381` -- Forced down write: `backend/internal/services/uptime_service.go:610` -- Forced heartbeat message: `backend/internal/services/uptime_service.go:624` - -This exactly matches symptom pattern: -1. Manual refresh sets monitor `UP` via direct HTTP check. -2. Next scheduler cycle can force it back to `DOWN` from host precheck path. - -## 3.2 Hypothesis Check: TCP precheck can fail while public URL HTTP check succeeds - -Confirmed as plausible by design: -- `checkHost()` tests upstream reachability (`ForwardHost:ForwardPort`) from Charon runtime. -- `checkMonitor()` tests monitor URL (public domain URL, often via Caddy/public routing). +Tab bar with 4 items: `System`, `Notifications`, `SMTP`, `Account`. Note that **Account Management is NOT in the tab bar** — it is only reachable via the sidebar. The `Account` tab links to `/settings/account` (the self-service profile page). + +#### 2.2.3 Sidebar Navigation — `frontend/src/components/Layout.tsx` -A service can be publicly reachable by monitor URL while upstream TCP precheck fails due to network namespace/routing/DNS/hairpin differences. +Under the Settings section, two entries: -This is especially likely for: -- self-referential routes (Charon monitoring Charon via public hostname) -- host/container networking asymmetry -- services reachable through proxy path but not directly on upstream socket from current runtime context +```tsx +{ name: t('navigation.adminAccount'), path: '/settings/account', icon: '🛡️' }, +{ name: t('navigation.accountManagement'), path: '/settings/account-management', icon: '👥' }, +``` -## 3.3 Recent Change Correlation (Required) +#### 2.2.4 Account Page — `frontend/src/pages/Account.tsx` -### `SyncAndCheckForHost` (regression amplifier) -- Introduced in commit `2cd19d89` and called from proxy host create path. -- Files: - - `backend/internal/services/uptime_service.go:1195` - - `backend/internal/api/handlers/proxy_host_handler.go:418` -- Behavior: creates/syncs monitor and immediately runs `checkMonitor()`. +Four cards: +1. **Profile** — name, email (with password verification for email changes) +2. **Certificate Email** — toggle between account email and custom email for ACME +3. **Password Change** — old/new/confirm password form +4. **API Key** — masked display + regenerate button -Impact: makes monitors quickly show `UP` after create/manual, then scheduler can flip to `DOWN` if host precheck fails. This increased visibility of scheduled/manual inconsistency. +Uses `api/user.ts` (singular) — endpoints: `GET /user/profile`, `POST /user/profile`, `POST /user/api-key`. -### `CleanupStaleFailureCounts` -- Introduced in `2cd19d89`, refined in `7a12ab79`. -- File: `backend/internal/services/uptime_service.go:1277` -- It runs at startup and resets stale monitor states only; not per-cycle override logic. -- Not root cause of recurring per-cycle flip. +#### 2.2.5 Users Page — `frontend/src/pages/UsersPage.tsx` -### Frontend effective status changes -- Latest commit `0241de69` refactors `effectiveStatus` handling. -- File: `frontend/src/pages/Uptime.tsx`. -- Backend evidence proves this is not visual-only: scheduler writes `down` heartbeats/messages directly in DB. +~900 lines. Contains: +- `InviteModal` — email, role select (`user`/`admin`), permission mode, host selection, URL preview +- `PermissionsModal` — permission mode + host checkboxes per user +- `UsersPage` (default export) — table listing all users with columns: User (name/email), Role (badge), Status (active/pending/expired), Permissions (whitelist/blacklist), Enabled (toggle), Actions (resend/permissions/delete) + +**Current limitations:** +- Admin users cannot be disabled (switch is `disabled` when `role === 'admin'`) +- Admin users cannot be deleted (delete button is `disabled` when `role === 'admin'`) +- No permission editing for admins (gear icon hidden when `role !== 'admin'`) +- Role selector in `InviteModal` only has `user` and `admin` options +- No inline profile editing — admins can't edit their own name/email/password from this page +- The admin's own row appears in the list but has limited interactivity + +#### 2.2.6 API Client Files + +**`frontend/src/api/user.ts`** (singular — self-service): + +```tsx +interface UserProfile { id, email, name, role, has_api_key, api_key_masked } +getProfile() → GET /user/profile +updateProfile(data) → POST /user/profile +regenerateApiKey() → POST /user/api-key +``` -## 3.4 Grouping Logic Analysis (`UptimeHost`/`UpstreamHost`) +**`frontend/src/api/users.ts`** (plural — admin CRUD): -Monitors are grouped by `UptimeHostID` in `CheckAll()`. `UptimeHost` is derived from `ProxyHost.ForwardHost` in sync flows. - -Relevant code: -- group map by `UptimeHostID`: `backend/internal/services/uptime_service.go:367` -- host linkage in sync: `backend/internal/services/uptime_service.go:189`, `backend/internal/services/uptime_service.go:226` -- sync single-host update path: `backend/internal/services/uptime_service.go:1023` - -Risk: one host precheck failure can mark all grouped monitors down without URL-level validation. - -## 4. Technical Specification (Fix Plan) - -## 4.1 Minimal Proper Fix (First) - -Goal: eliminate false DOWN while preserving existing behavior as much as possible. - -Change `CheckAll()` host-down branch to avoid hard override for HTTP/HTTPS monitors. - -Mandatory hotfix rule: -- WHEN a host precheck is `down`, THE SYSTEM SHALL partition host monitors by type inside `CheckAll()`. -- `markHostMonitorsDown` MUST be invoked only for `tcp` monitors. -- `http`/`https` monitors MUST still run through `checkMonitor()` and MUST NOT be force-written `down` by the host precheck path. -- Host precheck outcomes MAY be recorded for optimization/telemetry/grouping, but MUST NOT be treated as final status for `http`/`https` monitors. - -Proposed rule: -1. If host is down: - - For `http`/`https` monitors: still run `checkMonitor()` (do not force down). - - For `tcp` monitors: keep current host-down fast-path (`markHostMonitorsDown`) or direct tcp check. -2. If host is not down: - - Keep existing behavior (run `checkMonitor()` for all monitors). - -Rationale: -- Aligns scheduled behavior with manual for URL-based monitors. -- Preserves reverse proxy product semantics where public URL availability is the source of truth. -- Minimal code delta in `CheckAll()` decision branch. -- Preserves optimization for true TCP-only monitors. - -### Exact file/function targets -- `backend/internal/services/uptime_service.go` - - `CheckAll()` - - add small helper (optional): `partitionMonitorsByType(...)` - -## 4.2 Long-Term Robust Fix (Deferred) - -Introduce host precheck as advisory signal, not authoritative override. - -Design: -1. Add `HostReachability` result to run context (not persisted as forced monitor status). -2. Always execute per-monitor checks, but use host precheck to: - - tune retries/backoff - - annotate failure reason - - optimize notification batching -3. Optionally add feature flag: - - `feature.uptime.strict_host_precheck` (default `false`) - - allows legacy strict gating in environments that want it. - -Benefits: -- Removes false DOWN caused by precheck mismatch. -- Keeps performance and batching controls. -- More explicit semantics for operators. - -## 5. API/Schema Impact - -No API contract change required for minimal fix. -No database migration required for minimal fix. - -Long-term fix may add one feature flag setting only. - -## 6. EARS Requirements - -### Ubiquitous -- THE SYSTEM SHALL evaluate HTTP/HTTPS monitor availability using URL-level checks as the authoritative signal. - -### Event-driven -- WHEN the scheduled uptime cycle runs, THE SYSTEM SHALL execute HTTP/HTTPS monitor checks regardless of internal host precheck state. -- WHEN the scheduled uptime cycle runs and host precheck is down, THE SYSTEM SHALL apply host-level forced-down logic only to TCP monitors. - -### State-driven -- WHILE a monitor type is `http` or `https`, THE SYSTEM SHALL NOT force monitor status to `down` solely from internal host precheck failure. -- WHILE a monitor type is `tcp`, THE SYSTEM SHALL evaluate status using endpoint socket reachability semantics. - -### Unwanted behavior -- IF internal host precheck is unreachable AND URL-level HTTP/HTTPS check returns success, THEN THE SYSTEM SHALL set monitor status to `up`. -- IF internal host precheck is reachable AND URL-level HTTP/HTTPS check fails, THEN THE SYSTEM SHALL set monitor status to `down`. - -### Optional -- WHERE host precheck telemetry is enabled, THE SYSTEM SHALL record host-level reachability for diagnostics and grouping without overriding HTTP/HTTPS monitor final state. - -## 7. Implementation Plan - -### Phase 1: Reproduction Lock-In (Tests First) -- Add backend service test proving current regression: - - host precheck fails - - monitor URL check would succeed - - scheduled `CheckAll()` currently writes down (existing behavior) -- File: `backend/internal/services/uptime_service_test.go` (new test block) - -### Phase 2: Minimal Backend Fix -- Update `CheckAll()` branch logic to run HTTP/HTTPS monitors even when host is down. -- Make monitor partitioning explicit and mandatory in `CheckAll()` host-down branch. -- Add an implementation guard before partitioning: normalize monitor type using - `strings.TrimSpace` + `strings.ToLower` to prevent `HTTP`/`HTTPS` case - regressions and whitespace-related misclassification. -- Ensure `markHostMonitorsDown` is called only for TCP monitor partitions. -- File: `backend/internal/services/uptime_service.go` - -### Phase 3: Backend Validation -- Add/adjust tests: - - scheduled path no longer forces down when HTTP succeeds - - manual and scheduled reach same final state for HTTP monitors - - internal host unreachable + public URL HTTP 200 => monitor is `UP` - - internal host reachable + public URL failure => monitor is `DOWN` - - TCP monitor behavior unchanged under host-down conditions -- Files: - - `backend/internal/services/uptime_service_test.go` - - `backend/internal/services/uptime_service_race_test.go` (if needed for concurrency side-effects) - -### Phase 4: Integration/E2E Coverage -- Add targeted API-level integration test for scheduler vs manual parity. -- Add Playwright scenario for: - - monitor set UP by manual check - - remains UP after scheduled cycle when URL is reachable -- Add parity scenario for: - - internal TCP precheck unreachable + URL returns 200 => `UP` - - internal TCP precheck reachable + URL failure => `DOWN` -- Files: - - `backend/internal/api/routes/routes_test.go` (or uptime handler integration suite) - - `tests/monitoring/uptime-monitoring.spec.ts` (or equivalent uptime spec file) - -Scope note: -- This hotfix plan is intentionally limited to backend behavior correction and - regression tests (unit/integration/E2E). -- Dedicated documentation-phase work is deferred and out of scope for this - hotfix PR. - -## 8. Test Plan (Unit / Integration / E2E) - -Duplicate notification definition (hotfix acceptance/testing): -- A duplicate notification means the same `(monitor_id, status, - scheduler_tick_id)` is emitted more than once within a single scheduler run. - -## Unit Tests -1. `CheckAll_HostDown_DoesNotForceDown_HTTPMonitor_WhenHTTPCheckSucceeds` -2. `CheckAll_HostDown_StillHandles_TCPMonitor_Conservatively` -3. `CheckAll_ManualAndScheduledParity_HTTPMonitor` -4. `CheckAll_InternalHostUnreachable_PublicURL200_HTTPMonitorEndsUp` (blocking) -5. `CheckAll_InternalHostReachable_PublicURLFail_HTTPMonitorEndsDown` (blocking) - -## Integration Tests -1. Scheduler endpoint (`/api/v1/system/uptime/check`) parity with monitor check endpoint. -2. Verify DB heartbeat message is real HTTP result (not `Host unreachable`) for HTTP monitors where URL is reachable. -3. Verify when host precheck is down, HTTP monitor heartbeat/notification output is derived from `checkMonitor()` (not synthetic host-path `Host unreachable`). -4. Verify no duplicate notifications are emitted from host+monitor paths for the same scheduler run, where duplicate is defined as repeated `(monitor_id, status, scheduler_tick_id)`. -5. Verify internal host precheck unreachable + public URL 200 still resolves monitor `UP`. -6. Verify internal host precheck reachable + public URL failure resolves monitor `DOWN`. - -## E2E Tests -1. Create/sync monitor scenario where manual refresh returns `UP`. -2. Wait one scheduler interval. -3. Assert monitor remains `UP` and latest heartbeat is not forced `Host unreachable` for reachable URL. -4. Assert scenario: internal host precheck unreachable + public URL 200 => monitor remains `UP`. -5. Assert scenario: internal host precheck reachable + public URL failure => monitor is `DOWN`. - -## Regression Guardrails -- Add a test explicitly asserting that host precheck must not unconditionally override HTTP monitor checks. -- Add explicit assertions that HTTP monitors under host-down precheck emit - check-derived heartbeat messages and do not produce duplicate notifications - under the `(monitor_id, status, scheduler_tick_id)` rule within a single - scheduler run. - -## 9. Risks and Rollback - -## Risks -1. More HTTP checks under true host outage may increase check volume. -2. Notification patterns may shift from single host-level event to monitor-level batched events. -3. Edge cases for mixed-type monitor groups (HTTP + TCP) need deterministic behavior. - -## Mitigations -1. Preserve batching (`queueDownNotification`) and existing retry thresholds. -2. Keep TCP strict path unchanged in minimal fix. -3. Add explicit log fields and targeted tests for mixed groups. - -## Rollback Plan -1. Revert the `CheckAll()` branch change only (single-file rollback). -2. Keep added tests; mark expected behavior as legacy if temporary rollback needed. -3. If necessary, introduce temporary feature toggle to switch between strict and tolerant host gating. - -## 10. PR Slicing Strategy - -Decision: Single focused PR (hotfix + tests) - -Trigger reasons: -- High-severity runtime behavior fix requiring minimal blast radius -- Fast review/rollback with behavior-only delta plus regression coverage -- Avoid scope creep into optional hardening/feature-flag work - -### PR-1 (Hotfix + Tests) -Scope: -- `CheckAll()` host-down branch adjustment for HTTP/HTTPS -- Unit/integration/E2E regression tests for URL-truth semantics - -Files: -- `backend/internal/services/uptime_service.go` -- `backend/internal/services/uptime_service_test.go` -- `backend/internal/api/routes/routes_test.go` (or equivalent) -- `tests/monitoring/uptime-monitoring.spec.ts` (or equivalent) - -Validation gates: -- backend unit tests pass -- targeted uptime integration tests pass -- targeted uptime E2E tests pass -- no behavior regression in existing `CheckAll` tests - -Rollback: -- single revert of PR-1 commit - -## 11. Acceptance Criteria (DoD) - -1. Scheduled and manual checks produce consistent status for HTTP/HTTPS monitors. -2. A reachable monitor URL is not forced to `DOWN` solely by host precheck failure. -3. New regression tests fail before fix and pass after fix. -4. No break in TCP monitor behavior expectations. -5. No new critical/high security findings in touched paths. -6. Blocking parity case passes: internal host precheck unreachable + public URL 200 => scheduled result is `UP`. -7. Blocking parity case passes: internal host precheck reachable + public URL failure => scheduled result is `DOWN`. -8. Under host-down precheck, HTTP monitors produce check-derived heartbeat messages (not synthetic `Host unreachable` from host path). -9. No duplicate notifications are produced by host+monitor paths within a - single scheduler run, where duplicate is defined as repeated - `(monitor_id, status, scheduler_tick_id)`. - -## 12. Implementation Risks - -1. Increased scheduler workload during host-precheck failures because HTTP/HTTPS checks continue to run. -2. Notification cadence may change due to check-derived monitor outcomes replacing host-forced synthetic downs. -3. Mixed monitor groups (TCP + HTTP/HTTPS) require strict ordering/partitioning to avoid regression. - -Mitigations: -- Keep change localized to `CheckAll()` host-down branch decisioning. -- Add explicit regression tests for both parity directions and mixed monitor types. -- Keep rollback path as single-commit revert. +```tsx +interface User { id, uuid, email, name, role, enabled, last_login, invite_status, ... } +type PermissionMode = 'allow_all' | 'deny_all' +listUsers() → GET /users +getUser(id) → GET /users/:id +createUser(data) → POST /users +inviteUser(data) → POST /users/invite +updateUser(id, data) → PUT /users/:id +deleteUser(id) → DELETE /users/:id +updateUserPermissions(id, d) → PUT /users/:id/permissions +validateInvite(token) → GET /invite/validate +acceptInvite(data) → POST /invite/accept +previewInviteURL(email) → POST /users/preview-invite-url +resendInvite(id) → POST /users/:id/resend-invite +``` + +#### 2.2.7 Auth Context — `frontend/src/context/AuthContextValue.ts` + +```tsx +interface User { user_id: number; role: string; name?: string; email?: string; } +interface AuthContextType { user, login, logout, changePassword, isAuthenticated, isLoading } +``` + +The `AuthProvider` in `AuthContext.tsx` fetches `/api/v1/auth/me` on mount and stores the user. Auto-logout after 15 minutes of inactivity. + +#### 2.2.8 Route Protection — `frontend/src/components/RequireAuth.tsx` + +Checks `isAuthenticated`, localStorage token, and `user !== null`. Redirects to `/login` if any check fails. **No role-based route protection exists** — all authenticated users see the same navigation and routes. + +#### 2.2.9 Translation Keys — `frontend/src/locales/en/translation.json` + +Relevant keys: +```json +"navigation.adminAccount": "Admin Account", +"navigation.accountManagement": "Account Management", +"users.roleUser": "User", +"users.roleAdmin": "Admin" +``` + +No keys exist for `"roleViewer"` or `"rolePassthrough"`. + +### 2.3 Current Architecture: E2E Tests + +**No Playwright E2E tests exist** for user management or account pages. The `tests/` directory contains specs for DNS providers, modal dropdowns, and debug utilities only. + +### 2.4 Configuration Files Review + +| File | Relevance | Action Needed | +|------|-----------|---------------| +| `.gitignore` | May need entries for new test artifacts | Review during implementation | +| `codecov.yml` | Coverage thresholds may need adjustment | Review if new files change coverage ratios | +| `.dockerignore` | No impact expected | No changes | +| `Dockerfile` | No impact expected | No changes | + +--- + +## 3. Technical Specifications + +### 3.1 Privilege Tier System + +#### 3.1.1 Tier Definitions + +| Tier | Value | Charon UI Access | Forward Auth (Proxy) Access | Can Manage Others | +|------|-------|------------------|---------------------------|-------------------| +| **Admin** | `"admin"` | Full access to all pages and settings | All hosts (bypasses permission checks) | Yes — full CRUD on all users | +| **User** | `"user"` | Access to Charon UI except admin-only pages (user management, system settings) | Based on `permission_mode` + `permitted_hosts` | No | +| **Pass-through** | `"passthrough"` | Login page only — redirected away from all management pages after login | Based on `permission_mode` + `permitted_hosts` | No | + +#### 3.1.2 Behavioral Details + +**Admin:** +- Unchanged from current behavior. +- Can edit any user's role, permissions, name, email, enabled state. +- Can edit their own profile inline on the consolidated Users page. +- Cannot delete themselves. Cannot disable themselves. + +**User:** +- Can log in to the Charon management UI. +- Can view proxy hosts, certificates, DNS, uptime, and other read-oriented pages. +- Cannot access: User management, System settings, SMTP settings, Encryption, Plugins. +- Can edit their own profile (name, email, password, API key) via self-service section on the Users page or a dedicated "My Account" modal. +- Forward auth access is governed by `permission_mode` + `permitted_hosts`. + +**Pass-through:** +- Can log in to Charon (to obtain a session cookie/JWT). +- Immediately after login, is redirected to `/passthrough` landing page — **no access to the management UI**. +- The session cookie is used by Caddy's `forward_auth` to grant access to proxied services based on `permission_mode` + `permitted_hosts`. +- Can access **only**: `/auth/logout`, `/auth/refresh`, `/auth/me`, `/auth/change-password`. +- **Cannot access**: `/user/profile`, `/user/api-key`, or any management API endpoints. +- Cannot access any Charon management UI pages. + +#### 3.1.3 Backend Role Constants + +Add typed constants to `backend/internal/models/user.go`: + +```go +type UserRole string + +const ( + RoleAdmin UserRole = "admin" + RoleUser UserRole = "user" + RolePassthrough UserRole = "passthrough" +) + +func (r UserRole) IsValid() bool { + switch r { + case RoleAdmin, RoleUser, RolePassthrough: + return true + } + return false +} +``` + +Change `User.Role` from `string` to `UserRole` (which is still a `string` alias, so GORM and JSON work identically — zero migration friction). + +### 3.2 Database Schema Changes + +#### 3.2.1 User Table + +**No schema migration needed.** The `Role` column is already a `TEXT` field storing string values. Changing the Go type from `string` to `UserRole` (a `string` alias) requires no ALTER TABLE. New role values (`"passthrough"`) are immediately storable. + +#### 3.2.2 Data Migration + +A one-time migration function should: +1. Scan for any users with `role = "viewer"` and update them to `role = "passthrough"` (in case any were manually created via the unused viewer path). +2. Validate all existing roles are in the allowed set. + +This runs in the `AutoMigrate` block in `routes.go`. + +The migration must be in a clearly named function (e.g., `migrateViewerToPassthrough()`) and must log when it runs and how many records it affected: + +```go +func migrateViewerToPassthrough(db *gorm.DB) { + result := db.Model(&User{}).Where("role = ?", "viewer").Update("role", string(RolePassthrough)) + if result.RowsAffected > 0 { + log.Printf("[migration] Updated %d users from 'viewer' to 'passthrough' role", result.RowsAffected) + } +} +``` + +### 3.3 API Changes + +#### 3.3.1 Consolidated Self-Service Endpoints + +Merge the current `/user/profile` and `/user/api-key` endpoints into the existing `/users/:id` group by allowing users to edit their own record: + +| Current Endpoint | Action | Proposed | +|-----------------|--------|----------| +| `GET /user/profile` | Get own profile | **Keep** (convenience alias) | +| `POST /user/profile` | Update own profile | **Keep** (convenience alias) | +| `POST /user/api-key` | Regenerate API key | **Keep** (convenience alias) | +| `PUT /users/:id` | Admin updates any user | **Extend**: allow authenticated user to update their own record (name, email with password verification) | + +The self-service endpoints (`/user/*`) remain as convenient aliases that internally resolve to the caller's own user ID and delegate to the same service logic. No breaking API changes. + +#### 3.3.2 Role Validation + +The `UpdateUser()` and `CreateUser()` handlers must validate the `role` field against `UserRole.IsValid()`. The invite endpoint `InviteUser()` must accept `"passthrough"` as a valid role. + +#### 3.3.3 Pass-through Route Protection + +Add middleware to block pass-through users from accessing management endpoints: + +```go +func RequireManagementAccess() gin.HandlerFunc { + return func(c *gin.Context) { + role := c.GetString("role") + if role == string(models.RolePassthrough) { + c.AbortWithStatusJSON(http.StatusForbidden, gin.H{ + "error": "pass-through users cannot access management features", + }) + return + } + c.Next() + } +} +``` + +Apply this middleware by restructuring the protected routes into two sub-groups: + +**Exempt routes** (authMiddleware only — no management check): +``` +POST /auth/logout +POST /auth/refresh +GET /auth/me +POST /auth/change-password +GET /auth/accessible-hosts +GET /auth/check-host/:hostId +GET /user/profile +POST /user/profile +POST /user/api-key +``` + +**Management routes** (authMiddleware + `RequireManagementAccess`): +``` +GET /users +POST /users +POST /users/invite +POST /users/preview-invite-url +GET /users/:id +PUT /users/:id +DELETE /users/:id +PUT /users/:id/permissions +POST /users/:id/resend-invite +All /backup/* routes +All /logs/* routes +All /settings/* routes +All /system/* routes +All /security/* routes +All /features/* routes +All /proxy/* routes +All /certificates/* routes +All /dns/* routes +All /uptime/* routes +All /plugins/* routes +``` + +The route split in `routes.go` should look like: + +```go +// Self-service + auth routes — accessible to all authenticated users (including passthrough) +exempt := protected.Group("") +{ + exempt.POST("/auth/logout", ...) + exempt.POST("/auth/refresh", ...) + exempt.GET("/auth/me", ...) + exempt.POST("/auth/change-password", ...) + exempt.GET("/auth/accessible-hosts", ...) + exempt.GET("/auth/check-host/:hostId", ...) + exempt.GET("/user/profile", ...) + exempt.POST("/user/profile", ...) + exempt.POST("/user/api-key", ...) +} + +// Management routes — blocked for passthrough users +management := protected.Group("", middleware.RequireManagementAccess()) +{ + // All other routes registered here +} +``` + +#### 3.3.4 New Endpoint: Pass-through Landing + +No new backend endpoint needed. The forward auth flow (`GET /auth/verify`) already works for pass-through users. The minimal landing page is purely a frontend concern. + +### 3.4 Frontend Changes + +#### 3.4.1 Consolidated Users Page + +Transform `UsersPage.tsx` into the single source of truth for all user management: + +**New capabilities:** +1. **Admin's own row is fully interactive** — edit name/email/password, regenerate API key, view cert email settings. +2. **"My Profile" section** at the top (or accessible via a button) for the currently logged-in user to manage their own account, replacing the `Account.tsx` page entirely. +3. **Role selector** includes three options: `Admin`, `User`, `Pass-through`. +4. **Permission controls** are shown for `User` and `Pass-through` roles (hidden for `Admin`). +5. **Inline editing** or a detail drawer for editing individual users. + +**Components to modify:** + +| Component | File | Change | +|-----------|------|--------| +| `InviteModal` | `UsersPage.tsx` | Add `"passthrough"` to role `