Add Caddy restart detection, auto-recovery, and metrics exposure
- Implemented health monitoring service that detects Caddy restarts/crashes - Automatically reapplies configuration when Caddy restarts - Added metrics settings UI for enabling Prometheus/Grafana monitoring - Caddy metrics exposed on separate port (default: 9090) via reverse proxy - Admin API (port 2019) kept internal-only for security - Updated docker-compose.yml with metrics port documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -2,11 +2,12 @@
|
||||
|
||||
import { useFormState } from "react-dom";
|
||||
import { Alert, Box, Button, Card, CardContent, Checkbox, FormControlLabel, Stack, TextField, Typography } from "@mui/material";
|
||||
import type { GeneralSettings, AuthentikSettings } from "@/src/lib/settings";
|
||||
import type { GeneralSettings, AuthentikSettings, MetricsSettings } from "@/src/lib/settings";
|
||||
import {
|
||||
updateCloudflareSettingsAction,
|
||||
updateGeneralSettingsAction,
|
||||
updateAuthentikSettingsAction
|
||||
updateAuthentikSettingsAction,
|
||||
updateMetricsSettingsAction
|
||||
} from "./actions";
|
||||
|
||||
type Props = {
|
||||
@@ -17,12 +18,14 @@ type Props = {
|
||||
accountId?: string;
|
||||
};
|
||||
authentik: AuthentikSettings | null;
|
||||
metrics: MetricsSettings | null;
|
||||
};
|
||||
|
||||
export default function SettingsClient({ general, cloudflare, authentik }: Props) {
|
||||
export default function SettingsClient({ general, cloudflare, authentik, metrics }: Props) {
|
||||
const [generalState, generalFormAction] = useFormState(updateGeneralSettingsAction, null);
|
||||
const [cloudflareState, cloudflareFormAction] = useFormState(updateCloudflareSettingsAction, null);
|
||||
const [authentikState, authentikFormAction] = useFormState(updateAuthentikSettingsAction, null);
|
||||
const [metricsState, metricsFormAction] = useFormState(updateMetricsSettingsAction, null);
|
||||
|
||||
return (
|
||||
<Stack spacing={4} sx={{ width: "100%" }}>
|
||||
@@ -158,6 +161,46 @@ export default function SettingsClient({ general, cloudflare, authentik }: Props
|
||||
</Stack>
|
||||
</CardContent>
|
||||
</Card>
|
||||
|
||||
<Card>
|
||||
<CardContent>
|
||||
<Typography variant="h6" fontWeight={600} gutterBottom>
|
||||
Metrics & Monitoring
|
||||
</Typography>
|
||||
<Typography color="text.secondary" variant="body2" sx={{ mb: 2 }}>
|
||||
Enable Caddy metrics exposure for monitoring with Prometheus, Grafana, or other observability tools.
|
||||
Metrics will be available at http://caddy:{metrics?.port ?? 9090}/metrics on a separate port (NOT the admin API port for security).
|
||||
</Typography>
|
||||
<Stack component="form" action={metricsFormAction} spacing={2}>
|
||||
{metricsState?.message && (
|
||||
<Alert severity={metricsState.success ? "success" : "warning"}>
|
||||
{metricsState.message}
|
||||
</Alert>
|
||||
)}
|
||||
<FormControlLabel
|
||||
control={<Checkbox name="enabled" defaultChecked={metrics?.enabled ?? false} />}
|
||||
label="Enable metrics endpoint"
|
||||
/>
|
||||
<TextField
|
||||
name="port"
|
||||
label="Metrics Port"
|
||||
type="number"
|
||||
defaultValue={metrics?.port ?? 9090}
|
||||
helperText="Port to expose metrics on (default: 9090, separate from admin API on 2019)"
|
||||
fullWidth
|
||||
/>
|
||||
<Alert severity="info">
|
||||
After enabling metrics, configure your monitoring tool to scrape http://caddy-proxy-manager-caddy:{metrics?.port ?? 9090}/metrics from within the Docker network.
|
||||
To expose metrics externally, add a port mapping like "{metrics?.port ?? 9090}:{metrics?.port ?? 9090}" in docker-compose.yml.
|
||||
</Alert>
|
||||
<Box sx={{ display: "flex", justifyContent: "flex-end" }}>
|
||||
<Button type="submit" variant="contained">
|
||||
Save metrics settings
|
||||
</Button>
|
||||
</Box>
|
||||
</Stack>
|
||||
</CardContent>
|
||||
</Card>
|
||||
</Stack>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
import { revalidatePath } from "next/cache";
|
||||
import { requireAdmin } from "@/src/lib/auth";
|
||||
import { applyCaddyConfig } from "@/src/lib/caddy";
|
||||
import { getCloudflareSettings, saveCloudflareSettings, saveGeneralSettings, saveAuthentikSettings } from "@/src/lib/settings";
|
||||
import { getCloudflareSettings, saveCloudflareSettings, saveGeneralSettings, saveAuthentikSettings, saveMetricsSettings } from "@/src/lib/settings";
|
||||
|
||||
type ActionResult = {
|
||||
success: boolean;
|
||||
@@ -86,3 +86,35 @@ export async function updateAuthentikSettingsAction(_prevState: ActionResult | n
|
||||
return { success: false, message: error instanceof Error ? error.message : "Failed to save Authentik settings" };
|
||||
}
|
||||
}
|
||||
|
||||
export async function updateMetricsSettingsAction(_prevState: ActionResult | null, formData: FormData): Promise<ActionResult> {
|
||||
try {
|
||||
await requireAdmin();
|
||||
const enabled = formData.get("enabled") === "on";
|
||||
const portStr = formData.get("port") ? String(formData.get("port")).trim() : "";
|
||||
const port = portStr && !isNaN(Number(portStr)) ? Number(portStr) : 9090;
|
||||
|
||||
await saveMetricsSettings({
|
||||
enabled,
|
||||
port
|
||||
});
|
||||
|
||||
// Apply config to enable/disable metrics
|
||||
try {
|
||||
await applyCaddyConfig();
|
||||
revalidatePath("/settings");
|
||||
return { success: true, message: "Metrics settings saved and applied successfully" };
|
||||
} catch (error) {
|
||||
console.error("Failed to apply Caddy config:", error);
|
||||
revalidatePath("/settings");
|
||||
const errorMsg = error instanceof Error ? error.message : "Unknown error";
|
||||
return {
|
||||
success: true,
|
||||
message: `Settings saved, but could not apply to Caddy: ${errorMsg}`
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Failed to save metrics settings:", error);
|
||||
return { success: false, message: error instanceof Error ? error.message : "Failed to save metrics settings" };
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
import SettingsClient from "./SettingsClient";
|
||||
import { getCloudflareSettings, getGeneralSettings, getAuthentikSettings } from "@/src/lib/settings";
|
||||
import { getCloudflareSettings, getGeneralSettings, getAuthentikSettings, getMetricsSettings } from "@/src/lib/settings";
|
||||
import { requireAdmin } from "@/src/lib/auth";
|
||||
|
||||
export default async function SettingsPage() {
|
||||
await requireAdmin();
|
||||
|
||||
const [general, cloudflare, authentik] = await Promise.all([
|
||||
const [general, cloudflare, authentik, metrics] = await Promise.all([
|
||||
getGeneralSettings(),
|
||||
getCloudflareSettings(),
|
||||
getAuthentikSettings()
|
||||
getAuthentikSettings(),
|
||||
getMetricsSettings()
|
||||
]);
|
||||
|
||||
return (
|
||||
@@ -20,6 +21,7 @@ export default async function SettingsPage() {
|
||||
accountId: cloudflare?.accountId
|
||||
}}
|
||||
authentik={authentik}
|
||||
metrics={metrics}
|
||||
/>
|
||||
);
|
||||
}
|
||||
|
||||
+3
-1
@@ -59,8 +59,10 @@ services:
|
||||
ports:
|
||||
- "80:80"
|
||||
- "443:443"
|
||||
# Admin API only exposed on internal network for security
|
||||
# Admin API (port 2019) is only exposed on internal network for security
|
||||
# Web UI accesses via http://caddy:2019 internally
|
||||
# Uncomment the line below to expose metrics externally for Grafana/Prometheus
|
||||
# - "9090:9090" # Metrics available at http://localhost:9090/metrics (configure in Settings first)
|
||||
environment:
|
||||
# Primary domain for Caddy configuration
|
||||
PRIMARY_DOMAIN: ${PRIMARY_DOMAIN:-caddyproxymanager.com}
|
||||
|
||||
@@ -37,5 +37,15 @@ export async function register() {
|
||||
// Don't throw - Caddy might not be ready yet, or config might be applied later
|
||||
// This ensures proxy hosts work after container restart
|
||||
}
|
||||
|
||||
// Start Caddy health monitoring to detect restarts and auto-reapply config
|
||||
const { startCaddyMonitoring } = await import("./lib/caddy-monitor");
|
||||
try {
|
||||
startCaddyMonitoring();
|
||||
console.log("Caddy health monitoring started");
|
||||
} catch (error) {
|
||||
console.error("Failed to start Caddy health monitoring:", error);
|
||||
// Don't throw - monitoring is a nice-to-have feature
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,170 @@
|
||||
/**
|
||||
* Caddy health monitoring service
|
||||
* Monitors Caddy for restarts/crashes and automatically reapplies configuration
|
||||
*/
|
||||
|
||||
import { config } from "./config";
|
||||
import { applyCaddyConfig } from "./caddy";
|
||||
import { getSetting, setSetting } from "./settings";
|
||||
|
||||
type CaddyMonitorState = {
|
||||
isHealthy: boolean;
|
||||
lastConfigId: string | null;
|
||||
lastCheckTime: number;
|
||||
consecutiveFailures: number;
|
||||
};
|
||||
|
||||
const HEALTH_CHECK_INTERVAL = 10000; // Check every 10 seconds
|
||||
const MAX_CONSECUTIVE_FAILURES = 3; // Consider unhealthy after 3 failures
|
||||
const REAPPLY_DELAY = 5000; // Wait 5 seconds after detecting restart before reapplying
|
||||
|
||||
let monitorState: CaddyMonitorState = {
|
||||
isHealthy: false,
|
||||
lastConfigId: null,
|
||||
lastCheckTime: 0,
|
||||
consecutiveFailures: 0
|
||||
};
|
||||
|
||||
let monitorInterval: NodeJS.Timeout | null = null;
|
||||
let isMonitoring = false;
|
||||
|
||||
/**
|
||||
* Get the current Caddy config ID from the admin API
|
||||
* This is used to detect when Caddy has restarted (config ID changes)
|
||||
*/
|
||||
async function getCaddyConfigId(): Promise<string | null> {
|
||||
try {
|
||||
const response = await fetch(`${config.caddyApiUrl}/config/`, {
|
||||
method: "GET",
|
||||
signal: AbortSignal.timeout(5000)
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Use ETag or compute a simple hash from the response
|
||||
const etag = response.headers.get("etag");
|
||||
if (etag) {
|
||||
return etag;
|
||||
}
|
||||
|
||||
// Fallback: use the config object's structure
|
||||
const configData = await response.json();
|
||||
// Check if config is essentially empty (default state after restart)
|
||||
const isEmpty = !configData.apps || Object.keys(configData.apps).length === 0;
|
||||
return isEmpty ? "empty" : "configured";
|
||||
} catch (error) {
|
||||
// Network error or timeout
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if Caddy is healthy and detect restarts
|
||||
*/
|
||||
async function checkCaddyHealth(): Promise<void> {
|
||||
const now = Date.now();
|
||||
monitorState.lastCheckTime = now;
|
||||
|
||||
const currentConfigId = await getCaddyConfigId();
|
||||
|
||||
if (currentConfigId === null) {
|
||||
// Caddy is not responding
|
||||
monitorState.consecutiveFailures++;
|
||||
|
||||
if (monitorState.isHealthy && monitorState.consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
|
||||
console.warn(
|
||||
`[CaddyMonitor] Caddy appears to be down (${monitorState.consecutiveFailures} consecutive failures)`
|
||||
);
|
||||
monitorState.isHealthy = false;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Caddy is responding
|
||||
const wasUnhealthy = !monitorState.isHealthy;
|
||||
monitorState.consecutiveFailures = 0;
|
||||
monitorState.isHealthy = true;
|
||||
|
||||
// Detect restart: config ID changed to "empty" or Caddy was previously unhealthy
|
||||
const hasRestarted =
|
||||
(monitorState.lastConfigId !== null && currentConfigId === "empty") ||
|
||||
(wasUnhealthy && currentConfigId === "empty");
|
||||
|
||||
if (hasRestarted) {
|
||||
console.log("[CaddyMonitor] Caddy restart detected! Waiting before reapplying configuration...");
|
||||
|
||||
// Wait a bit for Caddy to fully initialize
|
||||
setTimeout(async () => {
|
||||
try {
|
||||
console.log("[CaddyMonitor] Reapplying Caddy configuration after restart...");
|
||||
await applyCaddyConfig();
|
||||
console.log("[CaddyMonitor] Configuration reapplied successfully");
|
||||
|
||||
// Update the config ID after successful reapplication
|
||||
const newConfigId = await getCaddyConfigId();
|
||||
monitorState.lastConfigId = newConfigId;
|
||||
} catch (error) {
|
||||
console.error("[CaddyMonitor] Failed to reapply configuration after restart:", error);
|
||||
// Will retry on next health check
|
||||
}
|
||||
}, REAPPLY_DELAY);
|
||||
} else if (monitorState.lastConfigId === null) {
|
||||
// First time seeing Caddy healthy
|
||||
console.log("[CaddyMonitor] Caddy health monitoring initialized");
|
||||
monitorState.lastConfigId = currentConfigId;
|
||||
} else {
|
||||
// Normal operation, update last known config ID
|
||||
monitorState.lastConfigId = currentConfigId;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start monitoring Caddy health
|
||||
*/
|
||||
export function startCaddyMonitoring(): void {
|
||||
if (isMonitoring) {
|
||||
console.log("[CaddyMonitor] Already monitoring");
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[CaddyMonitor] Starting Caddy health monitoring (interval: ${HEALTH_CHECK_INTERVAL}ms)`);
|
||||
isMonitoring = true;
|
||||
|
||||
// Do initial check immediately
|
||||
checkCaddyHealth().catch((error) => {
|
||||
console.error("[CaddyMonitor] Initial health check failed:", error);
|
||||
});
|
||||
|
||||
// Set up periodic checks
|
||||
monitorInterval = setInterval(() => {
|
||||
checkCaddyHealth().catch((error) => {
|
||||
console.error("[CaddyMonitor] Health check failed:", error);
|
||||
});
|
||||
}, HEALTH_CHECK_INTERVAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop monitoring Caddy health
|
||||
*/
|
||||
export function stopCaddyMonitoring(): void {
|
||||
if (!isMonitoring) {
|
||||
return;
|
||||
}
|
||||
|
||||
console.log("[CaddyMonitor] Stopping Caddy health monitoring");
|
||||
isMonitoring = false;
|
||||
|
||||
if (monitorInterval) {
|
||||
clearInterval(monitorInterval);
|
||||
monitorInterval = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current monitoring state (useful for debugging)
|
||||
*/
|
||||
export function getMonitorState(): Readonly<CaddyMonitorState> {
|
||||
return { ...monitorState };
|
||||
}
|
||||
+38
-15
@@ -3,7 +3,7 @@ import { join } from "node:path";
|
||||
import crypto from "node:crypto";
|
||||
import db, { nowIso } from "./db";
|
||||
import { config } from "./config";
|
||||
import { getCloudflareSettings, getGeneralSettings, setSetting } from "./settings";
|
||||
import { getCloudflareSettings, getGeneralSettings, getMetricsSettings, setSetting } from "./settings";
|
||||
import {
|
||||
accessListEntries,
|
||||
certificates,
|
||||
@@ -925,23 +925,46 @@ async function buildCaddyDocument() {
|
||||
|
||||
const hasTls = tlsConnectionPolicies.length > 0;
|
||||
|
||||
const httpApp =
|
||||
httpRoutes.length > 0
|
||||
? {
|
||||
http: {
|
||||
servers: {
|
||||
cpm: {
|
||||
listen: hasTls ? [":80", ":443"] : [":80"],
|
||||
routes: httpRoutes,
|
||||
// Only disable automatic HTTPS if we have TLS automation policies
|
||||
// This allows Caddy to handle HTTP-01 challenges for managed certificates
|
||||
...(tlsApp ? {} : { automatic_https: { disable: true } }),
|
||||
...(hasTls ? { tls_connection_policies: tlsConnectionPolicies } : {})
|
||||
// Check if metrics should be enabled
|
||||
const metricsSettings = await getMetricsSettings();
|
||||
const metricsEnabled = metricsSettings?.enabled ?? false;
|
||||
const metricsPort = metricsSettings?.port ?? 9090;
|
||||
|
||||
const servers: Record<string, unknown> = {};
|
||||
|
||||
// Main HTTP/HTTPS server for proxy hosts
|
||||
if (httpRoutes.length > 0) {
|
||||
servers.cpm = {
|
||||
listen: hasTls ? [":80", ":443"] : [":80"],
|
||||
routes: httpRoutes,
|
||||
// Only disable automatic HTTPS if we have TLS automation policies
|
||||
// This allows Caddy to handle HTTP-01 challenges for managed certificates
|
||||
...(tlsApp ? {} : { automatic_https: { disable: true } }),
|
||||
...(hasTls ? { tls_connection_policies: tlsConnectionPolicies } : {})
|
||||
};
|
||||
}
|
||||
|
||||
// Metrics server - exposes /metrics endpoint on separate port
|
||||
if (metricsEnabled) {
|
||||
servers.metrics = {
|
||||
listen: [`:${metricsPort}`],
|
||||
routes: [
|
||||
{
|
||||
handle: [
|
||||
{
|
||||
handler: "reverse_proxy",
|
||||
upstreams: [{ dial: "localhost:2019" }],
|
||||
rewrite: {
|
||||
uri: "/metrics"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
: {};
|
||||
]
|
||||
};
|
||||
}
|
||||
|
||||
const httpApp = Object.keys(servers).length > 0 ? { http: { servers } } : {};
|
||||
|
||||
return {
|
||||
admin: {
|
||||
|
||||
@@ -21,6 +21,11 @@ export type AuthentikSettings = {
|
||||
authEndpoint?: string;
|
||||
};
|
||||
|
||||
export type MetricsSettings = {
|
||||
enabled: boolean;
|
||||
port?: number; // Port to expose metrics on (default: 9090, separate from admin API)
|
||||
};
|
||||
|
||||
export async function getSetting<T>(key: string): Promise<SettingValue<T>> {
|
||||
const setting = await db.query.settings.findFirst({
|
||||
where: (table, { eq }) => eq(table.key, key)
|
||||
@@ -81,3 +86,11 @@ export async function getAuthentikSettings(): Promise<AuthentikSettings | null>
|
||||
export async function saveAuthentikSettings(settings: AuthentikSettings): Promise<void> {
|
||||
await setSetting("authentik", settings);
|
||||
}
|
||||
|
||||
export async function getMetricsSettings(): Promise<MetricsSettings | null> {
|
||||
return await getSetting<MetricsSettings>("metrics");
|
||||
}
|
||||
|
||||
export async function saveMetricsSettings(settings: MetricsSettings): Promise<void> {
|
||||
await setSetting("metrics", settings);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user