diff --git a/.docker/docker-entrypoint.sh b/.docker/docker-entrypoint.sh index 7028d7a9..f1f4c471 100755 --- a/.docker/docker-entrypoint.sh +++ b/.docker/docker-entrypoint.sh @@ -18,6 +18,42 @@ run_as_charon() { fi } +get_group_by_gid() { + if command -v getent >/dev/null 2>&1; then + getent group "$1" 2>/dev/null || true + else + awk -F: -v gid="$1" '$3==gid {print $0}' /etc/group 2>/dev/null || true + fi +} + +create_group_with_gid() { + local gid="$1" + local name="$2" + + if command -v addgroup >/dev/null 2>&1; then + addgroup -g "$gid" "$name" 2>/dev/null || true + return + fi + + if command -v groupadd >/dev/null 2>&1; then + groupadd -g "$gid" "$name" 2>/dev/null || true + fi +} + +add_user_to_group() { + local user="$1" + local group="$2" + + if command -v addgroup >/dev/null 2>&1; then + addgroup "$user" "$group" 2>/dev/null || true + return + fi + + if command -v usermod >/dev/null 2>&1; then + usermod -aG "$group" "$user" 2>/dev/null || true + fi +} + # ============================================================================ # Volume Permission Handling for Non-Root User # ============================================================================ @@ -89,18 +125,19 @@ if [ -S "/var/run/docker.sock" ] && is_root; then DOCKER_SOCK_GID=$(stat -c '%g' /var/run/docker.sock 2>/dev/null || echo "") if [ -n "$DOCKER_SOCK_GID" ] && [ "$DOCKER_SOCK_GID" != "0" ]; then # Check if a group with this GID exists - if ! getent group "$DOCKER_SOCK_GID" >/dev/null 2>&1; then + GROUP_ENTRY=$(get_group_by_gid "$DOCKER_SOCK_GID") + if [ -z "$GROUP_ENTRY" ]; then echo "Docker socket detected (gid=$DOCKER_SOCK_GID) - creating docker group and adding charon user..." # Create docker group with the socket's GID - groupadd -g "$DOCKER_SOCK_GID" docker 2>/dev/null || true + create_group_with_gid "$DOCKER_SOCK_GID" docker # Add charon user to the docker group - usermod -aG docker charon 2>/dev/null || true + add_user_to_group charon docker echo "Docker integration enabled for charon user" else # Group exists, just add charon to it - GROUP_NAME=$(getent group "$DOCKER_SOCK_GID" | cut -d: -f1) + GROUP_NAME=$(echo "$GROUP_ENTRY" | cut -d: -f1) echo "Docker socket detected (gid=$DOCKER_SOCK_GID, group=$GROUP_NAME) - adding charon user..." - usermod -aG "$GROUP_NAME" charon 2>/dev/null || true + add_user_to_group charon "$GROUP_NAME" echo "Docker integration enabled for charon user" fi fi diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 12b4f78b..e2c99854 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -112,12 +112,12 @@ jobs: - name: Set up Docker Buildx if: steps.skip.outputs.skip_build != 'true' uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0 - - name: Resolve Debian base image digest + - name: Resolve Alpine base image digest if: steps.skip.outputs.skip_build != 'true' id: caddy run: | - docker pull debian:trixie-slim - DIGEST=$(docker inspect --format='{{index .RepoDigests 0}}' debian:trixie-slim) + docker pull alpine:3.23.3 + DIGEST=$(docker inspect --format='{{index .RepoDigests 0}}' alpine:3.23.3) echo "image=$DIGEST" >> $GITHUB_OUTPUT - name: Log in to GitHub Container Registry @@ -744,7 +744,7 @@ jobs: -p 80:80 \ ${{ env.GHCR_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }} - # Wait for container to be healthy (max 3 minutes - Debian needs more startup time) + # Wait for container to be healthy (max 3 minutes) echo "Waiting for container to start..." timeout 180s bash -c 'until docker exec test-container curl -sf http://localhost:8080/api/v1/health 2>/dev/null | grep -q "status"; do echo "Waiting..."; sleep 2; done' || { echo "❌ Container failed to become healthy" diff --git a/Dockerfile b/Dockerfile index 374e4b3d..808c9f88 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,13 +17,12 @@ ARG BUILD_DEBUG=0 ## If the requested tag isn't available, fall back to a known-good v2.11.0-beta.2 build. ARG CADDY_VERSION=2.11.0-beta.2 ## When an official caddy image tag isn't available on the host, use a -## plain Debian slim base image and overwrite its caddy binary with our +## plain Alpine base image and overwrite its caddy binary with our ## xcaddy-built binary in the later COPY step. This avoids relying on ## upstream caddy image tags while still shipping a pinned caddy binary. -## Using trixie (Debian 13 testing) for faster security updates - bookworm -## packages marked "wont-fix" are actively maintained in trixie. -# renovate: datasource=docker depName=debian versioning=docker -ARG CADDY_IMAGE=debian:trixie-slim@sha256:f6e2cfac5cf956ea044b4bd75e6397b4372ad88fe00908045e9a0d21712ae3ba +## Alpine 3.23 base to reduce glibc CVE exposure and image size. +# renovate: datasource=docker depName=alpine versioning=docker +ARG CADDY_IMAGE=alpine:3.23.3 # ---- Cross-Compilation Helpers ---- # renovate: datasource=docker depName=tonistiigi/xx @@ -35,7 +34,7 @@ FROM --platform=$BUILDPLATFORM tonistiigi/xx:1.9.0@sha256:c64defb9ed5a91eacb37f9 # CVEs fixed: CVE-2023-24531, CVE-2023-24540, CVE-2023-29402, CVE-2023-29404, # CVE-2023-29405, CVE-2024-24790, CVE-2025-22871, and 15 more # renovate: datasource=docker depName=golang -FROM --platform=$BUILDPLATFORM golang:1.25-trixie@sha256:f6a22bdb1d575b3f71c3d11b6ab09aef8f8ca3b0f1324ad944d80c14cc3fbe96 AS gosu-builder +FROM --platform=$BUILDPLATFORM golang:1.25-alpine AS gosu-builder COPY --from=xx / / WORKDIR /tmp/gosu @@ -46,11 +45,9 @@ ARG TARGETARCH # renovate: datasource=github-releases depName=tianon/gosu ARG GOSU_VERSION=1.17 -RUN apt-get update && apt-get install -y --no-install-recommends \ - git clang lld \ - && rm -rf /var/lib/apt/lists/* +RUN apk add --no-cache git clang lld # hadolint ignore=DL3059 -RUN xx-apt install -y gcc libc6-dev +RUN xx-apk add --no-cache gcc musl-dev # Clone and build gosu from source with modern Go RUN git clone --depth 1 --branch "${GOSU_VERSION}" https://github.com/tianon/gosu.git . @@ -65,7 +62,7 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ # ---- Frontend Builder ---- # Build the frontend using the BUILDPLATFORM to avoid arm64 musl Rollup native issues # renovate: datasource=docker depName=node -FROM --platform=$BUILDPLATFORM node:24.13.0-slim@sha256:4660b1ca8b28d6d1906fd644abe34b2ed81d15434d26d845ef0aced307cf4b6f AS frontend-builder +FROM --platform=$BUILDPLATFORM node:24.13.0-alpine AS frontend-builder WORKDIR /app/frontend # Copy frontend package files @@ -89,21 +86,19 @@ RUN --mount=type=cache,target=/app/frontend/node_modules/.cache \ # ---- Backend Builder ---- # renovate: datasource=docker depName=golang -FROM --platform=$BUILDPLATFORM golang:1.25-trixie@sha256:f6a22bdb1d575b3f71c3d11b6ab09aef8f8ca3b0f1324ad944d80c14cc3fbe96 AS backend-builder +FROM --platform=$BUILDPLATFORM golang:1.25-alpine AS backend-builder # Copy xx helpers for cross-compilation COPY --from=xx / / WORKDIR /app/backend # Install build dependencies -# xx-apt installs packages for the TARGET architecture +# xx-apk installs packages for the TARGET architecture ARG TARGETPLATFORM ARG TARGETARCH -RUN apt-get update && apt-get install -y --no-install-recommends \ - clang lld \ - && rm -rf /var/lib/apt/lists/* +RUN apk add --no-cache clang lld # hadolint ignore=DL3059 -RUN xx-apt install -y gcc libc6-dev libsqlite3-dev +RUN xx-apk add --no-cache gcc musl-dev sqlite-dev # Install Delve (cross-compile for target) # Note: xx-go install puts binaries in /go/bin/TARGETOS_TARGETARCH/dlv if cross-compiling. @@ -162,15 +157,14 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ # Build Caddy from source to ensure we use the latest Go version and dependencies # This fixes vulnerabilities found in the pre-built Caddy images (e.g. CVE-2025-59530, stdlib issues) # renovate: datasource=docker depName=golang -FROM --platform=$BUILDPLATFORM golang:1.25-trixie@sha256:f6a22bdb1d575b3f71c3d11b6ab09aef8f8ca3b0f1324ad944d80c14cc3fbe96 AS caddy-builder +FROM --platform=$BUILDPLATFORM golang:1.25-alpine AS caddy-builder ARG TARGETOS ARG TARGETARCH ARG CADDY_VERSION # renovate: datasource=go depName=github.com/caddyserver/xcaddy ARG XCADDY_VERSION=0.4.5 -RUN apt-get update && apt-get install -y --no-install-recommends git \ - && rm -rf /var/lib/apt/lists/* +RUN apk add --no-cache git # hadolint ignore=DL3062 RUN --mount=type=cache,target=/go/pkg/mod \ go install github.com/caddyserver/xcaddy/cmd/xcaddy@v${XCADDY_VERSION} @@ -227,7 +221,7 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ # Build CrowdSec from source to ensure we use Go 1.25.5+ and avoid stdlib vulnerabilities # (CVE-2025-58183, CVE-2025-58186, CVE-2025-58187, CVE-2025-61729) # renovate: datasource=docker depName=golang versioning=docker -FROM --platform=$BUILDPLATFORM golang:1.25.7-trixie@sha256:86d4bd34f4ca0536082637663aa6959c562ceb0161b289dc7592112228735272 AS crowdsec-builder +FROM --platform=$BUILDPLATFORM golang:1.25.7-alpine AS crowdsec-builder COPY --from=xx / / WORKDIR /tmp/crowdsec @@ -241,11 +235,9 @@ ARG CROWDSEC_VERSION=1.7.6 # CrowdSec fallback tarball checksum (v${CROWDSEC_VERSION}) ARG CROWDSEC_RELEASE_SHA256=704e37121e7ac215991441cef0d8732e33fa3b1a2b2b88b53a0bfe5e38f863bd -RUN apt-get update && apt-get install -y --no-install-recommends \ - git clang lld \ - && rm -rf /var/lib/apt/lists/* +RUN apk add --no-cache git clang lld # hadolint ignore=DL3059 -RUN xx-apt install -y gcc libc6-dev +RUN xx-apk add --no-cache gcc musl-dev # Clone CrowdSec source RUN git clone --depth 1 --branch "v${CROWDSEC_VERSION}" https://github.com/crowdsecurity/crowdsec.git . @@ -285,8 +277,8 @@ RUN mkdir -p /crowdsec-out/config && \ cp -r config/* /crowdsec-out/config/ || true # ---- CrowdSec Fallback (for architectures where build fails) ---- -# renovate: datasource=docker depName=debian -FROM debian:trixie-slim@sha256:f6e2cfac5cf956ea044b4bd75e6397b4372ad88fe00908045e9a0d21712ae3ba AS crowdsec-fallback +# renovate: datasource=docker depName=alpine versioning=docker +FROM alpine:3.23.3 AS crowdsec-fallback WORKDIR /tmp/crowdsec @@ -296,10 +288,7 @@ ARG TARGETARCH ARG CROWDSEC_VERSION=1.7.6 ARG CROWDSEC_RELEASE_SHA256=704e37121e7ac215991441cef0d8732e33fa3b1a2b2b88b53a0bfe5e38f863bd -# Note: Debian slim does NOT include tar by default - must be explicitly installed -RUN apt-get update && apt-get install -y --no-install-recommends \ - curl ca-certificates tar \ - && rm -rf /var/lib/apt/lists/* +RUN apk add --no-cache curl ca-certificates # Download static binaries as fallback (only available for amd64) # For other architectures, create empty placeholder files so COPY doesn't fail @@ -332,10 +321,9 @@ WORKDIR /app # Note: gosu is now built from source (see gosu-builder stage) to avoid CVEs from Debian's pre-compiled version # Explicitly upgrade packages to fix security vulnerabilities # binutils provides objdump for debug symbol detection in docker-entrypoint.sh -RUN apt-get update && apt-get install -y --no-install-recommends \ - bash ca-certificates libsqlite3-0 sqlite3 tzdata curl gettext-base libcap2-bin libc-ares2 binutils \ - && apt-get upgrade -y \ - && rm -rf /var/lib/apt/lists/* +RUN apk add --no-cache \ + bash ca-certificates sqlite-libs sqlite tzdata curl gettext libcap libcap-utils \ + c-ares binutils libc-utils busybox-extras # Copy gosu binary from gosu-builder (built with Go 1.25+ to avoid stdlib CVEs) COPY --from=gosu-builder /gosu-out/gosu /usr/sbin/gosu @@ -343,8 +331,8 @@ RUN chmod +x /usr/sbin/gosu # Security: Create non-root user and group for running the application # This follows the principle of least privilege (CIS Docker Benchmark 4.1) -RUN groupadd -g 1000 charon && \ - useradd -u 1000 -g charon -d /app -s /usr/sbin/nologin -M charon +RUN addgroup -g 1000 -S charon && \ + adduser -u 1000 -S -G charon -h /app -s /sbin/nologin charon # Download MaxMind GeoLite2 Country database # Note: In production, users should provide their own MaxMind license key @@ -428,7 +416,8 @@ COPY scripts/ /app/scripts/ RUN chmod +x /app/scripts/db-recovery.sh # Set default environment variables -ENV CHARON_ENV=production \ +ENV GODEBUG=netdns=go \ + CHARON_ENV=production \ CHARON_DB_PATH=/app/data/charon.db \ CHARON_FRONTEND_DIR=/app/frontend/dist \ CHARON_CADDY_ADMIN_API=http://localhost:2019 \ diff --git a/docs/plans/current_spec.md b/docs/plans/current_spec.md index e699eaf4..a1322a31 100644 --- a/docs/plans/current_spec.md +++ b/docs/plans/current_spec.md @@ -1,326 +1,282 @@ --- -title: "CI Docker Build and Scanning Blocker (PR #666)" +title: "Migration to Alpine (Issue #631)" status: "draft" -scope: "ci/docker-build-scan" +scope: "docker/alpine-migration" +notes: This plan has yet to be finished. You may add to but, ** DO NOT ** overwrite until completion of PR #666. --- ## 1. Introduction -This plan addresses the CI failure that blocks Docker build and scanning -for PR #666. The goal is to restore a clean, deterministic pipeline -where the image builds once, scans consistently, and security artifacts -align across workflows. The approach is minimal and evidence-driven: -collect logs, map the path, isolate the blocker, and apply the smallest -effective fix. +This plan defines the migration of the Charon Docker image base from +Debian Trixie Slim to Alpine Linux to address inherited glibc CVEs and +reduce image size (Issue #631). The plan consolidates the prior Alpine +migration research and translates it into a minimal-change, test-first +implementation path aligned with current CI and container workflows. Objectives: -- Identify the exact failing step in the build/scan chain. -- Trace the failure to a reproducible root cause. -- Propose minimal workflow/Dockerfile changes to restore green CI. -- Ensure all scan workflows resolve the same PR image. -- Review .gitignore, codecov.yml, .dockerignore, and Dockerfile if - needed for artifact hygiene. +- Replace Debian-based runtime with Alpine 3.23.x while maintaining + feature parity. +- Eliminate Debian glibc HIGH CVEs in the runtime image. +- Keep build stages compatible with multi-arch Buildx and existing + supply chain checks. +- Validate DNS resolution, SQLite (CGO) behavior, and security suite + functionality under musl. +- Review and update .gitignore, codecov.yml, .dockerignore, and + Dockerfile as needed. ## 2. Research Findings -CI workflow and build context (already reviewed): +### 2.1 Existing Plans and Security Context -- Docker build orchestration: .github/workflows/docker-build.yml -- Security scan for PR artifacts: .github/workflows/security-pr.yml -- Supply-chain verification for PRs: .github/workflows/supply-chain-pr.yml -- SBOM verification for non-PR builds: .github/workflows/supply-chain-verify.yml -- Dockerfile linting: .github/workflows/docker-lint.yml and - .hadolint.yaml -- Weekly rebuild and scan: .github/workflows/security-weekly-rebuild.yml -- Quality checks (non-Docker): .github/workflows/quality-checks.yml -- Build context filters: .dockerignore -- Runtime Docker build instructions: Dockerfile -- Ignored artifacts: .gitignore -- Coverage configuration: codecov.yml +- Alpine migration specification already exists and is comprehensive: + docs/plans/alpine_migration_spec.md. +- Debian CVE acceptance is temporary and explicitly tied to Alpine + migration: + docs/security/VULNERABILITY_ACCEPTANCE.md. +- Past Alpine-related issues and trade-offs are documented, including + musl DNS differences: + docs/analysis/crowdsec_integration_failure_analysis.md. -Observed from the public workflow summary (PR #666): +### 2.2 Current Docker and CI Touchpoints -- Job build-and-push failed in the Docker Build, Publish & Test workflow. -- Logs require GitHub authentication; obtained via gh CLI after auth. -- Evidence status: confirmed via gh CLI logs (see Results). +Primary files that must be considered for the migration: -Root cause captured from CI logs (authenticated gh CLI): +- Dockerfile (multi-stage build with Debian runtime base). +- .docker/docker-entrypoint.sh (uses user/group management and tools + that differ on Alpine). +- .docker/compose/docker-compose.yml (image tag references). +- .github/workflows/docker-build.yml (base image digest resolution and + build args). +- .github/workflows/security-pr.yml and supply-chain-pr.yml (build and + scan behaviors depend on the container layout). +- tools/dockerfile_check.sh (package manager validation). -- npm ci failed with ERESOLVE due to eslint@10 conflicting with the - @typescript-eslint peer dependency range. +### 2.3 Compatibility Summary (musl vs glibc) -Secondary/unconfirmed mismatch to verify only if remediation fails: +Based on alpine_migration_spec.md and current runtime behavior: -- PR tags are generated as pr-{number}-{short-sha} in docker-build.yml. -- Several steps reference pr-{number} (no short SHA) and use - --pull=never. -- This can cause image-not-found errors after Buildx pushes without - --load. +- Go services and Caddy/CrowdSec are Go binaries and compatible with + musl. +- SQLite is CGO-backed; ensure CGO remains enabled and libsqlite3 is + available under musl, then validate runtime CRUD behavior. +- DNS resolution differences are the primary operational risk; + mitigation is available via $GODEBUG=netdns=go. +- Entrypoint uses Debian-specific user/group tools; Alpine requires + adduser/addgroup or the shadow package. ## 3. Technical Specifications -### 3.1 CI Flow Map (Build -> Scan -> Verify) +### 3.1 Target Base Image + +- Runtime base: alpine:3.23.x pinned by digest (Renovate-managed). +- Build stages: switch to alpine-based golang/node images where required + to use apk/xx-apk consistently. +- Build-stage images should be digest-pinned when feasible. If a digest + pin is not practical (e.g., multi-arch tag compatibility), document + the reason and keep the tag Renovate-managed. + +### 3.2 Dockerfile Changes (Stage-by-Stage) + +Stages and expected changes (paths and stage names are current): + +1) gosu-builder (Dockerfile): + - Replace apt-get with apk. + - Replace xx-apt with xx-apk. + - Expected packages: git, clang, lld, gcc, musl-dev. + +2) frontend-builder (Dockerfile): + - Use node:24.x-alpine. + - Keep npm_config_rollup_skip_nodejs_native settings for cross-arch + builds. + +3) backend-builder (Dockerfile): + - Replace apt-get with apk. + - Replace xx-apt with xx-apk. + - Expected packages: clang, lld, gcc, musl-dev, sqlite-dev. + +4) caddy-builder (Dockerfile): + - Replace apt-get with apk. + - Expected packages: git. + +5) crowdsec-builder (Dockerfile): + - Replace apt-get with apk. + - Replace xx-apt with xx-apk. + - Expected packages: git, clang, lld, gcc, musl-dev. + +6) crowdsec-fallback (Dockerfile): + - Replace debian:trixie-slim with alpine:3.23.x. + - Use apk add curl ca-certificates (tar is provided by busybox). + +7) final runtime stage (Dockerfile): + - Replace CADDY_IMAGE base from Debian to Alpine. + - Replace apt-get with apk add. + - Runtime packages: bash, ca-certificates, sqlite-libs, sqlite, + tzdata, curl, gettext, libcap, c-ares, binutils, libc-utils + (for getent), busybox-extras or coreutils (for timeout), + libcap-utils (for setcap). + - Add ENV GODEBUG=netdns=go to mitigate musl DNS edge cases. + +### 3.3 Entrypoint Adjustments + +File: .docker/docker-entrypoint.sh + +Functions and command usage that must be Alpine-safe: + +- is_root(): no change. +- run_as_charon(): no change. +- Docker socket group handling: + - Replace groupadd/usermod with addgroup/adduser if shadow tools are + not installed. + - If using getent, ensure libc-utils is installed or implement a + /etc/group parsing fallback. +- CrowdSec initialization: + - Ensure sed -i usage is compatible with busybox sed. + - Verify timeout is available (busybox provides timeout). + +### 3.4 CI and Workflow Updates + +File: .github/workflows/docker-build.yml + +- Replace "Resolve Debian base image digest" step to pull and resolve + alpine:3.23.x digest. +- Update CADDY_IMAGE build-arg to use the Alpine digest. +- Ensure buildx cache and tag logic remain unchanged. + +No changes are expected to security-pr.yml and supply-chain-pr.yml +unless the container layout changes (paths used for binary extraction +and SBOM remain consistent). + +### 3.5 Data Flow and Runtime Behavior ```mermaid flowchart LR - A[PR Push] --> B[docker-build.yml: build-and-push] - B --> C[docker-build.yml: scan-pr-image] - B --> D[security-pr.yml: Trivy binary scan] - B --> E[supply-chain-pr.yml: SBOM + Grype] - B --> F[supply-chain-verify.yml: SBOM verify (non-PR)] + A[Docker Build] --> B[Multi-stage build on Alpine] + B --> C[Runtime: alpine base + charon + caddy + crowdsec] + C --> D[Entrypoint initializes volumes, CrowdSec, Caddy] + D --> E[Charon API + UI] ``` -### 3.2 Primary Failure Hypotheses (Ordered) +### 3.6 Requirements (EARS Notation) -1) Eslint peer dependency conflict (confirmed root cause) - -- npm ci failed with ERESOLVE due to eslint@10 conflicting with the - @typescript-eslint peer dependency range. - -2) Tag mismatch between build output and verification steps - -- Build tags for PRs are pr-{number}-{short-sha} (metadata action). -- Verification steps reference pr-{number} (no SHA) and do not pull. -- This is consistent with image-not-found errors. - Status: unconfirmed secondary hypothesis. - -3) Buildx push without local image for verification steps - -- Build uses docker buildx build --push without --load. -- Verification steps use docker run --pull=never with local tags. -- Buildx does not allow --load with multi-arch builds; --load only - produces a single-platform image. For multi-arch, prioritize pull by - digest or publish a single-platform build output for local checks. -- If the tag is not local and not pulled, verification fails. - -4) Dockerfile stage failure during network-heavy steps - -- gosu-builder: git clone and Go build -- frontend-builder: npm ci / npm run build -- backend-builder: go mod download / xx-go build -- caddy-builder: xcaddy build and Go dependency patching -- crowdsec-builder: git clone + go get + sed patch -- GeoLite2 download and checksum verification - -Any of these can fail with network timeouts or dependency resolution -errors in CI. The eslint peer dependency conflict is confirmed; other -hypotheses remain unconfirmed. - -### 3.3 Evidence Required (Single-Request Capture) - -Evidence capture completed in a single session. The following items -were captured: - -- Full logs for the failing docker-build.yml build-and-push job -- Failing step name and exit code -- Buildx command line as executed -- Metadata tags produced by docker/metadata-action -- Dockerfile stage that failed (if build failure) - -If accessible, also capture downstream scan job logs to confirm the image -reference used. - -### 3.4 Specific Files and Components to Investigate - -Docker build and tagging: - -- .github/workflows/docker-build.yml - - Generate Docker metadata (tag formatting) - - Build and push Docker image (with retry) - - Verify Caddy Security Patches - - Verify CrowdSec Security Patches - - Job: scan-pr-image - -Security scanning: - -- .github/workflows/security-pr.yml - - Extract PR number from workflow_run - - Extract charon binary from container (image reference) - - Trivy scans (fs, SARIF, blocking table) - -Supply-chain verification: - -- .github/workflows/supply-chain-pr.yml - - Check for PR image artifact - - Load Docker image (artifact) - - Build Docker image (Local) - -Dockerfile stages and critical components: - -- gosu-builder: /tmp/gosu build -- frontend-builder: /app/frontend build -- backend-builder: /app/backend build -- caddy-builder: xcaddy build and Go dependency patching -- crowdsec-builder: Go build and sed patch in - pkg/exprhelpers/debugger.go -- Final runtime stage: GeoLite2 download and checksum - -### 3.5 Tag and Digest Source-of-Truth Propagation - -Source of truth for the PR image reference is the output of the metadata -and build steps in docker-build.yml. Downstream workflows must consume a -single canonical reference, defined as: - -- primary: digest from buildx outputs (immutable) -- secondary: pr-{number}-{short-sha} tag (human-friendly) - -Propagation rules: - -- docker-build.yml SHALL publish the digest and tag as job outputs. -- docker-build.yml SHALL write digest and tag to a small artifact - (e.g., pr-image-ref.txt) for downstream workflow_run consumers. -- security-pr.yml and supply-chain-pr.yml SHALL prefer the digest from - outputs or artifact, and only fall back to tag if digest is absent. -- Any step that runs a local container SHALL ensure the referenced image - is available by either --load (local) or explicit pull by digest. - -### 3.6 Required Outcome (EARS Requirements) - -- WHEN a pull request triggers docker-build.yml, THE SYSTEM SHALL build a - PR image tagged as pr-{number}-{short-sha} and emit its digest. -- WHEN verification steps run in docker-build.yml, THE SYSTEM SHALL - reference the same digest or tag emitted by the build step. -- WHEN security-pr.yml runs for a workflow_run, THE SYSTEM SHALL resolve - the PR image using the digest or the emitted tag. -- WHEN supply-chain-pr.yml runs, THE SYSTEM SHALL load the exact PR image - by digest or by the emitted tag without ambiguity. -- IF the image reference cannot be resolved, THEN THE SYSTEM SHALL fail - fast with a clear message that includes the expected digest and tag. - -### 3.7 Config Hygiene Review (Requested Files) - -.gitignore: - -- Ensure CI scan artifacts are ignored locally, including any new names - introduced by fixes (e.g., trivy-pr-results.sarif, - trivy-binary-results.sarif, grype-results.json, - sbom.cyclonedx.json). - -codecov.yml: - -- Confirm CI-generated security artifacts are excluded from coverage. -- Add any new artifact names if introduced by fixes. - -.dockerignore: - -- Verify required frontend/backend sources and manifests are included. - -Dockerfile: - -- Review GeoLite2 download behavior for CI reliability. -- Confirm CADDY_IMAGE build-arg naming consistency across workflows. +- WHEN the Docker image is built, THE SYSTEM SHALL use Alpine 3.23.x + as the runtime base image. +- WHEN the container starts, THE SYSTEM SHALL create the charon user + and groups using Alpine-compatible tools. +- WHEN DNS resolution is performed, THE SYSTEM SHALL use the Go DNS + resolver to avoid musl NSS limitations. +- WHEN SQLite-backed operations run, THE SYSTEM SHALL read and write + data with CGO enabled and no schema errors under musl. +- IF Alpine package CVEs reappear at HIGH or CRITICAL, THEN THE SYSTEM + SHALL fail the security gate and block release. ## 4. Implementation Plan (Minimal-Request Phases) -### Phase 0: Evidence Capture (Single Request) +### Phase 1: Playwright Tests (Behavior Baseline) -Status: completed. Evidence captured and root cause confirmed. +- Rebuild the E2E container when Docker build inputs change, then run + E2E smoke tests before any unit or integration tests to establish the + UI baseline (tests/). Focus on login, proxy host CRUD, security + toggles. +- Record baseline timings for key flows to compare after migration. -- Retrieve full logs for the failing docker-build.yml build-and-push job. -- Capture the exact failing step, error output, and emitted tags/digest. -- Record the buildx command output as executed. -- Capture downstream scan logs if accessible to confirm image reference. +### Phase 2: Backend Implementation (Runtime and Container) -### Phase 1: Reproducibility Pass (Single Local Build) +- Update Dockerfile stages to Alpine equivalents (see Section 3.2). +- Update .docker/docker-entrypoint.sh for Alpine user/group commands and + tool availability (see Section 3.3). +- Add ENV GODEBUG=netdns=go to Dockerfile runtime stage. +- Update tools/dockerfile_check.sh to validate apk and xx-apk usage in + Alpine-based stages, replacing any Debian-specific checks. +- Run tools/dockerfile_check.sh and capture results for apk/xx-apk + verification. +- Validate crowdsec and caddy binaries remain in the same paths: + /usr/bin/caddy, /usr/local/bin/crowdsec, /usr/local/bin/cscli. -- Run a local docker buildx build using the same arguments as - docker-build.yml. -- Capture any stage failures and map them to Dockerfile stages. -- Confirm whether Buildx produces local images or only remote tags. +### Phase 3: Frontend Implementation -### Phase 2: Root Cause Isolation +- No application-level frontend changes expected. +- Ensure frontend build stage uses node:24.x-alpine in Dockerfile. -Status: completed. Root cause identified as the eslint peer dependency -conflict in the frontend build stage. +### Phase 4: Integration and Testing -- If failure is tag mismatch, trace tag references across docker-build.yml, - security-pr.yml, and supply-chain-pr.yml. -- If failure is a Dockerfile stage, isolate to specific step (gosu, - frontend, backend, caddy, crowdsec, GeoLite2). -- If failure is network-related, document retries/timeout behavior and - any missing mirrors. +- Rebuild E2E container and run Playwright suite (Docker mode). +- Run targeted integration tests: + - CrowdSec integration workflows. + - WAF and rate-limit workflows. +- Validate DNS challenges for at least one provider (Cloudflare). +- Validate SQLite CGO operations using health endpoints and basic CRUD. +- Validate multi-arch Buildx output and supply-chain workflows for the + Docker image: + - .github/workflows/docker-build.yml + - .github/workflows/security-pr.yml + - .github/workflows/supply-chain-pr.yml +- Run Trivy image scan and verify no HIGH/CRITICAL findings. -### Phase 3: Targeted Remediation Plan +### Phase 5: Documentation and Deployment -Focus on validating the eslint remediation. Revisit secondary -hypotheses only if the remediation does not resolve CI. +- Update ARCHITECTURE.md to reflect Alpine base image. +- Update docs/security/VULNERABILITY_ACCEPTANCE.md to close the Debian + CVE acceptance and note Alpine status. +- Update any Docker guidance in README or .docker/README.md if it + references Debian. -Conditional options (fallbacks, unconfirmed): +## 5. Config Hygiene Review (Requested Files) -Option A (Tag alignment): +### 5.1 .gitignore -- Update verification steps to use pr-{number}-{short-sha} tag. -- Or add a secondary tag pr-{number} for compatibility. +- No new ignore patterns required for Alpine migration. +- Verify no new build artifacts are introduced (apk cache is in-image + only). -Option B (Local image availability): +### 5.2 .dockerignore -- Add --load for PR builds so verification can run locally. -- Or explicitly pull by digest/tag before verification and remove - --pull=never. +- No changes required; keep excluding docs and CI artifacts to minimize + build context size. -Option C (Workflow scan alignment): +### 5.3 codecov.yml -- Update security-pr.yml and supply-chain-pr.yml to consume the digest - or emitted tag from docker-build.yml outputs/artifact. -- Add fallback order: digest artifact -> emitted tag -> local build. +- No changes required; migration does not add new code paths that should + be excluded from coverage. -## 5. Results (Evidence) +### 5.4 Dockerfile (Required) -Evidence status: confirmed via gh CLI logs after authentication. - -Root cause (confirmed): - -- Align eslint with the @typescript-eslint peer range to resolve npm ci - ERESOLVE in the frontend build stage. - -### Phase 4: Validation (Minimal Jobs) - -- Rerun docker-build.yml for the PR (or workflow_dispatch). -- Confirm build-and-push succeeds and verification steps resolve the - exact digest or tag. -- Confirm security-pr.yml and supply-chain-pr.yml resolve the same - digest or tag and complete scans. -- Deterministic check: use docker buildx imagetools inspect on the - emitted tag and compare the reported digest to the recorded build - digest, or pull by digest and verify the digest of the local image - matches the build output. - -### Phase 5: Documentation and Hygiene - -- Document the final tag/digest propagation in this plan. -- Update .gitignore / .dockerignore / codecov.yml if new artifacts are - produced. +- Update base images and package manager usage per Section 3.2. +- Add GODEBUG=netdns=go in runtime stage. +- Replace useradd/groupadd with adduser/addgroup or add shadow tools if + preferred. ## 6. Acceptance Criteria -- docker-build.yml build-and-push succeeds for PR #666. -- Verification steps resolve the same digest or tag emitted by build. -- security-pr.yml and supply-chain-pr.yml consume the same digest or tag - published by docker-build.yml. -- A validation check confirms tag-to-digest alignment across workflows - (digest matches tag for the PR image), using buildx imagetools inspect - or an equivalent digest comparison. -- No new CI artifacts are committed to the repository. -- Root cause is documented with logs and mapped to specific steps. +- The Docker image builds on Alpine with no build-stage failures. +- Runtime container starts with non-root user and no permission errors. +- All Playwright E2E tests pass against the Alpine-based container. +- Integration tests (CrowdSec, WAF, Rate Limit) pass without regressions. +- Trivy image scan reports zero HIGH/CRITICAL CVEs in the runtime image. +- tools/dockerfile_check.sh passes with apk and xx-apk checks for all + Alpine-based stages. +- Multi-arch Buildx validation succeeds and supply-chain workflows + (docker-build.yml, security-pr.yml, supply-chain-pr.yml) complete with + no regressions. +- ARCHITECTURE.md and security acceptance docs reflect Alpine as the + runtime base. ## 7. Risks and Mitigations -- Risk: CI logs are inaccessible without login, delaying diagnosis. - - Mitigation: request logs or export them once, then reproduce locally. +- Risk: musl DNS resolver differences cause ACME or webhook failures. + - Mitigation: set GODEBUG=netdns=go and run DNS provider tests. -- Risk: Multiple workflows use divergent tag formats. - - Mitigation: define a single source of truth for PR tags and digest - propagation. +- Risk: Alpine user/group tooling mismatch breaks Docker socket handling. + - Mitigation: adjust entrypoint to use adduser/addgroup or install + shadow tools and libc-utils for getent. -- Risk: Buildx produces only remote tags, breaking local verification. - - Mitigation: add --load for PR builds or pull by digest before - verification. +- Risk: SQLite CGO compatibility issues. + - Mitigation: run database integrity checks and CRUD tests. ## 8. Confidence Score -Confidence: 88 percent +Confidence: 84 percent -Rationale: The eslint peer dependency conflict is confirmed as the -frontend build failure. Secondary tag mismatch hypotheses remain -unconfirmed and are now conditional fallbacks only. +Rationale: Alpine migration has a detailed existing spec and low code +surface change, but runtime differences (musl DNS, user/group tooling) +require careful validation. diff --git a/tools/dockerfile_check.sh b/tools/dockerfile_check.sh index 906483c4..ad66d4c3 100755 --- a/tools/dockerfile_check.sh +++ b/tools/dockerfile_check.sh @@ -13,46 +13,43 @@ fi echo "Checking $DOCKERFILE for base image / package manager mismatches..." -# Read file content -dockerfile_content=$(cat "$DOCKERFILE") +checking_stage=false +current_stage="" +stage_base="" -# Check for golang:latest or golang:1.x (Debian) with apk commands in the same stage while IFS= read -r line; do - if echo "$line" | grep -qE "^FROM\s+golang:(latest|[0-9]+(\.[0-9]+)?)\s"; then - # Found a Debian-based golang image, check the next 20 lines for apk - current_stage="$line" + if echo "$line" | grep -qE "^FROM\s+"; then checking_stage=true - elif echo "$line" | grep -qE "^FROM\s+" && [ "$checking_stage" = true ]; then - # New FROM statement, reset - checking_stage=false + current_stage="$line" + stage_base="unknown" + + if echo "$line" | grep -qi "alpine"; then + stage_base="alpine" + elif echo "$line" | grep -qiE "debian|ubuntu|trixie|bookworm|bullseye"; then + stage_base="debian" + fi fi - if [ "$checking_stage" = true ] && echo "$line" | grep -qE "RUN.*apk\s+(add|update|del)"; then - echo "❌ ERROR: Found Debian-based golang image with Alpine package manager (apk)" - echo " Stage: $current_stage" - echo " Command: $line" - echo " Fix: Use 'golang:alpine' or 'golang:1.x-alpine' instead" - exit 1 + if [ "$checking_stage" = true ] && [ "$stage_base" = "alpine" ]; then + if echo "$line" | grep -qE "(apt-get|apt)\s+(install|update)" || echo "$line" | grep -qE "xx-apt\s+"; then + echo "❌ ERROR: Found Alpine-based image with Debian package manager (apt/apt-get)" + echo " Stage: $current_stage" + echo " Command: $line" + echo " Fix: Use 'apk add' or 'xx-apk' instead" + exit 1 + fi + fi + + if [ "$checking_stage" = true ] && [ "$stage_base" = "debian" ]; then + if echo "$line" | grep -qE "apk\s+(add|update|del)" || echo "$line" | grep -qE "xx-apk\s+"; then + echo "❌ ERROR: Found Debian-based image with Alpine package manager (apk)" + echo " Stage: $current_stage" + echo " Command: $line" + echo " Fix: Use 'apt-get' or 'xx-apt' instead" + exit 1 + fi fi done < "$DOCKERFILE" -# Check for node:latest or node:XX (Debian) with apk commands -if echo "$dockerfile_content" | grep -E "FROM\s+node:(latest|[0-9]+)\s" > /dev/null; then - if echo "$dockerfile_content" | grep -A 10 "FROM\s\+node:(latest|[0-9]\+)\s" | grep -E "RUN.*apk\s+(add|update)" > /dev/null; then - echo "❌ ERROR: Found Debian-based node image (node:latest or node:XX) with Alpine package manager (apk)" - echo " Fix: Use 'node:alpine' or 'node:XX-alpine' instead" - exit 1 - fi -fi - -# Check for alpine images with apt/apt-get -if echo "$dockerfile_content" | grep -E "FROM\s+.*:.*alpine" > /dev/null; then - if echo "$dockerfile_content" | grep -A 10 "FROM\s\+.*:.*alpine" | grep -E "RUN.*(apt-get|apt)\s+(install|update)" > /dev/null; then - echo "❌ ERROR: Found Alpine-based image with Debian package manager (apt/apt-get)" - echo " Fix: Use 'apk add' instead of 'apt install'" - exit 1 - fi -fi - echo "✓ Dockerfile validation passed" exit 0