diff --git a/.github/workflows/ci-pipeline.yml b/.github/workflows/ci-pipeline.yml index fbbb8bde..d60390ec 100644 --- a/.github/workflows/ci-pipeline.yml +++ b/.github/workflows/ci-pipeline.yml @@ -463,7 +463,8 @@ jobs: name: E2E Tests with Coverage needs: - build-image - if: (github.event_name != 'workflow_dispatch' || inputs.run_e2e != false) && needs.build-image.result == 'success' + - integration-gate + if: always() && (github.event_name != 'workflow_dispatch' || inputs.run_e2e != false) && needs.build-image.result == 'success' && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') uses: ./.github/workflows/e2e-tests-split.yml with: browser: all @@ -478,7 +479,8 @@ jobs: runs-on: ubuntu-latest needs: - e2e - if: github.event_name != 'workflow_dispatch' || inputs.run_e2e != false + - integration-gate + if: always() && (github.event_name != 'workflow_dispatch' || inputs.run_e2e != false) && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') steps: - name: Verify E2E results run: | @@ -491,8 +493,8 @@ jobs: name: Coverage - Backend runs-on: ubuntu-latest needs: - - build-image - if: github.event_name != 'workflow_dispatch' || inputs.run_coverage != false + - integration-gate + if: always() && (github.event_name != 'workflow_dispatch' || inputs.run_coverage != false) && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 @@ -523,8 +525,8 @@ jobs: name: Coverage - Frontend runs-on: ubuntu-latest needs: - - build-image - if: github.event_name != 'workflow_dispatch' || inputs.run_coverage != false + - integration-gate + if: always() && (github.event_name != 'workflow_dispatch' || inputs.run_coverage != false) && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 @@ -560,7 +562,8 @@ jobs: needs: - coverage-backend - coverage-frontend - if: github.event_name != 'workflow_dispatch' || inputs.run_coverage != false + - integration-gate + if: always() && (github.event_name != 'workflow_dispatch' || inputs.run_coverage != false) && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') steps: - name: Evaluate coverage results run: | @@ -584,7 +587,8 @@ jobs: needs: - coverage-gate - e2e - if: github.event_name != 'workflow_dispatch' || inputs.run_coverage != false + - integration-gate + if: always() && (github.event_name != 'workflow_dispatch' || inputs.run_coverage != false) && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 @@ -638,7 +642,8 @@ jobs: runs-on: ubuntu-latest needs: - codecov-upload - if: (github.event_name != 'workflow_dispatch' || inputs.run_coverage != false) && needs.codecov-upload.result != 'skipped' + - integration-gate + if: always() && (github.event_name != 'workflow_dispatch' || inputs.run_coverage != false) && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') && needs.codecov-upload.result != 'skipped' steps: - name: Evaluate Codecov upload results run: | @@ -658,7 +663,9 @@ jobs: security-codeql: name: Security - CodeQL runs-on: ubuntu-latest - if: (github.event_name != 'workflow_dispatch' || inputs.run_security_scans != false) && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork != true) + needs: + - integration-gate + if: always() && (github.event_name != 'workflow_dispatch' || inputs.run_security_scans != false) && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork != true) && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') permissions: contents: read security-events: write @@ -698,7 +705,8 @@ jobs: runs-on: ubuntu-latest needs: - build-image - if: (github.event_name != 'workflow_dispatch' || inputs.run_security_scans != false) && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork != true) && needs.build-image.result == 'success' + - integration-gate + if: always() && (github.event_name != 'workflow_dispatch' || inputs.run_security_scans != false) && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork != true) && needs.build-image.result == 'success' && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') permissions: contents: read security-events: write @@ -741,7 +749,8 @@ jobs: runs-on: ubuntu-latest needs: - build-image - if: (github.event_name != 'workflow_dispatch' || inputs.run_security_scans != false) && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork != true) && needs.build-image.result == 'success' + - integration-gate + if: always() && (github.event_name != 'workflow_dispatch' || inputs.run_security_scans != false) && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork != true) && needs.build-image.result == 'success' && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') permissions: contents: read security-events: write @@ -774,7 +783,8 @@ jobs: - security-codeql - security-trivy - security-supply-chain - if: github.event_name != 'workflow_dispatch' || inputs.run_security_scans != false + - integration-gate + if: always() && (github.event_name != 'workflow_dispatch' || inputs.run_security_scans != false) && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') steps: - name: Verify security results run: | @@ -854,7 +864,7 @@ jobs: fi fi - e2e_enabled="${{ github.event_name != 'workflow_dispatch' || inputs.run_e2e != false }}" + e2e_enabled="${{ (github.event_name != 'workflow_dispatch' || inputs.run_e2e != false) && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') }}" if [ "$e2e_enabled" = "true" ]; then require_success_if_ran "e2e-gate" "${{ needs.e2e-gate.result }}" "true" if [ "${{ needs.e2e-gate.result }}" != "skipped" ]; then @@ -862,7 +872,7 @@ jobs: fi fi - coverage_enabled="${{ github.event_name != 'workflow_dispatch' || inputs.run_coverage != false }}" + coverage_enabled="${{ (github.event_name != 'workflow_dispatch' || inputs.run_coverage != false) && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') }}" if [ "$coverage_enabled" = "true" ]; then require_success_if_ran "coverage-gate" "${{ needs.coverage-gate.result }}" "true" require_success_if_ran "codecov-gate" "${{ needs.codecov-gate.result }}" "true" @@ -874,7 +884,7 @@ jobs: fi fi - security_enabled="${{ (github.event_name != 'workflow_dispatch' || inputs.run_security_scans != false) && (github.event_name != 'pull_request' || !github.event.pull_request.head.repo.fork) }}" + security_enabled="${{ (github.event_name != 'workflow_dispatch' || inputs.run_security_scans != false) && (needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped') && (github.event_name != 'pull_request' || !github.event.pull_request.head.repo.fork) }}" if [ "$security_enabled" = "true" ]; then require_success_if_ran "security-gate" "${{ needs.security-gate.result }}" "true" if [ "${{ needs.security-gate.result }}" != "skipped" ]; then diff --git a/docs/plans/ci_optimization_spec.md b/docs/plans/ci_optimization_spec.md new file mode 100644 index 00000000..580ac1d5 --- /dev/null +++ b/docs/plans/ci_optimization_spec.md @@ -0,0 +1,92 @@ +# CI Pipeline Optimization Plan + +## 1. Introduction + +**Overview:** +This plan optimizes the CI pipeline dependency graph so the `e2e` job starts as early as possible, while preserving quality gates. The primary change is to decouple `lint` from `build-image`, allowing both to run in parallel after `setup` completes. + +**Objectives:** +- Start `e2e` as soon as `build-image` finishes. +- Keep `lint` as a required gate via `pipeline-gate`. +- Preserve existing security scan behavior, especially early/parallel execution of `security-codeql`. + +## 2. Research Findings + +**Existing workflow file:** +- [ci-pipeline.yml](.github/workflows/ci-pipeline.yml) + +**Current dependency graph (relevant):** +- `setup` has no needs (fast input normalization). +- `lint` has no needs. +- `build-image` needs `lint` and `setup`. +- `e2e` needs `build-image`. +- `pipeline-gate` needs `lint`, `build-image`, `integration-gate`, `e2e-gate`, `coverage-gate`, `codecov-gate`, `security-gate`. +- `security-codeql` has no needs and runs early/parallel. + +**Observation:** +- `build-image` is unnecessarily serialized behind `lint`, delaying downstream jobs (`e2e`, integrations, security image scans). +- `security-codeql` already runs independently and should remain so. + +## 3. Technical Specifications + +### 3.1 Dependency Graph Changes + +**Target behavior:** +- `lint` runs in parallel with `setup` and `build-image`. +- `build-image` depends only on `setup`. +- `e2e` continues to depend on `build-image`. +- `pipeline-gate` continues to enforce `lint` success. +- `security-codeql` remains without `needs`. + +**Proposed change:** +- Update `build-image.needs` to only include `setup`. + +### 3.2 EARS Requirements + +- WHEN the CI pipeline runs, THE SYSTEM SHALL start `build-image` after `setup` completes, without waiting for `lint`. +- WHEN `build-image` completes successfully, THE SYSTEM SHALL start `e2e` as soon as it is scheduled. +- WHEN `lint` fails, THE SYSTEM SHALL block the pipeline via `pipeline-gate` even if `e2e` or `build-image` succeed. +- WHEN security scans are enabled, THE SYSTEM SHALL run `security-codeql` in parallel with other jobs without dependency on `setup`, `lint`, or `build-image`. + +### 3.3 Error Handling and Edge Cases + +- If `setup` fails, `build-image` and its dependents must not run (existing behavior preserved). +- If `lint` fails but `build-image` and `e2e` succeed, `pipeline-gate` must still fail. +- If `security-codeql` is skipped (e.g., forked PR), `security-gate` must continue to interpret skip correctly (no change). + +### 3.4 Risks and Mitigations + +| Risk | Impact | Mitigation | +| --- | --- | --- | +| `build-image` could start before `lint` detects issues | Failing lint might occur after expensive build/test work | `pipeline-gate` still enforces `lint` success; cost is acceptable for speed | +| Misconfigured `needs` graph causes unintended skips | Downstream jobs might not run | Only remove `lint` from `build-image.needs`; do not change other gates | + +## 4. Implementation Plan + +### Phase 1: Playwright Tests (Behavioral Expectations) +- No Playwright changes are required for this CI optimization. Confirm `e2e` workflow reuse remains unchanged. + +### Phase 2: Backend Implementation +- Not applicable. + +### Phase 3: Frontend Implementation +- Not applicable. + +### Phase 4: Integration and Testing +- Validate the dependency graph in `ci-pipeline.yml` locally by reasoning and optional dry-run (no CI execution in this plan). +- Confirm `security-codeql` retains no `needs`. + +### Phase 5: Documentation and Deployment +- Update this plan only (no documentation changes elsewhere). + +## 5. Acceptance Criteria + +- DoD: CI dependency graph reflects `build-image` depending only on `setup`. +- DoD: `lint` remains a required gate in `pipeline-gate`. +- DoD: `security-codeql` continues to run early/parallel (no `needs`). +- DoD: `e2e` still depends on `build-image` only. + +## 6. Complexity and Impact + +- **Complexity:** Low +- **Impact:** Moderate CI speed-up for E2E and integration jobs diff --git a/docs/plans/ci_sequencing_spec.md b/docs/plans/ci_sequencing_spec.md new file mode 100644 index 00000000..a73b96d9 --- /dev/null +++ b/docs/plans/ci_sequencing_spec.md @@ -0,0 +1,171 @@ +--- +title: "CI Sequencing and Parallelization" +status: "draft" +scope: "ci/pipeline, ci/integration, ci/coverage, ci/security" +--- + +## 1. Introduction + +This plan reorders CI job dependencies so lint remains first, integration runs after image build, E2E depends on integration-gate, and coverage/security jobs run in parallel with E2E once integration has passed. + +Objectives: + +- Keep `lint` as the earliest gate before `build-image`. +- Ensure integration tests run after `build-image` and before `e2e`. +- Make `e2e` depend on `integration-gate`. +- Start coverage and security jobs after `integration-gate` so they complete while E2E runs. +- Preserve strict gating: if a required stage is skipped or fails, downstream gates fail. + +## 2. Research Findings + +### 2.1 Current CI Ordering + +- `lint` runs first and gates `build-image`. +- Integration jobs depend on `build-image`, then `integration-gate` aggregates results. +- `e2e` depends only on `build-image`. +- Coverage and security jobs depend on `build-image` (CodeQL has no dependencies). +- `codecov-upload` depends on `coverage-gate` and `e2e`. +- `pipeline-gate` evaluates gate jobs based on input flags, not on integration status. + +### 2.2 Impact of Current Ordering + +- E2E can run without integration tests completing. +- Coverage and security jobs start before or during integration, competing for runners. +- `security-codeql` can start immediately, cluttering early logs. + +## 3. Technical Specifications + +### 3.1 Target Dependency Graph + +``` +setup -> lint -> build-image +build-image -> integration-* -> integration-gate +integration-gate -> e2e -> e2e-gate +integration-gate -> coverage-* -> coverage-gate +integration-gate -> security-* -> security-gate +coverage-gate + e2e -> codecov-upload -> codecov-gate +pipeline-gate depends on lint, build-image, integration-gate, e2e-gate, coverage-gate, codecov-gate, security-gate +``` + +### 3.2 Job Dependency Updates + +Update `.github/workflows/ci-pipeline.yml`: + +- `e2e`: + - `needs`: `build-image`, `integration-gate`. + - `if`: require `needs.integration-gate.result == 'success'` and `needs.build-image.result == 'success'` and the existing `run_e2e` input guard. + - Keep `build-image` in `needs` for `image_ref` outputs. + +- `e2e-gate`: + - Keep `needs: e2e`. + - Update `if` guard to require integration to have run (see Section 3.4) so skips are treated consistently. + +- `coverage-backend` and `coverage-frontend`: + - `needs`: `integration-gate`. + - `if`: existing `run_coverage` input guard AND `needs.integration-gate.result == 'success'`. + - No `build-image` dependency is required for coverage tests. + +- `security-codeql`: + - Add `needs: integration-gate`. + - `if`: existing `run_security_scans` guard AND `needs.integration-gate.result == 'success'` AND the existing fork guard. + +- `security-trivy` and `security-supply-chain`: + - `needs`: `build-image`, `integration-gate`. + - `if`: existing guards AND `needs.integration-gate.result == 'success'`. + +- `security-gate`: + - Keep `needs` on all security jobs. + - `if`: same as today, plus integration-enabled guard (Section 3.4). + +### 3.3 Parallelization Strategy + +- Once `integration-gate` succeeds, start `e2e`, `coverage-*`, and `security-*` concurrently. +- This ensures non-E2E work completes during the longer E2E runtime window. + +### 3.4 Enablement and Strict Gating Logic + +Adjust enablement expressions so skip behavior is intentional and strict: + +- Define an integration-enabled expression for reuse: + - `integration_enabled = needs.build-image.outputs.run_integration == 'true'` +- Define an integration-gate pass-or-skip expression for reuse: + - `integration_gate_ok = needs.integration-gate.result == 'success' || needs.integration-gate.result == 'skipped'` + +Update gate conditions to include `integration_enabled`: + +- `e2e` and `e2e-gate`: + - Only enabled if `run_e2e` is not false. + - Add guard: `always() && integration_gate_ok` so fork PRs (integration skipped) still run. + +- `coverage-*`, `coverage-gate`, `codecov-upload`, `codecov-gate`: + - Only enabled if `run_coverage` is not false. + - Add guard: `always() && integration_gate_ok` so fork PRs (integration skipped) still run. + +- `security-*`, `security-gate`: + - Only enabled if `run_security_scans` is not false (and fork guard for CodeQL/Trivy/SBOM). + - Add guard: `always() && integration_gate_ok` so fork PRs (integration skipped) still run. + +- `pipeline-gate`: + - Update enabled checks (`e2e_enabled`, `coverage_enabled`, `security_enabled`) to use `integration_gate_ok` (not `integration_enabled`). + - Keep strict gating: when enabled, skipped results remain failures. + +### 3.5 Artifact and Output Dependencies + +- `codecov-upload` continues to depend on `e2e` for E2E coverage artifacts and on `coverage-gate` for unit coverage. +- `security-trivy` and `security-supply-chain` continue using `needs.build-image.outputs.image_ref_dockerhub`. + +### 3.6 Data Flow and Runners + +- Integration is isolated to its phase, reducing early runner contention. +- The post-integration phase allows `e2e`, `coverage-*`, and `security-*` to run in parallel. + +## 4. Implementation Plan + +### Phase 1: Playwright Tests (Behavior Baseline) + +- No UI behavior changes are expected; treat as baseline verification only. + +### Phase 2: Dependency Rewire + +- Update `e2e` to require `integration-gate` and `build-image`. +- Add `integration-gate` to `coverage-*` and `security-*` `needs`. +- Move `security-codeql` behind `integration-gate`. + +### Phase 3: Strict Gating Alignment + +- Update job `if` conditions to include `integration_enabled`. +- Update `pipeline-gate` enablement logic to match new gating rules. + +### Phase 4: Validation + +- Verify that `lint -> build-image -> integration -> e2e` is enforced. +- Confirm coverage and security jobs start only after integration succeeds. +- Confirm `codecov-upload` and `codecov-gate` still run after coverage and E2E are complete. + +## 5. Acceptance Criteria (EARS) + +- WHEN a pipeline starts, THE SYSTEM SHALL run `lint` before `build-image`. +- WHEN `build-image` succeeds and integration is enabled, THE SYSTEM SHALL run all integration tests and aggregate them in `integration-gate`. +- WHEN `integration-gate` succeeds or is skipped and E2E is enabled, THE SYSTEM SHALL start `e2e` and require it to pass before `e2e-gate` succeeds. +- WHEN `integration-gate` succeeds or is skipped and coverage is enabled, THE SYSTEM SHALL start `coverage-backend` and `coverage-frontend` in parallel with E2E. +- WHEN `integration-gate` succeeds or is skipped and security scans are enabled, THE SYSTEM SHALL start `security-codeql`, `security-trivy`, and `security-supply-chain` in parallel with E2E. +- WHEN integration is skipped for fork PRs (`run_integration=false`), THE SYSTEM SHALL still run `e2e`, `coverage-*`, and `security-*` if their respective enablement flags are true. +- IF `integration-gate` is not successful while integration is enabled, THEN THE SYSTEM SHALL skip `e2e`, `coverage-*`, and `security-*` and fail the appropriate gates. +- WHEN coverage and E2E complete successfully, THE SYSTEM SHALL run `codecov-upload` and `codecov-gate`. + +## 6. Risks and Mitigations + +- Risk: Coupling coverage/security to integration could reduce flexibility for ad-hoc runs. + Mitigation: Keep `run_integration` default true; document that disabling integration disables downstream stages. + +- Risk: CodeQL no longer starts early, increasing total elapsed time for that job. + Mitigation: CodeQL runs in parallel with E2E to keep total pipeline time stable. + +- Risk: Misaligned gate logic could mark expected skips as failures. + Mitigation: Centralize enablement logic (`integration_enabled`) and apply consistently in job `if` conditions and `pipeline-gate`. + +## 7. Confidence Score + +Confidence: 88 percent + +Rationale: The sequencing changes are localized to job `needs` and `if` expressions. The main uncertainty is ensuring gate logic stays strict while respecting the new integration-first requirement. diff --git a/docs/reports/ci_sequencing_audit.md b/docs/reports/ci_sequencing_audit.md new file mode 100644 index 00000000..3b16819b --- /dev/null +++ b/docs/reports/ci_sequencing_audit.md @@ -0,0 +1,67 @@ +# CI Sequencing Audit + +Date: 2026-02-08 + +## Scope + +Audit target: .github/workflows/ci-pipeline.yml + +Focus areas: +- YAML syntax validity +- Job `if` condition patterns for `e2e`, `coverage-*`, and `security-*` +- Job dependency sequencing (Lint -> Build -> Integration -> Gate -> E2E/Rest) +- Fork behavior (integration skipped, E2E still runs) + +## Results + +### YAML syntax + +- Visual inspection indicates valid YAML structure and indentation. +- No duplicate keys or malformed mappings detected. + +### `if` condition pattern review + +The following jobs implement `always()` and use a `success || skipped` guard on the integration gate: + +- `e2e`: `always()` plus `needs.integration-gate.result == 'success' || ... == 'skipped'`, and `needs.build-image.result == 'success'`. +- `e2e-gate`: `always()` plus `needs.integration-gate.result == 'success' || ... == 'skipped'`. +- `coverage-backend`: `always()` plus `needs.integration-gate.result == 'success' || ... == 'skipped'`. +- `coverage-frontend`: `always()` plus `needs.integration-gate.result == 'success' || ... == 'skipped'`. +- `coverage-gate`: `always()` plus `needs.integration-gate.result == 'success' || ... == 'skipped'`. +- `codecov-upload`: `always()` plus `needs.integration-gate.result == 'success' || ... == 'skipped'`. +- `codecov-gate`: `always()` plus `needs.integration-gate.result == 'success' || ... == 'skipped'` and `needs.codecov-upload.result != 'skipped'`. +- `security-codeql`: `always()` plus `needs.integration-gate.result == 'success' || ... == 'skipped'`. +- `security-trivy`: `always()` plus `needs.integration-gate.result == 'success' || ... == 'skipped'`, and `needs.build-image.result == 'success'`. +- `security-supply-chain`: `always()` plus `needs.integration-gate.result == 'success' || ... == 'skipped'`, and `needs.build-image.result == 'success'`. +- `security-gate`: `always()` plus `needs.integration-gate.result == 'success' || ... == 'skipped'`. + +### Sequencing (Lint -> Build -> Integration -> Gate -> E2E/Rest) + +- `build-image` depends on `lint`, establishing Lint -> Build. +- Integration jobs depend on `build-image`. +- `integration-gate` depends on `build-image` and all integration jobs. +- `e2e` depends on `build-image` and `integration-gate`. +- Coverage and security jobs depend on `integration-gate` (but not directly on `build-image`). +- `pipeline-gate` depends on all gates. + +### Fork logic (Integration Skip -> E2E Run) + +- Fork PRs set `push_image=false`, which makes `run_integration=false`. +- Integration jobs and `integration-gate` are skipped. +- `e2e` still runs because it allows `integration-gate` to be `skipped` and only requires `build-image` to succeed. + +## Findings + +### IMPORTANT: Coverage and security jobs can run after a skipped integration gate caused by failed build + +If `lint` or `build-image` fail, `integration-gate` is skipped. The coverage and security jobs only check `(needs.integration-gate.result == 'success' || ... == 'skipped')`, so they can run even when the build failed. This weakens the strict sequence guarantee (Lint -> Build -> Integration -> Gate -> E2E/Rest) for these jobs. + +Suggested fix: +- Add `needs.build-image.result == 'success'` to `coverage-*`, `coverage-gate`, `codecov-*`, and `security-codeql` conditions, or require `needs.build-image.result == 'success'` at the `integration-gate` level and check for `success` (not `skipped`) where strict sequencing is required. + +## Conclusion + +- YAML syntax appears valid on inspection. +- `always() && (success || skipped)` pattern is applied consistently for the targeted jobs. +- Fork logic correctly skips integration and still runs E2E. +- Sequencing is mostly correct, with the exception noted for coverage and security jobs when the integration gate is skipped due to an upstream failure.