From 6acd94672e73a2c00b37b1073fae91ddf6b9295b Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Tue, 9 Dec 2025 02:06:33 +0000 Subject: [PATCH] chore(history-rewrite): add scripts/docs for history rewrite plan --- docs/plans/current_spec.md | 32 ++++ docs/plans/history_rewrite.md | 78 ++++++++ scripts/history-rewrite/clean_history.sh | 176 ++++++++++++++++++ scripts/history-rewrite/preview_removals.sh | 58 ++++++ .../history-rewrite/validate_after_rewrite.sh | 43 +++++ 5 files changed, 387 insertions(+) create mode 100644 docs/plans/history_rewrite.md create mode 100644 scripts/history-rewrite/clean_history.sh create mode 100644 scripts/history-rewrite/preview_removals.sh create mode 100644 scripts/history-rewrite/validate_after_rewrite.sh diff --git a/docs/plans/current_spec.md b/docs/plans/current_spec.md index b911cc2e..731366e2 100644 --- a/docs/plans/current_spec.md +++ b/docs/plans/current_spec.md @@ -394,6 +394,38 @@ Notes: - History rewrite is destructive; only do after explicit approval and scheduling during a low-impact window. - If the repo has widely used forks or CI jobs referencing old commit hashes, establish a temporary redirect communication plan. +History rewrite summary (safe workflow) +------------------------------------- +For repository history cleanup to remove committed CodeQL DBs or large blobs, the repo now contains a small set of tools under `scripts/history-rewrite` to help plan and safely execute this action. They are: + +- `scripts/history-rewrite/clean_history.sh` — Preview and optionally (with `--force`) run a git-filter-repo history rewrite. Default is `--dry-run` and the script creates a timestamped backup branch named `backup/history-YYYYMMDD-HHMMSS` before any destructive operations. The script logs operations to `data/backups/history_cleanup-YYYYMMDD-HHMMSS.log` and prints next-step instructions. Do NOT run `--force` on `main` or `master` and coordinate with maintainers before force-pushing. + +- `scripts/history-rewrite/preview_removals.sh` — Print commit/object lists and example large blobs relevant to the paths and strip size for verification. + +- `scripts/history-rewrite/validate_after_rewrite.sh` — Run `git fsck`, `git count-objects -vH`, `pre-commit` hooks, backend `go test ./...`, and frontend `npm run build` to verify the repository after a rewrite. + +Quick `clean_history.sh` usage examples +------------------------------------- +- Dry-run: + - `scripts/history-rewrite/clean_history.sh --dry-run --paths 'backend/codeql-db,codeql-db' --strip-size 50` + - This logs what would be removed without making any changes; review `data/backups/history_cleanup-*.log` for details. + +- Preview only: + - `scripts/history-rewrite/preview_removals.sh --paths 'backend/codeql-db,codeql-db' --strip-size 50` + +- Destructive rewrite (ONLY after approval): + - `scripts/history-rewrite/clean_history.sh --force --paths 'backend/codeql-db,codeql-db' --strip-size 50` + - It will create `backup/history-YYYYMMDD-HHMMSS`, prompt for explicit confirmation `I UNDERSTAND`, run `git filter-repo` locally, then run `git fsck` and `git gc`. + - After rewrite, do not auto-push; perform `git push --all --force` and `git push --tags --force` only after team approval. + +Warnings & notes +---------------- +- The scripts only prepare and perform the rewrite locally; they will not force-push to remote unless you do so manually. +- Avoid running `--force` on `main` or `master`. Use a feature branch or a controlled clone. +- The rewrite is destructive; maintainers must rebase or re-clone after force-push. +- Always verify with `scripts/history-rewrite/validate_after_rewrite.sh` before any force push. + + ## Incident Triage: CrowdSec preset pull/apply 502/500 (feature/beta-release) - Logs to pull first: backend app/GIN logs under `/app/data/logs/charon.log` (or `data/logs/charon.log` in dev) via [backend/cmd/api/main.go](backend/cmd/api/main.go); look for warnings "crowdsec preset pull failed" / "crowdsec preset apply failed" emitted in [backend/internal/api/handlers/crowdsec_handler.go](backend/internal/api/handlers/crowdsec_handler.go). Access logs will also show 502/500 for POST `/api/v1/admin/crowdsec/presets/pull` and `/apply`. - Routes and code paths: handlers `PullPreset` and `ApplyPreset` live in [backend/internal/api/handlers/crowdsec_handler.go](backend/internal/api/handlers/crowdsec_handler.go) and delegate to `HubService.Pull/Apply` in [backend/internal/crowdsec/hub_sync.go](backend/internal/crowdsec/hub_sync.go) with cache helpers in [backend/internal/crowdsec/hub_cache.go](backend/internal/crowdsec/hub_cache.go). Data dir used is `data/crowdsec` with cache under `data/crowdsec/hub_cache` from [backend/internal/api/routes/routes.go](backend/internal/api/routes/routes.go). diff --git a/docs/plans/history_rewrite.md b/docs/plans/history_rewrite.md new file mode 100644 index 00000000..0bebdcb3 --- /dev/null +++ b/docs/plans/history_rewrite.md @@ -0,0 +1,78 @@ +History rewrite plan +==================== + +Rationale +--------- +Some committed CodeQL DB directories or large binary blobs can bloat clones, CI cache sizes, and repository size overall. This plan provides a non-destructive, auditable history-rewrite solution to remove these directories and optionally strip out huge blobs. + +Scope +----- +This plan targets CodeQL DB directories (e.g., backend/codeql-db, codeql-db, codeql-db-js, codeql-db-go) and other large blobs. Scripts are non-destructive by default and require `--force` to make destructive changes. + +Risk & Mitigation +----------------- +- Rewriting history changes commit hashes. We never force-push in the scripts automatically; the maintainer must coordinate before running `git push --force`. +- Always create a backup branch before rewriting; the script creates `backup/history-YYYYMMDD-HHMMSS` and pushes it to `origin`. +- Require the manual confirmation string `I UNDERSTAND` before running any destructive change. + +Overview of steps +----------------- +1. Prepare: create and checkout a non-main feature branch (do not run on `main` or `master`). +2. Dry-run and preview: run a dry-run to preview commits and blobs to remove. + - `scripts/history-rewrite/clean_history.sh --dry-run --paths 'backend/codeql-db,codeql-db' --strip-size 50` +3. Optional detailed preview: + - `scripts/history-rewrite/preview_removals.sh --paths 'backend/codeql-db,codeql-db' --strip-size 50` +4. With approval, run the destructive rewrite in a local clone or dedicated environment. + - `scripts/history-rewrite/clean_history.sh --force --paths 'backend/codeql-db,codeql-db' --strip-size 50` + - When prompted, type `I UNDERSTAND` to proceed. +5. Validation: run the validator script and ensure CI passes locally: + - `scripts/history-rewrite/validate_after_rewrite.sh` +6. Coordinate with maintainers and force-push only after consensus. + +Installation & prerequisites +---------------------------- +- git >= 2.25 +- git-filter-repo: install via package manager or pip. See https://github.com/newren/git-filter-repo. +- pre-commit (optional): installed in the repository virtual environment (`.venv`). + +Sample commands and dry-run outputs +---------------------------------- +Dry-run: +``` +scripts/history-rewrite/clean_history.sh --dry-run --paths 'backend/codeql-db,codeql-db' --strip-size 50 +``` + +Sample dry-run output (excerpt): + +--- Path: backend/codeql-db +2b7c6f8d1a... (commits touching this path) +--- Objects in paths +f6a9abcd... backend/codeql-db/project.sarif +--- Example large objects (candidate for --strip-size) +f3ae1234... size=104857600 + +Force-run (coordination required): +``` +scripts/history-rewrite/clean_history.sh --force --paths 'backend/codeql-db,codeql-db' --strip-size 50 +``` +Followed by verification and manual force-push: + - Check `data/backups/history_cleanup-YYYYMMDD-HHMMSS.log` + - `scripts/history-rewrite/validate_after_rewrite.sh` + - `git push --all --force` (only after maintainers approve) + +Rollback plan +------------- +If problems occur, restore from the backup branch: + + git checkout -b restore/YYYYMMDD-HHMMSS backup/history-YYYYMMDD-HHMMSS + git push origin restore/YYYYMMDD-HHMMSS + +Post rewrite maintenance +------------------------ +- Run `git gc --aggressive --prune=now` on clones and local copies. +- Run `git count-objects -vH` to confirm size improvements. +- Refresh CI caches and mirrors after the change. + +Communication & Approval +------------------------ +Open a PR with dry-run logs and `preview_removals` output, tag maintainers for approval before `--force` is used. diff --git a/scripts/history-rewrite/clean_history.sh b/scripts/history-rewrite/clean_history.sh new file mode 100644 index 00000000..a09cafa2 --- /dev/null +++ b/scripts/history-rewrite/clean_history.sh @@ -0,0 +1,176 @@ +#!/bin/sh +# POSIX shell script to safely preview and optionally run a git history rewrite +set -eu + +# Default values +DRY_RUN=1 +FORCE=0 +PATHS="backend/codeql-db,codeql-db,codeql-db-js,codeql-db-go" +STRIP_SIZE=50 + +usage() { + cat </dev/null 2>&1; then + echo "git is required but not found. Aborting." >&2 + exit 1 + fi + if ! command -v git-filter-repo >/dev/null 2>&1; then + echo "git-filter-repo not found. Please install it:" + echo " - Debian/Ubuntu: sudo apt install git-filter-repo" + echo " - Mac (Homebrew): brew install git-filter-repo" + echo " - Python pip: pip install git-filter-repo" + echo "Or see https://github.com/newren/git-filter-repo for details." + exit 2 + fi +} + +timestamp() { + # POSIX-friendly timestamp + date +"%Y%m%d-%H%M%S" +} + +logdir="data/backups" +mkdir -p "$logdir" +logfile="$logdir/history_cleanup-$(timestamp).log" + +echo "Starting history cleanup tool at $(date)" | tee "$logfile" + +while [ "$#" -gt 0 ]; do + case "$1" in + --dry-run) + DRY_RUN=1; shift;; + --force) + DRY_RUN=0; FORCE=1; shift;; + --paths) + PATHS="$2"; shift 2;; + --strip-size) + STRIP_SIZE="$2"; shift 2;; + --help) + usage; exit 0;; + *) + echo "Unknown option: $1" >&2; usage; exit 1;; + esac +done + +check_requirements + +current_branch=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "(detached)") +if [ "$current_branch" = "main" ] || [ "$current_branch" = "master" ]; then + echo "Refusing to run on main/master branch. Switch to a feature branch and retry." | tee -a "$logfile" + exit 3 +fi + +backup_branch="backup/history-$(timestamp)" +echo "Creating backup branch: $backup_branch" | tee -a "$logfile" +git branch -f "$backup_branch" || true +git push origin "$backup_branch" || echo "Warning: push failed, ensure remote origin exists and push manually." | tee -a "$logfile" + +IFS=','; set -f +paths_list="" +for p in $PATHS; do + # Expand shell expansion + paths_list="$paths_list $p" +done +set +f; unset IFS + +echo "Paths targeted: $paths_list" | tee -a "$logfile" +echo "Strip blobs bigger than: ${STRIP_SIZE}M" | tee -a "$logfile" + +preview_removals() { + echo "=== Preview: commits & blobs touching specified paths ===" | tee -a "$logfile" + # List commits that touch the paths + for p in $paths_list; do + echo "--- Path: $p" | tee -a "$logfile" + git rev-list --all -- "$p" | head -n 20 | tee -a "$logfile" + done + echo "=== End of commit preview ===" | tee -a "$logfile" + + echo "=== Preview: objects in paths ===" | tee -a "$logfile" + # List objects for the given paths + git rev-list --objects --all -- $paths_list | tee -a "$logfile" | awk '{print $1, $2}' | head -n 50 | tee -a "$logfile" + + echo "=== Example large objects (candidate for --strip-size) ===" | tee -a "$logfile" + # List object sizes and show top N + git rev-list --objects --all | awk '{print $1}' | while read oid; do + size=$(git cat-file -s "$oid" 2>/dev/null || true) + if [ -n "$size" ] && [ "$size" -ge $((STRIP_SIZE * 1024 * 1024)) ]; then + echo "$oid size=$size" | tee -a "$logfile" + fi + done | head -n 30 +} + +if [ "$DRY_RUN" -eq 1 ]; then + echo "Running dry-run mode. No destructive operations will be performed." | tee -a "$logfile" + preview_removals + echo "Dry-run complete. See $logfile for details." | tee -a "$logfile" + exit 0 +fi + +if [ "$FORCE" -ne 1 ]; then + echo "To run a destructive rewrite, pass --force. Aborting." | tee -a "$logfile" + exit 1 +fi + +echo "FORCE mode enabled - performing rewrite. This is destructive and will rewrite history." | tee -a "$logfile" + +echo "Confirm operation: Type 'I UNDERSTAND' to proceed:" | tee -a "$logfile" +read -r confirmation +if [ "$confirmation" != "I UNDERSTAND" ]; then + echo "Confirmation not provided. Aborting." | tee -a "$logfile" + exit 1 +fi + +if [ "$current_branch" = "main" ] || [ "$current_branch" = "master" ]; then + echo "Refusing to run filter-repo on main/master. Switch to a safe branch and retry." | tee -a "$logfile" + exit 1 +fi + +# Build git-filter-repo arguments +paths_args="" +IFS=' ' +for p in $paths_list; do + paths_args="$paths_args --paths $p" +done +set +f + +echo "Running git filter-repo with: $paths_args --invert-paths --strip-blobs-bigger-than ${STRIP_SIZE}M" | tee -a "$logfile" + +echo "Performing a local dry-run against a local clone before actual rewrite is strongly recommended." | tee -a "$logfile" + +git filter-repo --invert-paths $paths_args --strip-blobs-bigger-than ${STRIP_SIZE}M | tee -a "$logfile" + +echo "Rewrite complete. Running post-rewrite checks..." | tee -a "$logfile" +git count-objects -vH | tee -a "$logfile" +git fsck --full | tee -a "$logfile" +git gc --aggressive --prune=now | tee -a "$logfile" + +echo "REWRITE DONE. Next steps (manual):" | tee -a "$logfile" +cat <&2; usage; exit 1;; + esac +done + +IFS=','; set -f +paths_list="" +for p in $PATHS; do + paths_list="$paths_list $p" +done +set +f; unset IFS + +echo "Paths: $paths_list" +echo "Strip blobs larger than: ${STRIP_SIZE}M" + +echo "--- Commits touching specified paths ---" +for p in $paths_list; do + echo "Path: $p" + git rev-list --all -- "$p" | nl -ba | sed -n '1,50p' +done + +echo "--- Objects in paths ---" +git rev-list --objects --all -- $paths_list | nl -ba | sed -n '1,100p' + +echo "--- Example large objects larger than ${STRIP_SIZE}M ---" +git rev-list --objects --all | awk '{print $1}' | while read oid; do + size=$(git cat-file -s "$oid" 2>/dev/null || true) + if [ -n "$size" ] && [ "$size" -ge $((STRIP_SIZE * 1024 * 1024)) ]; then + echo "$oid size=$size" + fi +done | nl -ba | sed -n '1,50p' + +echo "Preview complete. Use clean_history.sh --dry-run to get a log file." + +exit 0 diff --git a/scripts/history-rewrite/validate_after_rewrite.sh b/scripts/history-rewrite/validate_after_rewrite.sh new file mode 100644 index 00000000..9a5ab134 --- /dev/null +++ b/scripts/history-rewrite/validate_after_rewrite.sh @@ -0,0 +1,43 @@ +#!/bin/sh +# Verify repository health after a destructive history-rewrite +set -eu + +usage() { + cat <