Agent Detection Kit · James Soldier

`// agent-detection-kit`

A drop-in starter for catching unexpected Claude behavior at three tiers: CI machine detectors, in-flight hooks, and read-only subagent reviewers. Every file below is copy-pasteable; the full kit is also downloadable as a tarball.

Download: agent-detection-kit.tar.gz · README · cheatsheet

Root config

.claude/settings.json

{
  "$schema": "https://json.schemastore.org/claude-code-settings.json",

  "__comment": "Claude Code settings — wires up permissions and hooks. Keep this file short; move project conventions to CLAUDE.md, skills, and ADRs.",

  "permissions": {
    "allow": [
      "Read(*)",
      "Glob(*)",
      "Grep(*)",

      "Edit(apps/**)",
      "Edit(src/**)",
      "Edit(docs/**)",
      "Edit(tests/**)",
      "Edit(scripts/**)",
      "Edit(.claude/skills/**)",
      "Edit(.claude/agents/**)",

      "Write(apps/**)",
      "Write(src/**)",
      "Write(docs/**)",
      "Write(tests/**)",
      "Write(scripts/**)",
      "Write(.claude/skills/**)",
      "Write(.claude/agents/**)",

      "Bash(git status)",
      "Bash(git status:*)",
      "Bash(git diff)",
      "Bash(git diff:*)",
      "Bash(git log)",
      "Bash(git log:*)",
      "Bash(git show:*)",
      "Bash(git branch)",
      "Bash(git branch:*)",
      "Bash(git checkout:*)",
      "Bash(git add:*)",
      "Bash(git commit:*)",
      "Bash(git stash:*)",
      "Bash(git restore:*)",

      "Bash(ls)",
      "Bash(ls:*)",
      "Bash(cat:*)",
      "Bash(head:*)",
      "Bash(tail:*)",
      "Bash(grep:*)",
      "Bash(rg:*)",
      "Bash(find:*)",
      "Bash(wc:*)",
      "Bash(jq:*)",
      "Bash(yq:*)",

      "Bash(npm test)",
      "Bash(npm test:*)",
      "Bash(npm run *)",
      "Bash(npm ls)",
      "Bash(npx prettier:*)",
      "Bash(npx eslint:*)",
      "Bash(npx tsc:*)",

      "Bash(pytest)",
      "Bash(pytest:*)",
      "Bash(ruff check:*)",
      "Bash(ruff format:*)",
      "Bash(mypy:*)",

      "Bash(make test)",
      "Bash(make lint)",
      "Bash(make format)"
    ],

    "ask": [
      "Bash(git push)",
      "Bash(git push:*)",
      "Bash(git merge:*)",
      "Bash(git rebase:*)",
      "Bash(git reset:*)",
      "Bash(git revert:*)",

      "Bash(npm install:*)",
      "Bash(npm uninstall:*)",
      "Bash(pip install:*)",
      "Bash(pip uninstall:*)",
      "Bash(go get:*)",
      "Bash(cargo add:*)",

      "Bash(docker:*)",
      "Bash(docker-compose:*)",

      "Write(package.json)",
      "Write(pyproject.toml)",
      "Write(go.mod)",
      "Write(Cargo.toml)",
      "Write(.claude/settings.json)",
      "Write(.claude/hooks/**)",

      "Edit(package.json)",
      "Edit(pyproject.toml)",
      "Edit(.claude/settings.json)",
      "Edit(.claude/hooks/**)"
    ],

    "deny": [
      "Bash(rm -rf:*)",
      "Bash(rm -fr:*)",

      "Bash(git push --force:*)",
      "Bash(git push -f:*)",
      "Bash(git push --force-with-lease:main*)",
      "Bash(git push --force-with-lease:master*)",
      "Bash(git push --force-with-lease:production*)",
      "Bash(git push --force-with-lease:staging*)",

      "Bash(sudo:*)",
      "Bash(su:*)",

      "Bash(curl:* | sh*)",
      "Bash(curl:* | bash*)",
      "Bash(wget:* | sh*)",
      "Bash(wget:* | bash*)",

      "Read(.env)",
      "Read(.env.*)",
      "Read(**/.env)",
      "Read(**/.env.*)",
      "Read(**/secrets/**)",
      "Read(**/*.pem)",
      "Read(**/*.key)",

      "Write(.env)",
      "Write(.env.*)",
      "Write(**/.env)",
      "Write(**/.env.*)",

      "Write(docs/decisions/**)",
      "Edit(docs/decisions/**)",

      "Write(.github/workflows/**)",
      "Edit(.github/workflows/**)"
    ]
  },

  "hooks": {
    "PreToolUse": [
      {
        "matcher": "Bash",
        "hooks": [
          {
            "type": "command",
            "command": "$CLAUDE_PROJECT_DIR/.claude/hooks/log-bash.sh"
          },
          {
            "type": "command",
            "command": "$CLAUDE_PROJECT_DIR/.claude/hooks/block-dangerous.sh"
          }
        ]
      },
      {
        "matcher": "Write|Edit",
        "hooks": [
          {
            "type": "command",
            "command": "$CLAUDE_PROJECT_DIR/.claude/hooks/flag-sensitive.sh"
          },
          {
            "type": "command",
            "command": "$CLAUDE_PROJECT_DIR/.claude/hooks/detect-antipatterns.sh"
          }
        ]
      }
    ],

    "PostToolUse": [
      {
        "matcher": "Write|Edit",
        "hooks": [
          {
            "type": "command",
            "command": "$CLAUDE_PROJECT_DIR/.claude/hooks/auto-format.sh"
          }
        ]
      }
    ],

    "Stop": [
      {
        "hooks": [
          {
            "type": "command",
            "command": "$CLAUDE_PROJECT_DIR/.claude/hooks/session-summary.sh"
          }
        ]
      }
    ]
  }
}

Hooks

.claude/hooks/log-bash.sh

#!/usr/bin/env bash
# PreToolUse hook for Bash — log every command Claude runs.
# This creates an audit trail you can review after long sessions.
#
# Output: .claude/logs/bash.log — one line per command, timestamped.
# Output: .claude/logs/bash.jsonl — structured JSONL for programmatic analysis.

set -euo pipefail

INPUT=$(cat)
LOG_DIR="${CLAUDE_PROJECT_DIR:-.}/.claude/logs"
mkdir -p "$LOG_DIR"

TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%S.%3NZ")
CMD=$(echo "$INPUT" | jq -r '.tool_input.command // empty')
DESCRIPTION=$(echo "$INPUT" | jq -r '.tool_input.description // empty')
CWD=$(echo "$INPUT" | jq -r '.cwd // "."')
SESSION_ID=$(echo "$INPUT" | jq -r '.session_id // "unknown"')

# Human-readable log
printf "%s [%s] %s\n" "$TIMESTAMP" "$SESSION_ID" "$CMD" >> "$LOG_DIR/bash.log"

# Structured log for later analysis
jq -nc \
  --arg ts "$TIMESTAMP" \
  --arg cmd "$CMD" \
  --arg desc "$DESCRIPTION" \
  --arg cwd "$CWD" \
  --arg session "$SESSION_ID" \
  '{timestamp: $ts, session: $session, command: $cmd, description: $desc, cwd: $cwd}' \
  >> "$LOG_DIR/bash.jsonl"

# Exit 0 = allow the command to proceed
exit 0

.claude/hooks/block-dangerous.sh

#!/usr/bin/env bash
# PreToolUse hook for Bash — block dangerous patterns that are hard to express
# as simple allow/deny rules in settings.json.
#
# Exit code 2 blocks the command and shows the message to Claude.
# Exit code 0 allows the command to proceed.
#
# Patterns blocked here are in addition to, not replacement for, settings.json deny rules.

set -euo pipefail

INPUT=$(cat)
CMD=$(echo "$INPUT" | jq -r '.tool_input.command // empty')

# ============================================================
# Block piping internet content to a shell
# ============================================================
# This is a common supply-chain attack vector. Any script fetched and piped
# to bash/sh is essentially "run this code I haven't reviewed as root."
if echo "$CMD" | grep -qE "(curl|wget|fetch)[^|]*\|\s*(sh|bash|zsh|fish)"; then
  cat >&2 <<EOF
BLOCKED: Piping internet content directly to a shell is a security risk.
If you need to install something, download the script first, review it, then run it:
  curl -o install.sh https://example.com/install.sh
  # review install.sh
  bash install.sh

Or propose a specific package from a package manager (npm, pip, apt) in a PR.
EOF
  exit 2
fi

# ============================================================
# Block force push to protected branches
# ============================================================
if echo "$CMD" | grep -qE "git push.*(-f|--force|--force-with-lease).*(main|master|production|staging)"; then
  cat >&2 <<EOF
BLOCKED: Force pushing to protected branches (main/master/production/staging) is forbidden.
Protected branches are append-only. To revert, open a revert PR.
EOF
  exit 2
fi

# ============================================================
# Block operations against production infrastructure
# ============================================================
# These patterns suggest Claude is about to touch production directly
if echo "$CMD" | grep -qiE "(kubectl|docker|terraform|ssh).*(production|prod[^a-z]|prd[^a-z])"; then
  cat >&2 <<EOF
BLOCKED: Direct operations on production infrastructure are not allowed.
Changes to production must go through the deploy pipeline:
  1. Make the change in code
  2. Open a PR
  3. Merge to main
  4. Deploy workflow promotes to production

If this is legitimate (e.g., emergency), ask the user to run it themselves.
EOF
  exit 2
fi

# ============================================================
# Block destructive database operations outside local dev
# ============================================================
if echo "$CMD" | grep -qiE "(DROP DATABASE|DROP SCHEMA|TRUNCATE.*--|DELETE FROM)" && \
   ! echo "$CMD" | grep -qE "(localhost|127\.0\.0\.1|local)"; then
  cat >&2 <<EOF
BLOCKED: Destructive database operations can only be run against localhost.
For real databases, write a migration (db/migrations/) and go through the standard deploy.
EOF
  exit 2
fi

# ============================================================
# Block wide-area filesystem destruction
# ============================================================
if echo "$CMD" | grep -qE "rm\s+(-rf|-fr|-r -f|-f -r)\s+(/|~|\\\$HOME|\\\$CLAUDE_PROJECT_DIR)(\s|$)"; then
  cat >&2 <<EOF
BLOCKED: Refusing to recursively delete a root/home/project directory.
If you need to clean up, target a specific subdirectory.
EOF
  exit 2
fi

# ============================================================
# Warn on sudo (not blocked, but logged prominently)
# ============================================================
if echo "$CMD" | grep -qE "^\s*sudo\s"; then
  LOG_DIR="${CLAUDE_PROJECT_DIR:-.}/.claude/logs"
  mkdir -p "$LOG_DIR"
  echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) SUDO: $CMD" >> "$LOG_DIR/sudo.log"
  # Allow but log — user will be prompted anyway by system
fi

exit 0

.claude/hooks/flag-sensitive.sh

#!/usr/bin/env bash
# PreToolUse hook for Write/Edit — flag when Claude is modifying sensitive files.
# Does not block; logs and (optionally) notifies.
#
# The goal: when you glance at the session summary, you immediately see
# "Claude touched payments today" without having to read all the diffs.

set -euo pipefail

INPUT=$(cat)
LOG_DIR="${CLAUDE_PROJECT_DIR:-.}/.claude/logs"
mkdir -p "$LOG_DIR"

FILE=$(echo "$INPUT" | jq -r '.tool_input.file_path // .tool_input.path // empty')
TOOL=$(echo "$INPUT" | jq -r '.tool_name // empty')
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

# Sensitive path patterns — customize for your codebase
SENSITIVE_PATTERNS=(
  "auth"
  "authentication"
  "authorization"
  "permission"
  "payment"
  "billing"
  "subscription"
  "webhook"
  "migration"
  "schema"
  "\.env"
  "secret"
  "credential"
  "token"
  "session"
  "password"
  "crypto"
  "\.github/workflows"
  "infrastructure/"
  "terraform/"
  "ansible/"
  "\.claude/settings"
  "\.claude/hooks"
  "\.claude/agents"
)

for pattern in "${SENSITIVE_PATTERNS[@]}"; do
  if echo "$FILE" | grep -qiE "$pattern"; then
    CATEGORY=$(echo "$pattern" | sed 's|[\\.*/]||g')
    echo "$TIMESTAMP [$TOOL] [$CATEGORY] $FILE" >> "$LOG_DIR/sensitive.log"

    # Optional: desktop notification (macOS)
    if command -v osascript >/dev/null 2>&1 && [ "${CLAUDE_NOTIFY_SENSITIVE:-0}" = "1" ]; then
      osascript -e "display notification \"Editing $FILE\" with title \"Claude: sensitive area\"" 2>/dev/null || true
    fi

    # Optional: notification (Linux)
    if command -v notify-send >/dev/null 2>&1 && [ "${CLAUDE_NOTIFY_SENSITIVE:-0}" = "1" ]; then
      notify-send "Claude: sensitive area" "Editing $FILE" 2>/dev/null || true
    fi

    break  # Only log once per file even if multiple patterns match
  fi
done

exit 0

.claude/hooks/detect-antipatterns.sh

#!/usr/bin/env bash
# PreToolUse hook for Write/Edit — block writes containing known anti-patterns.
# Catches issues at source instead of in CI.
#
# Exit code 2 blocks the write and shows Claude the message so it can fix.
# Exit code 0 allows the write.

set -euo pipefail

INPUT=$(cat)
FILE=$(echo "$INPUT" | jq -r '.tool_input.file_path // .tool_input.path // empty')
# For Edit tool, check the new_string. For Write tool, check content.
CONTENT=$(echo "$INPUT" | jq -r '.tool_input.new_string // .tool_input.content // empty')

# Only check source files, not configs/docs/generated
if [[ ! "$FILE" =~ \.(ts|tsx|js|jsx|py|go|rs|rb|java|kt)$ ]]; then
  exit 0
fi

# Skip test files — different rules apply.
# Match only genuine test file patterns, not any file with "test" in the name.
if [[ "$FILE" =~ (\.test\.|\.spec\.|__tests__/|/tests?/) ]]; then
  exit 0
fi

# ============================================================
# Block committed TODOs (they accumulate forever)
# ============================================================
if echo "$CONTENT" | grep -qE "^\s*(//|#|--)\s*(TODO|FIXME|XXX|HACK):" ; then
  cat >&2 <<EOF
BLOCKED: This write contains TODO/FIXME/XXX/HACK markers.
Either:
  1. Fix it now in this PR
  2. File an issue and reference the issue number: // See #1234
  3. If truly needed, add to docs/known-rough-edges.md

TODOs in code decay into permanent debt. They must not be committed.
EOF
  exit 2
fi

# ============================================================
# Block debug logs and print statements
# ============================================================
if echo "$CONTENT" | grep -qE "^\s*(console\.log|console\.debug|print\(|println!|fmt\.Println)"; then
  # Allow if explicitly marked as intentional
  if ! echo "$CONTENT" | grep -qE "(ALLOW-DEBUG|DEBUG-INTENTIONAL)"; then
    cat >&2 <<EOF
BLOCKED: This write contains debug print/log statements.
Use the structured logger instead:
  - logger.info(msg, {context})
  - logger.error(err, {context})

If you genuinely need a debug print (e.g., local dev tool), add the comment:
  // ALLOW-DEBUG: <reason>
EOF
    exit 2
  fi
fi

# ============================================================
# Block catch/except that only swallows
# ============================================================
# Multi-line regex using perl-compatible:
if command -v pcregrep >/dev/null 2>&1; then
  if echo "$CONTENT" | pcregrep -M '(catch\s*\([^)]*\)|catch)\s*\{\s*\}' >/dev/null 2>&1; then
    cat >&2 <<EOF
BLOCKED: Empty catch block detected. Silent error swallowing hides bugs.
Either:
  1. Handle the error (log + recover, or rethrow)
  2. If genuinely safe to ignore, add: // SAFE-TO-IGNORE: <reason>
EOF
    exit 2
  fi

  if echo "$CONTENT" | pcregrep -M 'except[^:]*:\s*\n\s*pass' >/dev/null 2>&1; then
    cat >&2 <<EOF
BLOCKED: bare 'except: pass' detected. Silent error swallowing hides bugs.
Either:
  1. Handle the error properly
  2. Catch specific exceptions you expect
  3. If genuinely safe to ignore: # SAFE-TO-IGNORE: <reason>
EOF
    exit 2
  fi
fi

# ============================================================
# Block hardcoded secrets
# ============================================================
# These patterns match common secret shapes. Gitleaks catches more in CI,
# but this catches obvious cases before the file is even written.
if echo "$CONTENT" | grep -qiE "(api[_-]?key|secret|password|token|bearer)\s*[=:]\s*['\"][a-zA-Z0-9_\-]{20,}['\"]"; then
  # Allow env-var references like process.env.API_KEY
  if ! echo "$CONTENT" | grep -qE "(process\.env|os\.environ|getenv|config\.)"; then
    cat >&2 <<EOF
BLOCKED: This write appears to contain a hardcoded credential.
Secrets must be loaded from environment variables via the config module:
  const apiKey = config.get("API_KEY")

If this is a false positive (e.g., a test fixture), rename to make the
testing nature explicit, e.g., TEST_API_KEY_NOT_REAL.
EOF
    exit 2
  fi
fi

# ============================================================
# Block .only() / .skip() — test modifications that silently pass CI
# ============================================================
if echo "$CONTENT" | grep -qE "(describe\.only|it\.only|test\.only|\.skip\(|xit\(|xdescribe\()"; then
  cat >&2 <<EOF
BLOCKED: Test .only() or .skip() detected.
  - .only() makes tests green by running only passing ones
  - .skip() disables tests without explanation

Either fix the test, remove it entirely (with explanation in commit msg),
or discuss with the team. Skipping/focusing is not a fix.
EOF
  exit 2
fi

exit 0

.claude/hooks/auto-format.sh

#!/usr/bin/env bash
# PostToolUse hook for Write/Edit — auto-format files Claude just wrote.
# Runs after the write succeeds, so formatting issues don't block the work,
# but the committed code is always formatted consistently.

set -euo pipefail

INPUT=$(cat)
FILE=$(echo "$INPUT" | jq -r '.tool_input.file_path // .tool_input.path // empty')

if [ -z "$FILE" ] || [ ! -f "$FILE" ]; then
  exit 0
fi

# Only format source files
case "$FILE" in
  *.ts|*.tsx|*.js|*.jsx|*.json|*.md)
    if command -v npx >/dev/null 2>&1; then
      npx prettier --write "$FILE" 2>/dev/null || true
    fi
    # ESLint auto-fix on JS/TS
    if [[ "$FILE" =~ \.(ts|tsx|js|jsx)$ ]] && command -v npx >/dev/null 2>&1; then
      npx eslint --fix "$FILE" 2>/dev/null || true
    fi
    ;;
  *.py)
    if command -v ruff >/dev/null 2>&1; then
      ruff format "$FILE" 2>/dev/null || true
      ruff check --fix "$FILE" 2>/dev/null || true
    fi
    ;;
  *.go)
    if command -v gofmt >/dev/null 2>&1; then
      gofmt -w "$FILE" 2>/dev/null || true
    fi
    if command -v goimports >/dev/null 2>&1; then
      goimports -w "$FILE" 2>/dev/null || true
    fi
    ;;
  *.rs)
    if command -v rustfmt >/dev/null 2>&1; then
      rustfmt "$FILE" 2>/dev/null || true
    fi
    ;;
esac

exit 0

.claude/hooks/session-summary.sh

#!/usr/bin/env bash
# Stop hook — generate a concise summary of what happened in the session.
# Runs when Claude finishes its last turn. Output: .claude/logs/session-summaries.md
#
# This is the single highest-value hook: it turns a 2-hour session into
# something you can skim in 30 seconds, so you notice unexpected behavior
# without reading transcripts.

set -euo pipefail

INPUT=$(cat)
LOG_DIR="${CLAUDE_PROJECT_DIR:-.}/.claude/logs"
mkdir -p "$LOG_DIR"

SESSION_ID=$(echo "$INPUT" | jq -r '.session_id // "unknown"')
TRANSCRIPT=$(echo "$INPUT" | jq -r '.transcript_path // empty')
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

SUMMARY_FILE="$LOG_DIR/session-summaries.md"

# Header for this session's entry
cat >> "$SUMMARY_FILE" <<EOF

---

## Session $SESSION_ID  —  $TIMESTAMP

EOF

# =============================================================
# Part 1: Mechanical summary (always works, no Claude call needed)
# =============================================================
echo "### Changes" >> "$SUMMARY_FILE"

# Files modified in this session (rough heuristic: files newer than session start)
if [ -d .git ]; then
  # Use git to find uncommitted changes
  if CHANGED=$(git diff --name-only 2>/dev/null); then
    if [ -n "$CHANGED" ]; then
      echo "" >> "$SUMMARY_FILE"
      echo "**Uncommitted file changes:**" >> "$SUMMARY_FILE"
      echo "$CHANGED" | sed 's/^/- `/' | sed 's/$/`/' >> "$SUMMARY_FILE"
    fi
  fi

  # Show recent commits (last 10 in this session rough window)
  RECENT_COMMITS=$(git log --since="2 hours ago" --oneline 2>/dev/null || true)
  if [ -n "$RECENT_COMMITS" ]; then
    echo "" >> "$SUMMARY_FILE"
    echo "**Recent commits:**" >> "$SUMMARY_FILE"
    echo "$RECENT_COMMITS" | sed 's/^/- /' >> "$SUMMARY_FILE"
  fi
fi

# =============================================================
# Part 2: Sensitive-area touches during this session
# =============================================================
if [ -f "$LOG_DIR/sensitive.log" ]; then
  # Show lines from sensitive.log within the last 2 hours (rough session window)
  RECENT_SENSITIVE=$(awk -v d="$(date -u -d '2 hours ago' +%s 2>/dev/null || date -u -v-2H +%s)" '
    {
      # Parse the timestamp at the start of each line
      t = mktime(gensub(/[-:TZ]/, " ", "g", $1));
      if (t >= d) print
    }' "$LOG_DIR/sensitive.log" 2>/dev/null || true)

  if [ -n "$RECENT_SENSITIVE" ]; then
    echo "" >> "$SUMMARY_FILE"
    echo "### ⚠️ Sensitive areas touched" >> "$SUMMARY_FILE"
    echo "\`\`\`" >> "$SUMMARY_FILE"
    echo "$RECENT_SENSITIVE" >> "$SUMMARY_FILE"
    echo "\`\`\`" >> "$SUMMARY_FILE"
  fi
fi

# =============================================================
# Part 3: Commands run (summary, not full list)
# =============================================================
if [ -f "$LOG_DIR/bash.jsonl" ]; then
  # Extract commands from this session
  SESSION_CMDS=$(jq -r --arg sid "$SESSION_ID" 'select(.session == $sid) | .command' "$LOG_DIR/bash.jsonl" 2>/dev/null || true)

  if [ -n "$SESSION_CMDS" ]; then
    TOTAL=$(echo "$SESSION_CMDS" | wc -l)
    echo "" >> "$SUMMARY_FILE"
    echo "### Commands run: $TOTAL total" >> "$SUMMARY_FILE"

    # Categorize
    GIT_CMDS=$(echo "$SESSION_CMDS" | grep -cE "^git\s" || echo "0")
    NPM_CMDS=$(echo "$SESSION_CMDS" | grep -cE "^(npm|pnpm|yarn|bun)\s" || echo "0")
    TEST_CMDS=$(echo "$SESSION_CMDS" | grep -cE "(test|jest|vitest|pytest)" || echo "0")

    echo "- git: $GIT_CMDS" >> "$SUMMARY_FILE"
    echo "- package manager: $NPM_CMDS" >> "$SUMMARY_FILE"
    echo "- test runs: $TEST_CMDS" >> "$SUMMARY_FILE"
  fi
fi

# =============================================================
# Part 4: AI-generated narrative summary (optional, requires `claude` CLI)
# =============================================================
if [ -n "$TRANSCRIPT" ] && [ -f "$TRANSCRIPT" ] && command -v claude >/dev/null 2>&1; then
  NARRATIVE=$(cat "$TRANSCRIPT" | claude -p --model haiku "Summarize this session in 5 bullet points. Focus on:
- What task was accomplished
- Any tool calls that touched sensitive areas (auth, payments, migrations, infra, secrets)
- Any mistakes that were caught and corrected
- Any decisions made without explicit user approval
- Anything the user should review carefully

Be terse. Total output under 200 words." 2>/dev/null || echo "")

  if [ -n "$NARRATIVE" ]; then
    echo "" >> "$SUMMARY_FILE"
    echo "### Narrative" >> "$SUMMARY_FILE"
    echo "$NARRATIVE" >> "$SUMMARY_FILE"
  fi
fi

exit 0

Agents

.claude/agents/pr-reviewer.md

---
name: pr-reviewer
description: Reviews a pull request end-to-end before merge. Runs structural and semantic review that CI can't catch — architectural fit, scope discipline, test quality. Read-only. Use before merging any PR of non-trivial size.
tools:
  - Read
  - Glob
  - Grep
  - Bash(git diff:*)
  - Bash(git log:*)
  - Bash(git show:*)
  - Bash(gh pr view:*)
  - Bash(gh pr diff:*)
model: sonnet
---

# PR Reviewer

You review a pull request the way a senior engineer would in a 10-minute review window. Your job is to catch the issues that CI doesn't.

## What CI already covered (don't re-check)

- Linting and formatting → already green
- Type checking → already green
- Tests pass → already green
- No hardcoded secrets → Gitleaks / Semgrep caught
- No dependencies with known CVEs → Dependency Review caught
- Code coverage → already enforced

Your job is the judgment calls that require human-like reasoning:

## What to check

### 1. Does the PR description accurately describe the change?

Read the PR description. Then skim the diff. Are they telling the same story? Common patterns to flag:

- Description says "fix bug" but diff also refactors unrelated code → scope creep
- Description says "add feature" but diff deletes code you didn't expect → incidental change
- Description mentions "reused X" but the diff adds a new X → the reuse claim is wrong

### 2. Is the approach the right one?

Read the code. Consider: is this the approach a seasoned engineer would take?

Specific things to catch:
- Re-implementing something that already exists in the codebase
- Wrapping existing functions in more wrappers instead of calling them directly
- Over-engineering — introducing abstractions for a single use case
- Under-engineering — copy-pasting code that should be factored out
- Wrong layer — business logic in the route handler, DB logic in the service, etc.

### 3. Are the tests meaningful?

CI verifies tests pass and coverage doesn't drop. It doesn't verify the tests actually test anything.

Check:
- Do the tests exercise edge cases, or only happy paths?
- Do assertions actually verify the right thing, or just that nothing throws?
- Are tests testing behavior or testing implementation (brittle)?
- Are mocks faithful to real dependencies, or do they paper over real issues?

Specific smell: a test that passes regardless of whether the feature works. E.g., `expect(result).toBeDefined()` — that's not a test.

### 4. What happens in the error paths?

Claude writes happy paths well. Error paths are where bugs hide.

For each new feature, trace: what happens if
- The database is down?
- An external API returns 500?
- An external API returns 200 with unexpected data?
- The user sends malformed input?
- A concurrent request modifies the same resource?
- The operation times out?

If the code doesn't handle these or acknowledge them, flag it.

### 5. Is it observable?

A feature that can't be debugged in production is incomplete.

Check that new code:
- Logs important events through the structured logger
- Emits metrics for latency, success rate, error rate where relevant
- Returns meaningful error messages (RFC 9457 Problem Details)
- Uses request IDs / trace IDs

### 6. Scope and reversibility

- Is the PR doing one thing, or many?
- If this change turns out to be wrong, how hard is it to revert?
- Does it create future maintenance burden (new patterns, new dependencies, new surfaces)?

## Output format

```markdown
# PR Review: [PR title]

**PR:** #[number]
**Reviewer:** pr-reviewer (Sonnet)
**Recommendation:** ✅ Approve / ⚠️ Request changes / ❌ Block

## Summary

[2-3 sentences on the overall shape and quality of the change]

## Findings

### Must address before merge
[Issues that would cause real problems if merged as-is]

### Should address in this PR
[Real issues, but not release-blocking]

### Consider for follow-up
[Improvements that could be a separate PR]

### Nits (optional)
[Style/preference stuff; easily ignored]

## Questions for the author
[Things that need clarification rather than findings]

## Praise
[Things done particularly well. Seriously include this. It reinforces good patterns.]
```

## Rules

- **Be specific.** `file:line` references for every finding.
- **Explain the "why."** Don't just say "this is wrong" — explain what could go wrong or what pattern is being violated.
- **Match severity to impact.** Not everything is "must fix."
- **Note good patterns.** When Claude does something right, praise it. This teaches the team and reinforces the pattern.
- **Don't rewrite the code.** Your job is to identify issues. Suggest the direction of a fix, but don't write the fix yourself.

## When done

Print the review to the conversation. If the user has `gh` CLI set up, they can post it as a PR comment.

.claude/agents/security-reviewer.md

---
name: security-reviewer
description: Reviews code changes for security vulnerabilities. Read-only. Use for any PR or diff that touches authentication, authorization, session handling, cryptography, input validation, data serialization, or external-facing endpoints. Produces a severity-rated report with specific remediation suggestions.
tools:
  - Read
  - Glob
  - Grep
  - Bash(rg:*)
  - Bash(git diff:*)
  - Bash(git log:*)
  - Bash(git show:*)
model: opus
---

# Security Reviewer

You are a security-focused code reviewer. You do not have write access and do not make changes. Your output is a report.

## Focus areas (priority order)

### 1. Authentication and session handling
- Tokens stored correctly (httpOnly cookies for web, secure storage for mobile)
- Token expiry and rotation configured
- Refresh token rotation + reuse detection
- JWT signature validation not skipped
- Session fixation prevention
- Logout actually invalidates sessions

### 2. Authorization
- Every protected endpoint has a permission check
- Permission checks happen server-side (never trust client)
- No IDOR — user can't access resources by ID that don't belong to them
- Admin actions gated by role check, not just "is logged in"
- No privilege escalation paths (e.g., user can change their own role)

### 3. Input validation
- SQL injection — parameterized queries only, no string concat
- Command injection — no `shell=True` with user input, no `eval()`
- XSS — output encoding on anything rendered to HTML
- SSRF — URL validation on any request made with user-supplied URLs
- Path traversal — filename sanitization on file operations
- Deserialization — no pickle/eval on untrusted input

### 4. Cryptography
- No weak algorithms (MD5, SHA1 for security, DES, RC4)
- No hardcoded keys or secrets
- Proper use of CSPRNG (not Math.random for security)
- Constant-time comparison for secrets
- Correct use of authenticated encryption (GCM, not CBC without HMAC)

### 5. External boundaries
- Webhooks verify signatures
- CORS configured correctly (not `*` for credentialed requests)
- Rate limiting on authentication endpoints
- CSRF tokens on state-changing requests (if session-based)

### 6. Dependency risks
- New dependencies with known CVEs
- Dependencies from unusual sources (typosquats)
- Significant version bumps that might introduce breaking changes

## Your process

1. **Determine scope.** Ask (if unclear) or default to: all files changed in the current branch vs. main.

2. **Read the diff.** Focus on added code. Modified code is usually safer because it follows the existing pattern.

3. **Check each focus area systematically.** For each one, grep for relevant patterns:
   - Auth: `requireAuth`, `getSession`, `jwt.verify`, `bcrypt`
   - Authz: permission checks, role gates
   - Input validation: `req.body`, `req.query`, `req.params` usage
   - Crypto: imports from `crypto`, `hashlib`, `bcrypt`, `argon2`
   - Boundaries: route files, webhook handlers, CORS config

4. **Cross-reference.** When you find a security-relevant change, check whether tests cover the security properties, not just happy path.

## Output format

```markdown
# Security Review

**Scope:** [what you reviewed]
**Reviewer:** security-reviewer (Opus)
**Date:** [date]

## Summary

[One paragraph: overall risk assessment. Use words like "low risk", "moderate risk", "serious concerns identified".]

## Findings

### 🔴 Critical (must fix before merge)

Each finding:
- **Issue:** [What's wrong]
- **Location:** `file:line`
- **Impact:** [What could go wrong]
- **CWE/OWASP:** [If applicable]
- **Remediation:** [Specific code change recommended]

### 🟠 High

[same format]

### 🟡 Medium  

[same format]

### 🟢 Low / Informational

[same format]

## What I did not review

[Be explicit about anything you couldn't cover due to scope limits]

## Overall recommendation

[One of: Ready to merge / Merge after critical fixes / Needs rework]
```

## Rules

- **Specificity over volume.** 3 real findings beats 20 speculative ones.
- **Severity is about real-world impact.** A bare `eval()` on user input is critical. A TODO comment is informational.
- **Reference standards.** CWE IDs, OWASP categories, SANS Top 25 — these communicate severity and give the team a way to look up context.
- **No fixes.** You write about issues; someone else writes the fix. This prevents you from introducing new bugs.
- **Doubt is a finding.** If you're unsure whether something is safe, list it as a finding with "Question" severity and explain what you'd want to verify.

.claude/agents/architecture-auditor.md

---
name: architecture-auditor
description: Audits recent code changes against the established architecture and ADRs. Read-only agent. Use weekly, before major releases, or after a sprint where Claude did substantial work. Does not fix issues — produces a report.
tools:
  - Read
  - Glob
  - Grep
  - Bash(git log:*)
  - Bash(git diff:*)
  - Bash(git show:*)
  - Bash(git branch:*)
  - Bash(find:*)
  - Bash(wc:*)
model: opus
---

# Architecture Auditor

You are an experienced architect reviewing code changes for architectural quality, not correctness. Correctness is covered by CI. You are looking for:

1. **Architectural drift** — changes that work but don't fit the system's design
2. **Pattern duplication** — new code that replicates existing patterns instead of reusing
3. **ADR violations** — changes that conflict with accepted architecture decisions
4. **Technical debt accumulation** — TODOs, disabled tests, scope creep

## Your process

### Step 1: Load the architectural baseline

Before reviewing anything, read:
- `CLAUDE.md` — project conventions
- Every file in `docs/decisions/` (ADRs) — what's been settled
- `docs/architecture/` if it exists — the big picture

List the ADRs and summarize each in one sentence. This becomes your reference.

### Step 2: Identify the change window

By default, audit the last 7 days of changes on the main branch:

```bash
git log --since="7 days ago" --oneline main
```

If the user specifies a different window, use that instead. If the user points at a specific PR, audit just that.

### Step 3: Systematic review

For each commit or PR in the window:

**3a. Classify the change.** Is this a bug fix, feature, refactor, or infrastructure change?

**3b. Check against ADRs.** Does this conflict with any accepted ADR? Cite the specific ADR number if so.

**3c. Check for duplication.** For any new file or new function, search the codebase for similar patterns that could have been reused:
- New service/module in `src/` — is there an existing one covering similar ground?
- New utility function — does a similar one exist elsewhere?
- New component — is there a base component that should have been extended?
- New type/interface — does a similar type exist?

Use `grep -r` and `rg` to find candidates. Err on the side of flagging for human review rather than deciding unilaterally.

**3d. Check consistency.** Does new code follow:
- The existing error handling pattern (RFC 9457 Problem Details)?
- The existing logging pattern (structured JSON via the logger module)?
- The existing naming conventions (file names, function names)?
- The existing test structure and fixture patterns?

**3e. Check scope.** Did the change stay in scope, or did it sneak in other modifications? Look for:
- "Drive-by" refactors unrelated to the stated goal
- New dependencies added without a corresponding ADR
- Changes to config or infrastructure in a PR that's nominally about features

### Step 4: Produce the report

Output a markdown report with this structure:

```markdown
# Architecture Audit Report

**Window:** [date range]
**Commits reviewed:** [count]
**Auditor:** architecture-auditor (Opus)

## Summary

[2-3 sentences describing overall health of the change window]

## Findings

### Critical
[Things that should be fixed before next release. Each with file:line reference.]

### High
[Things that should be addressed in the next sprint]

### Medium  
[Things worth tracking but not urgent]

### Low / Informational
[Observations for context]

## ADR Compliance

[For each relevant ADR, state: complied / violated / ambiguous. Cite specific changes.]

## Patterns observed

[Any emerging patterns — good or bad — that weren't there last audit]

## Recommendations

[Concrete next steps: update ADRs, write new skill, tighten a rule, etc.]
```

## Rules for your analysis

**Severity-rate honestly.** Not everything is critical. Most findings will be medium or low. Critical findings are things that will cause incidents or significant rework if not fixed. Don't inflate severity to seem valuable.

**Cite specifics.** Every finding must include `file:line` references. "Error handling is inconsistent" is useless; "Error handling in src/api/users.ts:42 uses a bare `throw new Error('oops')` instead of the AppError pattern used in src/api/orders.ts:58" is actionable.

**Give benefit of the doubt.** If a pattern could be intentional (maybe there's context you don't have), flag it as "Question" not "Violation." The user can confirm.

**Don't propose fixes.** Your job is to identify issues, not resolve them. Fixes are a separate workflow. Keep the report to findings only.

**Don't read every file.** For a 7-day audit of an active repo, you'll encounter hundreds of files. Focus on:
- Files that are newly added (not modified — additions are where drift happens)
- Files in "sensitive" directories (auth, payments, migrations, config, infrastructure)
- Files with unusually large changes
- Files Claude touched that show up in `.claude/logs/sensitive.log` if available

If you must skip areas due to time, say so in the report.

## When done

Save the report to `docs/audits/YYYY-MM-DD-audit.md` for historical record. Print the report to the conversation for the user to read.

Scripts

.claude/scripts/rotate-logs.sh

#!/usr/bin/env bash
# Rotate .claude/logs/ files weekly to keep them manageable.
# Run via cron or manually. Keeps last 4 weeks, compresses older.

set -euo pipefail

LOG_DIR="${CLAUDE_PROJECT_DIR:-.}/.claude/logs"

if [ ! -d "$LOG_DIR" ]; then
  echo "No log directory at $LOG_DIR"
  exit 0
fi

cd "$LOG_DIR"

WEEK=$(date -u +%Y-W%V)
ARCHIVE_DIR="archive/$WEEK"
mkdir -p "$ARCHIVE_DIR"

# Files to rotate
FILES=(bash.log bash.jsonl sensitive.log sudo.log session-summaries.md)

for file in "${FILES[@]}"; do
  if [ -f "$file" ] && [ -s "$file" ]; then
    cp "$file" "$ARCHIVE_DIR/$file"
    gzip "$ARCHIVE_DIR/$file"
    # Truncate the live log
    > "$file"
    echo "Rotated $file -> $ARCHIVE_DIR/$file.gz"
  fi
done

# Clean up archives older than 4 weeks
find archive/ -maxdepth 1 -type d -mtime +28 -exec rm -rf {} \; 2>/dev/null || true

echo "Log rotation complete."

.claude/scripts/weekly-review.sh

#!/usr/bin/env bash
# Weekly review helper — generates a single report from the past week's activity.
# Run every Friday (or whenever your review window is) before the weekly hygiene session.
#
# Output: .claude/logs/weekly-reviews/YYYY-WNN.md

set -euo pipefail

PROJECT_DIR="${CLAUDE_PROJECT_DIR:-.}"
LOG_DIR="$PROJECT_DIR/.claude/logs"
REVIEW_DIR="$LOG_DIR/weekly-reviews"
mkdir -p "$REVIEW_DIR"

WEEK=$(date -u +%Y-W%V)
REPORT="$REVIEW_DIR/$WEEK.md"

cat > "$REPORT" <<EOF
# Weekly Review — $WEEK

Generated: $(date -u +"%Y-%m-%dT%H:%M:%SZ")

## Activity summary

EOF

# =============================================================
# Git activity
# =============================================================
if [ -d "$PROJECT_DIR/.git" ]; then
  cd "$PROJECT_DIR"

  echo "### Commits this week" >> "$REPORT"
  COMMITS_COUNT=$(git log --since="7 days ago" --oneline | wc -l)
  echo "**Total:** $COMMITS_COUNT commits" >> "$REPORT"
  echo "" >> "$REPORT"
  echo '```' >> "$REPORT"
  git log --since="7 days ago" --oneline >> "$REPORT" 2>&1 || echo "(no commits)" >> "$REPORT"
  echo '```' >> "$REPORT"
  echo "" >> "$REPORT"

  echo "### Files most changed this week" >> "$REPORT"
  echo '```' >> "$REPORT"
  git log --since="7 days ago" --name-only --pretty=format: 2>/dev/null | \
    sort | uniq -c | sort -rn | head -15 >> "$REPORT" || echo "(none)" >> "$REPORT"
  echo '```' >> "$REPORT"
  echo "" >> "$REPORT"
fi

# =============================================================
# Sensitive touches
# =============================================================
echo "### Sensitive area touches" >> "$REPORT"
if [ -f "$LOG_DIR/sensitive.log" ]; then
  RECENT=$(awk -v d="$(date -u -d '7 days ago' +%s 2>/dev/null || date -u -v-7d +%s)" '
    {
      cmd = "date -u -d " $1 " +%s 2>/dev/null || date -u -j -f \"%Y-%m-%dT%H:%M:%SZ\" " $1 " +%s";
      cmd | getline t;
      close(cmd);
      if (t >= d) print
    }' "$LOG_DIR/sensitive.log" 2>/dev/null || true)

  if [ -n "$RECENT" ]; then
    COUNT=$(echo "$RECENT" | wc -l)
    echo "**Total:** $COUNT sensitive touches" >> "$REPORT"
    echo "" >> "$REPORT"
    echo '```' >> "$REPORT"
    echo "$RECENT" >> "$REPORT"
    echo '```' >> "$REPORT"
  else
    echo "None" >> "$REPORT"
  fi
else
  echo "(no sensitive log)" >> "$REPORT"
fi
echo "" >> "$REPORT"

# =============================================================
# Bash command volume
# =============================================================
echo "### Claude bash activity" >> "$REPORT"
if [ -f "$LOG_DIR/bash.jsonl" ]; then
  WEEK_CMDS=$(jq -r --arg since "$(date -u -d '7 days ago' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -v-7d +%Y-%m-%dT%H:%M:%SZ)" \
    'select(.timestamp > $since) | .command' "$LOG_DIR/bash.jsonl" 2>/dev/null || echo "")
  if [ -n "$WEEK_CMDS" ]; then
    TOTAL=$(echo "$WEEK_CMDS" | wc -l)
    echo "**Total commands:** $TOTAL" >> "$REPORT"
    echo "" >> "$REPORT"
    echo "**Most common commands:**" >> "$REPORT"
    echo '```' >> "$REPORT"
    echo "$WEEK_CMDS" | awk '{print $1}' | sort | uniq -c | sort -rn | head -10 >> "$REPORT"
    echo '```' >> "$REPORT"
  else
    echo "(no commands logged)" >> "$REPORT"
  fi
fi
echo "" >> "$REPORT"

# =============================================================
# Blocked / dangerous attempts
# =============================================================
if [ -f "$LOG_DIR/blocked.log" ]; then
  echo "### Blocked operations this week" >> "$REPORT"
  echo '```' >> "$REPORT"
  tail -20 "$LOG_DIR/blocked.log" >> "$REPORT"
  echo '```' >> "$REPORT"
  echo "" >> "$REPORT"
fi

# =============================================================
# Action items (manual)
# =============================================================
cat >> "$REPORT" <<'EOF'
## Review checklist

- [ ] Any sensitive area touches that warrant a security review?
- [ ] Any recurring patterns in commands that should be allowed permanently?
- [ ] Any recurring patterns in blocked attempts that indicate a rule needs tightening?
- [ ] Any learnings from session summaries to promote to CLAUDE.md or skills?
- [ ] CLAUDE.md still under 200 lines?
- [ ] Any skills that haven't activated in a month — retire them?
- [ ] Run the standardization test on a representative task?

## Notes

<!-- Add your notes from the weekly hygiene session here -->

EOF

echo "Weekly review generated: $REPORT"
echo ""
echo "Review the report, add notes in the 'Notes' section,"
echo "and convert any learnings into rule updates."

# Open it if we can
if command -v open >/dev/null 2>&1; then
  open "$REPORT"
elif command -v xdg-open >/dev/null 2>&1; then
  xdg-open "$REPORT"
fi

CI checks

scripts/ci/checks/no-direct-env-access.sh

#!/usr/bin/env bash
# Check: no direct process.env or os.environ access outside the config module.
# All env var reads must go through the central config, which validates and types them.
#
# This catches a common Claude failure mode: it knows process.env works and uses it
# directly rather than extending the config module, producing config that's scattered
# across the codebase and hard to audit.

set -euo pipefail

VIOLATIONS=0

# Check TypeScript/JavaScript files
while IFS= read -r file; do
  # Allow the config module itself
  if [[ "$file" == *"src/config/"* ]]; then
    continue
  fi

  if grep -nE "process\.env\." "$file" > /dev/null 2>&1; then
    echo "❌ Direct process.env access in $file:"
    grep -nE "process\.env\." "$file" | head -5
    VIOLATIONS=$((VIOLATIONS + 1))
  fi
done < <(find src -type f \( -name "*.ts" -o -name "*.tsx" -o -name "*.js" -o -name "*.jsx" \) 2>/dev/null || true)

# Check Python files
while IFS= read -r file; do
  if [[ "$file" == *"config/"* ]]; then
    continue
  fi

  if grep -nE "(os\.environ|os\.getenv)" "$file" > /dev/null 2>&1; then
    echo "❌ Direct os.environ access in $file:"
    grep -nE "(os\.environ|os\.getenv)" "$file" | head -5
    VIOLATIONS=$((VIOLATIONS + 1))
  fi
done < <(find . -type f -name "*.py" -not -path "./.venv/*" -not -path "./node_modules/*" 2>/dev/null || true)

if [ "$VIOLATIONS" -gt 0 ]; then
  echo ""
  echo "Found $VIOLATIONS files with direct env var access."
  echo "All env var reads must go through the config module. See docs/decisions/ADR-XXXX-config.md"
  exit 1
fi

echo "✓ No direct env var access outside config module"

scripts/ci/checks/migrations-reversible.sh

#!/usr/bin/env bash
# Check: every database migration must have both an `up` and a `down` section.
# This is a common Claude failure mode — it writes forward-only migrations which
# can't be rolled back, violating the ADR that requires reversibility.

set -euo pipefail

VIOLATIONS=0
MIGRATIONS_DIR="${MIGRATIONS_DIR:-db/migrations}"

if [ ! -d "$MIGRATIONS_DIR" ]; then
  echo "⊘ No migrations directory at $MIGRATIONS_DIR (skipping)"
  exit 0
fi

while IFS= read -r file; do
  HAS_UP=0
  HAS_DOWN=0

  # Accept common migration framework conventions
  # SQL convention: -- +up / -- +down (goose), -- migrate:up / -- migrate:down (dbmate)
  # Knex/TypeORM: exports.up / exports.down, function up/down
  # Alembic (Python): def upgrade / def downgrade

  if grep -qE "(-- \+up|-- migrate:up|exports\.up|function up|def upgrade)" "$file"; then
    HAS_UP=1
  fi

  if grep -qE "(-- \+down|-- migrate:down|exports\.down|function down|def downgrade)" "$file"; then
    HAS_DOWN=1
  fi

  if [ "$HAS_UP" -eq 0 ] || [ "$HAS_DOWN" -eq 0 ]; then
    echo "❌ Migration missing up/down: $file"
    [ "$HAS_UP" -eq 0 ] && echo "   Missing: up/upgrade section"
    [ "$HAS_DOWN" -eq 0 ] && echo "   Missing: down/downgrade section"
    VIOLATIONS=$((VIOLATIONS + 1))
  fi

  # Additional check: warn on destructive operations without an explicit approval comment
  if grep -qiE "(DROP TABLE|DROP COLUMN|TRUNCATE|DELETE FROM.*WHERE 1=1)" "$file"; then
    if ! grep -qE "DESTRUCTIVE-APPROVED:" "$file"; then
      echo "⚠️  Destructive migration without approval marker: $file"
      echo "   Destructive migrations (DROP, TRUNCATE) must include the comment:"
      echo "   -- DESTRUCTIVE-APPROVED: <reviewer username> <date>"
      VIOLATIONS=$((VIOLATIONS + 1))
    fi
  fi
done < <(find "$MIGRATIONS_DIR" -type f \( -name "*.sql" -o -name "*.ts" -o -name "*.js" -o -name "*.py" \))

if [ "$VIOLATIONS" -gt 0 ]; then
  exit 1
fi

echo "✓ All migrations are reversible"

scripts/ci/checks/routes-have-auth.sh

#!/usr/bin/env bash
# Check: every API route file must either declare an auth requirement
# or explicitly opt out with a comment.
#
# Catches the common failure where Claude adds a new endpoint and forgets
# to apply authentication middleware, silently creating a public endpoint
# that exposes data.

set -euo pipefail

VIOLATIONS=0
ROUTES_DIR="${ROUTES_DIR:-src/api/routes}"

if [ ! -d "$ROUTES_DIR" ]; then
  echo "⊘ No routes directory at $ROUTES_DIR (skipping)"
  exit 0
fi

while IFS= read -r file; do
  # Skip route index files and type definitions
  if [[ "$file" == *"index.ts" ]] || [[ "$file" == *".d.ts" ]] || [[ "$file" == *".test."* ]]; then
    continue
  fi

  # Require one of:
  #   requireAuth, requireRole, authenticated(), @Auth decorator,
  #   or an explicit "// PUBLIC-ROUTE:" comment with justification
  if ! grep -qE "(requireAuth|requireRole|authenticated|@Auth|@UseGuards|// PUBLIC-ROUTE:)" "$file"; then
    echo "❌ Route file has no auth declaration: $file"
    echo "   Every route must either use auth middleware or include an explicit"
    echo "   '// PUBLIC-ROUTE: <reason>' comment explaining why it's public."
    VIOLATIONS=$((VIOLATIONS + 1))
  fi
done < <(find "$ROUTES_DIR" -type f \( -name "*.ts" -o -name "*.tsx" -o -name "*.js" \))

if [ "$VIOLATIONS" -gt 0 ]; then
  exit 1
fi

echo "✓ All routes declare auth"

scripts/ci/checks/no-silent-errors.sh

#!/usr/bin/env bash
# Check: no silent catch/except blocks.
# Claude sometimes writes error handlers that swallow errors silently,
# producing code that "works" but hides real failures from observability.

set -euo pipefail

VIOLATIONS=0

# Pattern 1: empty catch blocks in JS/TS
#   catch (e) { }
#   catch {}
while IFS= read -r file; do
  # pcre2grep catches multi-line patterns
  if command -v pcregrep >/dev/null 2>&1; then
    MATCHES=$(pcregrep -M -n '(catch\s*\([^)]*\)|catch)\s*\{\s*\}' "$file" 2>/dev/null || true)
    if [ -n "$MATCHES" ]; then
      echo "❌ Empty catch block in $file:"
      echo "$MATCHES"
      VIOLATIONS=$((VIOLATIONS + 1))
    fi
  fi
done < <(find . -type f \( -name "*.ts" -o -name "*.tsx" -o -name "*.js" \) -not -path "./node_modules/*" -not -path "./dist/*" 2>/dev/null || true)

# Pattern 2: bare except: pass in Python
while IFS= read -r file; do
  if command -v pcregrep >/dev/null 2>&1; then
    MATCHES=$(pcregrep -M -n 'except[^:]*:\s*\n\s*pass' "$file" 2>/dev/null || true)
    if [ -n "$MATCHES" ]; then
      echo "❌ except: pass in $file:"
      echo "$MATCHES"
      VIOLATIONS=$((VIOLATIONS + 1))
    fi
  fi
done < <(find . -type f -name "*.py" -not -path "./.venv/*" 2>/dev/null || true)

# Pattern 3: catch that only console.logs and continues (a softer form of swallowing)
while IFS= read -r file; do
  # Look for catch blocks whose only statement is a console.log/error
  if grep -Pzo '(?s)catch\s*\([^)]*\)\s*\{\s*console\.(log|error|warn)\([^)]*\);?\s*\}' "$file" > /dev/null 2>&1; then
    echo "⚠️  Catch block only logs in $file (use proper error handling or rethrow)"
    VIOLATIONS=$((VIOLATIONS + 1))
  fi
done < <(find . -type f \( -name "*.ts" -o -name "*.tsx" -o -name "*.js" \) -not -path "./node_modules/*" 2>/dev/null || true)

if [ "$VIOLATIONS" -gt 0 ]; then
  echo ""
  echo "Found $VIOLATIONS silent error swallow(s). Either handle the error properly,"
  echo "log through the structured logger AND rethrow, or document why it's safe"
  echo "to ignore with a comment like: '// SAFE-TO-IGNORE: <reason>'"
  exit 1
fi

echo "✓ No silent error swallowing detected"

scripts/ci/checks/openapi-matches-routes.sh

#!/usr/bin/env bash
# Check: every route file has a corresponding OpenAPI spec entry.
# Catches the common failure where Claude adds endpoints but forgets to document them.

set -euo pipefail

VIOLATIONS=0
ROUTES_DIR="${ROUTES_DIR:-src/api/routes}"
OPENAPI_FILE="${OPENAPI_FILE:-openapi.yaml}"

if [ ! -d "$ROUTES_DIR" ]; then
  echo "⊘ No routes directory at $ROUTES_DIR (skipping)"
  exit 0
fi

if [ ! -f "$OPENAPI_FILE" ]; then
  echo "⊘ No OpenAPI file at $OPENAPI_FILE (skipping)"
  exit 0
fi

# Extract route paths declared in code
# This is a rough heuristic; customize for your framework's patterns
# Example patterns matched:
#   app.get("/users/:id"
#   router.post('/orders'
#   @Get('/items/:id')
CODE_ROUTES=$(grep -rhE "(app|router|fastify)\.(get|post|put|patch|delete)\s*\(\s*['\"]" "$ROUTES_DIR" 2>/dev/null | \
  grep -oE "['\"][^'\"]+['\"]" | head -1 | tr -d "'\"" || true)

# Extract paths declared in OpenAPI
if command -v yq >/dev/null 2>&1; then
  SPEC_PATHS=$(yq eval '.paths | keys | .[]' "$OPENAPI_FILE" 2>/dev/null || echo "")
else
  # Fallback: grep for path-like entries
  SPEC_PATHS=$(grep -E "^\s+/" "$OPENAPI_FILE" | sed 's/://' | tr -d ' ' || echo "")
fi

# Normalize route params: /:id -> /{id}
normalize() {
  echo "$1" | sed -E 's|/:([a-zA-Z_]+)|/{\1}|g'
}

# Check each code route has a spec entry
MISSING=()
while IFS= read -r route; do
  [ -z "$route" ] && continue
  NORMALIZED=$(normalize "$route")
  if ! echo "$SPEC_PATHS" | grep -qF "$NORMALIZED"; then
    MISSING+=("$route")
  fi
done <<< "$CODE_ROUTES"

if [ ${#MISSING[@]} -gt 0 ]; then
  echo "❌ Routes missing from $OPENAPI_FILE:"
  printf '  - %s\n' "${MISSING[@]}"
  echo ""
  echo "Every route must be documented in the OpenAPI spec."
  exit 1
fi

echo "✓ All routes documented in OpenAPI spec"

GitHub Actions

.github/workflows/agent-gauntlet.yml

name: Agent Output Gauntlet

# This workflow runs on every PR. It's designed specifically to catch issues
# in AI-generated code that a human reviewer can't reliably catch by reading.
# Each job focuses on a different category of detection.
#
# The principle: fail fast and loudly. A green PR should give you high confidence
# the code is at least mechanically sound, so you can focus review on judgment calls.

on:
  pull_request:
    branches: [main, staging]
  push:
    branches: [main]

# Prevent concurrent runs on the same PR from wasting CI minutes
concurrency:
  group: gauntlet-${{ github.ref }}
  cancel-in-progress: true

jobs:
  # ============================================================
  # PR SIZE GATE — reject oversized PRs before running anything else
  # ============================================================
  size-gate:
    name: "Gate: PR Size"
    runs-on: ubuntu-latest
    if: github.event_name == 'pull_request'
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Check PR diff size
        env:
          MAX_LINES: 600
          MAX_FILES: 40
        run: |
          BASE_SHA=${{ github.event.pull_request.base.sha }}
          HEAD_SHA=${{ github.event.pull_request.head.sha }}

          # Count lines changed (excluding lockfiles, generated files, fixtures)
          LINES=$(git diff --numstat $BASE_SHA..$HEAD_SHA -- \
            ':!*.lock' \
            ':!*.lockb' \
            ':!package-lock.json' \
            ':!yarn.lock' \
            ':!pnpm-lock.yaml' \
            ':!poetry.lock' \
            ':!go.sum' \
            ':!Cargo.lock' \
            ':!**/fixtures/**' \
            ':!**/__generated__/**' \
            ':!**/*.generated.*' \
            | awk '{sum+=$1+$2} END {print sum}')

          FILES=$(git diff --name-only $BASE_SHA..$HEAD_SHA | wc -l)

          echo "Lines changed: $LINES (max $MAX_LINES)"
          echo "Files changed: $FILES (max $MAX_FILES)"

          if [ "$LINES" -gt "$MAX_LINES" ]; then
            echo "::error::PR too large ($LINES lines > $MAX_LINES). Split into smaller PRs."
            echo "Large PRs are impossible to review carefully. If this is genuinely a single logical change, add the label 'large-pr-approved' after getting sign-off."
            # Don't fail if the override label is present
            if ! gh pr view ${{ github.event.pull_request.number }} --json labels -q '.labels[].name' | grep -q 'large-pr-approved'; then
              exit 1
            fi
          fi

          if [ "$FILES" -gt "$MAX_FILES" ]; then
            echo "::error::PR touches too many files ($FILES > $MAX_FILES). Split into smaller PRs."
            exit 1
          fi
        env:
          GH_TOKEN: ${{ github.token }}

  # ============================================================
  # PR DESCRIPTION CHECK — ensure the PR includes the required sections
  # ============================================================
  pr-description:
    name: "Gate: PR Description"
    runs-on: ubuntu-latest
    if: github.event_name == 'pull_request'
    steps:
      - name: Verify PR description has required sections
        env:
          PR_BODY: ${{ github.event.pull_request.body }}
        run: |
          REQUIRED=("What changed" "Why" "Reused" "New" "Risk areas")
          MISSING=()

          for section in "${REQUIRED[@]}"; do
            if ! echo "$PR_BODY" | grep -iq "$section"; then
              MISSING+=("$section")
            fi
          done

          if [ ${#MISSING[@]} -gt 0 ]; then
            echo "::error::PR description missing required sections: ${MISSING[*]}"
            echo "Every PR must include: What changed, Why, Reused (existing code extended), New (anything new introduced), Risk areas (what needs manual review)."
            exit 1
          fi

  # ============================================================
  # LINTING — style, formatting, dead code
  # ============================================================
  lint:
    name: "Check: Lint & Format"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      # Configure these based on your actual stack. Examples shown for JS/TS + Python.
      - name: Setup Node
        uses: actions/setup-node@v4
        with:
          node-version: "22"
          cache: "npm"

      - name: Install JS deps
        run: npm ci

      - name: Lint JS/TS
        run: |
          npm run lint
          npx prettier --check "**/*.{ts,tsx,js,jsx,json,md}"

      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Lint Python
        run: |
          pip install ruff
          ruff check .
          ruff format --check .

  # ============================================================
  # TYPE CHECK — catch code that "looks right" but doesn't type
  # ============================================================
  typecheck:
    name: "Check: Types"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-node@v4
        with:
          node-version: "22"
          cache: "npm"
      - run: npm ci
      - run: npx tsc --noEmit

      # Python type check (if applicable)
      - uses: actions/setup-python@v5
        with:
          python-version: "3.12"
      - run: pip install mypy
      - run: mypy . --ignore-missing-imports || echo "::warning::mypy has findings (non-blocking)"

  # ============================================================
  # TESTS — full suite, with coverage delta check
  # ============================================================
  test:
    name: "Check: Tests & Coverage"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - uses: actions/setup-node@v4
        with:
          node-version: "22"
          cache: "npm"
      - run: npm ci

      - name: Run tests with coverage
        run: npm test -- --coverage --coverageReporters=json-summary

      - name: Check coverage didn't drop
        if: github.event_name == 'pull_request'
        run: |
          CURRENT=$(jq '.total.lines.pct' coverage/coverage-summary.json)
          echo "Current coverage: $CURRENT%"

          # Fetch base branch coverage baseline (stored in repo at docs/coverage-baseline.json)
          if [ -f docs/coverage-baseline.json ]; then
            BASELINE=$(jq '.total.lines.pct' docs/coverage-baseline.json)
            echo "Baseline coverage: $BASELINE%"

            # Allow up to 1% drop (noise tolerance)
            DROP=$(echo "$BASELINE - $CURRENT" | bc)
            if (( $(echo "$DROP > 1" | bc -l) )); then
              echo "::error::Coverage dropped by more than 1% ($BASELINE% -> $CURRENT%). Add tests for new code."
              exit 1
            fi
          fi

      - name: Check test suite runtime didn't balloon
        run: |
          START=$(date +%s)
          npm test --silent > /dev/null 2>&1 || true
          END=$(date +%s)
          DURATION=$((END - START))

          echo "Test suite runtime: ${DURATION}s"

          if [ -f docs/test-runtime-baseline.txt ]; then
            BASELINE=$(cat docs/test-runtime-baseline.txt)
            # Fail if runtime more than 2x baseline
            if [ $((DURATION)) -gt $((BASELINE * 2)) ]; then
              echo "::error::Test suite runtime ${DURATION}s is >2x baseline ${BASELINE}s. Claude may have added slow tests or broken parallelization."
              exit 1
            fi
          fi

  # ============================================================
  # SECURITY — static analysis for common vulns
  # ============================================================
  security:
    name: "Check: Security (SAST)"
    runs-on: ubuntu-latest
    permissions:
      security-events: write
      actions: read
      contents: read
    steps:
      - uses: actions/checkout@v4

      # Semgrep — fast, broad, lots of rules for common vulnerabilities
      - name: Semgrep
        uses: semgrep/semgrep-action@v1
        with:
          config: |
            p/security-audit
            p/secrets
            p/owasp-top-ten
            p/javascript
            p/typescript
            p/python

      # Dedicated secret scanning
      - name: Gitleaks
        uses: gitleaks/gitleaks-action@v2
        env:
          GITHUB_TOKEN: ${{ github.token }}

      # CodeQL — deeper analysis, catches things Semgrep misses
      - name: Initialize CodeQL
        uses: github/codeql-action/init@v3
        with:
          languages: javascript,python
      - name: CodeQL Analysis
        uses: github/codeql-action/analyze@v3

  # ============================================================
  # DEPENDENCY REVIEW — flag new deps with known vulns or licensing issues
  # ============================================================
  dependency-review:
    name: "Check: Dependency Review"
    runs-on: ubuntu-latest
    if: github.event_name == 'pull_request'
    steps:
      - uses: actions/checkout@v4
      - name: Dependency Review
        uses: actions/dependency-review-action@v4
        with:
          fail-on-severity: high
          # Fail if new dependencies add GPL/AGPL licenses (adjust for your policy)
          deny-licenses: AGPL-3.0, GPL-3.0

      - name: Check dependency count delta
        run: |
          # Count current deps
          CURRENT=$(jq '.dependencies | length + (.devDependencies | length)' package.json)
          # Count deps on main
          git show origin/main:package.json > /tmp/base-package.json 2>/dev/null || echo "{}" > /tmp/base-package.json
          BASE=$(jq '.dependencies | length + (.devDependencies | length)' /tmp/base-package.json)

          DELTA=$((CURRENT - BASE))
          echo "Dependency count: $BASE -> $CURRENT (delta: $DELTA)"

          # Flag any PR that adds more than 5 deps at once
          if [ "$DELTA" -gt 5 ]; then
            echo "::error::This PR adds $DELTA dependencies. Claude may be pulling in packages unnecessarily. Justify each new dependency in the PR description or split into smaller PRs."
            exit 1
          fi

  # ============================================================
  # COMPLEXITY — flag functions that got too complex
  # ============================================================
  complexity:
    name: "Check: Complexity"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-node@v4
        with:
          node-version: "22"

      - name: Check JS/TS complexity
        run: |
          npx --yes complexity-report-json --format json 'src/**/*.{ts,tsx,js,jsx}' > /tmp/complexity.json || true

          # Fail if any function has cyclomatic complexity > 15 or length > 100 lines
          HIGH_COMPLEXITY=$(jq '[.reports[].functions[] | select(.cyclomatic > 15 or .sloc.logical > 100)] | length' /tmp/complexity.json 2>/dev/null || echo "0")

          if [ "$HIGH_COMPLEXITY" -gt "0" ]; then
            echo "::warning::$HIGH_COMPLEXITY functions exceed complexity thresholds. Consider breaking them up."
            jq '.reports[].functions[] | select(.cyclomatic > 15 or .sloc.logical > 100) | {file: .file, name: .name, complexity: .cyclomatic, lines: .sloc.logical}' /tmp/complexity.json
          fi

      - uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Check Python complexity
        run: |
          pip install radon
          # Fail on any function with complexity grade worse than C
          radon cc -s -n C . || echo "::warning::Python complexity findings"

  # ============================================================
  # CUSTOM CHECKS — repo-specific invariants Claude keeps violating
  # ============================================================
  custom-checks:
    name: "Check: Repo-Specific Invariants"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Run repo-specific checks
        run: |
          # This runs all scripts in scripts/ci/checks/ — each returns non-zero if it finds a violation
          set +e
          FAILED=0
          for script in scripts/ci/checks/*.sh; do
            if [ -f "$script" ]; then
              echo "=== Running $script ==="
              bash "$script"
              if [ $? -ne 0 ]; then
                echo "::error::Check failed: $script"
                FAILED=1
              fi
            fi
          done
          exit $FAILED

  # ============================================================
  # TODO / DEBT GUARD — no new TODOs or disabled tests
  # ============================================================
  debt-guard:
    name: "Check: No New Debt"
    runs-on: ubuntu-latest
    if: github.event_name == 'pull_request'
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Check for new TODOs
        run: |
          BASE_SHA=${{ github.event.pull_request.base.sha }}
          HEAD_SHA=${{ github.event.pull_request.head.sha }}

          # Find lines added in this PR containing TODO/FIXME/XXX markers
          NEW_TODOS=$(git diff $BASE_SHA..$HEAD_SHA --unified=0 | \
            grep -E "^\+" | \
            grep -E "(TODO|FIXME|XXX|HACK):" | \
            grep -v "^+++" || true)

          if [ -n "$NEW_TODOS" ]; then
            echo "::error::This PR adds TODO/FIXME/XXX/HACK markers. Either fix now or file issues."
            echo "$NEW_TODOS"
            exit 1
          fi

      - name: Check for disabled tests
        run: |
          BASE_SHA=${{ github.event.pull_request.base.sha }}
          HEAD_SHA=${{ github.event.pull_request.head.sha }}

          DISABLED=$(git diff $BASE_SHA..$HEAD_SHA --unified=0 | \
            grep -E "^\+" | \
            grep -E "(\.skip|\.only|xit\(|xdescribe\(|@pytest\.mark\.skip|#\[ignore\])" \
            | grep -v "^+++" || true)

          if [ -n "$DISABLED" ]; then
            echo "::error::This PR disables or focuses tests. This is not allowed."
            echo "$DISABLED"
            exit 1
          fi

  # ============================================================
  # SESSION SUMMARY — generate a readable summary of what Claude did
  # ============================================================
  session-summary:
    name: "Report: Change Summary"
    runs-on: ubuntu-latest
    if: github.event_name == 'pull_request'
    permissions:
      pull-requests: write
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Generate change summary
        run: |
          BASE_SHA=${{ github.event.pull_request.base.sha }}
          HEAD_SHA=${{ github.event.pull_request.head.sha }}

          echo "## Change Summary" > /tmp/summary.md
          echo "" >> /tmp/summary.md

          echo "### New files" >> /tmp/summary.md
          git diff --name-only --diff-filter=A $BASE_SHA..$HEAD_SHA | sed 's/^/- /' >> /tmp/summary.md || echo "None" >> /tmp/summary.md

          echo "" >> /tmp/summary.md
          echo "### Deleted files" >> /tmp/summary.md
          git diff --name-only --diff-filter=D $BASE_SHA..$HEAD_SHA | sed 's/^/- /' >> /tmp/summary.md || echo "None" >> /tmp/summary.md

          echo "" >> /tmp/summary.md
          echo "### Modified files (top 20 by lines changed)" >> /tmp/summary.md
          git diff --numstat $BASE_SHA..$HEAD_SHA | sort -rn | head -20 | awk '{printf "- `%s` (+%s -%s)\n", $3, $1, $2}' >> /tmp/summary.md

          echo "" >> /tmp/summary.md
          echo "### Sensitive area touches" >> /tmp/summary.md
          SENSITIVE=$(git diff --name-only $BASE_SHA..$HEAD_SHA | grep -E "(auth|payment|permission|migration|\.github|infrastructure|\.env)" || true)
          if [ -n "$SENSITIVE" ]; then
            echo "⚠️  **This PR touches sensitive areas:**" >> /tmp/summary.md
            echo "$SENSITIVE" | sed 's/^/- /' >> /tmp/summary.md
          else
            echo "None" >> /tmp/summary.md
          fi

          cat /tmp/summary.md

      - name: Post summary as PR comment
        uses: actions/github-script@v7
        with:
          script: |
            const fs = require('fs');
            const summary = fs.readFileSync('/tmp/summary.md', 'utf8');

            // Find existing summary comment and update it, rather than spamming new ones
            const { data: comments } = await github.rest.issues.listComments({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
            });

            const existing = comments.find(c => c.body.startsWith('## Change Summary'));

            if (existing) {
              await github.rest.issues.updateComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                comment_id: existing.id,
                body: summary
              });
            } else {
              await github.rest.issues.createComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number: context.issue.number,
                body: summary
              });
            }

Gitignore for logs

.claude/logs/.gitignore

# Runtime-generated logs (do not commit)
bash.log
bash.jsonl
sudo.log
sensitive.log
archive/

# Keep summaries and reviews in git — they're valuable history
!session-summaries.md
!weekly-reviews/
!weekly-reviews/*.md

Last updated: 2026-04-23