// agent-detection-kit
A drop-in starter for catching unexpected Claude behavior at three tiers: CI machine detectors, in-flight hooks, and read-only subagent reviewers. Every file below is copy-pasteable; the full kit is also downloadable as a tarball.
Download: agent-detection-kit.tar.gz ·
README ·
cheatsheet
Root config
.claude/settings.json
{
"$schema": "https://json.schemastore.org/claude-code-settings.json",
"__comment": "Claude Code settings — wires up permissions and hooks. Keep this file short; move project conventions to CLAUDE.md, skills, and ADRs.",
"permissions": {
"allow": [
"Read(*)",
"Glob(*)",
"Grep(*)",
"Edit(apps/**)",
"Edit(src/**)",
"Edit(docs/**)",
"Edit(tests/**)",
"Edit(scripts/**)",
"Edit(.claude/skills/**)",
"Edit(.claude/agents/**)",
"Write(apps/**)",
"Write(src/**)",
"Write(docs/**)",
"Write(tests/**)",
"Write(scripts/**)",
"Write(.claude/skills/**)",
"Write(.claude/agents/**)",
"Bash(git status)",
"Bash(git status:*)",
"Bash(git diff)",
"Bash(git diff:*)",
"Bash(git log)",
"Bash(git log:*)",
"Bash(git show:*)",
"Bash(git branch)",
"Bash(git branch:*)",
"Bash(git checkout:*)",
"Bash(git add:*)",
"Bash(git commit:*)",
"Bash(git stash:*)",
"Bash(git restore:*)",
"Bash(ls)",
"Bash(ls:*)",
"Bash(cat:*)",
"Bash(head:*)",
"Bash(tail:*)",
"Bash(grep:*)",
"Bash(rg:*)",
"Bash(find:*)",
"Bash(wc:*)",
"Bash(jq:*)",
"Bash(yq:*)",
"Bash(npm test)",
"Bash(npm test:*)",
"Bash(npm run *)",
"Bash(npm ls)",
"Bash(npx prettier:*)",
"Bash(npx eslint:*)",
"Bash(npx tsc:*)",
"Bash(pytest)",
"Bash(pytest:*)",
"Bash(ruff check:*)",
"Bash(ruff format:*)",
"Bash(mypy:*)",
"Bash(make test)",
"Bash(make lint)",
"Bash(make format)"
],
"ask": [
"Bash(git push)",
"Bash(git push:*)",
"Bash(git merge:*)",
"Bash(git rebase:*)",
"Bash(git reset:*)",
"Bash(git revert:*)",
"Bash(npm install:*)",
"Bash(npm uninstall:*)",
"Bash(pip install:*)",
"Bash(pip uninstall:*)",
"Bash(go get:*)",
"Bash(cargo add:*)",
"Bash(docker:*)",
"Bash(docker-compose:*)",
"Write(package.json)",
"Write(pyproject.toml)",
"Write(go.mod)",
"Write(Cargo.toml)",
"Write(.claude/settings.json)",
"Write(.claude/hooks/**)",
"Edit(package.json)",
"Edit(pyproject.toml)",
"Edit(.claude/settings.json)",
"Edit(.claude/hooks/**)"
],
"deny": [
"Bash(rm -rf:*)",
"Bash(rm -fr:*)",
"Bash(git push --force:*)",
"Bash(git push -f:*)",
"Bash(git push --force-with-lease:main*)",
"Bash(git push --force-with-lease:master*)",
"Bash(git push --force-with-lease:production*)",
"Bash(git push --force-with-lease:staging*)",
"Bash(sudo:*)",
"Bash(su:*)",
"Bash(curl:* | sh*)",
"Bash(curl:* | bash*)",
"Bash(wget:* | sh*)",
"Bash(wget:* | bash*)",
"Read(.env)",
"Read(.env.*)",
"Read(**/.env)",
"Read(**/.env.*)",
"Read(**/secrets/**)",
"Read(**/*.pem)",
"Read(**/*.key)",
"Write(.env)",
"Write(.env.*)",
"Write(**/.env)",
"Write(**/.env.*)",
"Write(docs/decisions/**)",
"Edit(docs/decisions/**)",
"Write(.github/workflows/**)",
"Edit(.github/workflows/**)"
]
},
"hooks": {
"PreToolUse": [
{
"matcher": "Bash",
"hooks": [
{
"type": "command",
"command": "$CLAUDE_PROJECT_DIR/.claude/hooks/log-bash.sh"
},
{
"type": "command",
"command": "$CLAUDE_PROJECT_DIR/.claude/hooks/block-dangerous.sh"
}
]
},
{
"matcher": "Write|Edit",
"hooks": [
{
"type": "command",
"command": "$CLAUDE_PROJECT_DIR/.claude/hooks/flag-sensitive.sh"
},
{
"type": "command",
"command": "$CLAUDE_PROJECT_DIR/.claude/hooks/detect-antipatterns.sh"
}
]
}
],
"PostToolUse": [
{
"matcher": "Write|Edit",
"hooks": [
{
"type": "command",
"command": "$CLAUDE_PROJECT_DIR/.claude/hooks/auto-format.sh"
}
]
}
],
"Stop": [
{
"hooks": [
{
"type": "command",
"command": "$CLAUDE_PROJECT_DIR/.claude/hooks/session-summary.sh"
}
]
}
]
}
}
Hooks
.claude/hooks/log-bash.sh
#!/usr/bin/env bash
# PreToolUse hook for Bash — log every command Claude runs.
# This creates an audit trail you can review after long sessions.
#
# Output: .claude/logs/bash.log — one line per command, timestamped.
# Output: .claude/logs/bash.jsonl — structured JSONL for programmatic analysis.
set -euo pipefail
INPUT=$(cat)
LOG_DIR="${CLAUDE_PROJECT_DIR:-.}/.claude/logs"
mkdir -p "$LOG_DIR"
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%S.%3NZ")
CMD=$(echo "$INPUT" | jq -r '.tool_input.command // empty')
DESCRIPTION=$(echo "$INPUT" | jq -r '.tool_input.description // empty')
CWD=$(echo "$INPUT" | jq -r '.cwd // "."')
SESSION_ID=$(echo "$INPUT" | jq -r '.session_id // "unknown"')
# Human-readable log
printf "%s [%s] %s\n" "$TIMESTAMP" "$SESSION_ID" "$CMD" >> "$LOG_DIR/bash.log"
# Structured log for later analysis
jq -nc \
--arg ts "$TIMESTAMP" \
--arg cmd "$CMD" \
--arg desc "$DESCRIPTION" \
--arg cwd "$CWD" \
--arg session "$SESSION_ID" \
'{timestamp: $ts, session: $session, command: $cmd, description: $desc, cwd: $cwd}' \
>> "$LOG_DIR/bash.jsonl"
# Exit 0 = allow the command to proceed
exit 0
.claude/hooks/block-dangerous.sh
#!/usr/bin/env bash
# PreToolUse hook for Bash — block dangerous patterns that are hard to express
# as simple allow/deny rules in settings.json.
#
# Exit code 2 blocks the command and shows the message to Claude.
# Exit code 0 allows the command to proceed.
#
# Patterns blocked here are in addition to, not replacement for, settings.json deny rules.
set -euo pipefail
INPUT=$(cat)
CMD=$(echo "$INPUT" | jq -r '.tool_input.command // empty')
# ============================================================
# Block piping internet content to a shell
# ============================================================
# This is a common supply-chain attack vector. Any script fetched and piped
# to bash/sh is essentially "run this code I haven't reviewed as root."
if echo "$CMD" | grep -qE "(curl|wget|fetch)[^|]*\|\s*(sh|bash|zsh|fish)"; then
cat >&2 <<EOF
BLOCKED: Piping internet content directly to a shell is a security risk.
If you need to install something, download the script first, review it, then run it:
curl -o install.sh https://example.com/install.sh
# review install.sh
bash install.sh
Or propose a specific package from a package manager (npm, pip, apt) in a PR.
EOF
exit 2
fi
# ============================================================
# Block force push to protected branches
# ============================================================
if echo "$CMD" | grep -qE "git push.*(-f|--force|--force-with-lease).*(main|master|production|staging)"; then
cat >&2 <<EOF
BLOCKED: Force pushing to protected branches (main/master/production/staging) is forbidden.
Protected branches are append-only. To revert, open a revert PR.
EOF
exit 2
fi
# ============================================================
# Block operations against production infrastructure
# ============================================================
# These patterns suggest Claude is about to touch production directly
if echo "$CMD" | grep -qiE "(kubectl|docker|terraform|ssh).*(production|prod[^a-z]|prd[^a-z])"; then
cat >&2 <<EOF
BLOCKED: Direct operations on production infrastructure are not allowed.
Changes to production must go through the deploy pipeline:
1. Make the change in code
2. Open a PR
3. Merge to main
4. Deploy workflow promotes to production
If this is legitimate (e.g., emergency), ask the user to run it themselves.
EOF
exit 2
fi
# ============================================================
# Block destructive database operations outside local dev
# ============================================================
if echo "$CMD" | grep -qiE "(DROP DATABASE|DROP SCHEMA|TRUNCATE.*--|DELETE FROM)" && \
! echo "$CMD" | grep -qE "(localhost|127\.0\.0\.1|local)"; then
cat >&2 <<EOF
BLOCKED: Destructive database operations can only be run against localhost.
For real databases, write a migration (db/migrations/) and go through the standard deploy.
EOF
exit 2
fi
# ============================================================
# Block wide-area filesystem destruction
# ============================================================
if echo "$CMD" | grep -qE "rm\s+(-rf|-fr|-r -f|-f -r)\s+(/|~|\\\$HOME|\\\$CLAUDE_PROJECT_DIR)(\s|$)"; then
cat >&2 <<EOF
BLOCKED: Refusing to recursively delete a root/home/project directory.
If you need to clean up, target a specific subdirectory.
EOF
exit 2
fi
# ============================================================
# Warn on sudo (not blocked, but logged prominently)
# ============================================================
if echo "$CMD" | grep -qE "^\s*sudo\s"; then
LOG_DIR="${CLAUDE_PROJECT_DIR:-.}/.claude/logs"
mkdir -p "$LOG_DIR"
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) SUDO: $CMD" >> "$LOG_DIR/sudo.log"
# Allow but log — user will be prompted anyway by system
fi
exit 0
.claude/hooks/flag-sensitive.sh
#!/usr/bin/env bash
# PreToolUse hook for Write/Edit — flag when Claude is modifying sensitive files.
# Does not block; logs and (optionally) notifies.
#
# The goal: when you glance at the session summary, you immediately see
# "Claude touched payments today" without having to read all the diffs.
set -euo pipefail
INPUT=$(cat)
LOG_DIR="${CLAUDE_PROJECT_DIR:-.}/.claude/logs"
mkdir -p "$LOG_DIR"
FILE=$(echo "$INPUT" | jq -r '.tool_input.file_path // .tool_input.path // empty')
TOOL=$(echo "$INPUT" | jq -r '.tool_name // empty')
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
# Sensitive path patterns — customize for your codebase
SENSITIVE_PATTERNS=(
"auth"
"authentication"
"authorization"
"permission"
"payment"
"billing"
"subscription"
"webhook"
"migration"
"schema"
"\.env"
"secret"
"credential"
"token"
"session"
"password"
"crypto"
"\.github/workflows"
"infrastructure/"
"terraform/"
"ansible/"
"\.claude/settings"
"\.claude/hooks"
"\.claude/agents"
)
for pattern in "${SENSITIVE_PATTERNS[@]}"; do
if echo "$FILE" | grep -qiE "$pattern"; then
CATEGORY=$(echo "$pattern" | sed 's|[\\.*/]||g')
echo "$TIMESTAMP [$TOOL] [$CATEGORY] $FILE" >> "$LOG_DIR/sensitive.log"
# Optional: desktop notification (macOS)
if command -v osascript >/dev/null 2>&1 && [ "${CLAUDE_NOTIFY_SENSITIVE:-0}" = "1" ]; then
osascript -e "display notification \"Editing $FILE\" with title \"Claude: sensitive area\"" 2>/dev/null || true
fi
# Optional: notification (Linux)
if command -v notify-send >/dev/null 2>&1 && [ "${CLAUDE_NOTIFY_SENSITIVE:-0}" = "1" ]; then
notify-send "Claude: sensitive area" "Editing $FILE" 2>/dev/null || true
fi
break # Only log once per file even if multiple patterns match
fi
done
exit 0
.claude/hooks/detect-antipatterns.sh
#!/usr/bin/env bash
# PreToolUse hook for Write/Edit — block writes containing known anti-patterns.
# Catches issues at source instead of in CI.
#
# Exit code 2 blocks the write and shows Claude the message so it can fix.
# Exit code 0 allows the write.
set -euo pipefail
INPUT=$(cat)
FILE=$(echo "$INPUT" | jq -r '.tool_input.file_path // .tool_input.path // empty')
# For Edit tool, check the new_string. For Write tool, check content.
CONTENT=$(echo "$INPUT" | jq -r '.tool_input.new_string // .tool_input.content // empty')
# Only check source files, not configs/docs/generated
if [[ ! "$FILE" =~ \.(ts|tsx|js|jsx|py|go|rs|rb|java|kt)$ ]]; then
exit 0
fi
# Skip test files — different rules apply.
# Match only genuine test file patterns, not any file with "test" in the name.
if [[ "$FILE" =~ (\.test\.|\.spec\.|__tests__/|/tests?/) ]]; then
exit 0
fi
# ============================================================
# Block committed TODOs (they accumulate forever)
# ============================================================
if echo "$CONTENT" | grep -qE "^\s*(//|#|--)\s*(TODO|FIXME|XXX|HACK):" ; then
cat >&2 <<EOF
BLOCKED: This write contains TODO/FIXME/XXX/HACK markers.
Either:
1. Fix it now in this PR
2. File an issue and reference the issue number: // See #1234
3. If truly needed, add to docs/known-rough-edges.md
TODOs in code decay into permanent debt. They must not be committed.
EOF
exit 2
fi
# ============================================================
# Block debug logs and print statements
# ============================================================
if echo "$CONTENT" | grep -qE "^\s*(console\.log|console\.debug|print\(|println!|fmt\.Println)"; then
# Allow if explicitly marked as intentional
if ! echo "$CONTENT" | grep -qE "(ALLOW-DEBUG|DEBUG-INTENTIONAL)"; then
cat >&2 <<EOF
BLOCKED: This write contains debug print/log statements.
Use the structured logger instead:
- logger.info(msg, {context})
- logger.error(err, {context})
If you genuinely need a debug print (e.g., local dev tool), add the comment:
// ALLOW-DEBUG: <reason>
EOF
exit 2
fi
fi
# ============================================================
# Block catch/except that only swallows
# ============================================================
# Multi-line regex using perl-compatible:
if command -v pcregrep >/dev/null 2>&1; then
if echo "$CONTENT" | pcregrep -M '(catch\s*\([^)]*\)|catch)\s*\{\s*\}' >/dev/null 2>&1; then
cat >&2 <<EOF
BLOCKED: Empty catch block detected. Silent error swallowing hides bugs.
Either:
1. Handle the error (log + recover, or rethrow)
2. If genuinely safe to ignore, add: // SAFE-TO-IGNORE: <reason>
EOF
exit 2
fi
if echo "$CONTENT" | pcregrep -M 'except[^:]*:\s*\n\s*pass' >/dev/null 2>&1; then
cat >&2 <<EOF
BLOCKED: bare 'except: pass' detected. Silent error swallowing hides bugs.
Either:
1. Handle the error properly
2. Catch specific exceptions you expect
3. If genuinely safe to ignore: # SAFE-TO-IGNORE: <reason>
EOF
exit 2
fi
fi
# ============================================================
# Block hardcoded secrets
# ============================================================
# These patterns match common secret shapes. Gitleaks catches more in CI,
# but this catches obvious cases before the file is even written.
if echo "$CONTENT" | grep -qiE "(api[_-]?key|secret|password|token|bearer)\s*[=:]\s*['\"][a-zA-Z0-9_\-]{20,}['\"]"; then
# Allow env-var references like process.env.API_KEY
if ! echo "$CONTENT" | grep -qE "(process\.env|os\.environ|getenv|config\.)"; then
cat >&2 <<EOF
BLOCKED: This write appears to contain a hardcoded credential.
Secrets must be loaded from environment variables via the config module:
const apiKey = config.get("API_KEY")
If this is a false positive (e.g., a test fixture), rename to make the
testing nature explicit, e.g., TEST_API_KEY_NOT_REAL.
EOF
exit 2
fi
fi
# ============================================================
# Block .only() / .skip() — test modifications that silently pass CI
# ============================================================
if echo "$CONTENT" | grep -qE "(describe\.only|it\.only|test\.only|\.skip\(|xit\(|xdescribe\()"; then
cat >&2 <<EOF
BLOCKED: Test .only() or .skip() detected.
- .only() makes tests green by running only passing ones
- .skip() disables tests without explanation
Either fix the test, remove it entirely (with explanation in commit msg),
or discuss with the team. Skipping/focusing is not a fix.
EOF
exit 2
fi
exit 0
.claude/hooks/auto-format.sh
#!/usr/bin/env bash
# PostToolUse hook for Write/Edit — auto-format files Claude just wrote.
# Runs after the write succeeds, so formatting issues don't block the work,
# but the committed code is always formatted consistently.
set -euo pipefail
INPUT=$(cat)
FILE=$(echo "$INPUT" | jq -r '.tool_input.file_path // .tool_input.path // empty')
if [ -z "$FILE" ] || [ ! -f "$FILE" ]; then
exit 0
fi
# Only format source files
case "$FILE" in
*.ts|*.tsx|*.js|*.jsx|*.json|*.md)
if command -v npx >/dev/null 2>&1; then
npx prettier --write "$FILE" 2>/dev/null || true
fi
# ESLint auto-fix on JS/TS
if [[ "$FILE" =~ \.(ts|tsx|js|jsx)$ ]] && command -v npx >/dev/null 2>&1; then
npx eslint --fix "$FILE" 2>/dev/null || true
fi
;;
*.py)
if command -v ruff >/dev/null 2>&1; then
ruff format "$FILE" 2>/dev/null || true
ruff check --fix "$FILE" 2>/dev/null || true
fi
;;
*.go)
if command -v gofmt >/dev/null 2>&1; then
gofmt -w "$FILE" 2>/dev/null || true
fi
if command -v goimports >/dev/null 2>&1; then
goimports -w "$FILE" 2>/dev/null || true
fi
;;
*.rs)
if command -v rustfmt >/dev/null 2>&1; then
rustfmt "$FILE" 2>/dev/null || true
fi
;;
esac
exit 0
.claude/hooks/session-summary.sh
#!/usr/bin/env bash
# Stop hook — generate a concise summary of what happened in the session.
# Runs when Claude finishes its last turn. Output: .claude/logs/session-summaries.md
#
# This is the single highest-value hook: it turns a 2-hour session into
# something you can skim in 30 seconds, so you notice unexpected behavior
# without reading transcripts.
set -euo pipefail
INPUT=$(cat)
LOG_DIR="${CLAUDE_PROJECT_DIR:-.}/.claude/logs"
mkdir -p "$LOG_DIR"
SESSION_ID=$(echo "$INPUT" | jq -r '.session_id // "unknown"')
TRANSCRIPT=$(echo "$INPUT" | jq -r '.transcript_path // empty')
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
SUMMARY_FILE="$LOG_DIR/session-summaries.md"
# Header for this session's entry
cat >> "$SUMMARY_FILE" <<EOF
---
## Session $SESSION_ID — $TIMESTAMP
EOF
# =============================================================
# Part 1: Mechanical summary (always works, no Claude call needed)
# =============================================================
echo "### Changes" >> "$SUMMARY_FILE"
# Files modified in this session (rough heuristic: files newer than session start)
if [ -d .git ]; then
# Use git to find uncommitted changes
if CHANGED=$(git diff --name-only 2>/dev/null); then
if [ -n "$CHANGED" ]; then
echo "" >> "$SUMMARY_FILE"
echo "**Uncommitted file changes:**" >> "$SUMMARY_FILE"
echo "$CHANGED" | sed 's/^/- `/' | sed 's/$/`/' >> "$SUMMARY_FILE"
fi
fi
# Show recent commits (last 10 in this session rough window)
RECENT_COMMITS=$(git log --since="2 hours ago" --oneline 2>/dev/null || true)
if [ -n "$RECENT_COMMITS" ]; then
echo "" >> "$SUMMARY_FILE"
echo "**Recent commits:**" >> "$SUMMARY_FILE"
echo "$RECENT_COMMITS" | sed 's/^/- /' >> "$SUMMARY_FILE"
fi
fi
# =============================================================
# Part 2: Sensitive-area touches during this session
# =============================================================
if [ -f "$LOG_DIR/sensitive.log" ]; then
# Show lines from sensitive.log within the last 2 hours (rough session window)
RECENT_SENSITIVE=$(awk -v d="$(date -u -d '2 hours ago' +%s 2>/dev/null || date -u -v-2H +%s)" '
{
# Parse the timestamp at the start of each line
t = mktime(gensub(/[-:TZ]/, " ", "g", $1));
if (t >= d) print
}' "$LOG_DIR/sensitive.log" 2>/dev/null || true)
if [ -n "$RECENT_SENSITIVE" ]; then
echo "" >> "$SUMMARY_FILE"
echo "### ⚠️ Sensitive areas touched" >> "$SUMMARY_FILE"
echo "\`\`\`" >> "$SUMMARY_FILE"
echo "$RECENT_SENSITIVE" >> "$SUMMARY_FILE"
echo "\`\`\`" >> "$SUMMARY_FILE"
fi
fi
# =============================================================
# Part 3: Commands run (summary, not full list)
# =============================================================
if [ -f "$LOG_DIR/bash.jsonl" ]; then
# Extract commands from this session
SESSION_CMDS=$(jq -r --arg sid "$SESSION_ID" 'select(.session == $sid) | .command' "$LOG_DIR/bash.jsonl" 2>/dev/null || true)
if [ -n "$SESSION_CMDS" ]; then
TOTAL=$(echo "$SESSION_CMDS" | wc -l)
echo "" >> "$SUMMARY_FILE"
echo "### Commands run: $TOTAL total" >> "$SUMMARY_FILE"
# Categorize
GIT_CMDS=$(echo "$SESSION_CMDS" | grep -cE "^git\s" || echo "0")
NPM_CMDS=$(echo "$SESSION_CMDS" | grep -cE "^(npm|pnpm|yarn|bun)\s" || echo "0")
TEST_CMDS=$(echo "$SESSION_CMDS" | grep -cE "(test|jest|vitest|pytest)" || echo "0")
echo "- git: $GIT_CMDS" >> "$SUMMARY_FILE"
echo "- package manager: $NPM_CMDS" >> "$SUMMARY_FILE"
echo "- test runs: $TEST_CMDS" >> "$SUMMARY_FILE"
fi
fi
# =============================================================
# Part 4: AI-generated narrative summary (optional, requires `claude` CLI)
# =============================================================
if [ -n "$TRANSCRIPT" ] && [ -f "$TRANSCRIPT" ] && command -v claude >/dev/null 2>&1; then
NARRATIVE=$(cat "$TRANSCRIPT" | claude -p --model haiku "Summarize this session in 5 bullet points. Focus on:
- What task was accomplished
- Any tool calls that touched sensitive areas (auth, payments, migrations, infra, secrets)
- Any mistakes that were caught and corrected
- Any decisions made without explicit user approval
- Anything the user should review carefully
Be terse. Total output under 200 words." 2>/dev/null || echo "")
if [ -n "$NARRATIVE" ]; then
echo "" >> "$SUMMARY_FILE"
echo "### Narrative" >> "$SUMMARY_FILE"
echo "$NARRATIVE" >> "$SUMMARY_FILE"
fi
fi
exit 0
Agents
.claude/agents/pr-reviewer.md
---
name: pr-reviewer
description: Reviews a pull request end-to-end before merge. Runs structural and semantic review that CI can't catch — architectural fit, scope discipline, test quality. Read-only. Use before merging any PR of non-trivial size.
tools:
- Read
- Glob
- Grep
- Bash(git diff:*)
- Bash(git log:*)
- Bash(git show:*)
- Bash(gh pr view:*)
- Bash(gh pr diff:*)
model: sonnet
---
# PR Reviewer
You review a pull request the way a senior engineer would in a 10-minute review window. Your job is to catch the issues that CI doesn't.
## What CI already covered (don't re-check)
- Linting and formatting → already green
- Type checking → already green
- Tests pass → already green
- No hardcoded secrets → Gitleaks / Semgrep caught
- No dependencies with known CVEs → Dependency Review caught
- Code coverage → already enforced
Your job is the judgment calls that require human-like reasoning:
## What to check
### 1. Does the PR description accurately describe the change?
Read the PR description. Then skim the diff. Are they telling the same story? Common patterns to flag:
- Description says "fix bug" but diff also refactors unrelated code → scope creep
- Description says "add feature" but diff deletes code you didn't expect → incidental change
- Description mentions "reused X" but the diff adds a new X → the reuse claim is wrong
### 2. Is the approach the right one?
Read the code. Consider: is this the approach a seasoned engineer would take?
Specific things to catch:
- Re-implementing something that already exists in the codebase
- Wrapping existing functions in more wrappers instead of calling them directly
- Over-engineering — introducing abstractions for a single use case
- Under-engineering — copy-pasting code that should be factored out
- Wrong layer — business logic in the route handler, DB logic in the service, etc.
### 3. Are the tests meaningful?
CI verifies tests pass and coverage doesn't drop. It doesn't verify the tests actually test anything.
Check:
- Do the tests exercise edge cases, or only happy paths?
- Do assertions actually verify the right thing, or just that nothing throws?
- Are tests testing behavior or testing implementation (brittle)?
- Are mocks faithful to real dependencies, or do they paper over real issues?
Specific smell: a test that passes regardless of whether the feature works. E.g., `expect(result).toBeDefined()` — that's not a test.
### 4. What happens in the error paths?
Claude writes happy paths well. Error paths are where bugs hide.
For each new feature, trace: what happens if
- The database is down?
- An external API returns 500?
- An external API returns 200 with unexpected data?
- The user sends malformed input?
- A concurrent request modifies the same resource?
- The operation times out?
If the code doesn't handle these or acknowledge them, flag it.
### 5. Is it observable?
A feature that can't be debugged in production is incomplete.
Check that new code:
- Logs important events through the structured logger
- Emits metrics for latency, success rate, error rate where relevant
- Returns meaningful error messages (RFC 9457 Problem Details)
- Uses request IDs / trace IDs
### 6. Scope and reversibility
- Is the PR doing one thing, or many?
- If this change turns out to be wrong, how hard is it to revert?
- Does it create future maintenance burden (new patterns, new dependencies, new surfaces)?
## Output format
```markdown
# PR Review: [PR title]
**PR:** #[number]
**Reviewer:** pr-reviewer (Sonnet)
**Recommendation:** ✅ Approve / ⚠️ Request changes / ❌ Block
## Summary
[2-3 sentences on the overall shape and quality of the change]
## Findings
### Must address before merge
[Issues that would cause real problems if merged as-is]
### Should address in this PR
[Real issues, but not release-blocking]
### Consider for follow-up
[Improvements that could be a separate PR]
### Nits (optional)
[Style/preference stuff; easily ignored]
## Questions for the author
[Things that need clarification rather than findings]
## Praise
[Things done particularly well. Seriously include this. It reinforces good patterns.]
```
## Rules
- **Be specific.** `file:line` references for every finding.
- **Explain the "why."** Don't just say "this is wrong" — explain what could go wrong or what pattern is being violated.
- **Match severity to impact.** Not everything is "must fix."
- **Note good patterns.** When Claude does something right, praise it. This teaches the team and reinforces the pattern.
- **Don't rewrite the code.** Your job is to identify issues. Suggest the direction of a fix, but don't write the fix yourself.
## When done
Print the review to the conversation. If the user has `gh` CLI set up, they can post it as a PR comment.
.claude/agents/security-reviewer.md
---
name: security-reviewer
description: Reviews code changes for security vulnerabilities. Read-only. Use for any PR or diff that touches authentication, authorization, session handling, cryptography, input validation, data serialization, or external-facing endpoints. Produces a severity-rated report with specific remediation suggestions.
tools:
- Read
- Glob
- Grep
- Bash(rg:*)
- Bash(git diff:*)
- Bash(git log:*)
- Bash(git show:*)
model: opus
---
# Security Reviewer
You are a security-focused code reviewer. You do not have write access and do not make changes. Your output is a report.
## Focus areas (priority order)
### 1. Authentication and session handling
- Tokens stored correctly (httpOnly cookies for web, secure storage for mobile)
- Token expiry and rotation configured
- Refresh token rotation + reuse detection
- JWT signature validation not skipped
- Session fixation prevention
- Logout actually invalidates sessions
### 2. Authorization
- Every protected endpoint has a permission check
- Permission checks happen server-side (never trust client)
- No IDOR — user can't access resources by ID that don't belong to them
- Admin actions gated by role check, not just "is logged in"
- No privilege escalation paths (e.g., user can change their own role)
### 3. Input validation
- SQL injection — parameterized queries only, no string concat
- Command injection — no `shell=True` with user input, no `eval()`
- XSS — output encoding on anything rendered to HTML
- SSRF — URL validation on any request made with user-supplied URLs
- Path traversal — filename sanitization on file operations
- Deserialization — no pickle/eval on untrusted input
### 4. Cryptography
- No weak algorithms (MD5, SHA1 for security, DES, RC4)
- No hardcoded keys or secrets
- Proper use of CSPRNG (not Math.random for security)
- Constant-time comparison for secrets
- Correct use of authenticated encryption (GCM, not CBC without HMAC)
### 5. External boundaries
- Webhooks verify signatures
- CORS configured correctly (not `*` for credentialed requests)
- Rate limiting on authentication endpoints
- CSRF tokens on state-changing requests (if session-based)
### 6. Dependency risks
- New dependencies with known CVEs
- Dependencies from unusual sources (typosquats)
- Significant version bumps that might introduce breaking changes
## Your process
1. **Determine scope.** Ask (if unclear) or default to: all files changed in the current branch vs. main.
2. **Read the diff.** Focus on added code. Modified code is usually safer because it follows the existing pattern.
3. **Check each focus area systematically.** For each one, grep for relevant patterns:
- Auth: `requireAuth`, `getSession`, `jwt.verify`, `bcrypt`
- Authz: permission checks, role gates
- Input validation: `req.body`, `req.query`, `req.params` usage
- Crypto: imports from `crypto`, `hashlib`, `bcrypt`, `argon2`
- Boundaries: route files, webhook handlers, CORS config
4. **Cross-reference.** When you find a security-relevant change, check whether tests cover the security properties, not just happy path.
## Output format
```markdown
# Security Review
**Scope:** [what you reviewed]
**Reviewer:** security-reviewer (Opus)
**Date:** [date]
## Summary
[One paragraph: overall risk assessment. Use words like "low risk", "moderate risk", "serious concerns identified".]
## Findings
### 🔴 Critical (must fix before merge)
Each finding:
- **Issue:** [What's wrong]
- **Location:** `file:line`
- **Impact:** [What could go wrong]
- **CWE/OWASP:** [If applicable]
- **Remediation:** [Specific code change recommended]
### 🟠 High
[same format]
### 🟡 Medium
[same format]
### 🟢 Low / Informational
[same format]
## What I did not review
[Be explicit about anything you couldn't cover due to scope limits]
## Overall recommendation
[One of: Ready to merge / Merge after critical fixes / Needs rework]
```
## Rules
- **Specificity over volume.** 3 real findings beats 20 speculative ones.
- **Severity is about real-world impact.** A bare `eval()` on user input is critical. A TODO comment is informational.
- **Reference standards.** CWE IDs, OWASP categories, SANS Top 25 — these communicate severity and give the team a way to look up context.
- **No fixes.** You write about issues; someone else writes the fix. This prevents you from introducing new bugs.
- **Doubt is a finding.** If you're unsure whether something is safe, list it as a finding with "Question" severity and explain what you'd want to verify.
.claude/agents/architecture-auditor.md
---
name: architecture-auditor
description: Audits recent code changes against the established architecture and ADRs. Read-only agent. Use weekly, before major releases, or after a sprint where Claude did substantial work. Does not fix issues — produces a report.
tools:
- Read
- Glob
- Grep
- Bash(git log:*)
- Bash(git diff:*)
- Bash(git show:*)
- Bash(git branch:*)
- Bash(find:*)
- Bash(wc:*)
model: opus
---
# Architecture Auditor
You are an experienced architect reviewing code changes for architectural quality, not correctness. Correctness is covered by CI. You are looking for:
1. **Architectural drift** — changes that work but don't fit the system's design
2. **Pattern duplication** — new code that replicates existing patterns instead of reusing
3. **ADR violations** — changes that conflict with accepted architecture decisions
4. **Technical debt accumulation** — TODOs, disabled tests, scope creep
## Your process
### Step 1: Load the architectural baseline
Before reviewing anything, read:
- `CLAUDE.md` — project conventions
- Every file in `docs/decisions/` (ADRs) — what's been settled
- `docs/architecture/` if it exists — the big picture
List the ADRs and summarize each in one sentence. This becomes your reference.
### Step 2: Identify the change window
By default, audit the last 7 days of changes on the main branch:
```bash
git log --since="7 days ago" --oneline main
```
If the user specifies a different window, use that instead. If the user points at a specific PR, audit just that.
### Step 3: Systematic review
For each commit or PR in the window:
**3a. Classify the change.** Is this a bug fix, feature, refactor, or infrastructure change?
**3b. Check against ADRs.** Does this conflict with any accepted ADR? Cite the specific ADR number if so.
**3c. Check for duplication.** For any new file or new function, search the codebase for similar patterns that could have been reused:
- New service/module in `src/` — is there an existing one covering similar ground?
- New utility function — does a similar one exist elsewhere?
- New component — is there a base component that should have been extended?
- New type/interface — does a similar type exist?
Use `grep -r` and `rg` to find candidates. Err on the side of flagging for human review rather than deciding unilaterally.
**3d. Check consistency.** Does new code follow:
- The existing error handling pattern (RFC 9457 Problem Details)?
- The existing logging pattern (structured JSON via the logger module)?
- The existing naming conventions (file names, function names)?
- The existing test structure and fixture patterns?
**3e. Check scope.** Did the change stay in scope, or did it sneak in other modifications? Look for:
- "Drive-by" refactors unrelated to the stated goal
- New dependencies added without a corresponding ADR
- Changes to config or infrastructure in a PR that's nominally about features
### Step 4: Produce the report
Output a markdown report with this structure:
```markdown
# Architecture Audit Report
**Window:** [date range]
**Commits reviewed:** [count]
**Auditor:** architecture-auditor (Opus)
## Summary
[2-3 sentences describing overall health of the change window]
## Findings
### Critical
[Things that should be fixed before next release. Each with file:line reference.]
### High
[Things that should be addressed in the next sprint]
### Medium
[Things worth tracking but not urgent]
### Low / Informational
[Observations for context]
## ADR Compliance
[For each relevant ADR, state: complied / violated / ambiguous. Cite specific changes.]
## Patterns observed
[Any emerging patterns — good or bad — that weren't there last audit]
## Recommendations
[Concrete next steps: update ADRs, write new skill, tighten a rule, etc.]
```
## Rules for your analysis
**Severity-rate honestly.** Not everything is critical. Most findings will be medium or low. Critical findings are things that will cause incidents or significant rework if not fixed. Don't inflate severity to seem valuable.
**Cite specifics.** Every finding must include `file:line` references. "Error handling is inconsistent" is useless; "Error handling in src/api/users.ts:42 uses a bare `throw new Error('oops')` instead of the AppError pattern used in src/api/orders.ts:58" is actionable.
**Give benefit of the doubt.** If a pattern could be intentional (maybe there's context you don't have), flag it as "Question" not "Violation." The user can confirm.
**Don't propose fixes.** Your job is to identify issues, not resolve them. Fixes are a separate workflow. Keep the report to findings only.
**Don't read every file.** For a 7-day audit of an active repo, you'll encounter hundreds of files. Focus on:
- Files that are newly added (not modified — additions are where drift happens)
- Files in "sensitive" directories (auth, payments, migrations, config, infrastructure)
- Files with unusually large changes
- Files Claude touched that show up in `.claude/logs/sensitive.log` if available
If you must skip areas due to time, say so in the report.
## When done
Save the report to `docs/audits/YYYY-MM-DD-audit.md` for historical record. Print the report to the conversation for the user to read.
Scripts
.claude/scripts/rotate-logs.sh
#!/usr/bin/env bash
# Rotate .claude/logs/ files weekly to keep them manageable.
# Run via cron or manually. Keeps last 4 weeks, compresses older.
set -euo pipefail
LOG_DIR="${CLAUDE_PROJECT_DIR:-.}/.claude/logs"
if [ ! -d "$LOG_DIR" ]; then
echo "No log directory at $LOG_DIR"
exit 0
fi
cd "$LOG_DIR"
WEEK=$(date -u +%Y-W%V)
ARCHIVE_DIR="archive/$WEEK"
mkdir -p "$ARCHIVE_DIR"
# Files to rotate
FILES=(bash.log bash.jsonl sensitive.log sudo.log session-summaries.md)
for file in "${FILES[@]}"; do
if [ -f "$file" ] && [ -s "$file" ]; then
cp "$file" "$ARCHIVE_DIR/$file"
gzip "$ARCHIVE_DIR/$file"
# Truncate the live log
> "$file"
echo "Rotated $file -> $ARCHIVE_DIR/$file.gz"
fi
done
# Clean up archives older than 4 weeks
find archive/ -maxdepth 1 -type d -mtime +28 -exec rm -rf {} \; 2>/dev/null || true
echo "Log rotation complete."
.claude/scripts/weekly-review.sh
#!/usr/bin/env bash
# Weekly review helper — generates a single report from the past week's activity.
# Run every Friday (or whenever your review window is) before the weekly hygiene session.
#
# Output: .claude/logs/weekly-reviews/YYYY-WNN.md
set -euo pipefail
PROJECT_DIR="${CLAUDE_PROJECT_DIR:-.}"
LOG_DIR="$PROJECT_DIR/.claude/logs"
REVIEW_DIR="$LOG_DIR/weekly-reviews"
mkdir -p "$REVIEW_DIR"
WEEK=$(date -u +%Y-W%V)
REPORT="$REVIEW_DIR/$WEEK.md"
cat > "$REPORT" <<EOF
# Weekly Review — $WEEK
Generated: $(date -u +"%Y-%m-%dT%H:%M:%SZ")
## Activity summary
EOF
# =============================================================
# Git activity
# =============================================================
if [ -d "$PROJECT_DIR/.git" ]; then
cd "$PROJECT_DIR"
echo "### Commits this week" >> "$REPORT"
COMMITS_COUNT=$(git log --since="7 days ago" --oneline | wc -l)
echo "**Total:** $COMMITS_COUNT commits" >> "$REPORT"
echo "" >> "$REPORT"
echo '```' >> "$REPORT"
git log --since="7 days ago" --oneline >> "$REPORT" 2>&1 || echo "(no commits)" >> "$REPORT"
echo '```' >> "$REPORT"
echo "" >> "$REPORT"
echo "### Files most changed this week" >> "$REPORT"
echo '```' >> "$REPORT"
git log --since="7 days ago" --name-only --pretty=format: 2>/dev/null | \
sort | uniq -c | sort -rn | head -15 >> "$REPORT" || echo "(none)" >> "$REPORT"
echo '```' >> "$REPORT"
echo "" >> "$REPORT"
fi
# =============================================================
# Sensitive touches
# =============================================================
echo "### Sensitive area touches" >> "$REPORT"
if [ -f "$LOG_DIR/sensitive.log" ]; then
RECENT=$(awk -v d="$(date -u -d '7 days ago' +%s 2>/dev/null || date -u -v-7d +%s)" '
{
cmd = "date -u -d " $1 " +%s 2>/dev/null || date -u -j -f \"%Y-%m-%dT%H:%M:%SZ\" " $1 " +%s";
cmd | getline t;
close(cmd);
if (t >= d) print
}' "$LOG_DIR/sensitive.log" 2>/dev/null || true)
if [ -n "$RECENT" ]; then
COUNT=$(echo "$RECENT" | wc -l)
echo "**Total:** $COUNT sensitive touches" >> "$REPORT"
echo "" >> "$REPORT"
echo '```' >> "$REPORT"
echo "$RECENT" >> "$REPORT"
echo '```' >> "$REPORT"
else
echo "None" >> "$REPORT"
fi
else
echo "(no sensitive log)" >> "$REPORT"
fi
echo "" >> "$REPORT"
# =============================================================
# Bash command volume
# =============================================================
echo "### Claude bash activity" >> "$REPORT"
if [ -f "$LOG_DIR/bash.jsonl" ]; then
WEEK_CMDS=$(jq -r --arg since "$(date -u -d '7 days ago' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -v-7d +%Y-%m-%dT%H:%M:%SZ)" \
'select(.timestamp > $since) | .command' "$LOG_DIR/bash.jsonl" 2>/dev/null || echo "")
if [ -n "$WEEK_CMDS" ]; then
TOTAL=$(echo "$WEEK_CMDS" | wc -l)
echo "**Total commands:** $TOTAL" >> "$REPORT"
echo "" >> "$REPORT"
echo "**Most common commands:**" >> "$REPORT"
echo '```' >> "$REPORT"
echo "$WEEK_CMDS" | awk '{print $1}' | sort | uniq -c | sort -rn | head -10 >> "$REPORT"
echo '```' >> "$REPORT"
else
echo "(no commands logged)" >> "$REPORT"
fi
fi
echo "" >> "$REPORT"
# =============================================================
# Blocked / dangerous attempts
# =============================================================
if [ -f "$LOG_DIR/blocked.log" ]; then
echo "### Blocked operations this week" >> "$REPORT"
echo '```' >> "$REPORT"
tail -20 "$LOG_DIR/blocked.log" >> "$REPORT"
echo '```' >> "$REPORT"
echo "" >> "$REPORT"
fi
# =============================================================
# Action items (manual)
# =============================================================
cat >> "$REPORT" <<'EOF'
## Review checklist
- [ ] Any sensitive area touches that warrant a security review?
- [ ] Any recurring patterns in commands that should be allowed permanently?
- [ ] Any recurring patterns in blocked attempts that indicate a rule needs tightening?
- [ ] Any learnings from session summaries to promote to CLAUDE.md or skills?
- [ ] CLAUDE.md still under 200 lines?
- [ ] Any skills that haven't activated in a month — retire them?
- [ ] Run the standardization test on a representative task?
## Notes
<!-- Add your notes from the weekly hygiene session here -->
EOF
echo "Weekly review generated: $REPORT"
echo ""
echo "Review the report, add notes in the 'Notes' section,"
echo "and convert any learnings into rule updates."
# Open it if we can
if command -v open >/dev/null 2>&1; then
open "$REPORT"
elif command -v xdg-open >/dev/null 2>&1; then
xdg-open "$REPORT"
fi
CI checks
scripts/ci/checks/no-direct-env-access.sh
#!/usr/bin/env bash
# Check: no direct process.env or os.environ access outside the config module.
# All env var reads must go through the central config, which validates and types them.
#
# This catches a common Claude failure mode: it knows process.env works and uses it
# directly rather than extending the config module, producing config that's scattered
# across the codebase and hard to audit.
set -euo pipefail
VIOLATIONS=0
# Check TypeScript/JavaScript files
while IFS= read -r file; do
# Allow the config module itself
if [[ "$file" == *"src/config/"* ]]; then
continue
fi
if grep -nE "process\.env\." "$file" > /dev/null 2>&1; then
echo "❌ Direct process.env access in $file:"
grep -nE "process\.env\." "$file" | head -5
VIOLATIONS=$((VIOLATIONS + 1))
fi
done < <(find src -type f \( -name "*.ts" -o -name "*.tsx" -o -name "*.js" -o -name "*.jsx" \) 2>/dev/null || true)
# Check Python files
while IFS= read -r file; do
if [[ "$file" == *"config/"* ]]; then
continue
fi
if grep -nE "(os\.environ|os\.getenv)" "$file" > /dev/null 2>&1; then
echo "❌ Direct os.environ access in $file:"
grep -nE "(os\.environ|os\.getenv)" "$file" | head -5
VIOLATIONS=$((VIOLATIONS + 1))
fi
done < <(find . -type f -name "*.py" -not -path "./.venv/*" -not -path "./node_modules/*" 2>/dev/null || true)
if [ "$VIOLATIONS" -gt 0 ]; then
echo ""
echo "Found $VIOLATIONS files with direct env var access."
echo "All env var reads must go through the config module. See docs/decisions/ADR-XXXX-config.md"
exit 1
fi
echo "✓ No direct env var access outside config module"
scripts/ci/checks/migrations-reversible.sh
#!/usr/bin/env bash
# Check: every database migration must have both an `up` and a `down` section.
# This is a common Claude failure mode — it writes forward-only migrations which
# can't be rolled back, violating the ADR that requires reversibility.
set -euo pipefail
VIOLATIONS=0
MIGRATIONS_DIR="${MIGRATIONS_DIR:-db/migrations}"
if [ ! -d "$MIGRATIONS_DIR" ]; then
echo "⊘ No migrations directory at $MIGRATIONS_DIR (skipping)"
exit 0
fi
while IFS= read -r file; do
HAS_UP=0
HAS_DOWN=0
# Accept common migration framework conventions
# SQL convention: -- +up / -- +down (goose), -- migrate:up / -- migrate:down (dbmate)
# Knex/TypeORM: exports.up / exports.down, function up/down
# Alembic (Python): def upgrade / def downgrade
if grep -qE "(-- \+up|-- migrate:up|exports\.up|function up|def upgrade)" "$file"; then
HAS_UP=1
fi
if grep -qE "(-- \+down|-- migrate:down|exports\.down|function down|def downgrade)" "$file"; then
HAS_DOWN=1
fi
if [ "$HAS_UP" -eq 0 ] || [ "$HAS_DOWN" -eq 0 ]; then
echo "❌ Migration missing up/down: $file"
[ "$HAS_UP" -eq 0 ] && echo " Missing: up/upgrade section"
[ "$HAS_DOWN" -eq 0 ] && echo " Missing: down/downgrade section"
VIOLATIONS=$((VIOLATIONS + 1))
fi
# Additional check: warn on destructive operations without an explicit approval comment
if grep -qiE "(DROP TABLE|DROP COLUMN|TRUNCATE|DELETE FROM.*WHERE 1=1)" "$file"; then
if ! grep -qE "DESTRUCTIVE-APPROVED:" "$file"; then
echo "⚠️ Destructive migration without approval marker: $file"
echo " Destructive migrations (DROP, TRUNCATE) must include the comment:"
echo " -- DESTRUCTIVE-APPROVED: <reviewer username> <date>"
VIOLATIONS=$((VIOLATIONS + 1))
fi
fi
done < <(find "$MIGRATIONS_DIR" -type f \( -name "*.sql" -o -name "*.ts" -o -name "*.js" -o -name "*.py" \))
if [ "$VIOLATIONS" -gt 0 ]; then
exit 1
fi
echo "✓ All migrations are reversible"
scripts/ci/checks/routes-have-auth.sh
#!/usr/bin/env bash
# Check: every API route file must either declare an auth requirement
# or explicitly opt out with a comment.
#
# Catches the common failure where Claude adds a new endpoint and forgets
# to apply authentication middleware, silently creating a public endpoint
# that exposes data.
set -euo pipefail
VIOLATIONS=0
ROUTES_DIR="${ROUTES_DIR:-src/api/routes}"
if [ ! -d "$ROUTES_DIR" ]; then
echo "⊘ No routes directory at $ROUTES_DIR (skipping)"
exit 0
fi
while IFS= read -r file; do
# Skip route index files and type definitions
if [[ "$file" == *"index.ts" ]] || [[ "$file" == *".d.ts" ]] || [[ "$file" == *".test."* ]]; then
continue
fi
# Require one of:
# requireAuth, requireRole, authenticated(), @Auth decorator,
# or an explicit "// PUBLIC-ROUTE:" comment with justification
if ! grep -qE "(requireAuth|requireRole|authenticated|@Auth|@UseGuards|// PUBLIC-ROUTE:)" "$file"; then
echo "❌ Route file has no auth declaration: $file"
echo " Every route must either use auth middleware or include an explicit"
echo " '// PUBLIC-ROUTE: <reason>' comment explaining why it's public."
VIOLATIONS=$((VIOLATIONS + 1))
fi
done < <(find "$ROUTES_DIR" -type f \( -name "*.ts" -o -name "*.tsx" -o -name "*.js" \))
if [ "$VIOLATIONS" -gt 0 ]; then
exit 1
fi
echo "✓ All routes declare auth"
scripts/ci/checks/no-silent-errors.sh
#!/usr/bin/env bash
# Check: no silent catch/except blocks.
# Claude sometimes writes error handlers that swallow errors silently,
# producing code that "works" but hides real failures from observability.
set -euo pipefail
VIOLATIONS=0
# Pattern 1: empty catch blocks in JS/TS
# catch (e) { }
# catch {}
while IFS= read -r file; do
# pcre2grep catches multi-line patterns
if command -v pcregrep >/dev/null 2>&1; then
MATCHES=$(pcregrep -M -n '(catch\s*\([^)]*\)|catch)\s*\{\s*\}' "$file" 2>/dev/null || true)
if [ -n "$MATCHES" ]; then
echo "❌ Empty catch block in $file:"
echo "$MATCHES"
VIOLATIONS=$((VIOLATIONS + 1))
fi
fi
done < <(find . -type f \( -name "*.ts" -o -name "*.tsx" -o -name "*.js" \) -not -path "./node_modules/*" -not -path "./dist/*" 2>/dev/null || true)
# Pattern 2: bare except: pass in Python
while IFS= read -r file; do
if command -v pcregrep >/dev/null 2>&1; then
MATCHES=$(pcregrep -M -n 'except[^:]*:\s*\n\s*pass' "$file" 2>/dev/null || true)
if [ -n "$MATCHES" ]; then
echo "❌ except: pass in $file:"
echo "$MATCHES"
VIOLATIONS=$((VIOLATIONS + 1))
fi
fi
done < <(find . -type f -name "*.py" -not -path "./.venv/*" 2>/dev/null || true)
# Pattern 3: catch that only console.logs and continues (a softer form of swallowing)
while IFS= read -r file; do
# Look for catch blocks whose only statement is a console.log/error
if grep -Pzo '(?s)catch\s*\([^)]*\)\s*\{\s*console\.(log|error|warn)\([^)]*\);?\s*\}' "$file" > /dev/null 2>&1; then
echo "⚠️ Catch block only logs in $file (use proper error handling or rethrow)"
VIOLATIONS=$((VIOLATIONS + 1))
fi
done < <(find . -type f \( -name "*.ts" -o -name "*.tsx" -o -name "*.js" \) -not -path "./node_modules/*" 2>/dev/null || true)
if [ "$VIOLATIONS" -gt 0 ]; then
echo ""
echo "Found $VIOLATIONS silent error swallow(s). Either handle the error properly,"
echo "log through the structured logger AND rethrow, or document why it's safe"
echo "to ignore with a comment like: '// SAFE-TO-IGNORE: <reason>'"
exit 1
fi
echo "✓ No silent error swallowing detected"
scripts/ci/checks/openapi-matches-routes.sh
#!/usr/bin/env bash
# Check: every route file has a corresponding OpenAPI spec entry.
# Catches the common failure where Claude adds endpoints but forgets to document them.
set -euo pipefail
VIOLATIONS=0
ROUTES_DIR="${ROUTES_DIR:-src/api/routes}"
OPENAPI_FILE="${OPENAPI_FILE:-openapi.yaml}"
if [ ! -d "$ROUTES_DIR" ]; then
echo "⊘ No routes directory at $ROUTES_DIR (skipping)"
exit 0
fi
if [ ! -f "$OPENAPI_FILE" ]; then
echo "⊘ No OpenAPI file at $OPENAPI_FILE (skipping)"
exit 0
fi
# Extract route paths declared in code
# This is a rough heuristic; customize for your framework's patterns
# Example patterns matched:
# app.get("/users/:id"
# router.post('/orders'
# @Get('/items/:id')
CODE_ROUTES=$(grep -rhE "(app|router|fastify)\.(get|post|put|patch|delete)\s*\(\s*['\"]" "$ROUTES_DIR" 2>/dev/null | \
grep -oE "['\"][^'\"]+['\"]" | head -1 | tr -d "'\"" || true)
# Extract paths declared in OpenAPI
if command -v yq >/dev/null 2>&1; then
SPEC_PATHS=$(yq eval '.paths | keys | .[]' "$OPENAPI_FILE" 2>/dev/null || echo "")
else
# Fallback: grep for path-like entries
SPEC_PATHS=$(grep -E "^\s+/" "$OPENAPI_FILE" | sed 's/://' | tr -d ' ' || echo "")
fi
# Normalize route params: /:id -> /{id}
normalize() {
echo "$1" | sed -E 's|/:([a-zA-Z_]+)|/{\1}|g'
}
# Check each code route has a spec entry
MISSING=()
while IFS= read -r route; do
[ -z "$route" ] && continue
NORMALIZED=$(normalize "$route")
if ! echo "$SPEC_PATHS" | grep -qF "$NORMALIZED"; then
MISSING+=("$route")
fi
done <<< "$CODE_ROUTES"
if [ ${#MISSING[@]} -gt 0 ]; then
echo "❌ Routes missing from $OPENAPI_FILE:"
printf ' - %s\n' "${MISSING[@]}"
echo ""
echo "Every route must be documented in the OpenAPI spec."
exit 1
fi
echo "✓ All routes documented in OpenAPI spec"
GitHub Actions
.github/workflows/agent-gauntlet.yml
name: Agent Output Gauntlet
# This workflow runs on every PR. It's designed specifically to catch issues
# in AI-generated code that a human reviewer can't reliably catch by reading.
# Each job focuses on a different category of detection.
#
# The principle: fail fast and loudly. A green PR should give you high confidence
# the code is at least mechanically sound, so you can focus review on judgment calls.
on:
pull_request:
branches: [main, staging]
push:
branches: [main]
# Prevent concurrent runs on the same PR from wasting CI minutes
concurrency:
group: gauntlet-${{ github.ref }}
cancel-in-progress: true
jobs:
# ============================================================
# PR SIZE GATE — reject oversized PRs before running anything else
# ============================================================
size-gate:
name: "Gate: PR Size"
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Check PR diff size
env:
MAX_LINES: 600
MAX_FILES: 40
run: |
BASE_SHA=${{ github.event.pull_request.base.sha }}
HEAD_SHA=${{ github.event.pull_request.head.sha }}
# Count lines changed (excluding lockfiles, generated files, fixtures)
LINES=$(git diff --numstat $BASE_SHA..$HEAD_SHA -- \
':!*.lock' \
':!*.lockb' \
':!package-lock.json' \
':!yarn.lock' \
':!pnpm-lock.yaml' \
':!poetry.lock' \
':!go.sum' \
':!Cargo.lock' \
':!**/fixtures/**' \
':!**/__generated__/**' \
':!**/*.generated.*' \
| awk '{sum+=$1+$2} END {print sum}')
FILES=$(git diff --name-only $BASE_SHA..$HEAD_SHA | wc -l)
echo "Lines changed: $LINES (max $MAX_LINES)"
echo "Files changed: $FILES (max $MAX_FILES)"
if [ "$LINES" -gt "$MAX_LINES" ]; then
echo "::error::PR too large ($LINES lines > $MAX_LINES). Split into smaller PRs."
echo "Large PRs are impossible to review carefully. If this is genuinely a single logical change, add the label 'large-pr-approved' after getting sign-off."
# Don't fail if the override label is present
if ! gh pr view ${{ github.event.pull_request.number }} --json labels -q '.labels[].name' | grep -q 'large-pr-approved'; then
exit 1
fi
fi
if [ "$FILES" -gt "$MAX_FILES" ]; then
echo "::error::PR touches too many files ($FILES > $MAX_FILES). Split into smaller PRs."
exit 1
fi
env:
GH_TOKEN: ${{ github.token }}
# ============================================================
# PR DESCRIPTION CHECK — ensure the PR includes the required sections
# ============================================================
pr-description:
name: "Gate: PR Description"
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- name: Verify PR description has required sections
env:
PR_BODY: ${{ github.event.pull_request.body }}
run: |
REQUIRED=("What changed" "Why" "Reused" "New" "Risk areas")
MISSING=()
for section in "${REQUIRED[@]}"; do
if ! echo "$PR_BODY" | grep -iq "$section"; then
MISSING+=("$section")
fi
done
if [ ${#MISSING[@]} -gt 0 ]; then
echo "::error::PR description missing required sections: ${MISSING[*]}"
echo "Every PR must include: What changed, Why, Reused (existing code extended), New (anything new introduced), Risk areas (what needs manual review)."
exit 1
fi
# ============================================================
# LINTING — style, formatting, dead code
# ============================================================
lint:
name: "Check: Lint & Format"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
# Configure these based on your actual stack. Examples shown for JS/TS + Python.
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
- name: Install JS deps
run: npm ci
- name: Lint JS/TS
run: |
npm run lint
npx prettier --check "**/*.{ts,tsx,js,jsx,json,md}"
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Lint Python
run: |
pip install ruff
ruff check .
ruff format --check .
# ============================================================
# TYPE CHECK — catch code that "looks right" but doesn't type
# ============================================================
typecheck:
name: "Check: Types"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
- run: npm ci
- run: npx tsc --noEmit
# Python type check (if applicable)
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- run: pip install mypy
- run: mypy . --ignore-missing-imports || echo "::warning::mypy has findings (non-blocking)"
# ============================================================
# TESTS — full suite, with coverage delta check
# ============================================================
test:
name: "Check: Tests & Coverage"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
- run: npm ci
- name: Run tests with coverage
run: npm test -- --coverage --coverageReporters=json-summary
- name: Check coverage didn't drop
if: github.event_name == 'pull_request'
run: |
CURRENT=$(jq '.total.lines.pct' coverage/coverage-summary.json)
echo "Current coverage: $CURRENT%"
# Fetch base branch coverage baseline (stored in repo at docs/coverage-baseline.json)
if [ -f docs/coverage-baseline.json ]; then
BASELINE=$(jq '.total.lines.pct' docs/coverage-baseline.json)
echo "Baseline coverage: $BASELINE%"
# Allow up to 1% drop (noise tolerance)
DROP=$(echo "$BASELINE - $CURRENT" | bc)
if (( $(echo "$DROP > 1" | bc -l) )); then
echo "::error::Coverage dropped by more than 1% ($BASELINE% -> $CURRENT%). Add tests for new code."
exit 1
fi
fi
- name: Check test suite runtime didn't balloon
run: |
START=$(date +%s)
npm test --silent > /dev/null 2>&1 || true
END=$(date +%s)
DURATION=$((END - START))
echo "Test suite runtime: ${DURATION}s"
if [ -f docs/test-runtime-baseline.txt ]; then
BASELINE=$(cat docs/test-runtime-baseline.txt)
# Fail if runtime more than 2x baseline
if [ $((DURATION)) -gt $((BASELINE * 2)) ]; then
echo "::error::Test suite runtime ${DURATION}s is >2x baseline ${BASELINE}s. Claude may have added slow tests or broken parallelization."
exit 1
fi
fi
# ============================================================
# SECURITY — static analysis for common vulns
# ============================================================
security:
name: "Check: Security (SAST)"
runs-on: ubuntu-latest
permissions:
security-events: write
actions: read
contents: read
steps:
- uses: actions/checkout@v4
# Semgrep — fast, broad, lots of rules for common vulnerabilities
- name: Semgrep
uses: semgrep/semgrep-action@v1
with:
config: |
p/security-audit
p/secrets
p/owasp-top-ten
p/javascript
p/typescript
p/python
# Dedicated secret scanning
- name: Gitleaks
uses: gitleaks/gitleaks-action@v2
env:
GITHUB_TOKEN: ${{ github.token }}
# CodeQL — deeper analysis, catches things Semgrep misses
- name: Initialize CodeQL
uses: github/codeql-action/init@v3
with:
languages: javascript,python
- name: CodeQL Analysis
uses: github/codeql-action/analyze@v3
# ============================================================
# DEPENDENCY REVIEW — flag new deps with known vulns or licensing issues
# ============================================================
dependency-review:
name: "Check: Dependency Review"
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- uses: actions/checkout@v4
- name: Dependency Review
uses: actions/dependency-review-action@v4
with:
fail-on-severity: high
# Fail if new dependencies add GPL/AGPL licenses (adjust for your policy)
deny-licenses: AGPL-3.0, GPL-3.0
- name: Check dependency count delta
run: |
# Count current deps
CURRENT=$(jq '.dependencies | length + (.devDependencies | length)' package.json)
# Count deps on main
git show origin/main:package.json > /tmp/base-package.json 2>/dev/null || echo "{}" > /tmp/base-package.json
BASE=$(jq '.dependencies | length + (.devDependencies | length)' /tmp/base-package.json)
DELTA=$((CURRENT - BASE))
echo "Dependency count: $BASE -> $CURRENT (delta: $DELTA)"
# Flag any PR that adds more than 5 deps at once
if [ "$DELTA" -gt 5 ]; then
echo "::error::This PR adds $DELTA dependencies. Claude may be pulling in packages unnecessarily. Justify each new dependency in the PR description or split into smaller PRs."
exit 1
fi
# ============================================================
# COMPLEXITY — flag functions that got too complex
# ============================================================
complexity:
name: "Check: Complexity"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: "22"
- name: Check JS/TS complexity
run: |
npx --yes complexity-report-json --format json 'src/**/*.{ts,tsx,js,jsx}' > /tmp/complexity.json || true
# Fail if any function has cyclomatic complexity > 15 or length > 100 lines
HIGH_COMPLEXITY=$(jq '[.reports[].functions[] | select(.cyclomatic > 15 or .sloc.logical > 100)] | length' /tmp/complexity.json 2>/dev/null || echo "0")
if [ "$HIGH_COMPLEXITY" -gt "0" ]; then
echo "::warning::$HIGH_COMPLEXITY functions exceed complexity thresholds. Consider breaking them up."
jq '.reports[].functions[] | select(.cyclomatic > 15 or .sloc.logical > 100) | {file: .file, name: .name, complexity: .cyclomatic, lines: .sloc.logical}' /tmp/complexity.json
fi
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Check Python complexity
run: |
pip install radon
# Fail on any function with complexity grade worse than C
radon cc -s -n C . || echo "::warning::Python complexity findings"
# ============================================================
# CUSTOM CHECKS — repo-specific invariants Claude keeps violating
# ============================================================
custom-checks:
name: "Check: Repo-Specific Invariants"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run repo-specific checks
run: |
# This runs all scripts in scripts/ci/checks/ — each returns non-zero if it finds a violation
set +e
FAILED=0
for script in scripts/ci/checks/*.sh; do
if [ -f "$script" ]; then
echo "=== Running $script ==="
bash "$script"
if [ $? -ne 0 ]; then
echo "::error::Check failed: $script"
FAILED=1
fi
fi
done
exit $FAILED
# ============================================================
# TODO / DEBT GUARD — no new TODOs or disabled tests
# ============================================================
debt-guard:
name: "Check: No New Debt"
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Check for new TODOs
run: |
BASE_SHA=${{ github.event.pull_request.base.sha }}
HEAD_SHA=${{ github.event.pull_request.head.sha }}
# Find lines added in this PR containing TODO/FIXME/XXX markers
NEW_TODOS=$(git diff $BASE_SHA..$HEAD_SHA --unified=0 | \
grep -E "^\+" | \
grep -E "(TODO|FIXME|XXX|HACK):" | \
grep -v "^+++" || true)
if [ -n "$NEW_TODOS" ]; then
echo "::error::This PR adds TODO/FIXME/XXX/HACK markers. Either fix now or file issues."
echo "$NEW_TODOS"
exit 1
fi
- name: Check for disabled tests
run: |
BASE_SHA=${{ github.event.pull_request.base.sha }}
HEAD_SHA=${{ github.event.pull_request.head.sha }}
DISABLED=$(git diff $BASE_SHA..$HEAD_SHA --unified=0 | \
grep -E "^\+" | \
grep -E "(\.skip|\.only|xit\(|xdescribe\(|@pytest\.mark\.skip|#\[ignore\])" \
| grep -v "^+++" || true)
if [ -n "$DISABLED" ]; then
echo "::error::This PR disables or focuses tests. This is not allowed."
echo "$DISABLED"
exit 1
fi
# ============================================================
# SESSION SUMMARY — generate a readable summary of what Claude did
# ============================================================
session-summary:
name: "Report: Change Summary"
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
permissions:
pull-requests: write
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Generate change summary
run: |
BASE_SHA=${{ github.event.pull_request.base.sha }}
HEAD_SHA=${{ github.event.pull_request.head.sha }}
echo "## Change Summary" > /tmp/summary.md
echo "" >> /tmp/summary.md
echo "### New files" >> /tmp/summary.md
git diff --name-only --diff-filter=A $BASE_SHA..$HEAD_SHA | sed 's/^/- /' >> /tmp/summary.md || echo "None" >> /tmp/summary.md
echo "" >> /tmp/summary.md
echo "### Deleted files" >> /tmp/summary.md
git diff --name-only --diff-filter=D $BASE_SHA..$HEAD_SHA | sed 's/^/- /' >> /tmp/summary.md || echo "None" >> /tmp/summary.md
echo "" >> /tmp/summary.md
echo "### Modified files (top 20 by lines changed)" >> /tmp/summary.md
git diff --numstat $BASE_SHA..$HEAD_SHA | sort -rn | head -20 | awk '{printf "- `%s` (+%s -%s)\n", $3, $1, $2}' >> /tmp/summary.md
echo "" >> /tmp/summary.md
echo "### Sensitive area touches" >> /tmp/summary.md
SENSITIVE=$(git diff --name-only $BASE_SHA..$HEAD_SHA | grep -E "(auth|payment|permission|migration|\.github|infrastructure|\.env)" || true)
if [ -n "$SENSITIVE" ]; then
echo "⚠️ **This PR touches sensitive areas:**" >> /tmp/summary.md
echo "$SENSITIVE" | sed 's/^/- /' >> /tmp/summary.md
else
echo "None" >> /tmp/summary.md
fi
cat /tmp/summary.md
- name: Post summary as PR comment
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const summary = fs.readFileSync('/tmp/summary.md', 'utf8');
// Find existing summary comment and update it, rather than spamming new ones
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const existing = comments.find(c => c.body.startsWith('## Change Summary'));
if (existing) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existing.id,
body: summary
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: summary
});
}
Gitignore for logs
.claude/logs/.gitignore
# Runtime-generated logs (do not commit)
bash.log
bash.jsonl
sudo.log
sensitive.log
archive/
# Keep summaries and reviews in git — they're valuable history
!session-summaries.md
!weekly-reviews/
!weekly-reviews/*.md