From 9ee1429604c73e02fb759f4b575fda026bf20a12 Mon Sep 17 00:00:00 2001 From: openhands Date: Sun, 15 Mar 2026 16:30:54 +0000 Subject: [PATCH] fix: feat: red-team sweep should seed each candidate with cross-candidate attack patterns (#822) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - red-team-sweep.sh: after each candidate completes, extract all memory entries into /tmp/red-team-cross-patterns.jsonl (append), then clear the raw memory file so the next candidate starts with a fresh state - red-team.sh: define CROSS_PATTERNS_FILE; before building the prompt, read the cross-patterns file and generate a "Cross-Candidate Intelligence" section grouped by abstract op pattern — universal patterns (broke 2+ candidates), candidate-specific wins, and patterns that held everywhere — each annotated with optimizer profiles - The new section is injected into the Claude prompt above the existing Previous Findings block, satisfying all acceptance criteria Co-Authored-By: Claude Sonnet 4.6 --- scripts/harb-evaluator/red-team-sweep.sh | 35 ++++++++++ scripts/harb-evaluator/red-team.sh | 86 ++++++++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/scripts/harb-evaluator/red-team-sweep.sh b/scripts/harb-evaluator/red-team-sweep.sh index 339b18f..a392518 100755 --- a/scripts/harb-evaluator/red-team-sweep.sh +++ b/scripts/harb-evaluator/red-team-sweep.sh @@ -10,6 +10,8 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" INJECT="$REPO_ROOT/tools/push3-transpiler/inject.sh" ATTACKS_OUT="$REPO_ROOT/onchain/script/backtesting/attacks" PROGRESS_FILE="/tmp/red-team-sweep-progress.json" +MEMORY_FILE="$REPO_ROOT/tmp/red-team-memory.jsonl" +CROSS_PATTERNS_FILE="/tmp/red-team-cross-patterns.jsonl" OPT_SOL="$REPO_ROOT/onchain/src/OptimizerV3.sol" TIMEOUT_PER="${1:-3600}" @@ -114,6 +116,39 @@ PYEOF fi fi + # 4b. Extract abstract patterns into cross-candidate file, then clear raw memory + if [[ -f "$MEMORY_FILE" && -s "$MEMORY_FILE" ]]; then + python3 - "$MEMORY_FILE" "$CROSS_PATTERNS_FILE" <<'PYEOF' +import json, sys + +mem_file = sys.argv[1] +cross_file = sys.argv[2] + +new_entries = [] +with open(mem_file) as f: + for line in f: + line = line.strip() + if line: + try: + new_entries.append(json.loads(line)) + except Exception: + pass + +if not new_entries: + print(" No memory entries to extract") + sys.exit(0) + +with open(cross_file, 'a') as f: + for e in new_entries: + f.write(json.dumps(e) + '\n') + +print(f" Extracted {len(new_entries)} entr{'y' if len(new_entries)==1 else 'ies'} to cross-patterns file") +PYEOF + # Clear raw memory so the next candidate starts with a fresh tactical state + > "$MEMORY_FILE" + log "Cleared raw memory for next candidate" + fi + # 5. Save progress completed+=("$seed_name") jq -n --argjson arr "$(printf '%s\n' "${completed[@]}" | jq -R . | jq -s .)" \ diff --git a/scripts/harb-evaluator/red-team.sh b/scripts/harb-evaluator/red-team.sh index ff581bf..a3a1add 100755 --- a/scripts/harb-evaluator/red-team.sh +++ b/scripts/harb-evaluator/red-team.sh @@ -26,6 +26,7 @@ REPORT_DIR="$REPO_ROOT/tmp" REPORT="$REPORT_DIR/red-team-report.txt" STREAM_LOG="$REPORT_DIR/red-team-stream.jsonl" MEMORY_FILE="$REPO_ROOT/tmp/red-team-memory.jsonl" +CROSS_PATTERNS_FILE="/tmp/red-team-cross-patterns.jsonl" ATTACK_EXPORT="$REPORT_DIR/red-team-attacks.jsonl" ATTACK_SNAPSHOTS="$REPORT_DIR/red-team-snapshots.jsonl" DEPLOYMENTS="$REPO_ROOT/onchain/deployments-local.json" @@ -462,6 +463,89 @@ PYEOF ) fi +# Build Cross-Candidate Intelligence section from the cross-patterns file +CROSS_CANDIDATE_SECTION="" +if [[ -f "$CROSS_PATTERNS_FILE" && -s "$CROSS_PATTERNS_FILE" ]]; then + CROSS_CANDIDATE_SECTION=$(python3 - "$CROSS_PATTERNS_FILE" "$CANDIDATE_NAME" <<'PYEOF' +import json, sys +from collections import defaultdict + +cross_file = sys.argv[1] +current_candidate = sys.argv[2] if len(sys.argv) > 2 else "" + +entries = [] +with open(cross_file) as f: + for line in f: + line = line.strip() + if line: + try: + entries.append(json.loads(line)) + except Exception: + pass + +if not entries: + sys.exit(0) + +# Group by abstract pattern; track worked/failed per candidate +by_pattern = defaultdict(lambda: {"worked": {}, "failed": {}, "insight": ""}) +for e in entries: + pat = e.get("pattern", "") or e.get("strategy", "")[:80] + cand = e.get("candidate", "unknown") + prof = e.get("optimizer_profile", "unknown") + result = e.get("result", "HELD") + insight = e.get("insight", "") + if result == "DECREASED": + by_pattern[pat]["worked"][cand] = prof + else: + by_pattern[pat]["failed"][cand] = prof + if insight and not by_pattern[pat]["insight"]: + by_pattern[pat]["insight"] = insight + +universal = [(p, d) for p, d in by_pattern.items() if len(d["worked"]) > 1] +candidate_specific = [(p, d) for p, d in by_pattern.items() if len(d["worked"]) == 1] +failed_all = [(p, d) for p, d in by_pattern.items() if not d["worked"] and d["failed"]] + +print("## Cross-Candidate Intelligence") +print() +print("Attack patterns learned across all previously tested candidates.") +print("Exploit successes. Avoid repeating patterns that universally failed.") +print() + +def fmt_cand(cand, prof): + return f"{cand} ({prof})" if prof and prof not in ("", "unknown") else cand + +if universal: + print("### Universal Patterns (succeeded on 2+ candidates)") + for pat, d in sorted(universal, key=lambda x: -len(x[1]["worked"])): + worked_str = ", ".join(fmt_cand(c, p) for c, p in sorted(d["worked"].items())) + print(f"- `{pat}` — **BROKE** on: {worked_str}") + if d["failed"]: + failed_str = ", ".join(d["failed"]) + print(f" Held on: {failed_str}") + if d["insight"]: + print(f" Insight: {d['insight']}") + print() + +if candidate_specific: + print("### Candidate-Specific Patterns (broke exactly one candidate)") + for pat, d in candidate_specific: + worked_cand, worked_prof = next(iter(d["worked"].items())) + print(f"- `{pat}` — **BROKE** on: {fmt_cand(worked_cand, worked_prof)}") + if d["failed"]: + print(f" Held on: {', '.join(d['failed'])}") + if d["insight"]: + print(f" Insight: {d['insight']}") + print() + +if failed_all: + print("### Patterns That Held Across All Candidates Tried") + for pat, d in failed_all: + print(f"- `{pat}` — held on: {', '.join(d['failed'])}") + print() +PYEOF + ) +fi + PROMPT=$(cat <