From 9ee1429604c73e02fb759f4b575fda026bf20a12 Mon Sep 17 00:00:00 2001 From: openhands Date: Sun, 15 Mar 2026 16:30:54 +0000 Subject: [PATCH 1/2] fix: feat: red-team sweep should seed each candidate with cross-candidate attack patterns (#822) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - red-team-sweep.sh: after each candidate completes, extract all memory entries into /tmp/red-team-cross-patterns.jsonl (append), then clear the raw memory file so the next candidate starts with a fresh state - red-team.sh: define CROSS_PATTERNS_FILE; before building the prompt, read the cross-patterns file and generate a "Cross-Candidate Intelligence" section grouped by abstract op pattern — universal patterns (broke 2+ candidates), candidate-specific wins, and patterns that held everywhere — each annotated with optimizer profiles - The new section is injected into the Claude prompt above the existing Previous Findings block, satisfying all acceptance criteria Co-Authored-By: Claude Sonnet 4.6 --- scripts/harb-evaluator/red-team-sweep.sh | 35 ++++++++++ scripts/harb-evaluator/red-team.sh | 86 ++++++++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/scripts/harb-evaluator/red-team-sweep.sh b/scripts/harb-evaluator/red-team-sweep.sh index 339b18f..a392518 100755 --- a/scripts/harb-evaluator/red-team-sweep.sh +++ b/scripts/harb-evaluator/red-team-sweep.sh @@ -10,6 +10,8 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" INJECT="$REPO_ROOT/tools/push3-transpiler/inject.sh" ATTACKS_OUT="$REPO_ROOT/onchain/script/backtesting/attacks" PROGRESS_FILE="/tmp/red-team-sweep-progress.json" +MEMORY_FILE="$REPO_ROOT/tmp/red-team-memory.jsonl" +CROSS_PATTERNS_FILE="/tmp/red-team-cross-patterns.jsonl" OPT_SOL="$REPO_ROOT/onchain/src/OptimizerV3.sol" TIMEOUT_PER="${1:-3600}" @@ -114,6 +116,39 @@ PYEOF fi fi + # 4b. Extract abstract patterns into cross-candidate file, then clear raw memory + if [[ -f "$MEMORY_FILE" && -s "$MEMORY_FILE" ]]; then + python3 - "$MEMORY_FILE" "$CROSS_PATTERNS_FILE" <<'PYEOF' +import json, sys + +mem_file = sys.argv[1] +cross_file = sys.argv[2] + +new_entries = [] +with open(mem_file) as f: + for line in f: + line = line.strip() + if line: + try: + new_entries.append(json.loads(line)) + except Exception: + pass + +if not new_entries: + print(" No memory entries to extract") + sys.exit(0) + +with open(cross_file, 'a') as f: + for e in new_entries: + f.write(json.dumps(e) + '\n') + +print(f" Extracted {len(new_entries)} entr{'y' if len(new_entries)==1 else 'ies'} to cross-patterns file") +PYEOF + # Clear raw memory so the next candidate starts with a fresh tactical state + > "$MEMORY_FILE" + log "Cleared raw memory for next candidate" + fi + # 5. Save progress completed+=("$seed_name") jq -n --argjson arr "$(printf '%s\n' "${completed[@]}" | jq -R . | jq -s .)" \ diff --git a/scripts/harb-evaluator/red-team.sh b/scripts/harb-evaluator/red-team.sh index ff581bf..a3a1add 100755 --- a/scripts/harb-evaluator/red-team.sh +++ b/scripts/harb-evaluator/red-team.sh @@ -26,6 +26,7 @@ REPORT_DIR="$REPO_ROOT/tmp" REPORT="$REPORT_DIR/red-team-report.txt" STREAM_LOG="$REPORT_DIR/red-team-stream.jsonl" MEMORY_FILE="$REPO_ROOT/tmp/red-team-memory.jsonl" +CROSS_PATTERNS_FILE="/tmp/red-team-cross-patterns.jsonl" ATTACK_EXPORT="$REPORT_DIR/red-team-attacks.jsonl" ATTACK_SNAPSHOTS="$REPORT_DIR/red-team-snapshots.jsonl" DEPLOYMENTS="$REPO_ROOT/onchain/deployments-local.json" @@ -462,6 +463,89 @@ PYEOF ) fi +# Build Cross-Candidate Intelligence section from the cross-patterns file +CROSS_CANDIDATE_SECTION="" +if [[ -f "$CROSS_PATTERNS_FILE" && -s "$CROSS_PATTERNS_FILE" ]]; then + CROSS_CANDIDATE_SECTION=$(python3 - "$CROSS_PATTERNS_FILE" "$CANDIDATE_NAME" <<'PYEOF' +import json, sys +from collections import defaultdict + +cross_file = sys.argv[1] +current_candidate = sys.argv[2] if len(sys.argv) > 2 else "" + +entries = [] +with open(cross_file) as f: + for line in f: + line = line.strip() + if line: + try: + entries.append(json.loads(line)) + except Exception: + pass + +if not entries: + sys.exit(0) + +# Group by abstract pattern; track worked/failed per candidate +by_pattern = defaultdict(lambda: {"worked": {}, "failed": {}, "insight": ""}) +for e in entries: + pat = e.get("pattern", "") or e.get("strategy", "")[:80] + cand = e.get("candidate", "unknown") + prof = e.get("optimizer_profile", "unknown") + result = e.get("result", "HELD") + insight = e.get("insight", "") + if result == "DECREASED": + by_pattern[pat]["worked"][cand] = prof + else: + by_pattern[pat]["failed"][cand] = prof + if insight and not by_pattern[pat]["insight"]: + by_pattern[pat]["insight"] = insight + +universal = [(p, d) for p, d in by_pattern.items() if len(d["worked"]) > 1] +candidate_specific = [(p, d) for p, d in by_pattern.items() if len(d["worked"]) == 1] +failed_all = [(p, d) for p, d in by_pattern.items() if not d["worked"] and d["failed"]] + +print("## Cross-Candidate Intelligence") +print() +print("Attack patterns learned across all previously tested candidates.") +print("Exploit successes. Avoid repeating patterns that universally failed.") +print() + +def fmt_cand(cand, prof): + return f"{cand} ({prof})" if prof and prof not in ("", "unknown") else cand + +if universal: + print("### Universal Patterns (succeeded on 2+ candidates)") + for pat, d in sorted(universal, key=lambda x: -len(x[1]["worked"])): + worked_str = ", ".join(fmt_cand(c, p) for c, p in sorted(d["worked"].items())) + print(f"- `{pat}` — **BROKE** on: {worked_str}") + if d["failed"]: + failed_str = ", ".join(d["failed"]) + print(f" Held on: {failed_str}") + if d["insight"]: + print(f" Insight: {d['insight']}") + print() + +if candidate_specific: + print("### Candidate-Specific Patterns (broke exactly one candidate)") + for pat, d in candidate_specific: + worked_cand, worked_prof = next(iter(d["worked"].items())) + print(f"- `{pat}` — **BROKE** on: {fmt_cand(worked_cand, worked_prof)}") + if d["failed"]: + print(f" Held on: {', '.join(d['failed'])}") + if d["insight"]: + print(f" Insight: {d['insight']}") + print() + +if failed_all: + print("### Patterns That Held Across All Candidates Tried") + for pat, d in failed_all: + print(f"- `{pat}` — held on: {', '.join(d['failed'])}") + print() +PYEOF + ) +fi + PROMPT=$(cat < Date: Sun, 15 Mar 2026 17:02:19 +0000 Subject: [PATCH 2/2] fix: address review findings for cross-candidate red-team sweep (#822) - red-team-sweep.sh: reset CROSS_PATTERNS_FILE at sweep start to prevent stale patterns from prior invocations contaminating a fresh run - red-team-sweep.sh: wrap pattern-extraction Python in set +e/set -e and capture output so log() prefix is applied; move memory truncation outside the if-block so it runs unconditionally even if Python fails - red-team.sh: filter entries where candidate == current_candidate before grouping, removing self-referential cross-candidate evidence - red-team.sh: skip entries with empty pattern key (both pattern and strategy fields empty) to prevent spurious bucket merging Co-Authored-By: Claude Sonnet 4.6 --- scripts/harb-evaluator/red-team-sweep.sh | 19 +++++++++++++++---- scripts/harb-evaluator/red-team.sh | 5 +++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/scripts/harb-evaluator/red-team-sweep.sh b/scripts/harb-evaluator/red-team-sweep.sh index a392518..f59023b 100755 --- a/scripts/harb-evaluator/red-team-sweep.sh +++ b/scripts/harb-evaluator/red-team-sweep.sh @@ -20,6 +20,9 @@ die() { log "FATAL: $*" >&2; exit 1; } [[ -f "$INJECT" ]] || die "inject.sh not found at $INJECT" +# Reset cross-patterns file for this sweep invocation (prevents stale data from prior runs) +> "$CROSS_PATTERNS_FILE" + # Load progress completed=() if [[ -f "$PROGRESS_FILE" ]]; then @@ -118,7 +121,8 @@ PYEOF # 4b. Extract abstract patterns into cross-candidate file, then clear raw memory if [[ -f "$MEMORY_FILE" && -s "$MEMORY_FILE" ]]; then - python3 - "$MEMORY_FILE" "$CROSS_PATTERNS_FILE" <<'PYEOF' + set +e + _extract_out=$(python3 - "$MEMORY_FILE" "$CROSS_PATTERNS_FILE" <<'PYEOF' import json, sys mem_file = sys.argv[1] @@ -135,16 +139,23 @@ with open(mem_file) as f: pass if not new_entries: - print(" No memory entries to extract") + print("No memory entries to extract") sys.exit(0) with open(cross_file, 'a') as f: for e in new_entries: f.write(json.dumps(e) + '\n') -print(f" Extracted {len(new_entries)} entr{'y' if len(new_entries)==1 else 'ies'} to cross-patterns file") +print(f"Extracted {len(new_entries)} entr{'y' if len(new_entries)==1 else 'ies'} to cross-patterns file") PYEOF - # Clear raw memory so the next candidate starts with a fresh tactical state + ) + _py_exit=$? + set -e + [[ -n "$_extract_out" ]] && log "$_extract_out" + [[ $_py_exit -ne 0 ]] && log "WARNING: cross-pattern extraction failed (exit $_py_exit) — continuing" + fi + # Always clear raw memory so the next candidate starts with a fresh tactical state + if [[ -f "$MEMORY_FILE" ]]; then > "$MEMORY_FILE" log "Cleared raw memory for next candidate" fi diff --git a/scripts/harb-evaluator/red-team.sh b/scripts/harb-evaluator/red-team.sh index a3a1add..a951bb9 100755 --- a/scripts/harb-evaluator/red-team.sh +++ b/scripts/harb-evaluator/red-team.sh @@ -486,10 +486,15 @@ with open(cross_file) as f: if not entries: sys.exit(0) +# Exclude entries from the current candidate (they are cross-candidate evidence, not self-evidence) +entries = [e for e in entries if e.get("candidate", "unknown") != current_candidate] + # Group by abstract pattern; track worked/failed per candidate by_pattern = defaultdict(lambda: {"worked": {}, "failed": {}, "insight": ""}) for e in entries: pat = e.get("pattern", "") or e.get("strategy", "")[:80] + if not pat: + continue # skip entries with no identifiable pattern cand = e.get("candidate", "unknown") prof = e.get("optimizer_profile", "unknown") result = e.get("result", "HELD")