fix: Red-team memory: persistent cross-run learning for adversarial agent (#528)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-09 09:23:37 +00:00 · 2026-03-09 09:23:37 +00:00 · c1db4cb93e
commit c1db4cb93e
parent 28568dbcfd
1 changed files with 192 additions and 2 deletions
--- a/scripts/harb-evaluator/red-team.sh
+++ b/scripts/harb-evaluator/red-team.sh
@ -23,6 +23,8 @@ CLAUDE_TIMEOUT="${CLAUDE_TIMEOUT:-7200}"
 REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
 REPORT_DIR="$REPO_ROOT/tmp"
 REPORT="$REPORT_DIR/red-team-report.txt"
+STREAM_LOG="$REPORT_DIR/red-team-stream.jsonl"
+MEMORY_FILE="$REPORT_DIR/red-team-memory.jsonl"
 DEPLOYMENTS="$REPO_ROOT/onchain/deployments-local.json"

 # ── Anvil accounts ─────────────────────────────────────────────────────────────
@ -154,12 +156,173 @@ print(0 if adj <= 0 else (e + w) * 10**18 // adj)
 PYEOF
 }

+# ── Helper: extract strategy findings from stream-json and append to memory ────
+extract_memory() {
+  local stream_file="$1"
+  local run_num memory_file="$MEMORY_FILE"
+
+  # Determine run number (rough: count existing entries)
+  if [[ -f "$memory_file" ]]; then
+    run_num=$(wc -l < "$memory_file")
+    run_num=$((run_num / 3 + 1))
+  else
+    run_num=1
+  fi
+
+  python3 - "$stream_file" "$memory_file" "$run_num" "$FLOOR_BEFORE" <<'PYEOF'
+import json, sys, re
+from datetime import datetime, timezone
+
+stream_file = sys.argv[1]
+memory_file = sys.argv[2]
+run_num = int(sys.argv[3])
+floor_before = int(sys.argv[4])
+
+texts = []
+with open(stream_file) as f:
+    for line in f:
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            obj = json.loads(line)
+            if obj.get("type") == "assistant":
+                for block in obj.get("message", {}).get("content", []):
+                    if block.get("type") == "text":
+                        texts.append(block["text"])
+        except:
+            pass
+
+# Parse strategies from agent text
+strategies = []
+current = None
+for text in texts:
+    # Detect strategy headers
+    strat_match = re.search(r"##\s*Strategy\s*\d+[^:]*:\s*(.+)", text)
+    if strat_match:
+        if current:
+            strategies.append(current)
+        current = {
+            "strategy": strat_match.group(1).strip(),
+            "steps": "",
+            "floor_after": None,
+            "insight": ""
+        }
+
+    if current:
+        # Capture floor readings
+        floor_match = re.search(r"(?:floor|ethPerToken)[^\d]*?(\d{4,})\s*(?:wei)?", text, re.IGNORECASE)
+        if floor_match:
+            current["floor_after"] = int(floor_match.group(1))
+
+        # Capture insights
+        for pattern in [r"[Kk]ey [Ii]nsight:\s*(.+)", r"[Ii]nsight:\s*(.+)", r"(?:discovered|learned|realized)\s+(?:that\s+)?(.+)"]:
+            insight_match = re.search(pattern, text)
+            if insight_match and len(insight_match.group(1)) > 20:
+                current["insight"] = insight_match.group(1).strip()[:300]
+
+        # Capture step summaries
+        if any(word in text.lower() for word in ["wrap", "buy", "sell", "stake", "recenter", "mint", "approve"]):
+            if len(text) < 200:
+                current["steps"] += text.strip() + "; "
+
+if current:
+    strategies.append(current)
+
+# Write to memory file
+ts = datetime.now(timezone.utc).isoformat()
+with open(memory_file, "a") as f:
+    for s in strategies:
+        fa = s.get("floor_after") or floor_before
+        delta_bps = (fa - floor_before) * 10000 // floor_before if floor_before else 0
+        if fa < floor_before:
+            result = "DECREASED"
+        elif fa > floor_before:
+            result = "INCREASED"
+        else:
+            result = "HELD"
+
+        entry = {
+            "run": run_num,
+            "ts": ts,
+            "strategy": s["strategy"][:100],
+            "steps": s["steps"][:300].rstrip("; "),
+            "floor_before": floor_before,
+            "floor_after": fa,
+            "delta_bps": delta_bps,
+            "result": result,
+            "insight": s["insight"][:300]
+        }
+        f.write(json.dumps(entry) + "\n")
+        print(f"  Recorded: {entry['strategy']} → {result} ({delta_bps:+d} bps)")
+
+if not strategies:
+    print("  No strategies detected in stream output")
+
+# Trim memory file: keep 10 most recent + all DECREASED entries (cap at 50)
+with open(memory_file) as f:
+    all_entries = [json.loads(l) for l in f if l.strip()]
+
+if len(all_entries) > 50:
+    decreased = [e for e in all_entries if e.get("result") == "DECREASED"]
+    recent = all_entries[-10:]
+    kept = {id(e): e for e in decreased + recent}.values()
+    # Preserve insertion order: filter all_entries keeping only kept ids
+    kept_set = set(id(e) for e in kept)
+    # Rebuild from original list preserving order
+    trimmed = [e for e in all_entries if e.get("result") == "DECREASED"] + all_entries[-10:]
+    # Deduplicate preserving order
+    seen = set()
+    deduped = []
+    for e in trimmed:
+        key = (e.get("run"), e.get("ts"), e.get("strategy"))
+        if key not in seen:
+            seen.add(key)
+            deduped.append(e)
+    with open(memory_file, "w") as f:
+        for e in deduped:
+            f.write(json.dumps(e) + "\n")
+    print(f"  Trimmed memory to {len(deduped)} entries")
+PYEOF
+}
+
 # ── 5. Read floor_before ───────────────────────────────────────────────────────
 log "Reading floor before agent run..."
 FLOOR_BEFORE=$(compute_eth_per_token)
 log "  floor_before = $FLOOR_BEFORE wei/token"

 # ── 6. Build agent prompt ──────────────────────────────────────────────────────
+
+# Build Previous Findings section from memory file
+MEMORY_SECTION=""
+if [[ -f "$MEMORY_FILE" && -s "$MEMORY_FILE" ]]; then
+  MEMORY_SECTION=$(python3 -c "
+import json, sys
+entries = []
+with open('$MEMORY_FILE') as f:
+    for line in f:
+        line = line.strip()
+        if line:
+            entries.append(json.loads(line))
+if not entries:
+    sys.exit(0)
+print('## Previous Findings (from earlier runs)')
+print()
+print('DO NOT repeat strategies marked HELD or INCREASED. Build on the insights.')
+print('Try NEW combinations not yet attempted. Combine tools creatively.')
+print()
+for e in entries:
+    r = e.get('result','?')
+    emoji = '❌' if r == 'DECREASED' else '⬆️' if r == 'INCREASED' else '➡️'
+    print(f\"### Run {e.get('run','?')}: {e.get('strategy','?')} {emoji} {r}\")
+    print(f\"Steps: {e.get('steps','?')}\")
+    print(f\"Delta: {e.get('delta_bps',0)} bps\")
+    if e.get('insight'):
+        print(f\"**Insight:** {e['insight']}\")
+    print()
+")
+fi
+
 PROMPT=$(cat <<PROMPT_EOF
 # Red-team mission: break the KRAIKEN protocol floor

@ -368,9 +531,13 @@ SNAP=\$(/home/debian/.foundry/bin/cast rpc anvil_snapshot --rpc-url http://local
      Remember: \`anvil_revert\` is one-shot. Take a new snapshot immediately after reverting.
 4. You may chain multiple actions in one strategy (e.g. large buy → recenter → large sell).
 5. Be methodical. Report every strategy tried even if it failed.
+6. If Previous Findings are provided, DO NOT repeat those strategies. Use their insights to design new approaches.
+7. Prioritize untried COMBINATIONS: staking + LP, staking + recenter timing, LP + multi-step swaps, etc.

 ---

+${MEMORY_SECTION}
+
 ## Final report format

 After trying all strategies, output a clearly structured report:
@ -405,17 +572,40 @@ log "  Report will be written to: $REPORT"

 set +e
 timeout "$CLAUDE_TIMEOUT" claude -p --dangerously-skip-permissions \
-  "$PROMPT" >"$REPORT" 2>&1
+  --verbose --output-format stream-json \
+  "$PROMPT" >"$STREAM_LOG" 2>&1
 AGENT_EXIT=$?
 set -e

 if [[ $AGENT_EXIT -ne 0 ]]; then
-  log "WARNING: claude exited with code $AGENT_EXIT — see $REPORT for details"
+  log "WARNING: claude exited with code $AGENT_EXIT — see $STREAM_LOG for details"
 fi

+# Extract readable text from stream-json for the report
+python3 - "$STREAM_LOG" >"$REPORT" <<'PYEOF'
+import json, sys
+with open(sys.argv[1]) as f:
+    for line in f:
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            obj = json.loads(line)
+            if obj.get("type") == "assistant":
+                for block in obj.get("message", {}).get("content", []):
+                    if block.get("type") == "text":
+                        print(block["text"], end="")
+        except:
+            pass
+PYEOF
+
 # ── 8. Read floor_after ────────────────────────────────────────────────────────
 log "Reading floor after agent run..."
 FLOOR_AFTER=$(compute_eth_per_token)
+
+# ── 8a. Extract and persist strategy findings ──────────────────────────────────
+log "Extracting strategy findings from agent output..."
+extract_memory "$STREAM_LOG"
 log "  floor_after = $FLOOR_AFTER wei/token"

 # ── 9. Summarise results ───────────────────────────────────────────────────────