fix: Red-team memory: persistent cross-run learning for adversarial agent (#528)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
openhands 2026-03-09 09:23:37 +00:00
parent 28568dbcfd
commit c1db4cb93e

View file

@ -23,6 +23,8 @@ CLAUDE_TIMEOUT="${CLAUDE_TIMEOUT:-7200}"
REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
REPORT_DIR="$REPO_ROOT/tmp"
REPORT="$REPORT_DIR/red-team-report.txt"
STREAM_LOG="$REPORT_DIR/red-team-stream.jsonl"
MEMORY_FILE="$REPORT_DIR/red-team-memory.jsonl"
DEPLOYMENTS="$REPO_ROOT/onchain/deployments-local.json"
# ── Anvil accounts ─────────────────────────────────────────────────────────────
@ -154,12 +156,173 @@ print(0 if adj <= 0 else (e + w) * 10**18 // adj)
PYEOF
}
# ── Helper: extract strategy findings from stream-json and append to memory ────
extract_memory() {
local stream_file="$1"
local run_num memory_file="$MEMORY_FILE"
# Determine run number (rough: count existing entries)
if [[ -f "$memory_file" ]]; then
run_num=$(wc -l < "$memory_file")
run_num=$((run_num / 3 + 1))
else
run_num=1
fi
python3 - "$stream_file" "$memory_file" "$run_num" "$FLOOR_BEFORE" <<'PYEOF'
import json, sys, re
from datetime import datetime, timezone
stream_file = sys.argv[1]
memory_file = sys.argv[2]
run_num = int(sys.argv[3])
floor_before = int(sys.argv[4])
texts = []
with open(stream_file) as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
if obj.get("type") == "assistant":
for block in obj.get("message", {}).get("content", []):
if block.get("type") == "text":
texts.append(block["text"])
except:
pass
# Parse strategies from agent text
strategies = []
current = None
for text in texts:
# Detect strategy headers
strat_match = re.search(r"##\s*Strategy\s*\d+[^:]*:\s*(.+)", text)
if strat_match:
if current:
strategies.append(current)
current = {
"strategy": strat_match.group(1).strip(),
"steps": "",
"floor_after": None,
"insight": ""
}
if current:
# Capture floor readings
floor_match = re.search(r"(?:floor|ethPerToken)[^\d]*?(\d{4,})\s*(?:wei)?", text, re.IGNORECASE)
if floor_match:
current["floor_after"] = int(floor_match.group(1))
# Capture insights
for pattern in [r"[Kk]ey [Ii]nsight:\s*(.+)", r"[Ii]nsight:\s*(.+)", r"(?:discovered|learned|realized)\s+(?:that\s+)?(.+)"]:
insight_match = re.search(pattern, text)
if insight_match and len(insight_match.group(1)) > 20:
current["insight"] = insight_match.group(1).strip()[:300]
# Capture step summaries
if any(word in text.lower() for word in ["wrap", "buy", "sell", "stake", "recenter", "mint", "approve"]):
if len(text) < 200:
current["steps"] += text.strip() + "; "
if current:
strategies.append(current)
# Write to memory file
ts = datetime.now(timezone.utc).isoformat()
with open(memory_file, "a") as f:
for s in strategies:
fa = s.get("floor_after") or floor_before
delta_bps = (fa - floor_before) * 10000 // floor_before if floor_before else 0
if fa < floor_before:
result = "DECREASED"
elif fa > floor_before:
result = "INCREASED"
else:
result = "HELD"
entry = {
"run": run_num,
"ts": ts,
"strategy": s["strategy"][:100],
"steps": s["steps"][:300].rstrip("; "),
"floor_before": floor_before,
"floor_after": fa,
"delta_bps": delta_bps,
"result": result,
"insight": s["insight"][:300]
}
f.write(json.dumps(entry) + "\n")
print(f" Recorded: {entry['strategy']} → {result} ({delta_bps:+d} bps)")
if not strategies:
print(" No strategies detected in stream output")
# Trim memory file: keep 10 most recent + all DECREASED entries (cap at 50)
with open(memory_file) as f:
all_entries = [json.loads(l) for l in f if l.strip()]
if len(all_entries) > 50:
decreased = [e for e in all_entries if e.get("result") == "DECREASED"]
recent = all_entries[-10:]
kept = {id(e): e for e in decreased + recent}.values()
# Preserve insertion order: filter all_entries keeping only kept ids
kept_set = set(id(e) for e in kept)
# Rebuild from original list preserving order
trimmed = [e for e in all_entries if e.get("result") == "DECREASED"] + all_entries[-10:]
# Deduplicate preserving order
seen = set()
deduped = []
for e in trimmed:
key = (e.get("run"), e.get("ts"), e.get("strategy"))
if key not in seen:
seen.add(key)
deduped.append(e)
with open(memory_file, "w") as f:
for e in deduped:
f.write(json.dumps(e) + "\n")
print(f" Trimmed memory to {len(deduped)} entries")
PYEOF
}
# ── 5. Read floor_before ───────────────────────────────────────────────────────
log "Reading floor before agent run..."
FLOOR_BEFORE=$(compute_eth_per_token)
log " floor_before = $FLOOR_BEFORE wei/token"
# ── 6. Build agent prompt ──────────────────────────────────────────────────────
# Build Previous Findings section from memory file
MEMORY_SECTION=""
if [[ -f "$MEMORY_FILE" && -s "$MEMORY_FILE" ]]; then
MEMORY_SECTION=$(python3 -c "
import json, sys
entries = []
with open('$MEMORY_FILE') as f:
for line in f:
line = line.strip()
if line:
entries.append(json.loads(line))
if not entries:
sys.exit(0)
print('## Previous Findings (from earlier runs)')
print()
print('DO NOT repeat strategies marked HELD or INCREASED. Build on the insights.')
print('Try NEW combinations not yet attempted. Combine tools creatively.')
print()
for e in entries:
r = e.get('result','?')
emoji = '❌' if r == 'DECREASED' else '⬆️' if r == 'INCREASED' else '➡️'
print(f\"### Run {e.get('run','?')}: {e.get('strategy','?')} {emoji} {r}\")
print(f\"Steps: {e.get('steps','?')}\")
print(f\"Delta: {e.get('delta_bps',0)} bps\")
if e.get('insight'):
print(f\"**Insight:** {e['insight']}\")
print()
")
fi
PROMPT=$(cat <<PROMPT_EOF
# Red-team mission: break the KRAIKEN protocol floor
@ -368,9 +531,13 @@ SNAP=\$(/home/debian/.foundry/bin/cast rpc anvil_snapshot --rpc-url http://local
Remember: \`anvil_revert\` is one-shot. Take a new snapshot immediately after reverting.
4. You may chain multiple actions in one strategy (e.g. large buy → recenter → large sell).
5. Be methodical. Report every strategy tried even if it failed.
6. If Previous Findings are provided, DO NOT repeat those strategies. Use their insights to design new approaches.
7. Prioritize untried COMBINATIONS: staking + LP, staking + recenter timing, LP + multi-step swaps, etc.
---
${MEMORY_SECTION}
## Final report format
After trying all strategies, output a clearly structured report:
@ -405,17 +572,40 @@ log " Report will be written to: $REPORT"
set +e
timeout "$CLAUDE_TIMEOUT" claude -p --dangerously-skip-permissions \
"$PROMPT" >"$REPORT" 2>&1
--verbose --output-format stream-json \
"$PROMPT" >"$STREAM_LOG" 2>&1
AGENT_EXIT=$?
set -e
if [[ $AGENT_EXIT -ne 0 ]]; then
log "WARNING: claude exited with code $AGENT_EXIT — see $REPORT for details"
log "WARNING: claude exited with code $AGENT_EXIT — see $STREAM_LOG for details"
fi
# Extract readable text from stream-json for the report
python3 - "$STREAM_LOG" >"$REPORT" <<'PYEOF'
import json, sys
with open(sys.argv[1]) as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
if obj.get("type") == "assistant":
for block in obj.get("message", {}).get("content", []):
if block.get("type") == "text":
print(block["text"], end="")
except:
pass
PYEOF
# ── 8. Read floor_after ────────────────────────────────────────────────────────
log "Reading floor after agent run..."
FLOOR_AFTER=$(compute_eth_per_token)
# ── 8a. Extract and persist strategy findings ──────────────────────────────────
log "Extracting strategy findings from agent output..."
extract_memory "$STREAM_LOG"
log " floor_after = $FLOOR_AFTER wei/token"
# ── 9. Summarise results ───────────────────────────────────────────────────────