diff --git a/scripts/harb-evaluator/red-team.sh b/scripts/harb-evaluator/red-team.sh index 082f220..bcc4b04 100755 --- a/scripts/harb-evaluator/red-team.sh +++ b/scripts/harb-evaluator/red-team.sh @@ -23,6 +23,8 @@ CLAUDE_TIMEOUT="${CLAUDE_TIMEOUT:-7200}" REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" REPORT_DIR="$REPO_ROOT/tmp" REPORT="$REPORT_DIR/red-team-report.txt" +STREAM_LOG="$REPORT_DIR/red-team-stream.jsonl" +MEMORY_FILE="$REPORT_DIR/red-team-memory.jsonl" DEPLOYMENTS="$REPO_ROOT/onchain/deployments-local.json" # ── Anvil accounts ───────────────────────────────────────────────────────────── @@ -154,12 +156,172 @@ print(0 if adj <= 0 else (e + w) * 10**18 // adj) PYEOF } +# ── Helper: extract strategy findings from stream-json and append to memory ──── +extract_memory() { + local stream_file="$1" + local run_num memory_file="$MEMORY_FILE" + + # Determine run number: one entry per line in JSONL, so next run = line_count + 1 + if [[ -f "$memory_file" ]]; then + run_num=$(wc -l < "$memory_file") + run_num=$((run_num + 1)) + else + run_num=1 + fi + + python3 - "$stream_file" "$memory_file" "$run_num" "$FLOOR_BEFORE" <<'PYEOF' +import json, sys, re +from datetime import datetime, timezone + +stream_file = sys.argv[1] +memory_file = sys.argv[2] +run_num = int(sys.argv[3]) +try: + floor_before = int(sys.argv[4]) +except (ValueError, IndexError): + print(" extract_memory: invalid floor_before value, skipping", file=sys.stderr) + sys.exit(0) + +texts = [] +with open(stream_file) as f: + for line in f: + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + if obj.get("type") == "assistant": + for block in obj.get("message", {}).get("content", []): + if block.get("type") == "text": + texts.append(block["text"]) + except: + pass + +# Parse strategies from agent text +strategies = [] +current = None +for text in texts: + # Detect strategy headers: matches "## Strategy 1: name" and "STRATEGY 1: name" + strat_match = re.search(r"(?:##\s*)?[Ss][Tt][Rr][Aa][Tt][Ee][Gg][Yy]\s*\d+[^:]*:\s*(.+)", text) + if strat_match: + if current: + strategies.append(current) + current = { + "strategy": strat_match.group(1).strip(), + "steps": "", + "floor_after": None, + "insight": "" + } + + if current: + # Capture floor readings — take the last match in the block (most recent value) + floor_matches = list(re.finditer(r"(?:floor|ethPerToken)[^\d]*?(\d{4,})\s*(?:wei)?", text, re.IGNORECASE)) + if floor_matches: + current["floor_after"] = int(floor_matches[-1].group(1)) + + # Capture insights + for pattern in [r"[Kk]ey [Ii]nsight:\s*(.+)", r"[Ii]nsight:\s*(.+)", r"(?:discovered|learned|realized)\s+(?:that\s+)?(.+)"]: + insight_match = re.search(pattern, text) + if insight_match and len(insight_match.group(1)) > 20: + current["insight"] = insight_match.group(1).strip()[:300] + + # Capture step summaries + if any(word in text.lower() for word in ["wrap", "buy", "sell", "stake", "recenter", "mint", "approve"]): + if len(text) < 200: + current["steps"] += text.strip() + "; " + +if current: + strategies.append(current) + +# Write to memory file +ts = datetime.now(timezone.utc).isoformat() +with open(memory_file, "a") as f: + for s in strategies: + fa = s["floor_after"] if s.get("floor_after") is not None else floor_before + delta_bps = round((fa - floor_before) * 10000 / floor_before) if floor_before else 0 + if fa < floor_before: + result = "DECREASED" + elif fa > floor_before: + result = "INCREASED" + else: + result = "HELD" + + entry = { + "run": run_num, + "ts": ts, + "strategy": s["strategy"][:100], + "steps": s["steps"][:300].rstrip("; "), + "floor_before": floor_before, + "floor_after": fa, + "delta_bps": delta_bps, + "result": result, + "insight": s["insight"][:300] + } + f.write(json.dumps(entry) + "\n") + print(f" Recorded: {entry['strategy']} → {result} ({delta_bps:+d} bps)") + +if not strategies: + print(" No strategies detected in stream output") + +# Trim memory file: keep 10 most recent + all DECREASED entries (cap at 50) +with open(memory_file) as f: + all_entries = [json.loads(l) for l in f if l.strip()] + +if len(all_entries) > 50: + # Keep all DECREASED entries + 10 most recent; deduplicate preserving order + trimmed = [e for e in all_entries if e.get("result") == "DECREASED"] + all_entries[-10:] + seen = set() + deduped = [] + for e in trimmed: + key = (e.get("run"), e.get("ts"), e.get("strategy")) + if key not in seen: + seen.add(key) + deduped.append(e) + with open(memory_file, "w") as f: + for e in deduped: + f.write(json.dumps(e) + "\n") + print(f" Trimmed memory to {len(deduped)} entries") +PYEOF +} + # ── 5. Read floor_before ─────────────────────────────────────────────────────── log "Reading floor before agent run..." FLOOR_BEFORE=$(compute_eth_per_token) log " floor_before = $FLOOR_BEFORE wei/token" # ── 6. Build agent prompt ────────────────────────────────────────────────────── + +# Build Previous Findings section from memory file +MEMORY_SECTION="" +if [[ -f "$MEMORY_FILE" && -s "$MEMORY_FILE" ]]; then + MEMORY_SECTION=$(python3 - "$MEMORY_FILE" <<'PYEOF' +import json, sys +entries = [] +with open(sys.argv[1]) as f: + for line in f: + line = line.strip() + if line: + entries.append(json.loads(line)) +if not entries: + sys.exit(0) +print('## Previous Findings (from earlier runs)') +print() +print('DO NOT repeat strategies marked HELD or INCREASED. Build on the insights.') +print('Try NEW combinations not yet attempted. Combine tools creatively.') +print() +for e in entries: + r = e.get('result', '?') + emoji = '❌' if r == 'DECREASED' else '⬆️' if r == 'INCREASED' else '➡️' + print(f"### Run {e.get('run','?')}: {e.get('strategy','?')} {emoji} {r}") + print(f"Steps: {e.get('steps','?')}") + print(f"Delta: {e.get('delta_bps',0)} bps") + if e.get('insight'): + print(f"**Insight:** {e['insight']}") + print() +PYEOF +) +fi + PROMPT=$(cat <"$REPORT" 2>&1 + --verbose --output-format stream-json \ + "$PROMPT" >"$STREAM_LOG" 2>&1 AGENT_EXIT=$? set -e if [[ $AGENT_EXIT -ne 0 ]]; then - log "WARNING: claude exited with code $AGENT_EXIT — see $REPORT for details" + log "WARNING: claude exited with code $AGENT_EXIT — see $STREAM_LOG for details" +fi + +# Extract readable text from stream-json for the report +python3 - "$STREAM_LOG" >"$REPORT" <<'PYEOF' +import json, sys +with open(sys.argv[1]) as f: + for line in f: + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + if obj.get("type") == "assistant": + for block in obj.get("message", {}).get("content", []): + if block.get("type") == "text": + print(block["text"], end="") + except: + pass +PYEOF + +# If the agent crashed and produced no readable output, treat as an infra error +# rather than silently reporting FLOOR HELD (a false pass). +if [[ $AGENT_EXIT -ne 0 && ! -s "$REPORT" ]]; then + die "claude agent failed (exit $AGENT_EXIT) with no readable output — see $STREAM_LOG" fi # ── 8. Read floor_after ──────────────────────────────────────────────────────── log "Reading floor after agent run..." FLOOR_AFTER=$(compute_eth_per_token) + +# ── 8a. Extract and persist strategy findings ────────────────────────────────── +log "Extracting strategy findings from agent output..." +extract_memory "$STREAM_LOG" log " floor_after = $FLOOR_AFTER wei/token" # ── 9. Summarise results ───────────────────────────────────────────────────────