From b2073ab3b19b89701a4d69d978dae77511431527 Mon Sep 17 00:00:00 2001 From: johba Date: Mon, 23 Mar 2026 00:38:56 +0000 Subject: [PATCH] fix: Formula AGENTS.md missing (#1079) Add formulas/AGENTS.md documenting sense vs act type distinction, cron conventions, step ID naming rules, TOML structure skeleton, and a how-to-add-a-new-formula walkthrough. Add scripts/harb-evaluator/AGENTS.md covering the evaluator runtime: directory layout, exit code convention, stack lifecycle, evidence output, and how to add a new evaluator script. Update root AGENTS.md directory map to link both new files. Co-Authored-By: Claude Opus 4.6 (1M context) --- AGENTS.md | 3 +- formulas/AGENTS.md | 139 +++++++++++++++++++++++++++++++ scripts/harb-evaluator/AGENTS.md | 61 ++++++++++++++ 3 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 formulas/AGENTS.md create mode 100644 scripts/harb-evaluator/AGENTS.md diff --git a/AGENTS.md b/AGENTS.md index 5c5e59c..d8bd9ba 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -18,7 +18,8 @@ KRAIKEN couples Harberger-tax staking with a dominant Uniswap V3 liquidity manag | `web-app/` | Staking UI | [web-app/AGENTS.md](web-app/AGENTS.md) | | `kraiken-lib/` | Shared TypeScript helpers for clients and bots | [kraiken-lib/AGENTS.md](kraiken-lib/AGENTS.md) | | `services/txnBot/` | Automation bot for `recenter()` and `payTax()` upkeep | [services/txnBot/AGENTS.md](services/txnBot/AGENTS.md) | -| `scripts/` | `dev.sh`, bootstrap, build helpers; `harb-evaluator/` red-team agent | — | +| `formulas/` | TOML pipeline definitions (sense/act) for the evaluator | [formulas/AGENTS.md](formulas/AGENTS.md) | +| `scripts/` | `dev.sh`, bootstrap, build helpers; `harb-evaluator/` red-team agent | [scripts/harb-evaluator/AGENTS.md](scripts/harb-evaluator/AGENTS.md) | | `tests/e2e/` | Playwright end-to-end tests | — | | `docs/` | Architecture, product truth, environment, ops guides | — | diff --git a/formulas/AGENTS.md b/formulas/AGENTS.md new file mode 100644 index 0000000..8547fbd --- /dev/null +++ b/formulas/AGENTS.md @@ -0,0 +1,139 @@ + +# Agent Brief: Formulas + +Formulas are TOML files that declare automated pipeline jobs for the harb evaluator. +Each formula describes **what** to run, **when**, and **what it produces** — the +orchestrator reads the TOML and dispatches execution to the scripts referenced in +`[execution]`. + +## Sense vs Act + +Every formula has a `type` field. Getting this wrong breaks orchestrator scheduling +and evidence routing. + +| Type | Meaning | Side-effects | Examples | +|------|---------|-------------|----------| +| `sense` | Read-only observation. Produces metrics / evidence only. | No PRs, no code changes, no contract deployments. | `run-holdout`, `run-protocol`, `run-resources`, `run-user-test` | +| `act` | Produces git artifacts: PRs, new files committed to main, contract upgrades. | Opens PRs, commits evidence + champion files, promotes attack vectors. | `run-evolution`, `run-red-team` | + +**Rule of thumb:** if the formula's `deliver` step calls `git push` or opens a PR, +it is `act`. If it only commits an evidence JSON to main, it is `sense`. + +## Current Formulas + +| ID | Type | Script | Cron | Purpose | +|----|------|--------|------|---------| +| `run-evolution` | act | `tools/push3-evolution/evolve.sh` | — | Evolve Push3 optimizer candidates, admit champions to seed pool via PR | +| `run-holdout` | sense | `scripts/harb-evaluator/evaluate.sh` | — | Deploy PR branch, run blind holdout scenarios, report pass/fail | +| `run-protocol` | sense | `scripts/harb-evaluator/run-protocol.sh` | `0 7 * * *` | On-chain health snapshot (TVL, fees, positions, rebalances) | +| `run-red-team` | act | `scripts/harb-evaluator/red-team.sh` | — | Adversarial agent attacks the optimizer; promotes novel attack vectors via PR | +| `run-resources` | sense | `scripts/harb-evaluator/run-resources.sh` | `0 6 * * *` | Infrastructure snapshot (disk, RAM, API budget, CI queue) | +| `run-user-test` | sense | `scripts/run-usertest.sh` | — | Persona-based Playwright UX evaluation | + +## Cron Conventions + +- Schedules use standard 5-field cron syntax in `[cron] schedule`. +- Stagger by at least 1 hour to avoid resource contention (`run-resources` at 06:00, `run-protocol` at 07:00). +- Only `sense` formulas should be cron-scheduled. An `act` formula on a timer risks unattended PRs. + +## Step ID Naming + +Steps are declared as `[[steps]]` arrays. Each step must have an `id` field. + +**Conventions:** +- Use lowercase kebab-case: `stack-up`, `run-scenarios`, `collect-tvl`. +- Prefix collection steps with `collect-` followed by the metric dimension: `collect-disk`, `collect-ram`, `collect-fees`. +- Every formula must include a `collect` step (assembles the evidence JSON) and a `deliver` step (commits + posts comment). +- Infrastructure lifecycle steps: `stack-up` / `stack-down` (or `boot-stack` / `teardown`). +- Use descriptive verbs: `run-attack-suite`, `evaluate-seeds`, `export-vectors`. + +## TOML Structure + +A formula file follows this skeleton: + +```toml +# formulas/run-{name}.toml +# +# One-line description of what this formula does. +# +# Type: sense | act +# Cron: (schedule if applicable, or "—") + +[formula] +id = "run-{name}" +name = "Human-Readable Name" +description = "What it does in one sentence." +type = "sense" # or "act" + +# [cron] # optional — only for scheduled formulas +# schedule = "0 6 * * *" + +[inputs.example_input] +type = "string" # string | integer | number +required = true +description = "What this input controls." + +[execution] +script = "path/to/script.sh" +invocation = "ENV_VAR={example_input} bash path/to/script.sh" + +[[steps]] +id = "do-something" +description = """ +What this step does, in enough detail for a new contributor to understand. +""" + +[[steps]] +id = "collect" +description = "Assemble metrics into evidence/{category}/{date}.json." +output = "evidence/{category}/{date}.json" + +[[steps]] +id = "deliver" +description = "Commit evidence file and post summary comment to issue." + +[products.evidence_file] +path = "evidence/{category}/{date}.json" +delivery = "commit to main" +schema = "evidence/README.md" + +[resources] +profile = "light" # or "heavy" +concurrency = "safe to run in parallel" # or "exclusive" +``` + +## How to Add a New Formula + +1. **Pick a name.** File goes in `formulas/run-{name}.toml`. The `[formula] id` must match: `run-{name}`. + +2. **Decide sense vs act.** If your formula only reads state and writes evidence → `sense`. If it creates PRs, commits code, or modifies contracts → `act`. + +3. **Write the TOML.** Follow the skeleton above. Key sections: + - `[formula]` — id, name, description, type. + - `[inputs.*]` — every tuneable parameter the script accepts. + - `[execution]` — script path and full invocation with `{input}` interpolation. + - `[[steps]]` — ordered list of logical steps. Always end with `collect` and `deliver`. + - `[products.*]` — what the formula produces (evidence file, PR, issue comment). + - `[resources]` — profile (`light` / `heavy`), concurrency constraints. + +4. **Write or wire the backing script.** The `[execution] script` must exist and be executable. Most scripts live in `scripts/harb-evaluator/` or `tools/`. Exit codes: `0` = success, `1` = gate failed, `2` = infra error. + +5. **Define the evidence schema.** If your formula writes `evidence/{category}/{date}.json`, add the schema to `evidence/README.md`. + +6. **Update this file.** Add your formula to the "Current Formulas" table above. + +7. **Test locally.** Run the backing script with the required inputs and verify the evidence file is well-formed JSON. + +## Resource Profiles + +| Profile | Meaning | Can run in parallel? | +|---------|---------|---------------------| +| `light` | Shell commands only (df, curl, cast). No Docker, no Anvil. | Yes — safe to run alongside anything. | +| `heavy` | Needs Anvil on port 8545, Docker containers, or long-running agents. | No — exclusive. Heavy formulas share port bindings and cannot overlap. | + +## Evaluator Integration + +Formula execution is dispatched by the orchestrator to scripts in +`scripts/harb-evaluator/`. See [scripts/harb-evaluator/AGENTS.md](../scripts/harb-evaluator/AGENTS.md) +for details on the evaluator runtime: stack lifecycle, scenario execution, +evidence collection, and the adversarial agent harness. diff --git a/scripts/harb-evaluator/AGENTS.md b/scripts/harb-evaluator/AGENTS.md new file mode 100644 index 0000000..20f9d8b --- /dev/null +++ b/scripts/harb-evaluator/AGENTS.md @@ -0,0 +1,61 @@ + +# Agent Brief: harb-evaluator + +The evaluator runtime executes formula-defined pipelines. Scripts in this +directory handle stack lifecycle, scenario execution, evidence collection, +and the adversarial agent harness. + +## Directory Layout + +| File | Purpose | +|------|---------| +| `evaluate.sh` | Holdout gate: worktree checkout → Docker stack → Playwright scenarios → teardown | +| `red-team.sh` | Adversarial agent runner: Anvil bootstrap → attack suite → Claude agent → evidence | +| `run-protocol.sh` | On-chain health snapshot (TVL, fees, positions, rebalances) via cast/forge | +| `run-resources.sh` | Infrastructure snapshot (disk, RAM, API budget, CI queue) via shell commands | +| `bootstrap-light.sh` | Lightweight Anvil bootstrap with contract deployment (used by red-team.sh) | +| `promote-attacks.sh` | Deduplicate and PR novel attack vectors discovered by the red-team agent | +| `export-attacks.py` | Extract cast send commands from agent stream log into `.jsonl` attack files | +| `red-team-program.md` | System prompt for the adversarial Claude agent | +| `holdout.config.ts` | Playwright config for holdout scenario execution | +| `helpers/` | TypeScript helpers: RPC, assertions, swap, stake, floor, market, reporting | +| `scenarios/` | Holdout scenario scripts and the passive-confidence suite | + +## Exit Code Convention + +All evaluator scripts follow the same three-code contract: + +| Code | Meaning | +|------|---------| +| `0` | Success / gate passed | +| `1` | Gate failed (scenario or attack found a problem) | +| `2` | Infrastructure error (stack down, missing dependency, RPC unreachable) | + +Formulas and the orchestrator rely on these codes for routing — do not +introduce additional exit codes without updating the formula TOML. + +## Stack Lifecycle + +**Heavy formulas** (`run-holdout`, `run-red-team`, `run-evolution`) need a running +Anvil or full Docker stack. Port 8545 is shared — these formulas are mutually +exclusive and must not run concurrently. + +- `evaluate.sh` manages Docker compose (`harb-eval-{pr}` project) with full + teardown via shell trap. +- `red-team.sh` uses `bootstrap-light.sh` for a lightweight Anvil-only stack + (no Docker). Cleanup is also trap-registered. +- `run-protocol.sh` and `run-resources.sh` are lightweight — no Anvil, no Docker. + +## Evidence Output + +Every script writes its evidence file to `evidence/{category}/{date}.json` +conforming to the schema in `evidence/README.md`. The `deliver` step in each +formula handles committing and posting an issue comment. + +## Adding a New Evaluator Script + +1. Place the script in this directory. Use `#!/usr/bin/env bash` and `set -euo pipefail`. +2. Follow the exit code convention (0 / 1 / 2). +3. Accept configuration via environment variables, not positional args (except `evaluate.sh` which takes a PR number). +4. Write evidence to `evidence/{category}/{date}.json`. +5. Wire it into a formula TOML in `formulas/` — see [formulas/AGENTS.md](../../formulas/AGENTS.md) for the full walkthrough.