From 708a00a2f471f25ff608cf3a448579ca12d12c87 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 19 Mar 2026 11:25:01 +0000 Subject: [PATCH] fix: Formula: run-evolution (optimizer pipeline) (#975) Co-Authored-By: Claude Sonnet 4.6 --- formulas/run-evolution.toml | 295 ++++++++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 formulas/run-evolution.toml diff --git a/formulas/run-evolution.toml b/formulas/run-evolution.toml new file mode 100644 index 0000000..db4ce45 --- /dev/null +++ b/formulas/run-evolution.toml @@ -0,0 +1,295 @@ +# formulas/run-evolution.toml +# +# Push3 optimizer evolution pipeline — evaluate seed pool, evolve a population +# of candidates, admit survivors back to the pool, deliver champions via PR. +# +# Type: act. Produces git artifacts (new .push3 champions + updated +# manifest.jsonl via PR to main; evidence file committed to main). +# +# Depends on: #973 (evidence/evolution/ directory structure) + +[formula] +id = "run-evolution" +name = "Push3 Optimizer Evolution" +description = "Evaluate seed pool, evolve Push3 optimizer population, admit survivors, deliver champions via PR." +type = "act" +# "sense" → read-only, produces metrics only +# "act" → produces git artifacts (cf. run-red-team, run-evolution) +depends_on = [973] + +# ── Inputs ───────────────────────────────────────────────────────────────────── + +[inputs.seed] +type = "string" +required = false +default = "tools/push3-evolution/seeds/optimizer_v3.push3" +description = "Starting seed .push3 file (passed as --seed to evolve.sh). Serves as the fallback mutation source when the pool does not fill the full population." + +[inputs.population] +type = "integer" +required = false +default = 10 +description = "Number of candidates per generation (--population)." + +[inputs.generations] +type = "integer" +required = false +default = 5 +description = "Number of evolution generations to run (--generations)." + +[inputs.mutation_rate] +type = "integer" +required = false +default = 2 +description = "Mutations applied per candidate per generation (--mutation-rate)." + +[inputs.elites] +type = "integer" +required = false +default = 2 +description = "Top-scoring candidates carried forward unchanged each generation (--elites)." + +[inputs.base_rpc_url] +type = "string" +required = true +description = """ +Base network RPC endpoint forwarded as BASE_RPC_URL to both evaluate-seeds.sh +and evolve.sh. Required for the revm evaluator (default EVAL_MODE). +Example: https://mainnet.base.org or a fork URL from a running Anvil instance. +""" + +[inputs.run_id] +type = "integer" +required = false +description = """ +Override the run ID used when naming candidates admitted to the seed pool +(e.g. run009_gen2_c005.push3). Auto-incremented from the highest existing +run in manifest.jsonl when omitted (recommended). +""" + +[inputs.attack_dir] +type = "string" +required = false +default = "onchain/script/backtesting/attacks" +description = """ +Directory of .jsonl adversarial attack scenarios. Intended as an adversarial +fitness input — candidates scored against these patterns in addition to the +revm fitness metric. Not yet forwarded to evolve.sh; documented here as a +forward spec. +""" +status = "planned" + +# ── Execution ────────────────────────────────────────────────────────────────── +# +# Step 0 — evaluate-seeds.sh — runs before the main evolution loop. +# Scores any manifest.jsonl entries with fitness: null so the pool +# sampler has real fitness values when selecting gen_0 candidates. +# +# Steps 1-5 — evolve.sh — owns the full evolution lifecycle: +# 1. Initialise population: random sample from seed pool (--diverse-seeds). +# 2. Score candidates via revm batch evaluator (batch-eval.sh). +# 3. Tournament-select survivors; apply elitism + mutation / crossover. +# 4. Repeat for N generations; track global best. +# 5. Admit candidates above threshold (6e21 wei) into seeds/; rewrite manifest. +# +# evolve.sh always passes --diverse-seeds so gen_0 inherits pool diversity. +# --run-id is omitted to let evolve.sh auto-increment from manifest.jsonl. + +[execution] +pre_script = "tools/push3-evolution/evaluate-seeds.sh" +pre_invocation = "BASE_RPC_URL={base_rpc_url} bash tools/push3-evolution/evaluate-seeds.sh" +script = "tools/push3-evolution/evolve.sh" +invocation = "BASE_RPC_URL={base_rpc_url} bash tools/push3-evolution/evolve.sh --seed {seed} --population {population} --generations {generations} --mutation-rate {mutation_rate} --elites {elites} --output tmp/evolution --diverse-seeds" + +# Exit codes propagated by evolve.sh: +# 0 evolution complete; best candidate found and pool admission attempted +# 2 infrastructure error (RPC unreachable, missing tool, revm eval failed) + +# ── Steps ────────────────────────────────────────────────────────────────────── + +[[steps]] +id = "evaluate-seeds" +description = """ +Score manifest entries with fitness: null before the evolution loop begins. +tools/push3-evolution/evaluate-seeds.sh: + - Reads tools/push3-evolution/seeds/manifest.jsonl. + - For every entry where fitness is null, runs fitness.sh against the + corresponding .push3 file and records the numeric score. + - Rewrites manifest.jsonl atomically (temp-file rename). + - Exits 0 when nothing to do (idempotent; safe to re-run). + - Exits 2 on infrastructure error (eval stack unreachable). +Primary targets: LLM-generated seeds (origin=llm) and evolved entries whose +fitness was nulled due to scoring inflation (fitness_flags: token_value_inflation, +processExecIf_fix). Real fitness values allow --diverse-seeds to weight the +gen_0 sample correctly. +""" +script = "tools/push3-evolution/evaluate-seeds.sh" + +[[steps]] +id = "evolve" +description = """ +Run the outer evolutionary loop via tools/push3-evolution/evolve.sh. + +Initialisation (gen_0): + A random sample of up to {population} candidates is drawn from the seed pool + (tools/push3-evolution/seeds/); any shortfall is filled by mutating {seed}. + Seeds with unevaluated fitness (null) are included in the sample with equal + probability — evaluate-seeds (step 0) should have resolved most of these. + +Per-generation loop ({generations} iterations): + a. Score all candidates in a single forge test invocation via + tools/push3-evolution/revm-evaluator/batch-eval.sh (EVAL_MODE=revm). + Falls back to per-candidate fitness.sh (EVAL_MODE=anvil) if revm is + unavailable. + b. Log generation stats: min / max / mean fitness, best candidate file. + c. Tournament-select survivors (k = population / 2). + d. Elitism: carry the top {elites} candidates forward unchanged. + e. Fill remaining slots: mutate random survivors (first half) and apply + pairwise crossover (second half); fall back to copy on failure. + +Output per run (tmp/evolution/run_NNN/): + generation_0.jsonl … generation_N.jsonl per-candidate fitness records + best.push3 global champion + diff.txt constant delta vs seed + evolution.log full run transcript + +Pool admission (after final generation): + Candidates scoring above 6e21 wei are deduplicated by content hash and + admitted to tools/push3-evolution/seeds/, named run{NNN}_gen{G}_c{C}.push3. + manifest.jsonl is rewritten atomically; the evolved pool is capped at 100 + entries by fitness rank (hand-written / LLM seeds are always pinned). +""" +script = "tools/push3-evolution/evolve.sh" +output_dir = "tmp/evolution" + +[[steps]] +id = "score-attacks" +description = """ +[Planned] Score the champion against known adversarial attack scenarios in +{attack_dir}/*.jsonl via onchain/script/backtesting/AttackRunner.s.sol. +For each attack file: + - Replay the op sequence against a fresh Anvil snapshot. + - Record LM total ETH before and after. + - Emit one fitness adjustment: penalise the candidate's score if the + attack succeeds (floor broken), reward if the floor holds. +Results feed back into the adversarial fitness component — candidates that +survive all known attacks rank higher in the evidence record. +Skipped when {attack_dir} is empty or AttackRunner is unavailable. +""" +status = "planned" +attack_source = "{attack_dir}/*.jsonl" +forge_script = "onchain/script/backtesting/AttackRunner.s.sol" + +[[steps]] +id = "collect" +description = """ +Aggregate evolve.sh outputs into evidence/evolution/{date}.json. +Reads: + - tmp/evolution/run_NNN/generation_N.jsonl per-generation fitness records + - tmp/evolution/run_NNN/best.push3 champion file + - tools/push3-evolution/seeds/manifest.jsonl admission results +Writes evidence/evolution/{date}.json conforming to the schema in +evidence/README.md ## Schema: evolution/YYYY-MM-DD.json. +Verdict: "improved" if best_fitness > best seed fitness in manifest before +the run; "no_improvement" otherwise. +""" +output = "evidence/evolution/{date}.json" +schema = "evidence/README.md" + +[[steps]] +id = "deliver" +description = """ +Commit evidence and champion files, open PR, post summary comment. + +1. Commit evidence/evolution/{date}.json to main. + +2. Open a Codeberg PR targeting main containing: + - tools/push3-evolution/seeds/evo_run{NNN}_champion.push3 + (copied from tmp/evolution/run_NNN/best.push3) + - Updated tools/push3-evolution/seeds/manifest.jsonl + (with newly admitted candidates and fresh fitness scores) + PR title: "evo: run{NNN} champion — fitness={best_fitness}" + PR body: generation-by-generation table (gen, best, mean, worst fitness), + top-3 admitted candidates with fitness scores, constant diff vs + seed (from diff.txt), link to evidence file. + +3. Post summary comment to originating issue: + - Verdict (improved / no_improvement). + - Best fitness achieved and which generation it was found in. + - Admission count: N candidates added to seed pool. + - Link to champion PR (if new best found) and evidence file. + - If no_improvement: include best fitness achieved and seed pool size. +""" + +# ── Products ─────────────────────────────────────────────────────────────────── + +[products.evidence_file] +path = "evidence/evolution/{date}.json" +delivery = "commit to main" +schema = "evidence/README.md" # see ## Schema: evolution/YYYY-MM-DD.json + +[products.champion_files] +path = "tools/push3-evolution/seeds/evo_run{NNN}_champion.push3" +# {NNN} is the auto-incremented run ID assigned by evolve.sh at runtime. +delivery = "PR to main" +note = "Only created when at least one candidate exceeds the admission threshold (6e21 wei)." + +[products.manifest] +path = "tools/push3-evolution/seeds/manifest.jsonl" +delivery = "PR to main (same PR as champion_files)" +note = "Updated with newly admitted entries and fitness scores from evaluate-seeds." + +[products.issue_comment] +delivery = "post to originating issue" +content = "verdict (improved/no_improvement), best fitness, generation found, admission count, link to champion PR and evidence file" +on_failure = "include best fitness achieved, last generation completed, full log available in tmp/evolution/run_NNN/evolution.log" + +# ── Resources ────────────────────────────────────────────────────────────────── + +[resources] +profile = "heavy" +compute = "CPU + RAM intensive — transpile + compile + deploy + revm eval per candidate" +rpc = "Base network RPC (BASE_RPC_URL) for revm fork; or Anvil (EVAL_MODE=anvil)" +concurrency = "exclusive — revm evaluator and optional Anvil share port 8545 with run-holdout and run-red-team" + +# ── Notes ────────────────────────────────────────────────────────────────────── + +[notes] +no_uups_deployment = """ +The evolution pipeline produces Push3 candidate files only — no UUPS proxy +deployment step is wired. Candidates are scored in simulation (revm or Anvil) +and admitted to the seed pool for future runs. Deployment to a live chain is +out of scope until the champion passes holdout and red-team gates. +""" + +eval_mode = """ +Default EVAL_MODE is revm (batch-eval.sh): all candidates in a generation are +scored in a single forge test invocation against a Base fork, 10-100× faster +than per-candidate Anvil. Set EVAL_MODE=anvil to fall back to fitness.sh +(slower, but does not require BASE_RPC_URL if Anvil is already running). +Gas limit: revm evaluator runs at ~25 candidates × 100 trades per batch. +For larger populations, increase the batch budget in batch-eval.sh. +""" + +adversarial_fitness = """ +Adversarial fitness against attack scenarios ({attack_dir}/*.jsonl) is planned +but not yet implemented (score-attacks step is status=planned). Currently the +only fitness signal is the revm/Anvil metric from batch-eval.sh / fitness.sh. +When implemented, attack survival will penalise candidates whose floor breaks +under known attack patterns, biasing the population toward safer programs. +""" + +fee_fitness = """ +Fee optimization against in-market pool data is planned as a second fitness +dimension. Not yet implemented; tracked as a follow-up issue. +""" + +pool_cap = """ +The evolved seed pool is capped at 100 entries by fitness rank. Hand-written +(origin=hand-written) and LLM-generated (origin=llm) seeds are always pinned +regardless of fitness. Evolved entries below the pool floor are evicted when +new higher-scoring candidates are admitted. Raw fitness values are only +comparable within the same evaluation run; entries with fitness_flags +(token_value_inflation, processExecIf_fix) are ranked as fitness=0 for +admission and eviction purposes. +"""