From 708a00a2f471f25ff608cf3a448579ca12d12c87 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 19 Mar 2026 11:25:01 +0000
Subject: [PATCH] fix: Formula: run-evolution (optimizer pipeline) (#975)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 formulas/run-evolution.toml | 295 ++++++++++++++++++++++++++++++++++++
 1 file changed, 295 insertions(+)
 create mode 100644 formulas/run-evolution.toml

diff --git a/formulas/run-evolution.toml b/formulas/run-evolution.toml
new file mode 100644
index 0000000..db4ce45
--- /dev/null
+++ b/formulas/run-evolution.toml
@@ -0,0 +1,295 @@
+# formulas/run-evolution.toml
+#
+# Push3 optimizer evolution pipeline — evaluate seed pool, evolve a population
+# of candidates, admit survivors back to the pool, deliver champions via PR.
+#
+# Type: act.  Produces git artifacts (new .push3 champions + updated
+#              manifest.jsonl via PR to main; evidence file committed to main).
+#
+# Depends on: #973 (evidence/evolution/ directory structure)
+
+[formula]
+id          = "run-evolution"
+name        = "Push3 Optimizer Evolution"
+description = "Evaluate seed pool, evolve Push3 optimizer population, admit survivors, deliver champions via PR."
+type        = "act"
+# "sense"  → read-only, produces metrics only
+# "act"    → produces git artifacts (cf. run-red-team, run-evolution)
+depends_on  = [973]
+
+# ── Inputs ─────────────────────────────────────────────────────────────────────
+
+[inputs.seed]
+type        = "string"
+required    = false
+default     = "tools/push3-evolution/seeds/optimizer_v3.push3"
+description = "Starting seed .push3 file (passed as --seed to evolve.sh).  Serves as the fallback mutation source when the pool does not fill the full population."
+
+[inputs.population]
+type        = "integer"
+required    = false
+default     = 10
+description = "Number of candidates per generation (--population)."
+
+[inputs.generations]
+type        = "integer"
+required    = false
+default     = 5
+description = "Number of evolution generations to run (--generations)."
+
+[inputs.mutation_rate]
+type        = "integer"
+required    = false
+default     = 2
+description = "Mutations applied per candidate per generation (--mutation-rate)."
+
+[inputs.elites]
+type        = "integer"
+required    = false
+default     = 2
+description = "Top-scoring candidates carried forward unchanged each generation (--elites)."
+
+[inputs.base_rpc_url]
+type        = "string"
+required    = true
+description = """
+Base network RPC endpoint forwarded as BASE_RPC_URL to both evaluate-seeds.sh
+and evolve.sh.  Required for the revm evaluator (default EVAL_MODE).
+Example: https://mainnet.base.org or a fork URL from a running Anvil instance.
+"""
+
+[inputs.run_id]
+type        = "integer"
+required    = false
+description = """
+Override the run ID used when naming candidates admitted to the seed pool
+(e.g. run009_gen2_c005.push3).  Auto-incremented from the highest existing
+run in manifest.jsonl when omitted (recommended).
+"""
+
+[inputs.attack_dir]
+type        = "string"
+required    = false
+default     = "onchain/script/backtesting/attacks"
+description = """
+Directory of .jsonl adversarial attack scenarios.  Intended as an adversarial
+fitness input — candidates scored against these patterns in addition to the
+revm fitness metric.  Not yet forwarded to evolve.sh; documented here as a
+forward spec.
+"""
+status      = "planned"
+
+# ── Execution ──────────────────────────────────────────────────────────────────
+#
+# Step 0 — evaluate-seeds.sh — runs before the main evolution loop.
+#   Scores any manifest.jsonl entries with fitness: null so the pool
+#   sampler has real fitness values when selecting gen_0 candidates.
+#
+# Steps 1-5 — evolve.sh — owns the full evolution lifecycle:
+#   1. Initialise population: random sample from seed pool (--diverse-seeds).
+#   2. Score candidates via revm batch evaluator (batch-eval.sh).
+#   3. Tournament-select survivors; apply elitism + mutation / crossover.
+#   4. Repeat for N generations; track global best.
+#   5. Admit candidates above threshold (6e21 wei) into seeds/; rewrite manifest.
+#
+# evolve.sh always passes --diverse-seeds so gen_0 inherits pool diversity.
+# --run-id is omitted to let evolve.sh auto-increment from manifest.jsonl.
+
+[execution]
+pre_script     = "tools/push3-evolution/evaluate-seeds.sh"
+pre_invocation = "BASE_RPC_URL={base_rpc_url} bash tools/push3-evolution/evaluate-seeds.sh"
+script         = "tools/push3-evolution/evolve.sh"
+invocation     = "BASE_RPC_URL={base_rpc_url} bash tools/push3-evolution/evolve.sh --seed {seed} --population {population} --generations {generations} --mutation-rate {mutation_rate} --elites {elites} --output tmp/evolution --diverse-seeds"
+
+# Exit codes propagated by evolve.sh:
+#   0  evolution complete; best candidate found and pool admission attempted
+#   2  infrastructure error (RPC unreachable, missing tool, revm eval failed)
+
+# ── Steps ──────────────────────────────────────────────────────────────────────
+
+[[steps]]
+id          = "evaluate-seeds"
+description = """
+Score manifest entries with fitness: null before the evolution loop begins.
+tools/push3-evolution/evaluate-seeds.sh:
+  - Reads tools/push3-evolution/seeds/manifest.jsonl.
+  - For every entry where fitness is null, runs fitness.sh against the
+    corresponding .push3 file and records the numeric score.
+  - Rewrites manifest.jsonl atomically (temp-file rename).
+  - Exits 0 when nothing to do (idempotent; safe to re-run).
+  - Exits 2 on infrastructure error (eval stack unreachable).
+Primary targets: LLM-generated seeds (origin=llm) and evolved entries whose
+fitness was nulled due to scoring inflation (fitness_flags: token_value_inflation,
+processExecIf_fix).  Real fitness values allow --diverse-seeds to weight the
+gen_0 sample correctly.
+"""
+script      = "tools/push3-evolution/evaluate-seeds.sh"
+
+[[steps]]
+id          = "evolve"
+description = """
+Run the outer evolutionary loop via tools/push3-evolution/evolve.sh.
+
+Initialisation (gen_0):
+  A random sample of up to {population} candidates is drawn from the seed pool
+  (tools/push3-evolution/seeds/); any shortfall is filled by mutating {seed}.
+  Seeds with unevaluated fitness (null) are included in the sample with equal
+  probability — evaluate-seeds (step 0) should have resolved most of these.
+
+Per-generation loop ({generations} iterations):
+  a. Score all candidates in a single forge test invocation via
+     tools/push3-evolution/revm-evaluator/batch-eval.sh (EVAL_MODE=revm).
+     Falls back to per-candidate fitness.sh (EVAL_MODE=anvil) if revm is
+     unavailable.
+  b. Log generation stats: min / max / mean fitness, best candidate file.
+  c. Tournament-select survivors (k = population / 2).
+  d. Elitism: carry the top {elites} candidates forward unchanged.
+  e. Fill remaining slots: mutate random survivors (first half) and apply
+     pairwise crossover (second half); fall back to copy on failure.
+
+Output per run (tmp/evolution/run_NNN/):
+  generation_0.jsonl … generation_N.jsonl  per-candidate fitness records
+  best.push3                                global champion
+  diff.txt                                  constant delta vs seed
+  evolution.log                             full run transcript
+
+Pool admission (after final generation):
+  Candidates scoring above 6e21 wei are deduplicated by content hash and
+  admitted to tools/push3-evolution/seeds/, named run{NNN}_gen{G}_c{C}.push3.
+  manifest.jsonl is rewritten atomically; the evolved pool is capped at 100
+  entries by fitness rank (hand-written / LLM seeds are always pinned).
+"""
+script     = "tools/push3-evolution/evolve.sh"
+output_dir = "tmp/evolution"
+
+[[steps]]
+id          = "score-attacks"
+description = """
+[Planned] Score the champion against known adversarial attack scenarios in
+{attack_dir}/*.jsonl via onchain/script/backtesting/AttackRunner.s.sol.
+For each attack file:
+  - Replay the op sequence against a fresh Anvil snapshot.
+  - Record LM total ETH before and after.
+  - Emit one fitness adjustment: penalise the candidate's score if the
+    attack succeeds (floor broken), reward if the floor holds.
+Results feed back into the adversarial fitness component — candidates that
+survive all known attacks rank higher in the evidence record.
+Skipped when {attack_dir} is empty or AttackRunner is unavailable.
+"""
+status        = "planned"
+attack_source = "{attack_dir}/*.jsonl"
+forge_script  = "onchain/script/backtesting/AttackRunner.s.sol"
+
+[[steps]]
+id          = "collect"
+description = """
+Aggregate evolve.sh outputs into evidence/evolution/{date}.json.
+Reads:
+  - tmp/evolution/run_NNN/generation_N.jsonl   per-generation fitness records
+  - tmp/evolution/run_NNN/best.push3           champion file
+  - tools/push3-evolution/seeds/manifest.jsonl admission results
+Writes evidence/evolution/{date}.json conforming to the schema in
+evidence/README.md ## Schema: evolution/YYYY-MM-DD.json.
+Verdict: "improved" if best_fitness > best seed fitness in manifest before
+the run; "no_improvement" otherwise.
+"""
+output = "evidence/evolution/{date}.json"
+schema = "evidence/README.md"
+
+[[steps]]
+id          = "deliver"
+description = """
+Commit evidence and champion files, open PR, post summary comment.
+
+1. Commit evidence/evolution/{date}.json to main.
+
+2. Open a Codeberg PR targeting main containing:
+     - tools/push3-evolution/seeds/evo_run{NNN}_champion.push3
+       (copied from tmp/evolution/run_NNN/best.push3)
+     - Updated tools/push3-evolution/seeds/manifest.jsonl
+       (with newly admitted candidates and fresh fitness scores)
+   PR title: "evo: run{NNN} champion — fitness={best_fitness}"
+   PR body: generation-by-generation table (gen, best, mean, worst fitness),
+            top-3 admitted candidates with fitness scores, constant diff vs
+            seed (from diff.txt), link to evidence file.
+
+3. Post summary comment to originating issue:
+     - Verdict (improved / no_improvement).
+     - Best fitness achieved and which generation it was found in.
+     - Admission count: N candidates added to seed pool.
+     - Link to champion PR (if new best found) and evidence file.
+     - If no_improvement: include best fitness achieved and seed pool size.
+"""
+
+# ── Products ───────────────────────────────────────────────────────────────────
+
+[products.evidence_file]
+path     = "evidence/evolution/{date}.json"
+delivery = "commit to main"
+schema   = "evidence/README.md"   # see ## Schema: evolution/YYYY-MM-DD.json
+
+[products.champion_files]
+path     = "tools/push3-evolution/seeds/evo_run{NNN}_champion.push3"
+# {NNN} is the auto-incremented run ID assigned by evolve.sh at runtime.
+delivery = "PR to main"
+note     = "Only created when at least one candidate exceeds the admission threshold (6e21 wei)."
+
+[products.manifest]
+path     = "tools/push3-evolution/seeds/manifest.jsonl"
+delivery = "PR to main (same PR as champion_files)"
+note     = "Updated with newly admitted entries and fitness scores from evaluate-seeds."
+
+[products.issue_comment]
+delivery   = "post to originating issue"
+content    = "verdict (improved/no_improvement), best fitness, generation found, admission count, link to champion PR and evidence file"
+on_failure = "include best fitness achieved, last generation completed, full log available in tmp/evolution/run_NNN/evolution.log"
+
+# ── Resources ──────────────────────────────────────────────────────────────────
+
+[resources]
+profile     = "heavy"
+compute     = "CPU + RAM intensive — transpile + compile + deploy + revm eval per candidate"
+rpc         = "Base network RPC (BASE_RPC_URL) for revm fork; or Anvil (EVAL_MODE=anvil)"
+concurrency = "exclusive — revm evaluator and optional Anvil share port 8545 with run-holdout and run-red-team"
+
+# ── Notes ──────────────────────────────────────────────────────────────────────
+
+[notes]
+no_uups_deployment = """
+The evolution pipeline produces Push3 candidate files only — no UUPS proxy
+deployment step is wired.  Candidates are scored in simulation (revm or Anvil)
+and admitted to the seed pool for future runs.  Deployment to a live chain is
+out of scope until the champion passes holdout and red-team gates.
+"""
+
+eval_mode = """
+Default EVAL_MODE is revm (batch-eval.sh): all candidates in a generation are
+scored in a single forge test invocation against a Base fork, 10-100× faster
+than per-candidate Anvil.  Set EVAL_MODE=anvil to fall back to fitness.sh
+(slower, but does not require BASE_RPC_URL if Anvil is already running).
+Gas limit: revm evaluator runs at ~25 candidates × 100 trades per batch.
+For larger populations, increase the batch budget in batch-eval.sh.
+"""
+
+adversarial_fitness = """
+Adversarial fitness against attack scenarios ({attack_dir}/*.jsonl) is planned
+but not yet implemented (score-attacks step is status=planned).  Currently the
+only fitness signal is the revm/Anvil metric from batch-eval.sh / fitness.sh.
+When implemented, attack survival will penalise candidates whose floor breaks
+under known attack patterns, biasing the population toward safer programs.
+"""
+
+fee_fitness = """
+Fee optimization against in-market pool data is planned as a second fitness
+dimension.  Not yet implemented; tracked as a follow-up issue.
+"""
+
+pool_cap = """
+The evolved seed pool is capped at 100 entries by fitness rank.  Hand-written
+(origin=hand-written) and LLM-generated (origin=llm) seeds are always pinned
+regardless of fitness.  Evolved entries below the pool floor are evicted when
+new higher-scoring candidates are admitted.  Raw fitness values are only
+comparable within the same evaluation run; entries with fitness_flags
+(token_value_inflation, processExecIf_fix) are ranked as fitness=0 for
+admission and eviction purposes.
+"""