# formulas/run-evolution.toml # # Push3 optimizer evolution pipeline — evaluate seed pool, evolve a population # of candidates, admit survivors back to the pool, deliver champions via PR. # # Type: act. Produces git artifacts (new .push3 champions + updated # manifest.jsonl via PR to main; evidence file committed to main). # # Depends on: #973 (evidence/evolution/ directory structure) [formula] id = "run-evolution" name = "Push3 Optimizer Evolution" description = "Evaluate seed pool, evolve Push3 optimizer population, admit survivors, deliver champions via PR." type = "act" # "sense" → read-only, produces metrics only # "act" → produces git artifacts (cf. run-red-team, run-evolution) depends_on = [973] # ── Inputs ───────────────────────────────────────────────────────────────────── [inputs.seed] type = "string" required = false default = "tools/push3-evolution/seeds/optimizer_v3.push3" description = "Starting seed .push3 file (passed as --seed to evolve.sh). Serves as the fallback mutation source when the pool does not fill the full population." [inputs.population] type = "integer" required = false default = 10 description = "Number of candidates per generation (--population)." [inputs.generations] type = "integer" required = false default = 5 description = "Number of evolution generations to run (--generations)." [inputs.mutation_rate] type = "integer" required = false default = 2 description = "Mutations applied per candidate per generation (--mutation-rate)." [inputs.elites] type = "integer" required = false default = 2 description = "Top-scoring candidates carried forward unchanged each generation (--elites)." [inputs.base_rpc_url] type = "string" required = true description = """ Base network RPC endpoint forwarded as BASE_RPC_URL to both evaluate-seeds.sh and evolve.sh. Required for the revm evaluator (default EVAL_MODE). Example: https://mainnet.base.org or a fork URL from a running Anvil instance. """ [inputs.run_id] type = "integer" required = false description = """ Override the run ID used when naming candidates admitted to the seed pool (e.g. run009_gen2_c005.push3). Auto-incremented from the highest existing run in manifest.jsonl when omitted (recommended). """ [inputs.attack_dir] type = "string" required = false default = "onchain/script/backtesting/attacks" description = """ Directory of .jsonl adversarial attack scenarios. Intended as an adversarial fitness input — candidates scored against these patterns in addition to the revm fitness metric. Not yet forwarded to evolve.sh; documented here as a forward spec. """ status = "planned" # ── Execution ────────────────────────────────────────────────────────────────── # # Step 0 — evaluate-seeds.sh — runs before the main evolution loop. # Scores any manifest.jsonl entries with fitness: null so the pool # sampler has real fitness values when selecting gen_0 candidates. # # Steps 1-5 — evolve.sh — owns the full evolution lifecycle: # 1. Initialise population: random sample from seed pool (--diverse-seeds). # 2. Score candidates via revm batch evaluator (batch-eval.sh). # 3. Tournament-select survivors; apply elitism + mutation / crossover. # 4. Repeat for N generations; track global best. # 5. Admit candidates above threshold (6e21 wei) into seeds/; rewrite manifest. # # evolve.sh always passes --diverse-seeds so gen_0 inherits pool diversity. # --run-id is omitted to let evolve.sh auto-increment from manifest.jsonl. [execution] pre_script = "tools/push3-evolution/evaluate-seeds.sh" pre_invocation = "BASE_RPC_URL={base_rpc_url} bash tools/push3-evolution/evaluate-seeds.sh" script = "tools/push3-evolution/evolve.sh" invocation = "BASE_RPC_URL={base_rpc_url} bash tools/push3-evolution/evolve.sh --seed {seed} --population {population} --generations {generations} --mutation-rate {mutation_rate} --elites {elites} --output tmp/evolution --diverse-seeds" # Exit codes propagated by evolve.sh: # 0 evolution complete; best candidate found and pool admission attempted # 2 infrastructure error (RPC unreachable, missing tool, revm eval failed) # ── Steps ────────────────────────────────────────────────────────────────────── [[steps]] id = "evaluate-seeds" description = """ Score manifest entries with fitness: null before the evolution loop begins. tools/push3-evolution/evaluate-seeds.sh: - Reads tools/push3-evolution/seeds/manifest.jsonl. - For every entry where fitness is null, runs fitness.sh against the corresponding .push3 file and records the numeric score. - Rewrites manifest.jsonl atomically (temp-file rename). - Exits 0 when nothing to do (idempotent; safe to re-run). - Exits 2 on infrastructure error (eval stack unreachable). Primary targets: LLM-generated seeds (origin=llm) and evolved entries whose fitness was nulled due to scoring inflation (fitness_flags: token_value_inflation, processExecIf_fix). Real fitness values allow --diverse-seeds to weight the gen_0 sample correctly. """ script = "tools/push3-evolution/evaluate-seeds.sh" [[steps]] id = "evolve" description = """ Run the outer evolutionary loop via tools/push3-evolution/evolve.sh. Initialisation (gen_0): A random sample of up to {population} candidates is drawn from the seed pool (tools/push3-evolution/seeds/); any shortfall is filled by mutating {seed}. Seeds with unevaluated fitness (null) are included in the sample with equal probability — evaluate-seeds (step 0) should have resolved most of these. Per-generation loop ({generations} iterations): a. Score all candidates in a single forge test invocation via tools/push3-evolution/revm-evaluator/batch-eval.sh (EVAL_MODE=revm). Falls back to per-candidate fitness.sh (EVAL_MODE=anvil) if revm is unavailable. b. Log generation stats: min / max / mean fitness, best candidate file. c. Tournament-select survivors (k = population / 2). d. Elitism: carry the top {elites} candidates forward unchanged. e. Fill remaining slots: mutate random survivors (first half) and apply pairwise crossover (second half); fall back to copy on failure. Output per run (tmp/evolution/run_NNN/): generation_0.jsonl … generation_N.jsonl per-candidate fitness records best.push3 global champion diff.txt constant delta vs seed evolution.log full run transcript Pool admission (after final generation): Candidates scoring above 6e21 wei are deduplicated by content hash and admitted to tools/push3-evolution/seeds/, named run{NNN}_gen{G}_c{C}.push3. manifest.jsonl is rewritten atomically; the evolved pool is capped at 100 entries by fitness rank (hand-written / LLM seeds are always pinned). """ script = "tools/push3-evolution/evolve.sh" output_dir = "tmp/evolution" [[steps]] id = "score-attacks" description = """ [Planned] Score the champion against known adversarial attack scenarios in {attack_dir}/*.jsonl via onchain/script/backtesting/AttackRunner.s.sol. For each attack file: - Replay the op sequence against a fresh Anvil snapshot. - Record LM total ETH before and after. - Emit one fitness adjustment: penalise the candidate's score if the attack succeeds (floor broken), reward if the floor holds. Results feed back into the adversarial fitness component — candidates that survive all known attacks rank higher in the evidence record. Skipped when {attack_dir} is empty or AttackRunner is unavailable. """ status = "planned" attack_source = "{attack_dir}/*.jsonl" forge_script = "onchain/script/backtesting/AttackRunner.s.sol" [[steps]] id = "collect" description = """ Aggregate evolve.sh outputs into evidence/evolution/{date}.json. Reads: - tmp/evolution/run_NNN/generation_N.jsonl per-generation fitness records - tmp/evolution/run_NNN/best.push3 champion file - tools/push3-evolution/seeds/manifest.jsonl admission results Writes evidence/evolution/{date}.json conforming to the schema in evidence/README.md ## Schema: evolution/YYYY-MM-DD.json. Verdict: "improved" if best_fitness > best seed fitness in manifest before the run; "no_improvement" otherwise. """ output = "evidence/evolution/{date}.json" schema = "evidence/README.md" [[steps]] id = "deliver" description = """ Commit evidence and champion files, open PR, post summary comment. 1. Commit evidence/evolution/{date}.json to main. 2. Open a Codeberg PR targeting main containing: - tools/push3-evolution/seeds/evo_run{NNN}_champion.push3 (copied from tmp/evolution/run_NNN/best.push3) - Updated tools/push3-evolution/seeds/manifest.jsonl (with newly admitted candidates and fresh fitness scores) PR title: "evo: run{NNN} champion — fitness={best_fitness}" PR body: generation-by-generation table (gen, best, mean, worst fitness), top-3 admitted candidates with fitness scores, constant diff vs seed (from diff.txt), link to evidence file. 3. Post summary comment to originating issue: - Verdict (improved / no_improvement). - Best fitness achieved and which generation it was found in. - Admission count: N candidates added to seed pool. - Link to champion PR (if new best found) and evidence file. - If no_improvement: include best fitness achieved and seed pool size. """ # ── Products ─────────────────────────────────────────────────────────────────── [products.evidence_file] path = "evidence/evolution/{date}.json" delivery = "commit to main" schema = "evidence/README.md" # see ## Schema: evolution/YYYY-MM-DD.json [products.champion_files] path = "tools/push3-evolution/seeds/evo_run{NNN}_champion.push3" # {NNN} is the auto-incremented run ID assigned by evolve.sh at runtime. delivery = "PR to main" note = "Only created when at least one candidate exceeds the admission threshold (6e21 wei)." [products.manifest] path = "tools/push3-evolution/seeds/manifest.jsonl" delivery = "PR to main (same PR as champion_files)" note = "Updated with newly admitted entries and fitness scores from evaluate-seeds." [products.issue_comment] delivery = "post to originating issue" content = "verdict (improved/no_improvement), best fitness, generation found, admission count, link to champion PR and evidence file" on_failure = "include best fitness achieved, last generation completed, full log available in tmp/evolution/run_NNN/evolution.log" # ── Resources ────────────────────────────────────────────────────────────────── [resources] profile = "heavy" compute = "CPU + RAM intensive — transpile + compile + deploy + revm eval per candidate" rpc = "Base network RPC (BASE_RPC_URL) for revm fork; or Anvil (EVAL_MODE=anvil)" concurrency = "exclusive — revm evaluator and optional Anvil share port 8545 with run-holdout and run-red-team" # ── Notes ────────────────────────────────────────────────────────────────────── [notes] no_uups_deployment = """ The evolution pipeline produces Push3 candidate files only — no UUPS proxy deployment step is wired. Candidates are scored in simulation (revm or Anvil) and admitted to the seed pool for future runs. Deployment to a live chain is out of scope until the champion passes holdout and red-team gates. """ eval_mode = """ Default EVAL_MODE is revm (batch-eval.sh): all candidates in a generation are scored in a single forge test invocation against a Base fork, 10-100× faster than per-candidate Anvil. Set EVAL_MODE=anvil to fall back to fitness.sh (slower, but does not require BASE_RPC_URL if Anvil is already running). Gas limit: revm evaluator runs at ~25 candidates × 100 trades per batch. For larger populations, increase the batch budget in batch-eval.sh. """ adversarial_fitness = """ Adversarial fitness against attack scenarios ({attack_dir}/*.jsonl) is planned but not yet implemented (score-attacks step is status=planned). Currently the only fitness signal is the revm/Anvil metric from batch-eval.sh / fitness.sh. When implemented, attack survival will penalise candidates whose floor breaks under known attack patterns, biasing the population toward safer programs. """ fee_fitness = """ Fee optimization against in-market pool data is planned as a second fitness dimension. Not yet implemented; tracked as a follow-up issue. """ pool_cap = """ The evolved seed pool is capped at 100 entries by fitness rank. Hand-written (origin=hand-written) and LLM-generated (origin=llm) seeds are always pinned regardless of fitness. Evolved entries below the pool floor are evicted when new higher-scoring candidates are admitted. Raw fitness values are only comparable within the same evaluation run; entries with fitness_flags (token_value_inflation, processExecIf_fix) are ranked as fitness=0 for admission and eviction purposes. """