harb/formulas/run-evolution.toml
openhands 708a00a2f4 fix: Formula: run-evolution (optimizer pipeline) (#975)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-19 11:25:01 +00:00

295 lines
14 KiB
TOML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# formulas/run-evolution.toml
#
# Push3 optimizer evolution pipeline — evaluate seed pool, evolve a population
# of candidates, admit survivors back to the pool, deliver champions via PR.
#
# Type: act. Produces git artifacts (new .push3 champions + updated
# manifest.jsonl via PR to main; evidence file committed to main).
#
# Depends on: #973 (evidence/evolution/ directory structure)
[formula]
id = "run-evolution"
name = "Push3 Optimizer Evolution"
description = "Evaluate seed pool, evolve Push3 optimizer population, admit survivors, deliver champions via PR."
type = "act"
# "sense" → read-only, produces metrics only
# "act" → produces git artifacts (cf. run-red-team, run-evolution)
depends_on = [973]
# ── Inputs ─────────────────────────────────────────────────────────────────────
[inputs.seed]
type = "string"
required = false
default = "tools/push3-evolution/seeds/optimizer_v3.push3"
description = "Starting seed .push3 file (passed as --seed to evolve.sh). Serves as the fallback mutation source when the pool does not fill the full population."
[inputs.population]
type = "integer"
required = false
default = 10
description = "Number of candidates per generation (--population)."
[inputs.generations]
type = "integer"
required = false
default = 5
description = "Number of evolution generations to run (--generations)."
[inputs.mutation_rate]
type = "integer"
required = false
default = 2
description = "Mutations applied per candidate per generation (--mutation-rate)."
[inputs.elites]
type = "integer"
required = false
default = 2
description = "Top-scoring candidates carried forward unchanged each generation (--elites)."
[inputs.base_rpc_url]
type = "string"
required = true
description = """
Base network RPC endpoint forwarded as BASE_RPC_URL to both evaluate-seeds.sh
and evolve.sh. Required for the revm evaluator (default EVAL_MODE).
Example: https://mainnet.base.org or a fork URL from a running Anvil instance.
"""
[inputs.run_id]
type = "integer"
required = false
description = """
Override the run ID used when naming candidates admitted to the seed pool
(e.g. run009_gen2_c005.push3). Auto-incremented from the highest existing
run in manifest.jsonl when omitted (recommended).
"""
[inputs.attack_dir]
type = "string"
required = false
default = "onchain/script/backtesting/attacks"
description = """
Directory of .jsonl adversarial attack scenarios. Intended as an adversarial
fitness input — candidates scored against these patterns in addition to the
revm fitness metric. Not yet forwarded to evolve.sh; documented here as a
forward spec.
"""
status = "planned"
# ── Execution ──────────────────────────────────────────────────────────────────
#
# Step 0 — evaluate-seeds.sh — runs before the main evolution loop.
# Scores any manifest.jsonl entries with fitness: null so the pool
# sampler has real fitness values when selecting gen_0 candidates.
#
# Steps 1-5 — evolve.sh — owns the full evolution lifecycle:
# 1. Initialise population: random sample from seed pool (--diverse-seeds).
# 2. Score candidates via revm batch evaluator (batch-eval.sh).
# 3. Tournament-select survivors; apply elitism + mutation / crossover.
# 4. Repeat for N generations; track global best.
# 5. Admit candidates above threshold (6e21 wei) into seeds/; rewrite manifest.
#
# evolve.sh always passes --diverse-seeds so gen_0 inherits pool diversity.
# --run-id is omitted to let evolve.sh auto-increment from manifest.jsonl.
[execution]
pre_script = "tools/push3-evolution/evaluate-seeds.sh"
pre_invocation = "BASE_RPC_URL={base_rpc_url} bash tools/push3-evolution/evaluate-seeds.sh"
script = "tools/push3-evolution/evolve.sh"
invocation = "BASE_RPC_URL={base_rpc_url} bash tools/push3-evolution/evolve.sh --seed {seed} --population {population} --generations {generations} --mutation-rate {mutation_rate} --elites {elites} --output tmp/evolution --diverse-seeds"
# Exit codes propagated by evolve.sh:
# 0 evolution complete; best candidate found and pool admission attempted
# 2 infrastructure error (RPC unreachable, missing tool, revm eval failed)
# ── Steps ──────────────────────────────────────────────────────────────────────
[[steps]]
id = "evaluate-seeds"
description = """
Score manifest entries with fitness: null before the evolution loop begins.
tools/push3-evolution/evaluate-seeds.sh:
- Reads tools/push3-evolution/seeds/manifest.jsonl.
- For every entry where fitness is null, runs fitness.sh against the
corresponding .push3 file and records the numeric score.
- Rewrites manifest.jsonl atomically (temp-file rename).
- Exits 0 when nothing to do (idempotent; safe to re-run).
- Exits 2 on infrastructure error (eval stack unreachable).
Primary targets: LLM-generated seeds (origin=llm) and evolved entries whose
fitness was nulled due to scoring inflation (fitness_flags: token_value_inflation,
processExecIf_fix). Real fitness values allow --diverse-seeds to weight the
gen_0 sample correctly.
"""
script = "tools/push3-evolution/evaluate-seeds.sh"
[[steps]]
id = "evolve"
description = """
Run the outer evolutionary loop via tools/push3-evolution/evolve.sh.
Initialisation (gen_0):
A random sample of up to {population} candidates is drawn from the seed pool
(tools/push3-evolution/seeds/); any shortfall is filled by mutating {seed}.
Seeds with unevaluated fitness (null) are included in the sample with equal
probability — evaluate-seeds (step 0) should have resolved most of these.
Per-generation loop ({generations} iterations):
a. Score all candidates in a single forge test invocation via
tools/push3-evolution/revm-evaluator/batch-eval.sh (EVAL_MODE=revm).
Falls back to per-candidate fitness.sh (EVAL_MODE=anvil) if revm is
unavailable.
b. Log generation stats: min / max / mean fitness, best candidate file.
c. Tournament-select survivors (k = population / 2).
d. Elitism: carry the top {elites} candidates forward unchanged.
e. Fill remaining slots: mutate random survivors (first half) and apply
pairwise crossover (second half); fall back to copy on failure.
Output per run (tmp/evolution/run_NNN/):
generation_0.jsonl … generation_N.jsonl per-candidate fitness records
best.push3 global champion
diff.txt constant delta vs seed
evolution.log full run transcript
Pool admission (after final generation):
Candidates scoring above 6e21 wei are deduplicated by content hash and
admitted to tools/push3-evolution/seeds/, named run{NNN}_gen{G}_c{C}.push3.
manifest.jsonl is rewritten atomically; the evolved pool is capped at 100
entries by fitness rank (hand-written / LLM seeds are always pinned).
"""
script = "tools/push3-evolution/evolve.sh"
output_dir = "tmp/evolution"
[[steps]]
id = "score-attacks"
description = """
[Planned] Score the champion against known adversarial attack scenarios in
{attack_dir}/*.jsonl via onchain/script/backtesting/AttackRunner.s.sol.
For each attack file:
- Replay the op sequence against a fresh Anvil snapshot.
- Record LM total ETH before and after.
- Emit one fitness adjustment: penalise the candidate's score if the
attack succeeds (floor broken), reward if the floor holds.
Results feed back into the adversarial fitness component — candidates that
survive all known attacks rank higher in the evidence record.
Skipped when {attack_dir} is empty or AttackRunner is unavailable.
"""
status = "planned"
attack_source = "{attack_dir}/*.jsonl"
forge_script = "onchain/script/backtesting/AttackRunner.s.sol"
[[steps]]
id = "collect"
description = """
Aggregate evolve.sh outputs into evidence/evolution/{date}.json.
Reads:
- tmp/evolution/run_NNN/generation_N.jsonl per-generation fitness records
- tmp/evolution/run_NNN/best.push3 champion file
- tools/push3-evolution/seeds/manifest.jsonl admission results
Writes evidence/evolution/{date}.json conforming to the schema in
evidence/README.md ## Schema: evolution/YYYY-MM-DD.json.
Verdict: "improved" if best_fitness > best seed fitness in manifest before
the run; "no_improvement" otherwise.
"""
output = "evidence/evolution/{date}.json"
schema = "evidence/README.md"
[[steps]]
id = "deliver"
description = """
Commit evidence and champion files, open PR, post summary comment.
1. Commit evidence/evolution/{date}.json to main.
2. Open a Codeberg PR targeting main containing:
- tools/push3-evolution/seeds/evo_run{NNN}_champion.push3
(copied from tmp/evolution/run_NNN/best.push3)
- Updated tools/push3-evolution/seeds/manifest.jsonl
(with newly admitted candidates and fresh fitness scores)
PR title: "evo: run{NNN} champion fitness={best_fitness}"
PR body: generation-by-generation table (gen, best, mean, worst fitness),
top-3 admitted candidates with fitness scores, constant diff vs
seed (from diff.txt), link to evidence file.
3. Post summary comment to originating issue:
- Verdict (improved / no_improvement).
- Best fitness achieved and which generation it was found in.
- Admission count: N candidates added to seed pool.
- Link to champion PR (if new best found) and evidence file.
- If no_improvement: include best fitness achieved and seed pool size.
"""
# ── Products ───────────────────────────────────────────────────────────────────
[products.evidence_file]
path = "evidence/evolution/{date}.json"
delivery = "commit to main"
schema = "evidence/README.md" # see ## Schema: evolution/YYYY-MM-DD.json
[products.champion_files]
path = "tools/push3-evolution/seeds/evo_run{NNN}_champion.push3"
# {NNN} is the auto-incremented run ID assigned by evolve.sh at runtime.
delivery = "PR to main"
note = "Only created when at least one candidate exceeds the admission threshold (6e21 wei)."
[products.manifest]
path = "tools/push3-evolution/seeds/manifest.jsonl"
delivery = "PR to main (same PR as champion_files)"
note = "Updated with newly admitted entries and fitness scores from evaluate-seeds."
[products.issue_comment]
delivery = "post to originating issue"
content = "verdict (improved/no_improvement), best fitness, generation found, admission count, link to champion PR and evidence file"
on_failure = "include best fitness achieved, last generation completed, full log available in tmp/evolution/run_NNN/evolution.log"
# ── Resources ──────────────────────────────────────────────────────────────────
[resources]
profile = "heavy"
compute = "CPU + RAM intensive — transpile + compile + deploy + revm eval per candidate"
rpc = "Base network RPC (BASE_RPC_URL) for revm fork; or Anvil (EVAL_MODE=anvil)"
concurrency = "exclusive — revm evaluator and optional Anvil share port 8545 with run-holdout and run-red-team"
# ── Notes ──────────────────────────────────────────────────────────────────────
[notes]
no_uups_deployment = """
The evolution pipeline produces Push3 candidate files only — no UUPS proxy
deployment step is wired. Candidates are scored in simulation (revm or Anvil)
and admitted to the seed pool for future runs. Deployment to a live chain is
out of scope until the champion passes holdout and red-team gates.
"""
eval_mode = """
Default EVAL_MODE is revm (batch-eval.sh): all candidates in a generation are
scored in a single forge test invocation against a Base fork, 10-100× faster
than per-candidate Anvil. Set EVAL_MODE=anvil to fall back to fitness.sh
(slower, but does not require BASE_RPC_URL if Anvil is already running).
Gas limit: revm evaluator runs at ~25 candidates × 100 trades per batch.
For larger populations, increase the batch budget in batch-eval.sh.
"""
adversarial_fitness = """
Adversarial fitness against attack scenarios ({attack_dir}/*.jsonl) is planned
but not yet implemented (score-attacks step is status=planned). Currently the
only fitness signal is the revm/Anvil metric from batch-eval.sh / fitness.sh.
When implemented, attack survival will penalise candidates whose floor breaks
under known attack patterns, biasing the population toward safer programs.
"""
fee_fitness = """
Fee optimization against in-market pool data is planned as a second fitness
dimension. Not yet implemented; tracked as a follow-up issue.
"""
pool_cap = """
The evolved seed pool is capped at 100 entries by fitness rank. Hand-written
(origin=hand-written) and LLM-generated (origin=llm) seeds are always pinned
regardless of fitness. Evolved entries below the pool floor are evicted when
new higher-scoring candidates are admitted. Raw fitness values are only
comparable within the same evaluation run; entries with fitness_flags
(token_value_inflation, processExecIf_fix) are ranked as fitness=0 for
admission and eviction purposes.
"""