- Add `cleanup` step: removes per-generation candidate files and generation_*.jsonl records after they are aggregated into the evidence file, preventing disk exhaustion (cf. run #1025 at 91% usage). - Rewrite `deliver` step with mandatory ordering: 1. `git checkout -- .` to discard unrelated working-tree modifications before staging result files (evidence JSON, champion .push3, manifest). 2. Commit to branch `evidence/evolution-run-{run_id}` (not directly to main). 3. Push and create PR — if this fails, post an error comment and leave the issue OPEN; do not proceed to step 4. 4. Post summary comment only after PR URL is confirmed, with mandatory link to the PR. - Update `products.evidence_file` delivery to PR branch (was "commit to main"). - Update `products.issue_comment` to enforce ordering and non-close-on-failure. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
344 lines
16 KiB
TOML
344 lines
16 KiB
TOML
# formulas/run-evolution.toml
|
||
#
|
||
# Push3 optimizer evolution pipeline — evaluate seed pool, evolve a population
|
||
# of candidates, admit survivors back to the pool, deliver champions via PR.
|
||
#
|
||
# Type: act. Produces git artifacts (new .push3 champions + updated
|
||
# manifest.jsonl via PR to main; evidence file committed to main).
|
||
#
|
||
# Depends on: #973 (evidence/evolution/ directory structure)
|
||
|
||
[formula]
|
||
id = "run-evolution"
|
||
name = "Push3 Optimizer Evolution"
|
||
description = "Evaluate seed pool, evolve Push3 optimizer population, admit survivors, deliver champions via PR."
|
||
type = "act"
|
||
# "sense" → read-only, produces metrics only
|
||
# "act" → produces git artifacts (cf. run-red-team, run-evolution)
|
||
depends_on = [973]
|
||
|
||
# ── Inputs ─────────────────────────────────────────────────────────────────────
|
||
|
||
[inputs.seed]
|
||
type = "string"
|
||
required = false
|
||
default = "tools/push3-evolution/seeds/optimizer_v3.push3"
|
||
description = "Starting seed .push3 file (passed as --seed to evolve.sh). Serves as the fallback mutation source when the pool does not fill the full population."
|
||
|
||
[inputs.population]
|
||
type = "integer"
|
||
required = false
|
||
default = 10
|
||
description = "Number of candidates per generation (--population)."
|
||
|
||
[inputs.generations]
|
||
type = "integer"
|
||
required = false
|
||
default = 5
|
||
description = "Number of evolution generations to run (--generations)."
|
||
|
||
[inputs.mutation_rate]
|
||
type = "integer"
|
||
required = false
|
||
default = 2
|
||
description = "Mutations applied per candidate per generation (--mutation-rate)."
|
||
|
||
[inputs.elites]
|
||
type = "integer"
|
||
required = false
|
||
default = 2
|
||
description = "Top-scoring candidates carried forward unchanged each generation (--elites)."
|
||
|
||
[inputs.base_rpc_url]
|
||
type = "string"
|
||
required = true
|
||
description = """
|
||
Base network RPC endpoint forwarded as BASE_RPC_URL to both evaluate-seeds.sh
|
||
and evolve.sh. Required for the revm evaluator (default EVAL_MODE).
|
||
Example: https://mainnet.base.org or a fork URL from a running Anvil instance.
|
||
"""
|
||
|
||
[inputs.run_id]
|
||
type = "integer"
|
||
required = false
|
||
description = """
|
||
Override the run ID used when naming candidates admitted to the seed pool
|
||
(e.g. run009_gen2_c005.push3). Auto-incremented from the highest existing
|
||
run in manifest.jsonl when omitted (recommended).
|
||
"""
|
||
|
||
[inputs.attack_dir]
|
||
type = "string"
|
||
required = false
|
||
default = "onchain/script/backtesting/attacks"
|
||
description = """
|
||
Directory of .jsonl adversarial attack scenarios. Intended as an adversarial
|
||
fitness input — candidates scored against these patterns in addition to the
|
||
revm fitness metric. Not yet forwarded to evolve.sh; documented here as a
|
||
forward spec.
|
||
"""
|
||
status = "planned"
|
||
|
||
# ── Execution ──────────────────────────────────────────────────────────────────
|
||
#
|
||
# Step 0 — evaluate-seeds.sh — runs before the main evolution loop.
|
||
# Scores any manifest.jsonl entries with fitness: null so the pool
|
||
# sampler has real fitness values when selecting gen_0 candidates.
|
||
#
|
||
# Steps 1-5 — evolve.sh — owns the full evolution lifecycle:
|
||
# 1. Initialise population: random sample from seed pool (--diverse-seeds).
|
||
# 2. Score candidates via revm batch evaluator (batch-eval.sh).
|
||
# 3. Tournament-select survivors; apply elitism + mutation / crossover.
|
||
# 4. Repeat for N generations; track global best.
|
||
# 5. Admit candidates above threshold (6e21 wei) into seeds/; rewrite manifest.
|
||
#
|
||
# evolve.sh always passes --diverse-seeds so gen_0 inherits pool diversity.
|
||
# --run-id is omitted to let evolve.sh auto-increment from manifest.jsonl.
|
||
|
||
[execution]
|
||
pre_script = "tools/push3-evolution/evaluate-seeds.sh"
|
||
pre_invocation = "BASE_RPC_URL={base_rpc_url} bash tools/push3-evolution/evaluate-seeds.sh"
|
||
script = "tools/push3-evolution/evolve.sh"
|
||
invocation = "BASE_RPC_URL={base_rpc_url} bash tools/push3-evolution/evolve.sh --seed {seed} --population {population} --generations {generations} --mutation-rate {mutation_rate} --elites {elites} --output tmp/evolution --diverse-seeds"
|
||
|
||
# Exit codes propagated by evolve.sh:
|
||
# 0 evolution complete; best candidate found and pool admission attempted
|
||
# 2 infrastructure error (RPC unreachable, missing tool, revm eval failed)
|
||
|
||
# ── Steps ──────────────────────────────────────────────────────────────────────
|
||
|
||
[[steps]]
|
||
id = "evaluate-seeds"
|
||
description = """
|
||
Score manifest entries with fitness: null before the evolution loop begins.
|
||
tools/push3-evolution/evaluate-seeds.sh:
|
||
- Reads tools/push3-evolution/seeds/manifest.jsonl.
|
||
- For every entry where fitness is null, runs fitness.sh against the
|
||
corresponding .push3 file and records the numeric score.
|
||
- Rewrites manifest.jsonl atomically (temp-file rename).
|
||
- Exits 0 when nothing to do (idempotent; safe to re-run).
|
||
- Exits 2 on infrastructure error (eval stack unreachable).
|
||
Primary targets: LLM-generated seeds (origin=llm) and evolved entries whose
|
||
fitness was nulled due to scoring inflation (fitness_flags: token_value_inflation,
|
||
processExecIf_fix). Real fitness values allow --diverse-seeds to weight the
|
||
gen_0 sample correctly.
|
||
"""
|
||
script = "tools/push3-evolution/evaluate-seeds.sh"
|
||
|
||
[[steps]]
|
||
id = "evolve"
|
||
description = """
|
||
Run the outer evolutionary loop via tools/push3-evolution/evolve.sh.
|
||
|
||
Initialisation (gen_0):
|
||
A random sample of up to {population} candidates is drawn from the seed pool
|
||
(tools/push3-evolution/seeds/); any shortfall is filled by mutating {seed}.
|
||
Seeds with unevaluated fitness (null) are included in the sample with equal
|
||
probability — evaluate-seeds (step 0) should have resolved most of these.
|
||
|
||
Per-generation loop ({generations} iterations):
|
||
a. Score all candidates in a single forge test invocation via
|
||
tools/push3-evolution/revm-evaluator/batch-eval.sh (EVAL_MODE=revm).
|
||
Falls back to per-candidate fitness.sh (EVAL_MODE=anvil) if revm is
|
||
unavailable.
|
||
b. Log generation stats: min / max / mean fitness, best candidate file.
|
||
c. Tournament-select survivors (k = population / 2).
|
||
d. Elitism: carry the top {elites} candidates forward unchanged.
|
||
e. Fill remaining slots: mutate random survivors (first half) and apply
|
||
pairwise crossover (second half); fall back to copy on failure.
|
||
|
||
Output per run (tmp/evolution/run_NNN/):
|
||
generation_0.jsonl … generation_N.jsonl per-candidate fitness records
|
||
best.push3 global champion
|
||
diff.txt constant delta vs seed
|
||
evolution.log full run transcript
|
||
|
||
Pool admission (after final generation):
|
||
Candidates scoring above 6e21 wei are deduplicated by content hash and
|
||
admitted to tools/push3-evolution/seeds/, named run{NNN}_gen{G}_c{C}.push3.
|
||
manifest.jsonl is rewritten atomically; the evolved pool is capped at 100
|
||
entries by fitness rank (hand-written / LLM seeds are always pinned).
|
||
"""
|
||
script = "tools/push3-evolution/evolve.sh"
|
||
output_dir = "tmp/evolution"
|
||
|
||
[[steps]]
|
||
id = "score-attacks"
|
||
description = """
|
||
[Planned] Score the champion against known adversarial attack scenarios in
|
||
{attack_dir}/*.jsonl via onchain/script/backtesting/AttackRunner.s.sol.
|
||
For each attack file:
|
||
- Replay the op sequence against a fresh Anvil snapshot.
|
||
- Record LM total ETH before and after.
|
||
- Emit one fitness adjustment: penalise the candidate's score if the
|
||
attack succeeds (floor broken), reward if the floor holds.
|
||
Results feed back into the adversarial fitness component — candidates that
|
||
survive all known attacks rank higher in the evidence record.
|
||
Skipped when {attack_dir} is empty or AttackRunner is unavailable.
|
||
"""
|
||
status = "planned"
|
||
attack_source = "{attack_dir}/*.jsonl"
|
||
forge_script = "onchain/script/backtesting/AttackRunner.s.sol"
|
||
|
||
[[steps]]
|
||
id = "collect"
|
||
description = """
|
||
Aggregate evolve.sh outputs into evidence/evolution/{date}.json.
|
||
Reads:
|
||
- tmp/evolution/run_NNN/generation_N.jsonl per-generation fitness records
|
||
- tmp/evolution/run_NNN/best.push3 champion file
|
||
- tools/push3-evolution/seeds/manifest.jsonl admission results
|
||
Writes evidence/evolution/{date}.json conforming to the schema in
|
||
evidence/README.md ## Schema: evolution/YYYY-MM-DD.json.
|
||
Verdict: "improved" if best_fitness > best seed fitness in manifest before
|
||
the run; "no_improvement" otherwise.
|
||
"""
|
||
output = "evidence/evolution/{date}.json"
|
||
schema = "evidence/README.md"
|
||
|
||
[[steps]]
|
||
id = "cleanup"
|
||
description = """
|
||
Remove intermediate per-generation candidate files that are not part of the
|
||
final results. Only the following files are retained after this step:
|
||
|
||
tmp/evolution/run_NNN/best.push3 global champion
|
||
tmp/evolution/run_NNN/diff.txt constant delta vs seed
|
||
tmp/evolution/run_NNN/evolution.log full run transcript
|
||
tools/push3-evolution/seeds/run{NNN}_*.push3
|
||
top-N newly admitted seeds
|
||
(≤ elites per generation)
|
||
|
||
Files removed:
|
||
tmp/evolution/run_NNN/generation_*.jsonl per-candidate fitness records
|
||
(already aggregated into evidence)
|
||
tmp/evolution/run_NNN/candidate_*.push3 intermediate per-generation
|
||
candidates that are not elites
|
||
|
||
Rationale: the evolution box reached 91% disk utilisation in run #1025 because
|
||
these intermediate files were never cleaned up. Aggregated fitness data is
|
||
preserved in evidence/evolution/{date}.json; the per-candidate .push3 files for
|
||
non-elite generations are not needed once the evidence file is written.
|
||
"""
|
||
|
||
[[steps]]
|
||
id = "deliver"
|
||
description = """
|
||
Commit results to a branch, push, open PR, then post summary comment.
|
||
ORDERING IS MANDATORY — each sub-step must complete before the next begins.
|
||
Do NOT post to the issue before the PR URL is available.
|
||
|
||
1. CLEAN GIT STATE
|
||
Run `git checkout -- .` to discard any working-tree modifications that are
|
||
NOT part of the evolution results (e.g. .sol files left over from a prior
|
||
session, scratch files). Only stage files that belong to this run:
|
||
- evidence/evolution/{date}.json
|
||
- tools/push3-evolution/seeds/evo_run{NNN}_champion.push3
|
||
- tools/push3-evolution/seeds/manifest.jsonl
|
||
Verify `git diff --check` passes before committing.
|
||
|
||
2. COMMIT TO BRANCH
|
||
Create branch evidence/evolution-run-{run_id} from master.
|
||
Commit the staged result files with message:
|
||
"evo: run{NNN} results — fitness={best_fitness}"
|
||
The commit MUST include all three files above.
|
||
|
||
3. PUSH AND CREATE PR
|
||
Push the branch to origin.
|
||
Open a Codeberg PR targeting master:
|
||
Title: "evo: run{NNN} champion — fitness={best_fitness}"
|
||
Body: generation-by-generation table (gen, best, mean, worst fitness),
|
||
top-3 admitted candidates with fitness scores, constant diff vs
|
||
seed (from diff.txt), link to evidence file.
|
||
If `git push` or PR creation fails:
|
||
a. Post an error comment to the originating issue with the failure reason
|
||
and the path of the local evidence file.
|
||
b. Leave the issue OPEN.
|
||
c. Exit with a non-zero status — do NOT proceed to step 4.
|
||
|
||
4. POST SUMMARY COMMENT (only after PR URL is confirmed)
|
||
Post a comment to the originating issue containing:
|
||
- Verdict (improved / no_improvement).
|
||
- Best fitness achieved and which generation it was found in.
|
||
- Admission count: N candidates added to seed pool.
|
||
- Link to the champion PR (required — do not post without it).
|
||
- Link to evidence file committed in the PR.
|
||
- If no_improvement: best fitness achieved and seed pool size.
|
||
Do NOT close the issue in this step; closing is the orchestrator's
|
||
responsibility once the PR is merged.
|
||
"""
|
||
|
||
# ── Products ───────────────────────────────────────────────────────────────────
|
||
|
||
[products.evidence_file]
|
||
path = "evidence/evolution/{date}.json"
|
||
delivery = "PR to main (same PR as champion_files, on branch evidence/evolution-run-{run_id})"
|
||
schema = "evidence/README.md" # see ## Schema: evolution/YYYY-MM-DD.json
|
||
|
||
[products.champion_files]
|
||
path = "tools/push3-evolution/seeds/evo_run{NNN}_champion.push3"
|
||
# {NNN} is the auto-incremented run ID assigned by evolve.sh at runtime.
|
||
delivery = "PR to main"
|
||
note = "Only created when at least one candidate exceeds the admission threshold (6e21 wei)."
|
||
|
||
[products.manifest]
|
||
path = "tools/push3-evolution/seeds/manifest.jsonl"
|
||
delivery = "PR to main (same PR as champion_files)"
|
||
note = "Updated with newly admitted entries and fitness scores from evaluate-seeds."
|
||
|
||
[products.issue_comment]
|
||
delivery = "post to originating issue AFTER PR is created and URL is confirmed"
|
||
content = "verdict (improved/no_improvement), best fitness, generation found, admission count, link to champion PR (mandatory), link to evidence file"
|
||
on_pr_failure = "post error comment with failure reason and local evidence path; leave issue OPEN; do not close"
|
||
on_run_failure = "include best fitness achieved, last generation completed, full log available in tmp/evolution/run_NNN/evolution.log; do not close issue"
|
||
ordering_note = "The comment MUST NOT be posted before the PR URL exists. Closing the issue is the orchestrator's responsibility after PR merge, not this formula's."
|
||
|
||
# ── Resources ──────────────────────────────────────────────────────────────────
|
||
|
||
[resources]
|
||
profile = "heavy"
|
||
compute = "CPU + RAM intensive — transpile + compile + deploy + revm eval per candidate"
|
||
rpc = "Base network RPC (BASE_RPC_URL) for revm fork; or Anvil (EVAL_MODE=anvil)"
|
||
concurrency = "exclusive — revm evaluator and optional Anvil share port 8545 with run-holdout and run-red-team"
|
||
|
||
# ── Notes ──────────────────────────────────────────────────────────────────────
|
||
|
||
[notes]
|
||
no_uups_deployment = """
|
||
The evolution pipeline produces Push3 candidate files only — no UUPS proxy
|
||
deployment step is wired. Candidates are scored in simulation (revm or Anvil)
|
||
and admitted to the seed pool for future runs. Deployment to a live chain is
|
||
out of scope until the champion passes holdout and red-team gates.
|
||
"""
|
||
|
||
eval_mode = """
|
||
Default EVAL_MODE is revm (batch-eval.sh): all candidates in a generation are
|
||
scored in a single forge test invocation against a Base fork, 10-100× faster
|
||
than per-candidate Anvil. Set EVAL_MODE=anvil to fall back to fitness.sh
|
||
(slower, but does not require BASE_RPC_URL if Anvil is already running).
|
||
Gas limit: revm evaluator runs at ~25 candidates × 100 trades per batch.
|
||
For larger populations, increase the batch budget in batch-eval.sh.
|
||
"""
|
||
|
||
adversarial_fitness = """
|
||
Adversarial fitness against attack scenarios ({attack_dir}/*.jsonl) is planned
|
||
but not yet implemented (score-attacks step is status=planned). Currently the
|
||
only fitness signal is the revm/Anvil metric from batch-eval.sh / fitness.sh.
|
||
When implemented, attack survival will penalise candidates whose floor breaks
|
||
under known attack patterns, biasing the population toward safer programs.
|
||
"""
|
||
|
||
fee_fitness = """
|
||
Fee optimization against in-market pool data is planned as a second fitness
|
||
dimension. Not yet implemented; tracked as a follow-up issue.
|
||
"""
|
||
|
||
pool_cap = """
|
||
The evolved seed pool is capped at 100 entries by fitness rank. Hand-written
|
||
(origin=hand-written) and LLM-generated (origin=llm) seeds are always pinned
|
||
regardless of fitness. Evolved entries below the pool floor are evicted when
|
||
new higher-scoring candidates are admitted. Raw fitness values are only
|
||
comparable within the same evaluation run; entries with fitness_flags
|
||
(token_value_inflation, processExecIf_fix) are ranked as fitness=0 for
|
||
admission and eviction purposes.
|
||
"""
|