harb/formulas/run-evolution.toml

345 lines
16 KiB
TOML
Raw Normal View History

# formulas/run-evolution.toml
#
# Push3 optimizer evolution pipeline — evaluate seed pool, evolve a population
# of candidates, admit survivors back to the pool, deliver champions via PR.
#
# Type: act. Produces git artifacts (new .push3 champions + updated
# manifest.jsonl via PR to main; evidence file committed to main).
#
# Depends on: #973 (evidence/evolution/ directory structure)
[formula]
id = "run-evolution"
name = "Push3 Optimizer Evolution"
description = "Evaluate seed pool, evolve Push3 optimizer population, admit survivors, deliver champions via PR."
type = "act"
# "sense" → read-only, produces metrics only
# "act" → produces git artifacts (cf. run-red-team, run-evolution)
depends_on = [973]
# ── Inputs ─────────────────────────────────────────────────────────────────────
[inputs.seed]
type = "string"
required = false
default = "tools/push3-evolution/seeds/optimizer_v3.push3"
description = "Starting seed .push3 file (passed as --seed to evolve.sh). Serves as the fallback mutation source when the pool does not fill the full population."
[inputs.population]
type = "integer"
required = false
default = 10
description = "Number of candidates per generation (--population)."
[inputs.generations]
type = "integer"
required = false
default = 5
description = "Number of evolution generations to run (--generations)."
[inputs.mutation_rate]
type = "integer"
required = false
default = 2
description = "Mutations applied per candidate per generation (--mutation-rate)."
[inputs.elites]
type = "integer"
required = false
default = 2
description = "Top-scoring candidates carried forward unchanged each generation (--elites)."
[inputs.base_rpc_url]
type = "string"
required = true
description = """
Base network RPC endpoint forwarded as BASE_RPC_URL to both evaluate-seeds.sh
and evolve.sh. Required for the revm evaluator (default EVAL_MODE).
Example: https://mainnet.base.org or a fork URL from a running Anvil instance.
"""
[inputs.run_id]
type = "integer"
required = false
description = """
Override the run ID used when naming candidates admitted to the seed pool
(e.g. run009_gen2_c005.push3). Auto-incremented from the highest existing
run in manifest.jsonl when omitted (recommended).
"""
[inputs.attack_dir]
type = "string"
required = false
default = "onchain/script/backtesting/attacks"
description = """
Directory of .jsonl adversarial attack scenarios. Intended as an adversarial
fitness input candidates scored against these patterns in addition to the
revm fitness metric. Not yet forwarded to evolve.sh; documented here as a
forward spec.
"""
status = "planned"
# ── Execution ──────────────────────────────────────────────────────────────────
#
# Step 0 — evaluate-seeds.sh — runs before the main evolution loop.
# Scores any manifest.jsonl entries with fitness: null so the pool
# sampler has real fitness values when selecting gen_0 candidates.
#
# Steps 1-5 — evolve.sh — owns the full evolution lifecycle:
# 1. Initialise population: random sample from seed pool (--diverse-seeds).
# 2. Score candidates via revm batch evaluator (batch-eval.sh).
# 3. Tournament-select survivors; apply elitism + mutation / crossover.
# 4. Repeat for N generations; track global best.
# 5. Admit candidates above threshold (6e21 wei) into seeds/; rewrite manifest.
#
# evolve.sh always passes --diverse-seeds so gen_0 inherits pool diversity.
# --run-id is omitted to let evolve.sh auto-increment from manifest.jsonl.
[execution]
pre_script = "tools/push3-evolution/evaluate-seeds.sh"
pre_invocation = "BASE_RPC_URL={base_rpc_url} bash tools/push3-evolution/evaluate-seeds.sh"
script = "tools/push3-evolution/evolve.sh"
invocation = "BASE_RPC_URL={base_rpc_url} bash tools/push3-evolution/evolve.sh --seed {seed} --population {population} --generations {generations} --mutation-rate {mutation_rate} --elites {elites} --output tmp/evolution --diverse-seeds"
# Exit codes propagated by evolve.sh:
# 0 evolution complete; best candidate found and pool admission attempted
# 2 infrastructure error (RPC unreachable, missing tool, revm eval failed)
# ── Steps ──────────────────────────────────────────────────────────────────────
[[steps]]
id = "evaluate-seeds"
description = """
Score manifest entries with fitness: null before the evolution loop begins.
tools/push3-evolution/evaluate-seeds.sh:
- Reads tools/push3-evolution/seeds/manifest.jsonl.
- For every entry where fitness is null, runs fitness.sh against the
corresponding .push3 file and records the numeric score.
- Rewrites manifest.jsonl atomically (temp-file rename).
- Exits 0 when nothing to do (idempotent; safe to re-run).
- Exits 2 on infrastructure error (eval stack unreachable).
Primary targets: LLM-generated seeds (origin=llm) and evolved entries whose
fitness was nulled due to scoring inflation (fitness_flags: token_value_inflation,
processExecIf_fix). Real fitness values allow --diverse-seeds to weight the
gen_0 sample correctly.
"""
script = "tools/push3-evolution/evaluate-seeds.sh"
[[steps]]
id = "evolve"
description = """
Run the outer evolutionary loop via tools/push3-evolution/evolve.sh.
Initialisation (gen_0):
A random sample of up to {population} candidates is drawn from the seed pool
(tools/push3-evolution/seeds/); any shortfall is filled by mutating {seed}.
Seeds with unevaluated fitness (null) are included in the sample with equal
probability evaluate-seeds (step 0) should have resolved most of these.
Per-generation loop ({generations} iterations):
a. Score all candidates in a single forge test invocation via
tools/push3-evolution/revm-evaluator/batch-eval.sh (EVAL_MODE=revm).
Falls back to per-candidate fitness.sh (EVAL_MODE=anvil) if revm is
unavailable.
b. Log generation stats: min / max / mean fitness, best candidate file.
c. Tournament-select survivors (k = population / 2).
d. Elitism: carry the top {elites} candidates forward unchanged.
e. Fill remaining slots: mutate random survivors (first half) and apply
pairwise crossover (second half); fall back to copy on failure.
Output per run (tmp/evolution/run_NNN/):
generation_0.jsonl generation_N.jsonl per-candidate fitness records
best.push3 global champion
diff.txt constant delta vs seed
evolution.log full run transcript
Pool admission (after final generation):
Candidates scoring above 6e21 wei are deduplicated by content hash and
admitted to tools/push3-evolution/seeds/, named run{NNN}_gen{G}_c{C}.push3.
manifest.jsonl is rewritten atomically; the evolved pool is capped at 100
entries by fitness rank (hand-written / LLM seeds are always pinned).
"""
script = "tools/push3-evolution/evolve.sh"
output_dir = "tmp/evolution"
[[steps]]
id = "score-attacks"
description = """
[Planned] Score the champion against known adversarial attack scenarios in
{attack_dir}/*.jsonl via onchain/script/backtesting/AttackRunner.s.sol.
For each attack file:
- Replay the op sequence against a fresh Anvil snapshot.
- Record LM total ETH before and after.
- Emit one fitness adjustment: penalise the candidate's score if the
attack succeeds (floor broken), reward if the floor holds.
Results feed back into the adversarial fitness component candidates that
survive all known attacks rank higher in the evidence record.
Skipped when {attack_dir} is empty or AttackRunner is unavailable.
"""
status = "planned"
attack_source = "{attack_dir}/*.jsonl"
forge_script = "onchain/script/backtesting/AttackRunner.s.sol"
[[steps]]
id = "collect"
description = """
Aggregate evolve.sh outputs into evidence/evolution/{date}.json.
Reads:
- tmp/evolution/run_NNN/generation_N.jsonl per-generation fitness records
- tmp/evolution/run_NNN/best.push3 champion file
- tools/push3-evolution/seeds/manifest.jsonl admission results
Writes evidence/evolution/{date}.json conforming to the schema in
evidence/README.md ## Schema: evolution/YYYY-MM-DD.json.
Verdict: "improved" if best_fitness > best seed fitness in manifest before
the run; "no_improvement" otherwise.
"""
output = "evidence/evolution/{date}.json"
schema = "evidence/README.md"
[[steps]]
id = "cleanup"
description = """
Remove intermediate per-generation candidate files that are not part of the
final results. Only the following files are retained after this step:
tmp/evolution/run_NNN/best.push3 global champion
tmp/evolution/run_NNN/diff.txt constant delta vs seed
tmp/evolution/run_NNN/evolution.log full run transcript
tools/push3-evolution/seeds/run{NNN}_*.push3
top-N newly admitted seeds
( elites per generation)
Files removed:
tmp/evolution/run_NNN/generation_*.jsonl per-candidate fitness records
(already aggregated into evidence)
tmp/evolution/run_NNN/candidate_*.push3 intermediate per-generation
candidates that are not elites
Rationale: the evolution box reached 91% disk utilisation in run #1025 because
these intermediate files were never cleaned up. Aggregated fitness data is
preserved in evidence/evolution/{date}.json; the per-candidate .push3 files for
non-elite generations are not needed once the evidence file is written.
"""
[[steps]]
id = "deliver"
description = """
Commit results to a branch, push, open PR, then post summary comment.
ORDERING IS MANDATORY each sub-step must complete before the next begins.
Do NOT post to the issue before the PR URL is available.
1. CLEAN GIT STATE
Run `git checkout -- .` to discard any working-tree modifications that are
NOT part of the evolution results (e.g. .sol files left over from a prior
session, scratch files). Only stage files that belong to this run:
- evidence/evolution/{date}.json
- tools/push3-evolution/seeds/evo_run{NNN}_champion.push3
- tools/push3-evolution/seeds/manifest.jsonl
Verify `git diff --check` passes before committing.
2. COMMIT TO BRANCH
Create branch evidence/evolution-run-{run_id} from master.
Commit the staged result files with message:
"evo: run{NNN} results — fitness={best_fitness}"
The commit MUST include all three files above.
3. PUSH AND CREATE PR
Push the branch to origin.
Open a Codeberg PR targeting master:
Title: "evo: run{NNN} champion — fitness={best_fitness}"
Body: generation-by-generation table (gen, best, mean, worst fitness),
top-3 admitted candidates with fitness scores, constant diff vs
seed (from diff.txt), link to evidence file.
If `git push` or PR creation fails:
a. Post an error comment to the originating issue with the failure reason
and the path of the local evidence file.
b. Leave the issue OPEN.
c. Exit with a non-zero status do NOT proceed to step 4.
4. POST SUMMARY COMMENT (only after PR URL is confirmed)
Post a comment to the originating issue containing:
- Verdict (improved / no_improvement).
- Best fitness achieved and which generation it was found in.
- Admission count: N candidates added to seed pool.
- Link to the champion PR (required do not post without it).
- Link to evidence file committed in the PR.
- If no_improvement: best fitness achieved and seed pool size.
Do NOT close the issue in this step; closing is the orchestrator's
responsibility once the PR is merged.
"""
# ── Products ───────────────────────────────────────────────────────────────────
[products.evidence_file]
path = "evidence/evolution/{date}.json"
delivery = "PR to main (same PR as champion_files, on branch evidence/evolution-run-{run_id})"
schema = "evidence/README.md" # see ## Schema: evolution/YYYY-MM-DD.json
[products.champion_files]
path = "tools/push3-evolution/seeds/evo_run{NNN}_champion.push3"
# {NNN} is the auto-incremented run ID assigned by evolve.sh at runtime.
delivery = "PR to main"
note = "Only created when at least one candidate exceeds the admission threshold (6e21 wei)."
[products.manifest]
path = "tools/push3-evolution/seeds/manifest.jsonl"
delivery = "PR to main (same PR as champion_files)"
note = "Updated with newly admitted entries and fitness scores from evaluate-seeds."
[products.issue_comment]
delivery = "post to originating issue AFTER PR is created and URL is confirmed"
content = "verdict (improved/no_improvement), best fitness, generation found, admission count, link to champion PR (mandatory), link to evidence file"
on_pr_failure = "post error comment with failure reason and local evidence path; leave issue OPEN; do not close"
on_run_failure = "include best fitness achieved, last generation completed, full log available in tmp/evolution/run_NNN/evolution.log; do not close issue"
ordering_note = "The comment MUST NOT be posted before the PR URL exists. Closing the issue is the orchestrator's responsibility after PR merge, not this formula's."
# ── Resources ──────────────────────────────────────────────────────────────────
[resources]
profile = "heavy"
compute = "CPU + RAM intensive — transpile + compile + deploy + revm eval per candidate"
rpc = "Base network RPC (BASE_RPC_URL) for revm fork; or Anvil (EVAL_MODE=anvil)"
concurrency = "exclusive — revm evaluator and optional Anvil share port 8545 with run-holdout and run-red-team"
# ── Notes ──────────────────────────────────────────────────────────────────────
[notes]
no_uups_deployment = """
The evolution pipeline produces Push3 candidate files only no UUPS proxy
deployment step is wired. Candidates are scored in simulation (revm or Anvil)
and admitted to the seed pool for future runs. Deployment to a live chain is
out of scope until the champion passes holdout and red-team gates.
"""
eval_mode = """
Default EVAL_MODE is revm (batch-eval.sh): all candidates in a generation are
scored in a single forge test invocation against a Base fork, 10-100× faster
than per-candidate Anvil. Set EVAL_MODE=anvil to fall back to fitness.sh
(slower, but does not require BASE_RPC_URL if Anvil is already running).
Gas limit: revm evaluator runs at ~25 candidates × 100 trades per batch.
For larger populations, increase the batch budget in batch-eval.sh.
"""
adversarial_fitness = """
Adversarial fitness against attack scenarios ({attack_dir}/*.jsonl) is planned
but not yet implemented (score-attacks step is status=planned). Currently the
only fitness signal is the revm/Anvil metric from batch-eval.sh / fitness.sh.
When implemented, attack survival will penalise candidates whose floor breaks
under known attack patterns, biasing the population toward safer programs.
"""
fee_fitness = """
Fee optimization against in-market pool data is planned as a second fitness
dimension. Not yet implemented; tracked as a follow-up issue.
"""
pool_cap = """
The evolved seed pool is capped at 100 entries by fitness rank. Hand-written
(origin=hand-written) and LLM-generated (origin=llm) seeds are always pinned
regardless of fitness. Evolved entries below the pool floor are evicted when
new higher-scoring candidates are admitted. Raw fitness values are only
comparable within the same evaluation run; entries with fitness_flags
(token_value_inflation, processExecIf_fix) are ranked as fitness=0 for
admission and eviction purposes.
"""