harb/formulas/run-holdout.toml

139 lines
5.9 KiB
TOML
Raw Normal View History

# formulas/run-holdout.toml
#
# Holdout quality gate — deploy a PR branch, run blind holdout scenarios,
# report pass/fail.
#
# Type: sense-only. Produces metrics and a gate decision.
# Does NOT commit code, open PRs, or modify contracts.
#
# Depends on: #973 (evidence/holdout/ directory structure)
[formula]
id = "run-holdout"
name = "Holdout Quality Gate"
description = "Deploy PR branch, run blind holdout scenarios, report pass/fail."
type = "sense"
# "sense" → read-only, produces metrics only
# "act" → produces git artifacts (cf. run-evolution, run-red-team)
# ── Inputs ─────────────────────────────────────────────────────────────────────
[inputs.pr_number]
type = "integer"
required = true
description = "PR number to evaluate"
[inputs.holdout_repo]
type = "string"
required = false
default = "ssh://git@codeberg.org/johba/harb-holdout-scenarios.git"
description = """
Holdout scenarios repo. Dev-agent has no read access cloned at runtime
by evaluate.sh into the ephemeral worktree, never checked in to harb.
"""
# ── Execution ──────────────────────────────────────────────────────────────────
#
# The orchestrator invokes evaluate.sh, which owns the full lifecycle:
# checkout → build → boot stack → clone holdout repo → playwright → teardown.
[execution]
script = "scripts/harb-evaluator/evaluate.sh"
invocation = "bash scripts/harb-evaluator/evaluate.sh {pr_number}"
# Exit codes propagated by evaluate.sh:
# 0 gate passed (≥90% of scenarios achieved 2/3 majority)
# 1 gate failed (at least one scenario failed the 2/3 threshold)
# 2 infra error (stack failed to start, missing dependency, etc.)
# ── Steps ──────────────────────────────────────────────────────────────────────
[[steps]]
id = "boot-stack"
description = """
Spin up full docker stack from PR branch.
evaluate.sh creates an isolated git worktree, builds kraiken-lib,
installs npm deps, installs Playwright browser binaries, then runs:
docker compose -p harb-eval-{pr_number} up -d
Waits for anvil (healthy), bootstrap (exited 0), ponder (healthy + /ready).
"""
[[steps]]
id = "clone-holdout"
description = """
Clone harb-holdout-scenarios into .holdout-scenarios/ inside the worktree.
Sets HOLDOUT_SCENARIOS_DIR for holdout.config.ts.
The dev-agent never sees this repo; the wall is enforced by separate
repository access control on Codeberg.
"""
[[steps]]
id = "run-scenarios"
description = """
Run 8 Playwright specs via holdout.config.ts (workers=1, headless chromium).
4 surfaces: contracts, graphql, landing, webapp.
Each scenario is executed up to 3 times; 2/3 runs must pass.
"""
surfaces = ["contracts", "graphql", "landing", "webapp"]
scenarios_per_surface = 2
scenarios_total = 8
runs_per_scenario = 3
pass_per_scenario = 2 # 2-of-3 majority required for a scenario to count as passed
[[steps]]
id = "teardown"
description = """
docker compose -p harb-eval-{pr_number} down -v --remove-orphans
git worktree remove --force {worktree_dir}
Always runs cleanup is registered as a shell trap in evaluate.sh.
"""
[[steps]]
id = "deliver"
description = """
Collect per-scenario results from test-results/holdout-reports/.
Write evidence/holdout/{date}-pr{pr_number}.json and commit to main.
Post gate verdict to issue #{pr_number}.
On failure: include one-line reason per failed scenario.
Scenario text is never exposed to the dev-agent.
"""
# ── Gate ───────────────────────────────────────────────────────────────────────
[gate]
pass_threshold_pct = 90 # ≥90% of scenarios must pass
scenarios_total = 8 # 8 * 0.9 = 7.2 → at least 8 must pass to clear 90%
per_scenario_runs = 3
per_scenario_pass = 2 # 2-of-3 majority per scenario
# ── Products ───────────────────────────────────────────────────────────────────
[products.evidence_file]
path = "evidence/holdout/{date}-pr{pr_number}.json"
delivery = "commit to main"
schema = "evidence/README.md" # see §Schema: holdout/YYYY-MM-DD-prNNN.json
[products.issue_comment]
delivery = "post to issue #{pr_number}"
content = "gate verdict (pass/fail), scenarios_passed/scenarios_total, link to evidence file"
on_failure = "one-line failure reason per failing scenario; scenario text never revealed"
# ── Resources ──────────────────────────────────────────────────────────────────
[resources]
profile = "heavy"
containers = "5+" # anvil, bootstrap, ponder, webapp, (caddy if needed)
browser = "chromium (Playwright)"
ports = ["8545", "42069", "5173", "8081", "5100"]
concurrency = "exclusive — port bindings prevent parallel runs on the same host"
# ── Notes ──────────────────────────────────────────────────────────────────────
[notes]
wall = """
The holdout-specs repo (harb-holdout-scenarios) is intentionally inaccessible
to the dev-agent. The agent receives only pass/fail and one-line failure reasons
never the scenario text. This is enforced by Codeberg repo permissions, not
by runtime filtering.
"""