diff --git a/formulas/run-holdout.toml b/formulas/run-holdout.toml new file mode 100644 index 0000000..eac2875 --- /dev/null +++ b/formulas/run-holdout.toml @@ -0,0 +1,138 @@ +# formulas/run-holdout.toml +# +# Holdout quality gate — deploy a PR branch, run blind holdout scenarios, +# report pass/fail. +# +# Type: sense-only. Produces metrics and a gate decision. +# Does NOT commit code, open PRs, or modify contracts. +# +# Depends on: #973 (evidence/holdout/ directory structure) + +[formula] +id = "run-holdout" +name = "Holdout Quality Gate" +description = "Deploy PR branch, run blind holdout scenarios, report pass/fail." +type = "sense" +# "sense" → read-only, produces metrics only +# "act" → produces git artifacts (cf. run-evolution, run-red-team) + +# ── Inputs ───────────────────────────────────────────────────────────────────── + +[inputs.pr_number] +type = "integer" +required = true +description = "PR number to evaluate" + +[inputs.holdout_repo] +type = "string" +required = false +default = "ssh://git@codeberg.org/johba/harb-holdout-scenarios.git" +description = """ +Holdout scenarios repo. Dev-agent has no read access — cloned at runtime +by evaluate.sh into the ephemeral worktree, never checked in to harb. +""" + +# ── Execution ────────────────────────────────────────────────────────────────── +# +# The orchestrator invokes evaluate.sh, which owns the full lifecycle: +# checkout → build → boot stack → clone holdout repo → playwright → teardown. + +[execution] +script = "scripts/harb-evaluator/evaluate.sh" +invocation = "bash scripts/harb-evaluator/evaluate.sh {pr_number}" + +# Exit codes propagated by evaluate.sh: +# 0 gate passed (≥90% of scenarios achieved 2/3 majority) +# 1 gate failed (at least one scenario failed the 2/3 threshold) +# 2 infra error (stack failed to start, missing dependency, etc.) + +# ── Steps ────────────────────────────────────────────────────────────────────── + +[[steps]] +id = "boot-stack" +description = """ +Spin up full docker stack from PR branch. +evaluate.sh creates an isolated git worktree, builds kraiken-lib, +installs npm deps, installs Playwright browser binaries, then runs: + docker compose -p harb-eval-{pr_number} up -d +Waits for anvil (healthy), bootstrap (exited 0), ponder (healthy + /ready). +""" + +[[steps]] +id = "clone-holdout" +description = """ +Clone harb-holdout-scenarios into .holdout-scenarios/ inside the worktree. +Sets HOLDOUT_SCENARIOS_DIR for holdout.config.ts. +The dev-agent never sees this repo; the wall is enforced by separate +repository access control on Codeberg. +""" + +[[steps]] +id = "run-scenarios" +description = """ +Run 8 Playwright specs via holdout.config.ts (workers=1, headless chromium). +4 surfaces: contracts, graphql, landing, webapp. +Each scenario is executed up to 3 times; 2/3 runs must pass. +""" +surfaces = ["contracts", "graphql", "landing", "webapp"] +scenarios_per_surface = 2 +scenarios_total = 8 +runs_per_scenario = 3 +pass_per_scenario = 2 # 2-of-3 majority required for a scenario to count as passed + +[[steps]] +id = "teardown" +description = """ +docker compose -p harb-eval-{pr_number} down -v --remove-orphans +git worktree remove --force {worktree_dir} +Always runs — cleanup is registered as a shell trap in evaluate.sh. +""" + +[[steps]] +id = "deliver" +description = """ +Collect per-scenario results from test-results/holdout-reports/. +Write evidence/holdout/{date}-pr{pr_number}.json and commit to main. +Post gate verdict to issue #{pr_number}. +On failure: include one-line reason per failed scenario. +Scenario text is never exposed to the dev-agent. +""" + +# ── Gate ─────────────────────────────────────────────────────────────────────── + +[gate] +pass_threshold_pct = 90 # ≥90% of scenarios must pass +scenarios_total = 8 # 8 * 0.9 = 7.2 → at least 8 must pass to clear 90% +per_scenario_runs = 3 +per_scenario_pass = 2 # 2-of-3 majority per scenario + +# ── Products ─────────────────────────────────────────────────────────────────── + +[products.evidence_file] +path = "evidence/holdout/{date}-pr{pr_number}.json" +delivery = "commit to main" +schema = "evidence/README.md" # see §Schema: holdout/YYYY-MM-DD-prNNN.json + +[products.issue_comment] +delivery = "post to issue #{pr_number}" +content = "gate verdict (pass/fail), scenarios_passed/scenarios_total, link to evidence file" +on_failure = "one-line failure reason per failing scenario; scenario text never revealed" + +# ── Resources ────────────────────────────────────────────────────────────────── + +[resources] +profile = "heavy" +containers = "5+" # anvil, bootstrap, ponder, webapp, (caddy if needed) +browser = "chromium (Playwright)" +ports = ["8545", "42069", "5173", "8081", "5100"] +concurrency = "exclusive — port bindings prevent parallel runs on the same host" + +# ── Notes ────────────────────────────────────────────────────────────────────── + +[notes] +wall = """ +The holdout-specs repo (harb-holdout-scenarios) is intentionally inaccessible +to the dev-agent. The agent receives only pass/fail and one-line failure reasons +— never the scenario text. This is enforced by Codeberg repo permissions, not +by runtime filtering. +"""