# formulas/run-holdout.toml # # Holdout quality gate — deploy a PR branch, run blind holdout scenarios, # report pass/fail. # # Type: sense-only. Produces metrics and a gate decision. # Does NOT commit code, open PRs, or modify contracts. # # Depends on: #973 (evidence/holdout/ directory structure) [formula] id = "run-holdout" name = "Holdout Quality Gate" description = "Deploy PR branch, run blind holdout scenarios, report pass/fail." type = "sense" # "sense" → read-only, produces metrics only # "act" → produces git artifacts (cf. run-evolution, run-red-team) # ── Inputs ───────────────────────────────────────────────────────────────────── [inputs.pr_number] type = "integer" required = true description = "PR number to evaluate" [inputs.holdout_repo] type = "string" required = false default = "ssh://git@codeberg.org/johba/harb-holdout-scenarios.git" description = """ Holdout scenarios repo. Dev-agent has no read access — cloned at runtime by evaluate.sh into the ephemeral worktree, never checked in to harb. """ # ── Execution ────────────────────────────────────────────────────────────────── # # The orchestrator invokes evaluate.sh, which owns the full lifecycle: # checkout → build → boot stack → clone holdout repo → playwright → teardown. [execution] script = "scripts/harb-evaluator/evaluate.sh" invocation = "bash scripts/harb-evaluator/evaluate.sh {pr_number}" # Exit codes propagated by evaluate.sh: # 0 gate passed (≥90% of scenarios achieved 2/3 majority) # 1 gate failed (at least one scenario failed the 2/3 threshold) # 2 infra error (stack failed to start, missing dependency, etc.) # ── Steps ────────────────────────────────────────────────────────────────────── [[steps]] id = "boot-stack" description = """ Spin up full docker stack from PR branch. evaluate.sh creates an isolated git worktree, builds kraiken-lib, installs npm deps, installs Playwright browser binaries, then runs: docker compose -p harb-eval-{pr_number} up -d Waits for anvil (healthy), bootstrap (exited 0), ponder (healthy + /ready). """ [[steps]] id = "clone-holdout" description = """ Clone harb-holdout-scenarios into .holdout-scenarios/ inside the worktree. Sets HOLDOUT_SCENARIOS_DIR for holdout.config.ts. The dev-agent never sees this repo; the wall is enforced by separate repository access control on Codeberg. """ [[steps]] id = "run-scenarios" description = """ Run 8 Playwright specs via holdout.config.ts (workers=1, headless chromium). 4 surfaces: contracts, graphql, landing, webapp. Each scenario is executed up to 3 times; 2/3 runs must pass. """ surfaces = ["contracts", "graphql", "landing", "webapp"] scenarios_per_surface = 2 scenarios_total = 8 runs_per_scenario = 3 pass_per_scenario = 2 # 2-of-3 majority required for a scenario to count as passed [[steps]] id = "teardown" description = """ docker compose -p harb-eval-{pr_number} down -v --remove-orphans git worktree remove --force {worktree_dir} Always runs — cleanup is registered as a shell trap in evaluate.sh. """ [[steps]] id = "deliver" description = """ Collect per-scenario results from test-results/holdout-reports/. Write evidence/holdout/{date}-pr{pr_number}.json and commit to main. Post gate verdict to issue #{pr_number}. On failure: include one-line reason per failed scenario. Scenario text is never exposed to the dev-agent. """ # ── Gate ─────────────────────────────────────────────────────────────────────── [gate] pass_threshold_pct = 90 # ≥90% of scenarios must pass scenarios_total = 8 # 8 * 0.9 = 7.2 → at least 8 must pass to clear 90% per_scenario_runs = 3 per_scenario_pass = 2 # 2-of-3 majority per scenario # ── Products ─────────────────────────────────────────────────────────────────── [products.evidence_file] path = "evidence/holdout/{date}-pr{pr_number}.json" delivery = "commit to main" schema = "evidence/README.md" # see §Schema: holdout/YYYY-MM-DD-prNNN.json [products.issue_comment] delivery = "post to issue #{pr_number}" content = "gate verdict (pass/fail), scenarios_passed/scenarios_total, link to evidence file" on_failure = "one-line failure reason per failing scenario; scenario text never revealed" # ── Resources ────────────────────────────────────────────────────────────────── [resources] profile = "heavy" containers = "5+" # anvil, bootstrap, ponder, webapp, (caddy if needed) browser = "chromium (Playwright)" ports = ["8545", "42069", "5173", "8081", "5100"] concurrency = "exclusive — port bindings prevent parallel runs on the same host" # ── Notes ────────────────────────────────────────────────────────────────────── [notes] wall = """ The holdout-specs repo (harb-holdout-scenarios) is intentionally inaccessible to the dev-agent. The agent receives only pass/fail and one-line failure reasons — never the scenario text. This is enforced by Codeberg repo permissions, not by runtime filtering. """