harb/formulas/run-holdout.toml

# formulas/run-holdout.toml
#
# Holdout quality gate — deploy a PR branch, run blind holdout scenarios,
# report pass/fail.
#
# Type: sense-only.  Produces metrics and a gate decision.
# Does NOT commit code, open PRs, or modify contracts.
#
# Depends on: #973 (evidence/holdout/ directory structure)

[formula]
id          = "run-holdout"
name        = "Holdout Quality Gate"
description = "Deploy PR branch, run blind holdout scenarios, report pass/fail."
type        = "sense"
# "sense"  → read-only, produces metrics only
# "act"    → produces git artifacts (cf. run-evolution, run-red-team)

# ── Inputs ─────────────────────────────────────────────────────────────────────

[inputs.pr_number]
type        = "integer"
required    = true
description = "PR number to evaluate"

[inputs.holdout_repo]
type        = "string"
required    = false
default     = "ssh://git@codeberg.org/johba/harb-holdout-scenarios.git"
description = """
Holdout scenarios repo. Dev-agent has no read access — cloned at runtime
by evaluate.sh into the ephemeral worktree, never checked in to harb.
"""

# ── Execution ──────────────────────────────────────────────────────────────────
#
# The orchestrator invokes evaluate.sh, which owns the full lifecycle:
# checkout → build → boot stack → clone holdout repo → playwright → teardown.

[execution]
script     = "scripts/harb-evaluator/evaluate.sh"
invocation = "bash scripts/harb-evaluator/evaluate.sh {pr_number}"

# Exit codes propagated by evaluate.sh:
#   0  gate passed (≥90% of scenarios achieved 2/3 majority)
#   1  gate failed (at least one scenario failed the 2/3 threshold)
#   2  infra error (stack failed to start, missing dependency, etc.)

# ── Steps ──────────────────────────────────────────────────────────────────────

[[steps]]
id          = "boot-stack"
description = """
Spin up full docker stack from PR branch.
evaluate.sh creates an isolated git worktree, builds kraiken-lib,
installs npm deps, installs Playwright browser binaries, then runs:
  docker compose -p harb-eval-{pr_number} up -d
Waits for anvil (healthy), bootstrap (exited 0), ponder (healthy + /ready).
"""

[[steps]]
id          = "clone-holdout"
description = """
Clone harb-holdout-scenarios into .holdout-scenarios/ inside the worktree.
Sets HOLDOUT_SCENARIOS_DIR for holdout.config.ts.
The dev-agent never sees this repo; the wall is enforced by separate
repository access control on Codeberg.
"""

[[steps]]
id          = "run-scenarios"
description = """
Run 8 Playwright specs via holdout.config.ts (workers=1, headless chromium).
4 surfaces: contracts, graphql, landing, webapp.
Each scenario is executed up to 3 times; 2/3 runs must pass.
"""
surfaces              = ["contracts", "graphql", "landing", "webapp"]
scenarios_per_surface = 2
scenarios_total       = 8
runs_per_scenario     = 3
pass_per_scenario     = 2   # 2-of-3 majority required for a scenario to count as passed

[[steps]]
id          = "teardown"
description = """
docker compose -p harb-eval-{pr_number} down -v --remove-orphans
git worktree remove --force {worktree_dir}
Always runs — cleanup is registered as a shell trap in evaluate.sh.
"""

[[steps]]
id          = "deliver"
description = """
Collect per-scenario results from test-results/holdout-reports/.
Write evidence/holdout/{date}-pr{pr_number}.json and commit to main.
Post gate verdict to issue #{pr_number}.
On failure: include one-line reason per failed scenario.
Scenario text is never exposed to the dev-agent.
"""

# ── Gate ───────────────────────────────────────────────────────────────────────

[gate]
pass_threshold_pct = 90   # ≥90% of scenarios must pass
scenarios_total    = 8    # 8 * 0.9 = 7.2 → at least 8 must pass to clear 90%
per_scenario_runs  = 3
per_scenario_pass  = 2    # 2-of-3 majority per scenario

# ── Products ───────────────────────────────────────────────────────────────────

[products.evidence_file]
path     = "evidence/holdout/{date}-pr{pr_number}.json"
delivery = "commit to main"
schema   = "evidence/README.md"  # see §Schema: holdout/YYYY-MM-DD-prNNN.json

[products.issue_comment]
delivery   = "post to issue #{pr_number}"
content    = "gate verdict (pass/fail), scenarios_passed/scenarios_total, link to evidence file"
on_failure = "one-line failure reason per failing scenario; scenario text never revealed"

# ── Resources ──────────────────────────────────────────────────────────────────

[resources]
profile     = "heavy"
containers  = "5+"    # anvil, bootstrap, ponder, webapp, (caddy if needed)
browser     = "chromium (Playwright)"
ports       = ["8545", "42069", "5173", "8081", "5100"]
concurrency = "exclusive — port bindings prevent parallel runs on the same host"

# ── Notes ──────────────────────────────────────────────────────────────────────

[notes]
wall = """
The holdout-specs repo (harb-holdout-scenarios) is intentionally inaccessible
to the dev-agent. The agent receives only pass/fail and one-line failure reasons
— never the scenario text. This is enforced by Codeberg repo permissions, not
by runtime filtering.
"""
fix: Formula: run-holdout (PR quality gate) (#977) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-19 09:50:22 +00:00			`# formulas/run-holdout.toml`
			`#`
			`# Holdout quality gate — deploy a PR branch, run blind holdout scenarios,`
			`# report pass/fail.`
			`#`
			`# Type: sense-only. Produces metrics and a gate decision.`
			`# Does NOT commit code, open PRs, or modify contracts.`
			`#`
			`# Depends on: #973 (evidence/holdout/ directory structure)`

			`[formula]`
			`id = "run-holdout"`
			`name = "Holdout Quality Gate"`
			`description = "Deploy PR branch, run blind holdout scenarios, report pass/fail."`
			`type = "sense"`
			`# "sense" → read-only, produces metrics only`
			`# "act" → produces git artifacts (cf. run-evolution, run-red-team)`

			`# ── Inputs ─────────────────────────────────────────────────────────────────────`

			`[inputs.pr_number]`
			`type = "integer"`
			`required = true`
			`description = "PR number to evaluate"`

			`[inputs.holdout_repo]`
			`type = "string"`
			`required = false`
			`default = "ssh://git@codeberg.org/johba/harb-holdout-scenarios.git"`
			`description = """`
			`Holdout scenarios repo. Dev-agent has no read access — cloned at runtime`
			`by evaluate.sh into the ephemeral worktree, never checked in to harb.`
			`"""`

			`# ── Execution ──────────────────────────────────────────────────────────────────`
			`#`
			`# The orchestrator invokes evaluate.sh, which owns the full lifecycle:`
			`# checkout → build → boot stack → clone holdout repo → playwright → teardown.`

			`[execution]`
			`script = "scripts/harb-evaluator/evaluate.sh"`
			`invocation = "bash scripts/harb-evaluator/evaluate.sh {pr_number}"`

			`# Exit codes propagated by evaluate.sh:`
			`# 0 gate passed (≥90% of scenarios achieved 2/3 majority)`
			`# 1 gate failed (at least one scenario failed the 2/3 threshold)`
			`# 2 infra error (stack failed to start, missing dependency, etc.)`

			`# ── Steps ──────────────────────────────────────────────────────────────────────`

			`[[steps]]`
			`id = "boot-stack"`
			`description = """`
			`Spin up full docker stack from PR branch.`
			`evaluate.sh creates an isolated git worktree, builds kraiken-lib,`
			`installs npm deps, installs Playwright browser binaries, then runs:`
			`docker compose -p harb-eval-{pr_number} up -d`
			`Waits for anvil (healthy), bootstrap (exited 0), ponder (healthy + /ready).`
			`"""`

			`[[steps]]`
			`id = "clone-holdout"`
			`description = """`
			`Clone harb-holdout-scenarios into .holdout-scenarios/ inside the worktree.`
			`Sets HOLDOUT_SCENARIOS_DIR for holdout.config.ts.`
			`The dev-agent never sees this repo; the wall is enforced by separate`
			`repository access control on Codeberg.`
			`"""`

			`[[steps]]`
			`id = "run-scenarios"`
			`description = """`
			`Run 8 Playwright specs via holdout.config.ts (workers=1, headless chromium).`
			`4 surfaces: contracts, graphql, landing, webapp.`
			`Each scenario is executed up to 3 times; 2/3 runs must pass.`
			`"""`
			`surfaces = ["contracts", "graphql", "landing", "webapp"]`
			`scenarios_per_surface = 2`
			`scenarios_total = 8`
			`runs_per_scenario = 3`
			`pass_per_scenario = 2 # 2-of-3 majority required for a scenario to count as passed`

			`[[steps]]`
			`id = "teardown"`
			`description = """`
			`docker compose -p harb-eval-{pr_number} down -v --remove-orphans`
			`git worktree remove --force {worktree_dir}`
			`Always runs — cleanup is registered as a shell trap in evaluate.sh.`
			`"""`

			`[[steps]]`
			`id = "deliver"`
			`description = """`
			`Collect per-scenario results from test-results/holdout-reports/.`
			`Write evidence/holdout/{date}-pr{pr_number}.json and commit to main.`
			`Post gate verdict to issue #{pr_number}.`
			`On failure: include one-line reason per failed scenario.`
			`Scenario text is never exposed to the dev-agent.`
			`"""`

			`# ── Gate ───────────────────────────────────────────────────────────────────────`

			`[gate]`
			`pass_threshold_pct = 90 # ≥90% of scenarios must pass`
			`scenarios_total = 8 # 8 * 0.9 = 7.2 → at least 8 must pass to clear 90%`
			`per_scenario_runs = 3`
			`per_scenario_pass = 2 # 2-of-3 majority per scenario`

			`# ── Products ───────────────────────────────────────────────────────────────────`

			`[products.evidence_file]`
			`path = "evidence/holdout/{date}-pr{pr_number}.json"`
			`delivery = "commit to main"`
			`schema = "evidence/README.md" # see §Schema: holdout/YYYY-MM-DD-prNNN.json`

			`[products.issue_comment]`
			`delivery = "post to issue #{pr_number}"`
			`content = "gate verdict (pass/fail), scenarios_passed/scenarios_total, link to evidence file"`
			`on_failure = "one-line failure reason per failing scenario; scenario text never revealed"`

			`# ── Resources ──────────────────────────────────────────────────────────────────`

			`[resources]`
			`profile = "heavy"`
			`containers = "5+" # anvil, bootstrap, ponder, webapp, (caddy if needed)`
			`browser = "chromium (Playwright)"`
			`ports = ["8545", "42069", "5173", "8081", "5100"]`
			`concurrency = "exclusive — port bindings prevent parallel runs on the same host"`

			`# ── Notes ──────────────────────────────────────────────────────────────────────`

			`[notes]`
			`wall = """`
			`The holdout-specs repo (harb-holdout-scenarios) is intentionally inaccessible`
			`to the dev-agent. The agent receives only pass/fail and one-line failure reasons`
			`— never the scenario text. This is enforced by Codeberg repo permissions, not`
			`by runtime filtering.`
			`"""`