Merge pull request 'fix: Formula: run-holdout (PR quality gate) (#977)' (#996) from fix/issue-977 into master
Reviewed-on: https://codeberg.org/johba/harb/pulls/996
This commit is contained in:
commit
bab3a6751d
1 changed files with 138 additions and 0 deletions
138
formulas/run-holdout.toml
Normal file
138
formulas/run-holdout.toml
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
# formulas/run-holdout.toml
|
||||
#
|
||||
# Holdout quality gate — deploy a PR branch, run blind holdout scenarios,
|
||||
# report pass/fail.
|
||||
#
|
||||
# Type: sense-only. Produces metrics and a gate decision.
|
||||
# Does NOT commit code, open PRs, or modify contracts.
|
||||
#
|
||||
# Depends on: #973 (evidence/holdout/ directory structure)
|
||||
|
||||
[formula]
|
||||
id = "run-holdout"
|
||||
name = "Holdout Quality Gate"
|
||||
description = "Deploy PR branch, run blind holdout scenarios, report pass/fail."
|
||||
type = "sense"
|
||||
# "sense" → read-only, produces metrics only
|
||||
# "act" → produces git artifacts (cf. run-evolution, run-red-team)
|
||||
|
||||
# ── Inputs ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
[inputs.pr_number]
|
||||
type = "integer"
|
||||
required = true
|
||||
description = "PR number to evaluate"
|
||||
|
||||
[inputs.holdout_repo]
|
||||
type = "string"
|
||||
required = false
|
||||
default = "ssh://git@codeberg.org/johba/harb-holdout-scenarios.git"
|
||||
description = """
|
||||
Holdout scenarios repo. Dev-agent has no read access — cloned at runtime
|
||||
by evaluate.sh into the ephemeral worktree, never checked in to harb.
|
||||
"""
|
||||
|
||||
# ── Execution ──────────────────────────────────────────────────────────────────
|
||||
#
|
||||
# The orchestrator invokes evaluate.sh, which owns the full lifecycle:
|
||||
# checkout → build → boot stack → clone holdout repo → playwright → teardown.
|
||||
|
||||
[execution]
|
||||
script = "scripts/harb-evaluator/evaluate.sh"
|
||||
invocation = "bash scripts/harb-evaluator/evaluate.sh {pr_number}"
|
||||
|
||||
# Exit codes propagated by evaluate.sh:
|
||||
# 0 gate passed (≥90% of scenarios achieved 2/3 majority)
|
||||
# 1 gate failed (at least one scenario failed the 2/3 threshold)
|
||||
# 2 infra error (stack failed to start, missing dependency, etc.)
|
||||
|
||||
# ── Steps ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
[[steps]]
|
||||
id = "boot-stack"
|
||||
description = """
|
||||
Spin up full docker stack from PR branch.
|
||||
evaluate.sh creates an isolated git worktree, builds kraiken-lib,
|
||||
installs npm deps, installs Playwright browser binaries, then runs:
|
||||
docker compose -p harb-eval-{pr_number} up -d
|
||||
Waits for anvil (healthy), bootstrap (exited 0), ponder (healthy + /ready).
|
||||
"""
|
||||
|
||||
[[steps]]
|
||||
id = "clone-holdout"
|
||||
description = """
|
||||
Clone harb-holdout-scenarios into .holdout-scenarios/ inside the worktree.
|
||||
Sets HOLDOUT_SCENARIOS_DIR for holdout.config.ts.
|
||||
The dev-agent never sees this repo; the wall is enforced by separate
|
||||
repository access control on Codeberg.
|
||||
"""
|
||||
|
||||
[[steps]]
|
||||
id = "run-scenarios"
|
||||
description = """
|
||||
Run 8 Playwright specs via holdout.config.ts (workers=1, headless chromium).
|
||||
4 surfaces: contracts, graphql, landing, webapp.
|
||||
Each scenario is executed up to 3 times; 2/3 runs must pass.
|
||||
"""
|
||||
surfaces = ["contracts", "graphql", "landing", "webapp"]
|
||||
scenarios_per_surface = 2
|
||||
scenarios_total = 8
|
||||
runs_per_scenario = 3
|
||||
pass_per_scenario = 2 # 2-of-3 majority required for a scenario to count as passed
|
||||
|
||||
[[steps]]
|
||||
id = "teardown"
|
||||
description = """
|
||||
docker compose -p harb-eval-{pr_number} down -v --remove-orphans
|
||||
git worktree remove --force {worktree_dir}
|
||||
Always runs — cleanup is registered as a shell trap in evaluate.sh.
|
||||
"""
|
||||
|
||||
[[steps]]
|
||||
id = "deliver"
|
||||
description = """
|
||||
Collect per-scenario results from test-results/holdout-reports/.
|
||||
Write evidence/holdout/{date}-pr{pr_number}.json and commit to main.
|
||||
Post gate verdict to issue #{pr_number}.
|
||||
On failure: include one-line reason per failed scenario.
|
||||
Scenario text is never exposed to the dev-agent.
|
||||
"""
|
||||
|
||||
# ── Gate ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
[gate]
|
||||
pass_threshold_pct = 90 # ≥90% of scenarios must pass
|
||||
scenarios_total = 8 # 8 * 0.9 = 7.2 → at least 8 must pass to clear 90%
|
||||
per_scenario_runs = 3
|
||||
per_scenario_pass = 2 # 2-of-3 majority per scenario
|
||||
|
||||
# ── Products ───────────────────────────────────────────────────────────────────
|
||||
|
||||
[products.evidence_file]
|
||||
path = "evidence/holdout/{date}-pr{pr_number}.json"
|
||||
delivery = "commit to main"
|
||||
schema = "evidence/README.md" # see §Schema: holdout/YYYY-MM-DD-prNNN.json
|
||||
|
||||
[products.issue_comment]
|
||||
delivery = "post to issue #{pr_number}"
|
||||
content = "gate verdict (pass/fail), scenarios_passed/scenarios_total, link to evidence file"
|
||||
on_failure = "one-line failure reason per failing scenario; scenario text never revealed"
|
||||
|
||||
# ── Resources ──────────────────────────────────────────────────────────────────
|
||||
|
||||
[resources]
|
||||
profile = "heavy"
|
||||
containers = "5+" # anvil, bootstrap, ponder, webapp, (caddy if needed)
|
||||
browser = "chromium (Playwright)"
|
||||
ports = ["8545", "42069", "5173", "8081", "5100"]
|
||||
concurrency = "exclusive — port bindings prevent parallel runs on the same host"
|
||||
|
||||
# ── Notes ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
[notes]
|
||||
wall = """
|
||||
The holdout-specs repo (harb-holdout-scenarios) is intentionally inaccessible
|
||||
to the dev-agent. The agent receives only pass/fail and one-line failure reasons
|
||||
— never the scenario text. This is enforced by Codeberg repo permissions, not
|
||||
by runtime filtering.
|
||||
"""
|
||||
Loading…
Add table
Add a link
Reference in a new issue