fix: Holdout evaluator: fresh containerised stack per run (#380)

Adds scripts/harb-evaluator/evaluate.sh which:
- Accepts a PR number, resolves the branch via Codeberg API or git remote scan
- Checks out that branch into an isolated git worktree
- Boots a fresh docker compose stack with a unique COMPOSE_PROJECT name
- Waits for anvil healthy, bootstrap complete, ponder healthy + indexed
- Sources contract addresses from tmp/containers/contracts.env (never hardcoded)
- Exports EVAL_* env vars and runs any *.sh scripts under scenarios/
- Always tears down the stack and removes the worktree on exit (pass or fail)
- Returns 0 (gate passed), 1 (gate failed), or 2 (infra error)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
openhands 2026-03-01 09:04:33 +00:00
parent 703185ea48
commit 43c8d79afd

View file

@ -0,0 +1,270 @@
#!/usr/bin/env bash
# evaluate.sh — Spin up a fresh containerised harb stack from a PR branch,
# run holdout scenario scripts against it, then tear it down.
#
# Usage: evaluate.sh <pr-number>
#
# Exit codes:
# 0 gate passed (all scenarios succeeded, or no scenarios found)
# 1 gate failed (one or more scenario scripts returned non-zero)
# 2 infra error (stack failed to start, prerequisite missing, etc.)
#
# Environment overrides:
# HARB_REPO_REMOTE git remote to fetch from (default: origin)
# CODEBERG_REPO Gitea/Codeberg repo path (default: johba/harb)
set -euo pipefail
# ── Constants ──────────────────────────────────────────────────────────
readonly REPO_REMOTE="${HARB_REPO_REMOTE:-origin}"
readonly CODEBERG_REPO="${CODEBERG_REPO:-johba/harb}"
readonly REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
readonly EVALUATOR_DIR="$(cd "$(dirname "$0")" && pwd)"
readonly SCENARIOS_DIR="$EVALUATOR_DIR/scenarios"
readonly ANVIL_TIMEOUT=120 # seconds to wait for anvil healthy
readonly BOOTSTRAP_TIMEOUT=180 # seconds to wait for bootstrap container exit
readonly PONDER_TIMEOUT=300 # seconds to wait for ponder /health
readonly PONDER_READY_TIMEOUT=360 # seconds to wait for ponder /ready (fully indexed)
readonly POLL_INTERVAL=5
# ── Logging helpers ────────────────────────────────────────────────────
log() { echo "[eval] $*"; }
infra_error() { echo "[eval] INFRA ERROR: $*" >&2; exit 2; }
gate_fail() { echo "[eval] GATE FAILED: $*" >&2; exit 1; }
usage() {
cat >&2 <<EOF
Usage: $0 <pr-number>
Checks out the PR branch into an isolated git worktree, boots a fresh
docker compose stack, runs scenario scripts, then tears down.
Exit codes:
0 gate passed
1 gate failed
2 infra error
EOF
exit 2
}
# ── Argument parsing ───────────────────────────────────────────────────
[[ $# -lt 1 ]] && usage
PR_NUMBER="$1"
[[ "$PR_NUMBER" =~ ^[0-9]+$ ]] || infra_error "Invalid PR number: '$PR_NUMBER'"
# ── Prerequisites ──────────────────────────────────────────────────────
if docker compose version &>/dev/null 2>&1; then
COMPOSE_CMD="docker compose"
elif command -v docker-compose &>/dev/null; then
COMPOSE_CMD="docker-compose"
else
infra_error "docker compose not found. Install Docker with the compose plugin."
fi
command -v git &>/dev/null || infra_error "git not found"
command -v curl &>/dev/null || infra_error "curl not found"
# ── Fetch PR branch name ───────────────────────────────────────────────
# Try the Codeberg REST API first (requires ~/.netrc with credentials).
PR_BRANCH=""
if [[ -f "$HOME/.netrc" ]]; then
log "Resolving PR #$PR_NUMBER branch via Codeberg API..."
api_json="$(curl --netrc --silent --max-time 10 \
"https://codeberg.org/api/v1/repos/$CODEBERG_REPO/pulls/$PR_NUMBER" 2>/dev/null)" || true
if [[ -n "$api_json" ]]; then
if command -v jq &>/dev/null; then
PR_BRANCH="$(echo "$api_json" | jq -r '.head.ref // empty' 2>/dev/null)" || true
else
# jq not available — extract with python3 or grep+sed
PR_BRANCH="$(echo "$api_json" | \
python3 -c "import json,sys; print(json.load(sys.stdin)['head']['ref'])" 2>/dev/null)" || \
PR_BRANCH="$(echo "$api_json" | grep -o '"ref":"[^"]*"' | head -n1 | sed 's/"ref":"//;s/"//')" || true
fi
fi
fi
# Fall back: fetch all remote refs and match common harb branch patterns.
if [[ -z "$PR_BRANCH" ]]; then
log "API lookup skipped or failed; scanning remote branches..."
cd "$REPO_ROOT"
git fetch "$REPO_REMOTE" --prune 2>/dev/null || infra_error "git fetch $REPO_REMOTE failed"
PR_BRANCH="$(git branch -r 2>/dev/null | \
grep -E "(fix|feat|chore|refactor|hotfix)/.*[-/]${PR_NUMBER}[^0-9]?$|issue-${PR_NUMBER}$" | \
head -n1 | sed 's|.*/||' | tr -d ' ')" || true
fi
[[ -n "$PR_BRANCH" ]] || infra_error "Could not determine branch for PR #$PR_NUMBER"
log "PR #$PR_NUMBER => branch: $PR_BRANCH"
# ── Create isolated worktree ───────────────────────────────────────────
WORKTREE_DIR="$(mktemp -d /tmp/harb-eval-${PR_NUMBER}-XXXXXX)"
# Use a project name that is unique per PR and safe for Docker labels.
COMPOSE_PROJECT="harb-eval-${PR_NUMBER}"
cleanup() {
local rc=$?
log "--- cleanup (exit $rc) ---"
if [[ -d "$WORKTREE_DIR" ]]; then
log "Tearing down stack (project: $COMPOSE_PROJECT)..."
(cd "$WORKTREE_DIR" && $COMPOSE_CMD -p "$COMPOSE_PROJECT" down -v --remove-orphans 2>/dev/null) || true
fi
log "Removing worktree $WORKTREE_DIR..."
cd "$REPO_ROOT"
git worktree remove --force "$WORKTREE_DIR" 2>/dev/null || rm -rf "$WORKTREE_DIR" || true
exit $rc
}
trap cleanup EXIT INT TERM
# Ensure the branch is locally available before adding the worktree.
cd "$REPO_ROOT"
git fetch "$REPO_REMOTE" "$PR_BRANCH" 2>/dev/null || \
infra_error "Could not fetch branch '$PR_BRANCH' from $REPO_REMOTE"
log "Creating worktree at $WORKTREE_DIR (branch: $PR_BRANCH)..."
git worktree add "$WORKTREE_DIR" "remotes/$REPO_REMOTE/$PR_BRANCH" \
|| infra_error "git worktree add failed for branch $PR_BRANCH"
# ── Build kraiken-lib in the worktree ─────────────────────────────────
log "Building kraiken-lib..."
(cd "$WORKTREE_DIR" && ./scripts/build-kraiken-lib.sh) \
|| infra_error "kraiken-lib build failed"
# ── Boot the stack ─────────────────────────────────────────────────────
cd "$WORKTREE_DIR"
log "Starting containerised stack (project: $COMPOSE_PROJECT)..."
$COMPOSE_CMD -p "$COMPOSE_PROJECT" up -d \
|| infra_error "docker compose up failed"
# Helper: get status/health of a named container in this project.
container_name() { echo "${COMPOSE_PROJECT}-$1-1"; }
wait_healthy() {
local service="$1" timeout="$2"
local container
container="$(container_name "$service")"
log "Waiting for $service to be healthy (${timeout}s)..."
local deadline=$((SECONDS + timeout))
while (( SECONDS < deadline )); do
local status
status="$(docker inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null || echo "missing")"
if [[ "$status" == "healthy" ]]; then
log " $service healthy"
return 0
fi
sleep "$POLL_INTERVAL"
done
docker logs "$container" 2>&1 | tail -20 || true
infra_error "$service did not become healthy within ${timeout}s"
}
wait_exited() {
local service="$1" timeout="$2"
local container
container="$(container_name "$service")"
log "Waiting for $service container to complete (${timeout}s)..."
local deadline=$((SECONDS + timeout))
while (( SECONDS < deadline )); do
local status
status="$(docker inspect --format='{{.State.Status}}' "$container" 2>/dev/null || echo "missing")"
if [[ "$status" == "exited" ]]; then
local exit_code
exit_code="$(docker inspect --format='{{.State.ExitCode}}' "$container" 2>/dev/null || echo "1")"
if [[ "$exit_code" != "0" ]]; then
docker logs "$container" 2>&1 | tail -30 || true
infra_error "$service container exited with code $exit_code"
fi
log " $service completed successfully"
return 0
fi
sleep "$POLL_INTERVAL"
done
docker logs "$container" 2>&1 | tail -20 || true
infra_error "$service did not complete within ${timeout}s"
}
# Phase 1: base services
wait_healthy anvil "$ANVIL_TIMEOUT"
# Phase 2: bootstrap (deploys contracts, writes contracts.env)
wait_exited bootstrap "$BOOTSTRAP_TIMEOUT"
# ── Extract contract addresses ─────────────────────────────────────────
CONTRACTS_ENV="$WORKTREE_DIR/tmp/containers/contracts.env"
[[ -f "$CONTRACTS_ENV" ]] \
|| infra_error "contracts.env not found at $CONTRACTS_ENV"
log "Reading contract addresses from contracts.env..."
# shellcheck source=/dev/null
source "$CONTRACTS_ENV"
log " KRAIKEN=$KRAIKEN"
log " STAKE=$STAKE"
log " LIQUIDITY_MANAGER=$LIQUIDITY_MANAGER"
# Phase 3: ponder must be healthy and fully indexed before running scenarios
wait_healthy ponder "$PONDER_TIMEOUT"
log "Waiting for Ponder to finish historical indexing (${PONDER_READY_TIMEOUT}s)..."
ponder_ready=false
ponder_deadline=$((SECONDS + PONDER_READY_TIMEOUT))
while (( SECONDS < ponder_deadline )); do
http_code="$(curl -sf -o /dev/null -w '%{http_code}' --max-time 3 \
http://127.0.0.1:42069/ready 2>/dev/null || echo "000")"
if [[ "$http_code" == "200" ]]; then
log " Ponder fully indexed"
ponder_ready=true
break
fi
sleep "$POLL_INTERVAL"
done
if [[ "$ponder_ready" != "true" ]]; then
log "WARNING: Ponder not fully indexed after ${PONDER_READY_TIMEOUT}s — continuing anyway"
fi
# ── Export stack endpoints for scenario scripts ────────────────────────
export EVAL_PR_NUMBER="$PR_NUMBER"
export EVAL_BRANCH="$PR_BRANCH"
export EVAL_WORKTREE="$WORKTREE_DIR"
export EVAL_RPC_URL="http://127.0.0.1:8545"
export EVAL_GRAPHQL_URL="http://127.0.0.1:42069/graphql"
export EVAL_WEBAPP_URL="http://127.0.0.1:5173"
export EVAL_KRAIKEN="$KRAIKEN"
export EVAL_STAKE="$STAKE"
export EVAL_LIQUIDITY_MANAGER="$LIQUIDITY_MANAGER"
log "Stack ready. Endpoints:"
log " RPC: $EVAL_RPC_URL"
log " GraphQL: $EVAL_GRAPHQL_URL"
log " WebApp: $EVAL_WEBAPP_URL"
# ── Run scenario scripts ───────────────────────────────────────────────
shopt -s nullglob
scenario_scripts=("$SCENARIOS_DIR"/*.sh)
shopt -u nullglob
if [[ ${#scenario_scripts[@]} -eq 0 ]]; then
log "No scenario scripts found in $SCENARIOS_DIR"
log "Gate PASSED (no scenarios)"
exit 0
fi
failures=0
for scenario in "${scenario_scripts[@]}"; do
[[ -f "$scenario" ]] || continue
scenario_name="$(basename "$scenario")"
log "--- Running scenario: $scenario_name ---"
if bash "$scenario"; then
log " PASSED: $scenario_name"
else
log " FAILED: $scenario_name"
failures=$((failures + 1))
fi
done
if (( failures > 0 )); then
gate_fail "$failures of ${#scenario_scripts[@]} scenario(s) failed"
fi
log "Gate PASSED (${#scenario_scripts[@]} scenario(s))"
exit 0