From 43c8d79afd166c83d32ffcd1742c0b0815667880 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Sun, 1 Mar 2026 09:04:33 +0000
Subject: [PATCH 1/2] fix: Holdout evaluator: fresh containerised stack per run
 (#380)

Adds scripts/harb-evaluator/evaluate.sh which:
- Accepts a PR number, resolves the branch via Codeberg API or git remote scan
- Checks out that branch into an isolated git worktree
- Boots a fresh docker compose stack with a unique COMPOSE_PROJECT name
- Waits for anvil healthy, bootstrap complete, ponder healthy + indexed
- Sources contract addresses from tmp/containers/contracts.env (never hardcoded)
- Exports EVAL_* env vars and runs any *.sh scripts under scenarios/
- Always tears down the stack and removes the worktree on exit (pass or fail)
- Returns 0 (gate passed), 1 (gate failed), or 2 (infra error)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/harb-evaluator/evaluate.sh | 270 +++++++++++++++++++++++++++++
 1 file changed, 270 insertions(+)
 create mode 100755 scripts/harb-evaluator/evaluate.sh
diff --git a/scripts/harb-evaluator/evaluate.sh b/scripts/harb-evaluator/evaluate.sh
new file mode 100755
index 0000000..456fda8
--- /dev/null
+++ b/scripts/harb-evaluator/evaluate.sh
@@ -0,0 +1,270 @@
+#!/usr/bin/env bash
+# evaluate.sh — Spin up a fresh containerised harb stack from a PR branch,
+# run holdout scenario scripts against it, then tear it down.
+#
+# Usage: evaluate.sh <pr-number>
+#
+# Exit codes:
+#   0  gate passed  (all scenarios succeeded, or no scenarios found)
+#   1  gate failed  (one or more scenario scripts returned non-zero)
+#   2  infra error  (stack failed to start, prerequisite missing, etc.)
+#
+# Environment overrides:
+#   HARB_REPO_REMOTE   git remote to fetch from (default: origin)
+#   CODEBERG_REPO      Gitea/Codeberg repo path  (default: johba/harb)
+
+set -euo pipefail
+
+# ── Constants ──────────────────────────────────────────────────────────
+readonly REPO_REMOTE="${HARB_REPO_REMOTE:-origin}"
+readonly CODEBERG_REPO="${CODEBERG_REPO:-johba/harb}"
+readonly REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+readonly EVALUATOR_DIR="$(cd "$(dirname "$0")" && pwd)"
+readonly SCENARIOS_DIR="$EVALUATOR_DIR/scenarios"
+
+readonly ANVIL_TIMEOUT=120       # seconds to wait for anvil healthy
+readonly BOOTSTRAP_TIMEOUT=180   # seconds to wait for bootstrap container exit
+readonly PONDER_TIMEOUT=300      # seconds to wait for ponder /health
+readonly PONDER_READY_TIMEOUT=360 # seconds to wait for ponder /ready (fully indexed)
+readonly POLL_INTERVAL=5
+
+# ── Logging helpers ────────────────────────────────────────────────────
+log()         { echo "[eval] $*"; }
+infra_error() { echo "[eval] INFRA ERROR: $*" >&2; exit 2; }
+gate_fail()   { echo "[eval] GATE FAILED: $*" >&2; exit 1; }
+
+usage() {
+  cat >&2 <<EOF
+Usage: $0 <pr-number>
+
+Checks out the PR branch into an isolated git worktree, boots a fresh
+docker compose stack, runs scenario scripts, then tears down.
+
+Exit codes:
+  0  gate passed
+  1  gate failed
+  2  infra error
+EOF
+  exit 2
+}
+
+# ── Argument parsing ───────────────────────────────────────────────────
+[[ $# -lt 1 ]] && usage
+PR_NUMBER="$1"
+[[ "$PR_NUMBER" =~ ^[0-9]+$ ]] || infra_error "Invalid PR number: '$PR_NUMBER'"
+
+# ── Prerequisites ──────────────────────────────────────────────────────
+if docker compose version &>/dev/null 2>&1; then
+  COMPOSE_CMD="docker compose"
+elif command -v docker-compose &>/dev/null; then
+  COMPOSE_CMD="docker-compose"
+else
+  infra_error "docker compose not found. Install Docker with the compose plugin."
+fi
+
+command -v git  &>/dev/null || infra_error "git not found"
+command -v curl &>/dev/null || infra_error "curl not found"
+
+# ── Fetch PR branch name ───────────────────────────────────────────────
+# Try the Codeberg REST API first (requires ~/.netrc with credentials).
+PR_BRANCH=""
+if [[ -f "$HOME/.netrc" ]]; then
+  log "Resolving PR #$PR_NUMBER branch via Codeberg API..."
+  api_json="$(curl --netrc --silent --max-time 10 \
+    "https://codeberg.org/api/v1/repos/$CODEBERG_REPO/pulls/$PR_NUMBER" 2>/dev/null)" || true
+
+  if [[ -n "$api_json" ]]; then
+    if command -v jq &>/dev/null; then
+      PR_BRANCH="$(echo "$api_json" | jq -r '.head.ref // empty' 2>/dev/null)" || true
+    else
+      # jq not available — extract with python3 or grep+sed
+      PR_BRANCH="$(echo "$api_json" | \
+        python3 -c "import json,sys; print(json.load(sys.stdin)['head']['ref'])" 2>/dev/null)" || \
+        PR_BRANCH="$(echo "$api_json" | grep -o '"ref":"[^"]*"' | head -n1 | sed 's/"ref":"//;s/"//')" || true
+    fi
+  fi
+fi
+
+# Fall back: fetch all remote refs and match common harb branch patterns.
+if [[ -z "$PR_BRANCH" ]]; then
+  log "API lookup skipped or failed; scanning remote branches..."
+  cd "$REPO_ROOT"
+  git fetch "$REPO_REMOTE" --prune 2>/dev/null || infra_error "git fetch $REPO_REMOTE failed"
+  PR_BRANCH="$(git branch -r 2>/dev/null | \
+    grep -E "(fix|feat|chore|refactor|hotfix)/.*[-/]${PR_NUMBER}[^0-9]?$|issue-${PR_NUMBER}$" | \
+    head -n1 | sed 's|.*/||' | tr -d ' ')" || true
+fi
+
+[[ -n "$PR_BRANCH" ]] || infra_error "Could not determine branch for PR #$PR_NUMBER"
+log "PR #$PR_NUMBER => branch: $PR_BRANCH"
+
+# ── Create isolated worktree ───────────────────────────────────────────
+WORKTREE_DIR="$(mktemp -d /tmp/harb-eval-${PR_NUMBER}-XXXXXX)"
+# Use a project name that is unique per PR and safe for Docker labels.
+COMPOSE_PROJECT="harb-eval-${PR_NUMBER}"
+
+cleanup() {
+  local rc=$?
+  log "--- cleanup (exit $rc) ---"
+  if [[ -d "$WORKTREE_DIR" ]]; then
+    log "Tearing down stack (project: $COMPOSE_PROJECT)..."
+    (cd "$WORKTREE_DIR" && $COMPOSE_CMD -p "$COMPOSE_PROJECT" down -v --remove-orphans 2>/dev/null) || true
+  fi
+  log "Removing worktree $WORKTREE_DIR..."
+  cd "$REPO_ROOT"
+  git worktree remove --force "$WORKTREE_DIR" 2>/dev/null || rm -rf "$WORKTREE_DIR" || true
+  exit $rc
+}
+trap cleanup EXIT INT TERM
+
+# Ensure the branch is locally available before adding the worktree.
+cd "$REPO_ROOT"
+git fetch "$REPO_REMOTE" "$PR_BRANCH" 2>/dev/null || \
+  infra_error "Could not fetch branch '$PR_BRANCH' from $REPO_REMOTE"
+
+log "Creating worktree at $WORKTREE_DIR (branch: $PR_BRANCH)..."
+git worktree add "$WORKTREE_DIR" "remotes/$REPO_REMOTE/$PR_BRANCH" \
+  || infra_error "git worktree add failed for branch $PR_BRANCH"
+
+# ── Build kraiken-lib in the worktree ─────────────────────────────────
+log "Building kraiken-lib..."
+(cd "$WORKTREE_DIR" && ./scripts/build-kraiken-lib.sh) \
+  || infra_error "kraiken-lib build failed"
+
+# ── Boot the stack ─────────────────────────────────────────────────────
+cd "$WORKTREE_DIR"
+log "Starting containerised stack (project: $COMPOSE_PROJECT)..."
+$COMPOSE_CMD -p "$COMPOSE_PROJECT" up -d \
+  || infra_error "docker compose up failed"
+
+# Helper: get status/health of a named container in this project.
+container_name() { echo "${COMPOSE_PROJECT}-$1-1"; }
+
+wait_healthy() {
+  local service="$1" timeout="$2"
+  local container
+  container="$(container_name "$service")"
+  log "Waiting for $service to be healthy (${timeout}s)..."
+  local deadline=$((SECONDS + timeout))
+  while (( SECONDS < deadline )); do
+    local status
+    status="$(docker inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null || echo "missing")"
+    if [[ "$status" == "healthy" ]]; then
+      log "  $service healthy"
+      return 0
+    fi
+    sleep "$POLL_INTERVAL"
+  done
+  docker logs "$container" 2>&1 | tail -20 || true
+  infra_error "$service did not become healthy within ${timeout}s"
+}
+
+wait_exited() {
+  local service="$1" timeout="$2"
+  local container
+  container="$(container_name "$service")"
+  log "Waiting for $service container to complete (${timeout}s)..."
+  local deadline=$((SECONDS + timeout))
+  while (( SECONDS < deadline )); do
+    local status
+    status="$(docker inspect --format='{{.State.Status}}' "$container" 2>/dev/null || echo "missing")"
+    if [[ "$status" == "exited" ]]; then
+      local exit_code
+      exit_code="$(docker inspect --format='{{.State.ExitCode}}' "$container" 2>/dev/null || echo "1")"
+      if [[ "$exit_code" != "0" ]]; then
+        docker logs "$container" 2>&1 | tail -30 || true
+        infra_error "$service container exited with code $exit_code"
+      fi
+      log "  $service completed successfully"
+      return 0
+    fi
+    sleep "$POLL_INTERVAL"
+  done
+  docker logs "$container" 2>&1 | tail -20 || true
+  infra_error "$service did not complete within ${timeout}s"
+}
+
+# Phase 1: base services
+wait_healthy anvil    "$ANVIL_TIMEOUT"
+
+# Phase 2: bootstrap (deploys contracts, writes contracts.env)
+wait_exited  bootstrap "$BOOTSTRAP_TIMEOUT"
+
+# ── Extract contract addresses ─────────────────────────────────────────
+CONTRACTS_ENV="$WORKTREE_DIR/tmp/containers/contracts.env"
+[[ -f "$CONTRACTS_ENV" ]] \
+  || infra_error "contracts.env not found at $CONTRACTS_ENV"
+
+log "Reading contract addresses from contracts.env..."
+# shellcheck source=/dev/null
+source "$CONTRACTS_ENV"
+log "  KRAIKEN=$KRAIKEN"
+log "  STAKE=$STAKE"
+log "  LIQUIDITY_MANAGER=$LIQUIDITY_MANAGER"
+
+# Phase 3: ponder must be healthy and fully indexed before running scenarios
+wait_healthy ponder "$PONDER_TIMEOUT"
+
+log "Waiting for Ponder to finish historical indexing (${PONDER_READY_TIMEOUT}s)..."
+ponder_ready=false
+ponder_deadline=$((SECONDS + PONDER_READY_TIMEOUT))
+while (( SECONDS < ponder_deadline )); do
+  http_code="$(curl -sf -o /dev/null -w '%{http_code}' --max-time 3 \
+    http://127.0.0.1:42069/ready 2>/dev/null || echo "000")"
+  if [[ "$http_code" == "200" ]]; then
+    log "  Ponder fully indexed"
+    ponder_ready=true
+    break
+  fi
+  sleep "$POLL_INTERVAL"
+done
+if [[ "$ponder_ready" != "true" ]]; then
+  log "WARNING: Ponder not fully indexed after ${PONDER_READY_TIMEOUT}s — continuing anyway"
+fi
+
+# ── Export stack endpoints for scenario scripts ────────────────────────
+export EVAL_PR_NUMBER="$PR_NUMBER"
+export EVAL_BRANCH="$PR_BRANCH"
+export EVAL_WORKTREE="$WORKTREE_DIR"
+export EVAL_RPC_URL="http://127.0.0.1:8545"
+export EVAL_GRAPHQL_URL="http://127.0.0.1:42069/graphql"
+export EVAL_WEBAPP_URL="http://127.0.0.1:5173"
+export EVAL_KRAIKEN="$KRAIKEN"
+export EVAL_STAKE="$STAKE"
+export EVAL_LIQUIDITY_MANAGER="$LIQUIDITY_MANAGER"
+
+log "Stack ready. Endpoints:"
+log "  RPC:     $EVAL_RPC_URL"
+log "  GraphQL: $EVAL_GRAPHQL_URL"
+log "  WebApp:  $EVAL_WEBAPP_URL"
+
+# ── Run scenario scripts ───────────────────────────────────────────────
+shopt -s nullglob
+scenario_scripts=("$SCENARIOS_DIR"/*.sh)
+shopt -u nullglob
+
+if [[ ${#scenario_scripts[@]} -eq 0 ]]; then
+  log "No scenario scripts found in $SCENARIOS_DIR"
+  log "Gate PASSED (no scenarios)"
+  exit 0
+fi
+
+failures=0
+for scenario in "${scenario_scripts[@]}"; do
+  [[ -f "$scenario" ]] || continue
+  scenario_name="$(basename "$scenario")"
+  log "--- Running scenario: $scenario_name ---"
+  if bash "$scenario"; then
+    log "  PASSED: $scenario_name"
+  else
+    log "  FAILED: $scenario_name"
+    failures=$((failures + 1))
+  fi
+done
+
+if (( failures > 0 )); then
+  gate_fail "$failures of ${#scenario_scripts[@]} scenario(s) failed"
+fi
+
+log "Gate PASSED (${#scenario_scripts[@]} scenario(s))"
+exit 0

From 1d6dd8c628bdf867fd5d1e745b20cbbdacc53dd4 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Sun, 1 Mar 2026 10:12:34 +0000
Subject: [PATCH 2/2] fix: address review feedback on evaluate.sh (#380)

- container_name(): derive separator from compose version (_COMPOSE_SEP),
  so v1 (underscores) and v2 (hyphens) both resolve the right container name
- Drop buggy grep fallback for JSON branch extraction; python3 path is
  sufficient and safe; grep cannot reliably target only head.ref in the
  nested Gitea response
- Validate KRAIKEN/STAKE/LIQUIDITY_MANAGER after sourcing contracts.env to
  catch renamed variables with a clear infra_error instead of a cryptic
  'unbound variable' abort
- wait_exited: handle Docker 'dead' state alongside 'exited' to fail fast
  instead of polling the full timeout
- Ponder /ready timeout is now infra_error (was a logged warning); scenarios
  must not run against a partially-indexed chain
- Avoid double git fetch: only fetch the specific branch if the earlier
  --prune fetch (fallback path) has not already retrieved all remote refs
- Use mktemp -u so git worktree add creates the directory itself, avoiding
  failures on older git versions that reject a pre-existing target path

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/harb-evaluator/evaluate.sh | 49 +++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/scripts/harb-evaluator/evaluate.sh b/scripts/harb-evaluator/evaluate.sh
index 456fda8..40e48f6 100755
--- a/scripts/harb-evaluator/evaluate.sh
+++ b/scripts/harb-evaluator/evaluate.sh
@@ -12,6 +12,10 @@
 # Environment overrides:
 #   HARB_REPO_REMOTE   git remote to fetch from (default: origin)
 #   CODEBERG_REPO      Gitea/Codeberg repo path  (default: johba/harb)
+#
+# NOTE: host port isolation — docker-compose.yml binds fixed host ports
+# (8545, 42069, 5173, 8081, 5100). Concurrent evaluation runs on the same
+# host will collide on those ports. This script is designed for sequential use.
 
 set -euo pipefail
 
@@ -56,8 +60,12 @@ PR_NUMBER="$1"
 # ── Prerequisites ──────────────────────────────────────────────────────
 if docker compose version &>/dev/null 2>&1; then
   COMPOSE_CMD="docker compose"
+  # Compose v2 uses hyphens in container names: PROJECT-SERVICE-1
+  _COMPOSE_SEP="-"
 elif command -v docker-compose &>/dev/null; then
   COMPOSE_CMD="docker-compose"
+  # Compose v1 uses underscores in container names: PROJECT_SERVICE_1
+  _COMPOSE_SEP="_"
 else
   infra_error "docker compose not found. Install Docker with the compose plugin."
 fi
@@ -68,6 +76,7 @@ command -v curl &>/dev/null || infra_error "curl not found"
 # ── Fetch PR branch name ───────────────────────────────────────────────
 # Try the Codeberg REST API first (requires ~/.netrc with credentials).
 PR_BRANCH=""
+_FETCHED_ALL=false
 if [[ -f "$HOME/.netrc" ]]; then
   log "Resolving PR #$PR_NUMBER branch via Codeberg API..."
   api_json="$(curl --netrc --silent --max-time 10 \
@@ -77,10 +86,11 @@ if [[ -f "$HOME/.netrc" ]]; then
     if command -v jq &>/dev/null; then
       PR_BRANCH="$(echo "$api_json" | jq -r '.head.ref // empty' 2>/dev/null)" || true
     else
-      # jq not available — extract with python3 or grep+sed
+      # jq not available — use python3 for reliable nested key extraction.
+      # grep+sed is intentionally omitted: the Gitea response has multiple "ref"
+      # keys in nested objects and grep cannot safely target only head.ref.
       PR_BRANCH="$(echo "$api_json" | \
-        python3 -c "import json,sys; print(json.load(sys.stdin)['head']['ref'])" 2>/dev/null)" || \
-        PR_BRANCH="$(echo "$api_json" | grep -o '"ref":"[^"]*"' | head -n1 | sed 's/"ref":"//;s/"//')" || true
+        python3 -c "import json,sys; print(json.load(sys.stdin)['head']['ref'])" 2>/dev/null)" || true
     fi
   fi
 fi
@@ -90,6 +100,7 @@ if [[ -z "$PR_BRANCH" ]]; then
   log "API lookup skipped or failed; scanning remote branches..."
   cd "$REPO_ROOT"
   git fetch "$REPO_REMOTE" --prune 2>/dev/null || infra_error "git fetch $REPO_REMOTE failed"
+  _FETCHED_ALL=true
   PR_BRANCH="$(git branch -r 2>/dev/null | \
     grep -E "(fix|feat|chore|refactor|hotfix)/.*[-/]${PR_NUMBER}[^0-9]?$|issue-${PR_NUMBER}$" | \
     head -n1 | sed 's|.*/||' | tr -d ' ')" || true
@@ -99,7 +110,10 @@ fi
 log "PR #$PR_NUMBER => branch: $PR_BRANCH"
 
 # ── Create isolated worktree ───────────────────────────────────────────
-WORKTREE_DIR="$(mktemp -d /tmp/harb-eval-${PR_NUMBER}-XXXXXX)"
+# Use mktemp -u to generate a unique path without creating the directory;
+# git worktree add creates it. This avoids failures on older git that
+# rejects a pre-existing (even empty) target directory.
+WORKTREE_DIR="$(mktemp -u /tmp/harb-eval-${PR_NUMBER}-XXXXXX)"
 # Use a project name that is unique per PR and safe for Docker labels.
 COMPOSE_PROJECT="harb-eval-${PR_NUMBER}"
 
@@ -117,10 +131,12 @@ cleanup() {
 }
 trap cleanup EXIT INT TERM
 
-# Ensure the branch is locally available before adding the worktree.
+# Fetch the specific branch only if we haven't already fetched everything above.
 cd "$REPO_ROOT"
-git fetch "$REPO_REMOTE" "$PR_BRANCH" 2>/dev/null || \
-  infra_error "Could not fetch branch '$PR_BRANCH' from $REPO_REMOTE"
+if [[ "$_FETCHED_ALL" != "true" ]]; then
+  git fetch "$REPO_REMOTE" "$PR_BRANCH" 2>/dev/null || \
+    infra_error "Could not fetch branch '$PR_BRANCH' from $REPO_REMOTE"
+fi
 
 log "Creating worktree at $WORKTREE_DIR (branch: $PR_BRANCH)..."
 git worktree add "$WORKTREE_DIR" "remotes/$REPO_REMOTE/$PR_BRANCH" \
@@ -137,8 +153,9 @@ log "Starting containerised stack (project: $COMPOSE_PROJECT)..."
 $COMPOSE_CMD -p "$COMPOSE_PROJECT" up -d \
   || infra_error "docker compose up failed"
 
-# Helper: get status/health of a named container in this project.
-container_name() { echo "${COMPOSE_PROJECT}-$1-1"; }
+# Helper: resolve the container name for a service in this project.
+# Compose v2 uses hyphens (PROJECT-SERVICE-1); v1 uses underscores (PROJECT_SERVICE_1).
+container_name() { echo "${COMPOSE_PROJECT}${_COMPOSE_SEP}$1${_COMPOSE_SEP}1"; }
 
 wait_healthy() {
   local service="$1" timeout="$2"
@@ -168,7 +185,7 @@ wait_exited() {
   while (( SECONDS < deadline )); do
     local status
     status="$(docker inspect --format='{{.State.Status}}' "$container" 2>/dev/null || echo "missing")"
-    if [[ "$status" == "exited" ]]; then
+    if [[ "$status" == "exited" || "$status" == "dead" ]]; then
       local exit_code
       exit_code="$(docker inspect --format='{{.State.ExitCode}}' "$container" 2>/dev/null || echo "1")"
       if [[ "$exit_code" != "0" ]]; then
@@ -198,6 +215,16 @@ CONTRACTS_ENV="$WORKTREE_DIR/tmp/containers/contracts.env"
 log "Reading contract addresses from contracts.env..."
 # shellcheck source=/dev/null
 source "$CONTRACTS_ENV"
+
+# Validate expected variables after sourcing — guards against set -u crashes
+# if a future bootstrap refactor renames any of these.
+KRAIKEN="${KRAIKEN:-}"
+STAKE="${STAKE:-}"
+LIQUIDITY_MANAGER="${LIQUIDITY_MANAGER:-}"
+[[ -n "$KRAIKEN" ]]           || infra_error "KRAIKEN not set in contracts.env"
+[[ -n "$STAKE" ]]             || infra_error "STAKE not set in contracts.env"
+[[ -n "$LIQUIDITY_MANAGER" ]] || infra_error "LIQUIDITY_MANAGER not set in contracts.env"
+
 log "  KRAIKEN=$KRAIKEN"
 log "  STAKE=$STAKE"
 log "  LIQUIDITY_MANAGER=$LIQUIDITY_MANAGER"
@@ -219,7 +246,7 @@ while (( SECONDS < ponder_deadline )); do
   sleep "$POLL_INTERVAL"
 done
 if [[ "$ponder_ready" != "true" ]]; then
-  log "WARNING: Ponder not fully indexed after ${PONDER_READY_TIMEOUT}s — continuing anyway"
+  infra_error "Ponder did not finish indexing within ${PONDER_READY_TIMEOUT}s"
 fi
 
 # ── Export stack endpoints for scenario scripts ────────────────────────