From 69f6a87e2007949ad0b15150f54517a5d56ec823 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 3 Mar 2026 19:57:34 +0000
Subject: [PATCH 1/4] Move holdout scenarios to separate repo

- Updated holdout.config.ts to use HOLDOUT_SCENARIOS_DIR env var
- Modified evaluate.sh to clone harb-holdout-scenarios repo at runtime
- Deleted scripts/harb-evaluator/scenarios/ directory
- Added .holdout-scenarios/ to .gitignore
- Holdout scenarios are now cloned into .holdout-scenarios/ during evaluation
- This prevents dev-agent from seeing the holdout test set
---
 .gitignore                                    |  3 +
 scripts/harb-evaluator/evaluate.sh            | 12 +++
 scripts/harb-evaluator/holdout.config.ts      | 14 ++--
 .../sovereign-exit/always-leave.spec.ts       | 80 -------------------
 4 files changed, 23 insertions(+), 86 deletions(-)
 delete mode 100644 scripts/harb-evaluator/scenarios/sovereign-exit/always-leave.spec.ts

diff --git a/.gitignore b/.gitignore
index 0a28df3..5fece8b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,3 +36,6 @@ services/ponder/.ponder/
 # Temporary files
 /tmp/
 logs/
+
+# Holdout scenarios (cloned at runtime by evaluate.sh)
+.holdout-scenarios/
diff --git a/scripts/harb-evaluator/evaluate.sh b/scripts/harb-evaluator/evaluate.sh
index 52509ff..e691637 100755
--- a/scripts/harb-evaluator/evaluate.sh
+++ b/scripts/harb-evaluator/evaluate.sh
@@ -163,6 +163,18 @@ log "Installing Playwright browser binaries..."
 (cd "$WORKTREE_DIR" && npx playwright install chromium) \
   || infra_error "playwright install chromium failed"
 
+# ── Clone holdout scenarios ────────────────────────────────────────────
+# The holdout scenarios live in a separate repo so the dev-agent cannot
+# see them. Clone into .holdout-scenarios/ inside the worktree.
+readonly HOLDOUT_REPO="ssh://git@codeberg.org/johba/harb-holdout-scenarios.git"
+readonly HOLDOUT_DIR="$WORKTREE_DIR/.holdout-scenarios"
+log "Cloning holdout scenarios from $HOLDOUT_REPO..."
+git clone --quiet "$HOLDOUT_REPO" "$HOLDOUT_DIR" \
+  || infra_error "Failed to clone holdout scenarios repo"
+
+# Export the scenarios directory for holdout.config.ts
+export HOLDOUT_SCENARIOS_DIR=".holdout-scenarios/scenarios"
+
 # ── Boot the stack ─────────────────────────────────────────────────────
 cd "$WORKTREE_DIR"
 log "Starting containerised stack (project: $COMPOSE_PROJECT)..."
diff --git a/scripts/harb-evaluator/holdout.config.ts b/scripts/harb-evaluator/holdout.config.ts
index e393c52..1409390 100644
--- a/scripts/harb-evaluator/holdout.config.ts
+++ b/scripts/harb-evaluator/holdout.config.ts
@@ -3,19 +3,21 @@ import { defineConfig, devices } from '@playwright/test';
 /**
  * Playwright config for holdout scenarios.
  *
- * Holdout specs live under scripts/harb-evaluator/scenarios/ and reuse the
- * existing tests/setup/ infrastructure (wallet-provider, stack, navigate).
+ * Holdout specs are cloned from the separate harb-holdout-scenarios repo
+ * into .holdout-scenarios/ by evaluate.sh and reuse the existing tests/setup/
+ * infrastructure (wallet-provider, stack, navigate).
  *
  * The evaluator boots the stack first, then runs:
  *   npx playwright test --config scripts/harb-evaluator/holdout.config.ts
  *
  * Required env vars (set by evaluate.sh):
- *   STACK_RPC_URL     – Anvil JSON-RPC endpoint
- *   STACK_WEBAPP_URL  – Vite dev server URL
- *   STACK_GRAPHQL_URL – Ponder GraphQL endpoint
+ *   STACK_RPC_URL            – Anvil JSON-RPC endpoint
+ *   STACK_WEBAPP_URL         – Vite dev server URL
+ *   STACK_GRAPHQL_URL        – Ponder GraphQL endpoint
+ *   HOLDOUT_SCENARIOS_DIR    – Path to cloned scenarios (default: scripts/harb-evaluator/scenarios)
  */
 export default defineConfig({
-  testDir: './scenarios',
+  testDir: process.env.HOLDOUT_SCENARIOS_DIR ?? './scenarios',
   fullyParallel: false,
   // evaluate.sh sets CI=true before invoking playwright, so forbidOnly is always
   // active in the evaluator context. Accidental test.only() in any scenario file
diff --git a/scripts/harb-evaluator/scenarios/sovereign-exit/always-leave.spec.ts b/scripts/harb-evaluator/scenarios/sovereign-exit/always-leave.spec.ts
deleted file mode 100644
index 0351b8c..0000000
--- a/scripts/harb-evaluator/scenarios/sovereign-exit/always-leave.spec.ts
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * Holdout scenario: sovereign-exit / always-leave
- *
- * Verifies the core protocol invariant: a user can ALWAYS exit their position
- * by buying KRK through the in-app swap widget and then selling it back.
- *
- * Reuses tests/setup/ infrastructure and the shared helpers in
- * scripts/harb-evaluator/helpers/ — no inline wallet, swap, or balance logic.
- *
- * Account 0 from the Anvil test mnemonic is used (same as e2e tests).
- * Deploy scripts also use Account 0, but each test run gets a fresh Anvil stack,
- * so no collision occurs.
- */
-import { expect, test } from '@playwright/test';
-import { parseEther, Wallet } from 'ethers';
-import { createWalletContext } from '../../../../tests/setup/wallet-provider';
-import { getStackConfig } from '../../../../tests/setup/stack';
-import { connectWallet, getKrkBalance } from '../../helpers/wallet';
-import { buyKrk, sellAllKrk } from '../../helpers/swap';
-
-// Anvil account 0 — same as e2e tests (deploy uses it but state is reset per stack)
-const PK = '0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80';
-const ACCOUNT_ADDRESS = new Wallet(PK).address;
-
-test('I can always leave', async ({ browser }) => {
-  const config = getStackConfig();
-  const ctx = await createWalletContext(browser, {
-    privateKey: PK,
-    rpcUrl: config.rpcUrl,
-  });
-  const page = await ctx.newPage();
-
-  page.on('console', msg => console.log(`[BROWSER] ${msg.type()}: ${msg.text()}`));
-  page.on('pageerror', err => console.log(`[BROWSER ERROR] ${err.message}`));
-
-  try {
-    // ── 1. Load the web app ──────────────────────────────────────────────
-    console.log('[TEST] Loading web app...');
-    await page.goto(`${config.webAppUrl}/app/`, { waitUntil: 'domcontentloaded' });
-    await expect(page.locator('.navbar-title').first()).toBeVisible({ timeout: 30_000 });
-
-    // ── 2. Connect wallet via the UI ─────────────────────────────────────
-    console.log('[TEST] Connecting wallet...');
-    await connectWallet(page);
-
-    // ── 3. Buy KRK via the get-krk page swap widget ───────────────────────
-    const krkBefore = await getKrkBalance(config.rpcUrl, config.contracts.Kraiken, ACCOUNT_ADDRESS);
-    console.log(`[TEST] KRK balance before buy: ${krkBefore}`);
-
-    await buyKrk(page, '0.1');
-
-    const krkAfterBuy = await getKrkBalance(config.rpcUrl, config.contracts.Kraiken, ACCOUNT_ADDRESS);
-    console.log(`[TEST] KRK balance after buy: ${krkAfterBuy}`);
-    expect(krkAfterBuy).toBeGreaterThan(krkBefore);
-    console.log('[TEST] ✅ KRK received');
-
-    // ── 4. Sell all KRK back (sovereign exit) ────────────────────────────
-    const wethReceived = await sellAllKrk(page, {
-      rpcUrl: config.rpcUrl,
-      krkAddress: config.contracts.Kraiken,
-      accountAddress: ACCOUNT_ADDRESS,
-    });
-
-    // ── 5. Assert KRK was sold ────────────────────────────────────────────
-    const krkAfterSell = await getKrkBalance(config.rpcUrl, config.contracts.Kraiken, ACCOUNT_ADDRESS);
-    console.log(`[TEST] KRK balance after sell: ${krkAfterSell}`);
-    expect(krkAfterSell).toBeLessThan(krkAfterBuy);
-    console.log('[TEST] ✅ Sovereign exit confirmed: KRK sold back to WETH');
-
-    // ── 6. Assert reasonable slippage (at least 90% of ETH spent) ─────────
-    const ethSpent = parseEther('0.1');
-    const minExpected = parseEther('0.09'); // 90% of 0.1 ETH
-    expect(wethReceived).toBeGreaterThanOrEqual(minExpected);
-    const slippagePercent = ((Number(wethReceived) / Number(ethSpent)) * 100).toFixed(2);
-    console.log(`[TEST] ✅ Reasonable slippage: received ${wethReceived} WETH for 0.1 ETH spent (${slippagePercent}%)`);
-
-  } finally {
-    await ctx.close();
-  }
-});

From f6fe37dcc01eab0a151405d89848e501b2c91d40 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 3 Mar 2026 20:59:32 +0000
Subject: [PATCH 2/4] fix: address PR #438 review findings

- Fix HOLDOUT_SCENARIOS_DIR to use absolute path (resolves Playwright testDir issue)
- Remove dead SCENARIOS_DIR variable
- Replace fallback with explicit error in holdout.config.ts
- Add SSH key requirement comment
---
 scripts/harb-evaluator/evaluate.sh       |  5 +++--
 scripts/harb-evaluator/holdout.config.ts | 10 ++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/scripts/harb-evaluator/evaluate.sh b/scripts/harb-evaluator/evaluate.sh
index e691637..303c75e 100755
--- a/scripts/harb-evaluator/evaluate.sh
+++ b/scripts/harb-evaluator/evaluate.sh
@@ -24,7 +24,6 @@ readonly REPO_REMOTE="${HARB_REPO_REMOTE:-origin}"
 readonly CODEBERG_REPO="${CODEBERG_REPO:-johba/harb}"
 readonly REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
 readonly EVALUATOR_DIR="$(cd "$(dirname "$0")" && pwd)"
-readonly SCENARIOS_DIR="$EVALUATOR_DIR/scenarios"
 
 readonly ANVIL_TIMEOUT=120       # seconds to wait for anvil healthy
 readonly BOOTSTRAP_TIMEOUT=180   # seconds to wait for bootstrap container exit
@@ -166,6 +165,7 @@ log "Installing Playwright browser binaries..."
 # ── Clone holdout scenarios ────────────────────────────────────────────
 # The holdout scenarios live in a separate repo so the dev-agent cannot
 # see them. Clone into .holdout-scenarios/ inside the worktree.
+# NOTE: Requires SSH key with read access to johba/harb-holdout-scenarios on Codeberg.
 readonly HOLDOUT_REPO="ssh://git@codeberg.org/johba/harb-holdout-scenarios.git"
 readonly HOLDOUT_DIR="$WORKTREE_DIR/.holdout-scenarios"
 log "Cloning holdout scenarios from $HOLDOUT_REPO..."
@@ -173,7 +173,8 @@ git clone --quiet "$HOLDOUT_REPO" "$HOLDOUT_DIR" \
   || infra_error "Failed to clone holdout scenarios repo"
 
 # Export the scenarios directory for holdout.config.ts
-export HOLDOUT_SCENARIOS_DIR=".holdout-scenarios/scenarios"
+# Must be absolute path — Playwright resolves testDir relative to config file's directory.
+export HOLDOUT_SCENARIOS_DIR="$HOLDOUT_DIR/scenarios"
 
 # ── Boot the stack ─────────────────────────────────────────────────────
 cd "$WORKTREE_DIR"
diff --git a/scripts/harb-evaluator/holdout.config.ts b/scripts/harb-evaluator/holdout.config.ts
index 1409390..562fef1 100644
--- a/scripts/harb-evaluator/holdout.config.ts
+++ b/scripts/harb-evaluator/holdout.config.ts
@@ -14,10 +14,16 @@ import { defineConfig, devices } from '@playwright/test';
  *   STACK_RPC_URL            – Anvil JSON-RPC endpoint
  *   STACK_WEBAPP_URL         – Vite dev server URL
  *   STACK_GRAPHQL_URL        – Ponder GraphQL endpoint
- *   HOLDOUT_SCENARIOS_DIR    – Path to cloned scenarios (default: scripts/harb-evaluator/scenarios)
+ *   HOLDOUT_SCENARIOS_DIR    – Path to cloned scenarios
  */
+
+const scenariosDir = process.env.HOLDOUT_SCENARIOS_DIR;
+if (!scenariosDir) {
+  throw new Error('HOLDOUT_SCENARIOS_DIR env var required — run via evaluate.sh');
+}
+
 export default defineConfig({
-  testDir: process.env.HOLDOUT_SCENARIOS_DIR ?? './scenarios',
+  testDir: scenariosDir,
   fullyParallel: false,
   // evaluate.sh sets CI=true before invoking playwright, so forbidOnly is always
   // active in the evaluator context. Accidental test.only() in any scenario file

From 106521af2ec15d39c3006a384a78a4a54db0671c Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 4 Mar 2026 06:16:16 +0000
Subject: [PATCH 3/4] ci: retrigger after Codeberg OAuth refresh


From 7fc47d739a82fbaa1d28e95eb4e658688b86e3f6 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 4 Mar 2026 08:09:26 +0000
Subject: [PATCH 4/4] ci: retrigger