Move holdout scenarios to separate repo

- Updated holdout.config.ts to use HOLDOUT_SCENARIOS_DIR env var
- Modified evaluate.sh to clone harb-holdout-scenarios repo at runtime
- Deleted scripts/harb-evaluator/scenarios/ directory
- Added .holdout-scenarios/ to .gitignore
- Holdout scenarios are now cloned into .holdout-scenarios/ during evaluation
- This prevents dev-agent from seeing the holdout test set
This commit is contained in:
openhands 2026-03-03 19:57:34 +00:00
parent b2594a28b3
commit 69f6a87e20
4 changed files with 23 additions and 86 deletions

View file

@ -163,6 +163,18 @@ log "Installing Playwright browser binaries..."
(cd "$WORKTREE_DIR" && npx playwright install chromium) \
|| infra_error "playwright install chromium failed"
# ── Clone holdout scenarios ────────────────────────────────────────────
# The holdout scenarios live in a separate repo so the dev-agent cannot
# see them. Clone into .holdout-scenarios/ inside the worktree.
readonly HOLDOUT_REPO="ssh://git@codeberg.org/johba/harb-holdout-scenarios.git"
readonly HOLDOUT_DIR="$WORKTREE_DIR/.holdout-scenarios"
log "Cloning holdout scenarios from $HOLDOUT_REPO..."
git clone --quiet "$HOLDOUT_REPO" "$HOLDOUT_DIR" \
|| infra_error "Failed to clone holdout scenarios repo"
# Export the scenarios directory for holdout.config.ts
export HOLDOUT_SCENARIOS_DIR=".holdout-scenarios/scenarios"
# ── Boot the stack ─────────────────────────────────────────────────────
cd "$WORKTREE_DIR"
log "Starting containerised stack (project: $COMPOSE_PROJECT)..."

View file

@ -3,19 +3,21 @@ import { defineConfig, devices } from '@playwright/test';
/**
* Playwright config for holdout scenarios.
*
* Holdout specs live under scripts/harb-evaluator/scenarios/ and reuse the
* existing tests/setup/ infrastructure (wallet-provider, stack, navigate).
* Holdout specs are cloned from the separate harb-holdout-scenarios repo
* into .holdout-scenarios/ by evaluate.sh and reuse the existing tests/setup/
* infrastructure (wallet-provider, stack, navigate).
*
* The evaluator boots the stack first, then runs:
* npx playwright test --config scripts/harb-evaluator/holdout.config.ts
*
* Required env vars (set by evaluate.sh):
* STACK_RPC_URL Anvil JSON-RPC endpoint
* STACK_WEBAPP_URL Vite dev server URL
* STACK_GRAPHQL_URL Ponder GraphQL endpoint
* STACK_RPC_URL Anvil JSON-RPC endpoint
* STACK_WEBAPP_URL Vite dev server URL
* STACK_GRAPHQL_URL Ponder GraphQL endpoint
* HOLDOUT_SCENARIOS_DIR Path to cloned scenarios (default: scripts/harb-evaluator/scenarios)
*/
export default defineConfig({
testDir: './scenarios',
testDir: process.env.HOLDOUT_SCENARIOS_DIR ?? './scenarios',
fullyParallel: false,
// evaluate.sh sets CI=true before invoking playwright, so forbidOnly is always
// active in the evaluator context. Accidental test.only() in any scenario file

View file

@ -1,80 +0,0 @@
/**
* Holdout scenario: sovereign-exit / always-leave
*
* Verifies the core protocol invariant: a user can ALWAYS exit their position
* by buying KRK through the in-app swap widget and then selling it back.
*
* Reuses tests/setup/ infrastructure and the shared helpers in
* scripts/harb-evaluator/helpers/ no inline wallet, swap, or balance logic.
*
* Account 0 from the Anvil test mnemonic is used (same as e2e tests).
* Deploy scripts also use Account 0, but each test run gets a fresh Anvil stack,
* so no collision occurs.
*/
import { expect, test } from '@playwright/test';
import { parseEther, Wallet } from 'ethers';
import { createWalletContext } from '../../../../tests/setup/wallet-provider';
import { getStackConfig } from '../../../../tests/setup/stack';
import { connectWallet, getKrkBalance } from '../../helpers/wallet';
import { buyKrk, sellAllKrk } from '../../helpers/swap';
// Anvil account 0 — same as e2e tests (deploy uses it but state is reset per stack)
const PK = '0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80';
const ACCOUNT_ADDRESS = new Wallet(PK).address;
test('I can always leave', async ({ browser }) => {
const config = getStackConfig();
const ctx = await createWalletContext(browser, {
privateKey: PK,
rpcUrl: config.rpcUrl,
});
const page = await ctx.newPage();
page.on('console', msg => console.log(`[BROWSER] ${msg.type()}: ${msg.text()}`));
page.on('pageerror', err => console.log(`[BROWSER ERROR] ${err.message}`));
try {
// ── 1. Load the web app ──────────────────────────────────────────────
console.log('[TEST] Loading web app...');
await page.goto(`${config.webAppUrl}/app/`, { waitUntil: 'domcontentloaded' });
await expect(page.locator('.navbar-title').first()).toBeVisible({ timeout: 30_000 });
// ── 2. Connect wallet via the UI ─────────────────────────────────────
console.log('[TEST] Connecting wallet...');
await connectWallet(page);
// ── 3. Buy KRK via the get-krk page swap widget ───────────────────────
const krkBefore = await getKrkBalance(config.rpcUrl, config.contracts.Kraiken, ACCOUNT_ADDRESS);
console.log(`[TEST] KRK balance before buy: ${krkBefore}`);
await buyKrk(page, '0.1');
const krkAfterBuy = await getKrkBalance(config.rpcUrl, config.contracts.Kraiken, ACCOUNT_ADDRESS);
console.log(`[TEST] KRK balance after buy: ${krkAfterBuy}`);
expect(krkAfterBuy).toBeGreaterThan(krkBefore);
console.log('[TEST] ✅ KRK received');
// ── 4. Sell all KRK back (sovereign exit) ────────────────────────────
const wethReceived = await sellAllKrk(page, {
rpcUrl: config.rpcUrl,
krkAddress: config.contracts.Kraiken,
accountAddress: ACCOUNT_ADDRESS,
});
// ── 5. Assert KRK was sold ────────────────────────────────────────────
const krkAfterSell = await getKrkBalance(config.rpcUrl, config.contracts.Kraiken, ACCOUNT_ADDRESS);
console.log(`[TEST] KRK balance after sell: ${krkAfterSell}`);
expect(krkAfterSell).toBeLessThan(krkAfterBuy);
console.log('[TEST] ✅ Sovereign exit confirmed: KRK sold back to WETH');
// ── 6. Assert reasonable slippage (at least 90% of ETH spent) ─────────
const ethSpent = parseEther('0.1');
const minExpected = parseEther('0.09'); // 90% of 0.1 ETH
expect(wethReceived).toBeGreaterThanOrEqual(minExpected);
const slippagePercent = ((Number(wethReceived) / Number(ethSpent)) * 100).toFixed(2);
console.log(`[TEST] ✅ Reasonable slippage: received ${wethReceived} WETH for 0.1 ETH spent (${slippagePercent}%)`);
} finally {
await ctx.close();
}
});