harb/scripts/harb-evaluator/helpers/report.ts
johba a937f1cb4c docs: scope engineering principles to infra/tests, not frontend polling (#470)
Clarifies that the event-driven engineering principles apply to infrastructure (Docker, scripts, startup/teardown) and test/scenario execution — NOT frontend HTTP API polling.

Frontend polling (e.g. LiveStats → Ponder GraphQL every 30s) is fine. The scalability solution is caching at the proxy layer (`Cache-Control` headers via Caddy), not WebSocket subscriptions.

Relates to #447

Co-authored-by: openhands <openhands@all-hands.dev>
Reviewed-on: https://codeberg.org/johba/harb/pulls/470
2026-03-06 11:17:50 +01:00

177 lines
5.4 KiB
TypeScript

/**
* Holdout scenario reporting helper.
*
* Each scenario calls `recordMetric()` during execution, then `writeReport()`
* at the end. On pass, the report is informational. On fail, it captures
* exactly what went wrong with full numbers for post-mortem analysis.
*
* Reports are written to `test-results/holdout-reports/` as JSON + markdown.
*/
import { writeFileSync, mkdirSync, readFileSync, existsSync } from 'fs';
import { join, dirname } from 'path';
export interface ScenarioMetric {
label: string;
value: string | number | bigint;
unit?: string;
}
export interface ScenarioReport {
scenario: string;
domain: string;
timestamp: string;
durationMs: number;
passed: boolean;
/** The invariant being tested (one sentence) */
invariant: string;
/** Metrics collected during the run */
metrics: ScenarioMetric[];
/** If failed: what went wrong */
failureReason?: string;
/** Possible causes from the scenario spec's "Why this might fail" */
possibleCauses?: string[];
/** Screenshot paths captured during the run */
screenshots?: string[];
}
const REPORT_DIR = 'test-results/holdout-reports';
/**
* Create a report builder for a scenario. Call methods to add metrics,
* then call `.write()` to persist.
*/
export function createReportBuilder(scenario: string, domain: string, invariant: string) {
const startTime = Date.now();
const metrics: ScenarioMetric[] = [];
const screenshots: string[] = [];
let possibleCauses: string[] = [];
return {
/** Record a metric during test execution */
metric(label: string, value: string | number | bigint, unit?: string) {
const displayValue = typeof value === 'bigint' ? value.toString() : value;
metrics.push({ label, value: displayValue, unit });
console.log(`[METRIC] ${label}: ${displayValue}${unit ? ' ' + unit : ''}`);
},
/** Add a screenshot path */
screenshot(path: string) {
screenshots.push(path);
},
/** Set possible failure causes (from scenario .md) */
setCauses(causes: string[]) {
possibleCauses = causes;
},
/** Write the report (call in finally block) */
write(passed: boolean, failureReason?: string): ScenarioReport {
const report: ScenarioReport = {
scenario,
domain,
timestamp: new Date().toISOString(),
durationMs: Date.now() - startTime,
passed,
invariant,
metrics: metrics.map(m => ({
...m,
value: typeof m.value === 'bigint' ? m.value.toString() : m.value,
})),
failureReason,
possibleCauses: passed ? undefined : possibleCauses,
screenshots,
};
mkdirSync(REPORT_DIR, { recursive: true });
// JSON report
const safeName = scenario.replace(/\//g, '-');
const jsonPath = join(REPORT_DIR, `${safeName}.json`);
writeFileSync(jsonPath, JSON.stringify(report, null, 2));
// Markdown report (human-readable)
const mdPath = join(REPORT_DIR, `${safeName}.md`);
writeFileSync(mdPath, formatMarkdown(report));
// Append to aggregate results
appendToAggregate(report);
console.log(`[REPORT] Written to ${jsonPath}`);
return report;
},
};
}
function formatMarkdown(r: ScenarioReport): string {
const status = r.passed ? '✅ PASSED' : '❌ FAILED';
const lines = [
`# ${r.scenario}${status}`,
'',
`**Domain:** ${r.domain}`,
`**Invariant:** ${r.invariant}`,
`**Duration:** ${(r.durationMs / 1000).toFixed(1)}s`,
`**Timestamp:** ${r.timestamp}`,
'',
'## Metrics',
'',
'| Metric | Value | Unit |',
'|--------|-------|------|',
];
for (const m of r.metrics) {
const val = typeof m.value === 'bigint' ? m.value.toString() : String(m.value);
lines.push(`| ${m.label} | ${val} | ${m.unit ?? ''} |`);
}
if (!r.passed) {
lines.push('', '## Failure', '', `**Reason:** ${r.failureReason ?? 'Unknown'}`);
if (r.possibleCauses?.length) {
lines.push('', '### Possible Causes (from scenario spec)', '');
for (const c of r.possibleCauses) {
lines.push(`- ${c}`);
}
}
}
if (r.screenshots?.length) {
lines.push('', '## Screenshots', '');
for (const s of r.screenshots) {
lines.push(`- ${s}`);
}
}
return lines.join('\n') + '\n';
}
function appendToAggregate(r: ScenarioReport): void {
const aggPath = join(REPORT_DIR, 'RESULTS.md');
const status = r.passed ? '✅' : '❌';
const duration = (r.durationMs / 1000).toFixed(1);
// Find key metrics to show in summary
const profitMetric = r.metrics.find(m =>
m.label.toLowerCase().includes('profit') ||
m.label.toLowerCase().includes('loss') ||
m.label.toLowerCase().includes('net'),
);
const summary = profitMetric
? `${profitMetric.label}: ${profitMetric.value}${profitMetric.unit ? ' ' + profitMetric.unit : ''}`
: '';
const line = `| ${status} | ${r.scenario} | ${duration}s | ${r.invariant}${summary} |`;
if (!existsSync(aggPath)) {
writeFileSync(aggPath, [
`# Holdout Scenario Results — ${new Date().toISOString().slice(0, 10)}`,
'',
'| Status | Scenario | Time | Summary |',
'|--------|----------|------|---------|',
line,
'',
].join('\n'));
} else {
const content = readFileSync(aggPath, 'utf-8');
// Insert before the trailing newline
writeFileSync(aggPath, content.trimEnd() + '\n' + line + '\n');
}
}