harb/scripts/harb-evaluator/helpers/report.ts

/**
 * Holdout scenario reporting helper.
 *
 * Each scenario calls `recordMetric()` during execution, then `writeReport()`
 * at the end. On pass, the report is informational. On fail, it captures
 * exactly what went wrong with full numbers for post-mortem analysis.
 *
 * Reports are written to `test-results/holdout-reports/` as JSON + markdown.
 */
import { writeFileSync, mkdirSync, readFileSync, existsSync } from 'fs';
import { join, dirname } from 'path';

export interface ScenarioMetric {
  label: string;
  value: string | number | bigint;
  unit?: string;
}

export interface ScenarioReport {
  scenario: string;
  domain: string;
  timestamp: string;
  durationMs: number;
  passed: boolean;
  /** The invariant being tested (one sentence) */
  invariant: string;
  /** Metrics collected during the run */
  metrics: ScenarioMetric[];
  /** If failed: what went wrong */
  failureReason?: string;
  /** Possible causes from the scenario spec's "Why this might fail" */
  possibleCauses?: string[];
  /** Screenshot paths captured during the run */
  screenshots?: string[];
}

const REPORT_DIR = 'test-results/holdout-reports';

/**
 * Create a report builder for a scenario. Call methods to add metrics,
 * then call `.write()` to persist.
 */
export function createReportBuilder(scenario: string, domain: string, invariant: string) {
  const startTime = Date.now();
  const metrics: ScenarioMetric[] = [];
  const screenshots: string[] = [];
  let possibleCauses: string[] = [];

  return {
    /** Record a metric during test execution */
    metric(label: string, value: string | number | bigint, unit?: string) {
      const displayValue = typeof value === 'bigint' ? value.toString() : value;
      metrics.push({ label, value: displayValue, unit });
      console.log(`[METRIC] ${label}: ${displayValue}${unit ? ' ' + unit : ''}`);
    },

    /** Add a screenshot path */
    screenshot(path: string) {
      screenshots.push(path);
    },

    /** Set possible failure causes (from scenario .md) */
    setCauses(causes: string[]) {
      possibleCauses = causes;
    },

    /** Write the report (call in finally block) */
    write(passed: boolean, failureReason?: string): ScenarioReport {
      const report: ScenarioReport = {
        scenario,
        domain,
        timestamp: new Date().toISOString(),
        durationMs: Date.now() - startTime,
        passed,
        invariant,
        metrics: metrics.map(m => ({
          ...m,
          value: typeof m.value === 'bigint' ? m.value.toString() : m.value,
        })),
        failureReason,
        possibleCauses: passed ? undefined : possibleCauses,
        screenshots,
      };

      mkdirSync(REPORT_DIR, { recursive: true });

      // JSON report
      const safeName = scenario.replace(/\//g, '-');
      const jsonPath = join(REPORT_DIR, `${safeName}.json`);
      writeFileSync(jsonPath, JSON.stringify(report, null, 2));

      // Markdown report (human-readable)
      const mdPath = join(REPORT_DIR, `${safeName}.md`);
      writeFileSync(mdPath, formatMarkdown(report));

      // Append to aggregate results
      appendToAggregate(report);

      console.log(`[REPORT] Written to ${jsonPath}`);
      return report;
    },
  };
}

function formatMarkdown(r: ScenarioReport): string {
  const status = r.passed ? '✅ PASSED' : '❌ FAILED';
  const lines = [
    `# ${r.scenario} — ${status}`,
    '',
    `**Domain:** ${r.domain}`,
    `**Invariant:** ${r.invariant}`,
    `**Duration:** ${(r.durationMs / 1000).toFixed(1)}s`,
    `**Timestamp:** ${r.timestamp}`,
    '',
    '## Metrics',
    '',
    '| Metric | Value | Unit |',
    '|--------|-------|------|',
  ];

  for (const m of r.metrics) {
    const val = typeof m.value === 'bigint' ? m.value.toString() : String(m.value);
    lines.push(`| ${m.label} | ${val} | ${m.unit ?? ''} |`);
  }

  if (!r.passed) {
    lines.push('', '## Failure', '', `**Reason:** ${r.failureReason ?? 'Unknown'}`);
    if (r.possibleCauses?.length) {
      lines.push('', '### Possible Causes (from scenario spec)', '');
      for (const c of r.possibleCauses) {
        lines.push(`- ${c}`);
      }
    }
  }

  if (r.screenshots?.length) {
    lines.push('', '## Screenshots', '');
    for (const s of r.screenshots) {
      lines.push(`- ${s}`);
    }
  }

  return lines.join('\n') + '\n';
}

function appendToAggregate(r: ScenarioReport): void {
  const aggPath = join(REPORT_DIR, 'RESULTS.md');
  const status = r.passed ? '✅' : '❌';
  const duration = (r.durationMs / 1000).toFixed(1);

  // Find key metrics to show in summary
  const profitMetric = r.metrics.find(m =>
    m.label.toLowerCase().includes('profit') ||
    m.label.toLowerCase().includes('loss') ||
    m.label.toLowerCase().includes('net'),
  );
  const summary = profitMetric
    ? ` — ${profitMetric.label}: ${profitMetric.value}${profitMetric.unit ? ' ' + profitMetric.unit : ''}`
    : '';

  const line = `| ${status} | ${r.scenario} | ${duration}s | ${r.invariant}${summary} |`;

  if (!existsSync(aggPath)) {
    writeFileSync(aggPath, [
      `# Holdout Scenario Results — ${new Date().toISOString().slice(0, 10)}`,
      '',
      '| Status | Scenario | Time | Summary |',
      '|--------|----------|------|---------|',
      line,
      '',
    ].join('\n'));
  } else {
    const content = readFileSync(aggPath, 'utf-8');
    // Insert before the trailing newline
    writeFileSync(aggPath, content.trimEnd() + '\n' + line + '\n');
  }
}