diff --git a/STATE.md b/STATE.md index 8ed2b90..2b001dd 100644 --- a/STATE.md +++ b/STATE.md @@ -38,3 +38,4 @@ - [2026-03-15] add evolution run 8 champion to seed pool (#781) - [2026-03-15] fix FitnessEvaluator.t.sol broken on Base mainnet fork (#780) - [2026-03-15] No generic flag dispatch: only `token_value_inflation` is ever zero-rated (#723) +- [2026-03-15] `llm`-origin entries in manifest have null fitness and no evaluation path (#724): evaluate-seeds.sh scores null-fitness seeds and writes results back to manifest.jsonl diff --git a/tools/push3-evolution/evaluate-seeds.sh b/tools/push3-evolution/evaluate-seeds.sh new file mode 100755 index 0000000..ece31dc --- /dev/null +++ b/tools/push3-evolution/evaluate-seeds.sh @@ -0,0 +1,200 @@ +#!/usr/bin/env bash +# ============================================================================= +# evaluate-seeds.sh — Score null-fitness manifest entries via fitness.sh +# +# Reads manifest.jsonl, finds every entry with fitness: null, runs fitness.sh +# against the corresponding seed file, and writes the result back into +# manifest.jsonl (atomic temp-file rename). +# +# Usage: +# ./tools/push3-evolution/evaluate-seeds.sh [--dry-run] +# +# Options: +# --dry-run Print which seeds would be evaluated without running fitness.sh +# +# Environment: +# ANVIL_FORK_URL Passed through to fitness.sh when Anvil is not already +# running. Must point to a Base RPC endpoint. +# +# Exit codes: +# 0 All null-fitness entries evaluated (or nothing to do). +# 1 One or more evaluations failed (partial results may have been written). +# 2 Infrastructure error (missing tool, manifest not found, etc.). +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SEEDS_DIR="$SCRIPT_DIR/seeds" +MANIFEST="$SEEDS_DIR/manifest.jsonl" +FITNESS_SH="$SCRIPT_DIR/fitness.sh" + +DRY_RUN=false + +# ============================================================================= +# Argument parsing +# ============================================================================= + +while [ $# -gt 0 ]; do + case "$1" in + --dry-run) DRY_RUN=true; shift ;; + *) echo "Usage: $0 [--dry-run]" >&2; exit 2 ;; + esac +done + +# ============================================================================= +# Helpers +# ============================================================================= + +log() { echo " [evaluate-seeds] $*" >&2; } +fail() { echo " [evaluate-seeds] ERROR: $*" >&2; exit 2; } + +# ============================================================================= +# Pre-flight checks +# ============================================================================= + +[ -f "$MANIFEST" ] || fail "manifest.jsonl not found at $MANIFEST" +[ -f "$FITNESS_SH" ] || fail "fitness.sh not found at $FITNESS_SH" +command -v python3 &>/dev/null || fail "python3 not found in PATH" + +# ============================================================================= +# Find null-fitness entries +# ============================================================================= + +NULL_ENTRIES="$(python3 - "$MANIFEST" <<'PYEOF' +import json, sys +manifest_path = sys.argv[1] +with open(manifest_path) as f: + for line in f: + line = line.strip() + if not line: + continue + try: + d = json.loads(line) + if d.get('fitness') is None: + print(d.get('file', '')) + except json.JSONDecodeError: + pass +PYEOF +)" + +if [ -z "$NULL_ENTRIES" ]; then + log "No null-fitness entries in manifest — nothing to do." + exit 0 +fi + +NULL_COUNT=$(printf '%s\n' "$NULL_ENTRIES" | grep -c '.') +log "Found $NULL_COUNT null-fitness entry/entries: $(printf '%s\n' "$NULL_ENTRIES" | tr '\n' ' ')" + +if $DRY_RUN; then + echo "Dry run — would evaluate:" + printf '%s\n' "$NULL_ENTRIES" | while IFS= read -r fname; do + echo " $fname" + done + exit 0 +fi + +# ============================================================================= +# Evaluate each null-fitness seed and collect results +# ============================================================================= + +FAILURES=0 + +# Scores are accumulated in a temp file as tab-separated "filename\tscore" +# lines. Using a file (rather than a shell associative array embedded in a +# heredoc) avoids injecting values into Python source code. +SCORES_FILE="$(mktemp)" +trap 'rm -f "$SCORES_FILE"' EXIT + +while IFS= read -r FNAME; do + [ -z "$FNAME" ] && continue + SEED_FILE="$SEEDS_DIR/$FNAME" + if [ ! -f "$SEED_FILE" ]; then + log "WARNING: seed file not found: $SEED_FILE — skipping" + FAILURES=$((FAILURES + 1)) + continue + fi + + log "Evaluating $FNAME …" + SCORE="" + FITNESS_EC=0 + SCORE=$("$FITNESS_SH" "$SEED_FILE") || FITNESS_EC=$? + + if [ "$FITNESS_EC" -eq 2 ]; then + # Exit code 2 = infra error (Anvil down, missing tool, etc.). + # All subsequent evaluations will fail for the same reason; abort early. + log "ERROR: fitness.sh reported infra failure (exit 2) for $FNAME — aborting" + exit 2 + fi + + if [ "$FITNESS_EC" -ne 0 ] || [ -z "$SCORE" ]; then + log "WARNING: fitness.sh failed for $FNAME (exit $FITNESS_EC) — skipping" + FAILURES=$((FAILURES + 1)) + continue + fi + + log " $FNAME → fitness=$SCORE" + printf '%s\t%s\n' "$FNAME" "$SCORE" >> "$SCORES_FILE" +done <<< "$NULL_ENTRIES" + +if [ ! -s "$SCORES_FILE" ]; then + log "No seeds were successfully evaluated." + exit 1 +fi + +# ============================================================================= +# Write results back to manifest.jsonl (atomic temp-file rename) +# ============================================================================= + +MANIFEST_TMP="$(mktemp "${MANIFEST}.XXXXXX")" +# Update trap to clean up both temp files. +trap 'rm -f "$SCORES_FILE" "$MANIFEST_TMP"' EXIT + +python3 - "$MANIFEST" "$MANIFEST_TMP" "$SCORES_FILE" <<'PYEOF' +import json, sys + +manifest_path = sys.argv[1] +tmp_path = sys.argv[2] +scores_path = sys.argv[3] + +# Load scores from the tab-separated file written by the shell loop. +# Values are plain integers produced by fitness.sh — no shell expansion here. +scores = {} +with open(scores_path) as sf: + for line in sf: + line = line.rstrip('\n') + if '\t' in line: + fname, score = line.split('\t', 1) + scores[fname.strip()] = int(score.strip()) + +lines_out = [] +with open(manifest_path) as f: + for line in f: + stripped = line.rstrip('\n') + if not stripped: + continue + try: + d = json.loads(stripped) + fname = d.get('file', '') + if fname in scores and d.get('fitness') is None: + d['fitness'] = scores[fname] + lines_out.append(json.dumps(d, separators=(',', ':'))) + except json.JSONDecodeError: + lines_out.append(stripped) + +with open(tmp_path, 'w') as f: + for line in lines_out: + f.write(line + '\n') +PYEOF + +mv "$MANIFEST_TMP" "$MANIFEST" +trap 'rm -f "$SCORES_FILE"' EXIT + +EVALUATED=$(wc -l < "$SCORES_FILE" | tr -d ' ') +log "Done. Evaluated $EVALUATED seed(s); $FAILURES failure(s)." +log "Results written to $MANIFEST" + +if [ "$FAILURES" -gt 0 ]; then + exit 1 +fi +exit 0 diff --git a/tools/push3-evolution/evolve.sh b/tools/push3-evolution/evolve.sh index 856954b..4937c09 100755 --- a/tools/push3-evolution/evolve.sh +++ b/tools/push3-evolution/evolve.sh @@ -342,6 +342,12 @@ if [ "$DIVERSE_SEEDS" = "true" ]; then # Build a random sample list from the pool in one pass (also determines if # the pool has any usable entries, avoiding a second manifest parse). + # + # NOTE: seeds with fitness: null (e.g. llm-origin entries not yet evaluated) + # are included in the sample with equal probability to any other seed — the + # pool sampler does a flat random shuffle and does not read or weight by + # fitness at all. Run evaluate-seeds.sh to score them and write real fitness + # values back to manifest.jsonl. POOL_SAMPLE_LIST="$WORK_DIR/pool_sample.txt" POOL_COUNT=0 if [ -f "$POOL_MANIFEST" ]; then