harb/tools/push3-evolution/evaluate-seeds.sh
openhands cb6e6708b6 fix: \llm\-origin entries in manifest have null fitness and no evaluation path (#724)
- Add evaluate-seeds.sh: standalone script that reads manifest.jsonl,
  finds every entry with fitness: null, runs fitness.sh against each
  seed file, and atomically writes results back to manifest.jsonl.
  Supports --dry-run to preview without evaluating.
- Add comment to --diverse-seeds sampling in evolve.sh documenting that
  null-fitness seeds are included with effective_fitness=0 and that
  evaluate-seeds.sh should be run to score them.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-15 03:08:29 +00:00

187 lines
5.7 KiB
Bash
Executable file

#!/usr/bin/env bash
# =============================================================================
# evaluate-seeds.sh — Score null-fitness manifest entries via fitness.sh
#
# Reads manifest.jsonl, finds every entry with fitness: null, runs fitness.sh
# against the corresponding seed file, and writes the result back into
# manifest.jsonl (atomic temp-file rename).
#
# Usage:
# ./tools/push3-evolution/evaluate-seeds.sh [--dry-run]
#
# Options:
# --dry-run Print which seeds would be evaluated without running fitness.sh
#
# Environment:
# ANVIL_FORK_URL Passed through to fitness.sh when Anvil is not already
# running. Must point to a Base RPC endpoint.
#
# Exit codes:
# 0 All null-fitness entries evaluated (or nothing to do).
# 1 One or more evaluations failed (partial results may have been written).
# 2 Infrastructure error (missing tool, manifest not found, etc.).
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
SEEDS_DIR="$SCRIPT_DIR/seeds"
MANIFEST="$SEEDS_DIR/manifest.jsonl"
FITNESS_SH="$SCRIPT_DIR/fitness.sh"
DRY_RUN=false
# =============================================================================
# Argument parsing
# =============================================================================
while [ $# -gt 0 ]; do
case "$1" in
--dry-run) DRY_RUN=true; shift ;;
*) echo "Usage: $0 [--dry-run]" >&2; exit 2 ;;
esac
done
# =============================================================================
# Helpers
# =============================================================================
log() { echo " [evaluate-seeds] $*" >&2; }
fail() { echo " [evaluate-seeds] ERROR: $*" >&2; exit 2; }
# =============================================================================
# Pre-flight checks
# =============================================================================
[ -f "$MANIFEST" ] || fail "manifest.jsonl not found at $MANIFEST"
[ -f "$FITNESS_SH" ] || fail "fitness.sh not found at $FITNESS_SH"
command -v python3 &>/dev/null || fail "python3 not found in PATH"
# =============================================================================
# Find null-fitness entries
# =============================================================================
NULL_ENTRIES="$(python3 - "$MANIFEST" <<'PYEOF'
import json, sys
manifest_path = sys.argv[1]
with open(manifest_path) as f:
for line in f:
line = line.strip()
if not line:
continue
try:
d = json.loads(line)
if d.get('fitness') is None:
print(d.get('file', ''))
except json.JSONDecodeError:
pass
PYEOF
)"
if [ -z "$NULL_ENTRIES" ]; then
log "No null-fitness entries in manifest — nothing to do."
exit 0
fi
NULL_COUNT=$(printf '%s\n' "$NULL_ENTRIES" | grep -c '.')
log "Found $NULL_COUNT null-fitness entry/entries: $(printf '%s\n' "$NULL_ENTRIES" | tr '\n' ' ')"
if $DRY_RUN; then
echo "Dry run — would evaluate:"
printf '%s\n' "$NULL_ENTRIES" | while IFS= read -r fname; do
echo " $fname"
done
exit 0
fi
# =============================================================================
# Evaluate each null-fitness seed and collect results
# =============================================================================
FAILURES=0
# scores: associative array file -> score (bash 4+)
declare -A SCORES
while IFS= read -r FNAME; do
[ -z "$FNAME" ] && continue
SEED_FILE="$SEEDS_DIR/$FNAME"
if [ ! -f "$SEED_FILE" ]; then
log "WARNING: seed file not found: $SEED_FILE — skipping"
FAILURES=$((FAILURES + 1))
continue
fi
log "Evaluating $FNAME"
SCORE=""
FITNESS_EC=0
SCORE=$("$FITNESS_SH" "$SEED_FILE") || FITNESS_EC=$?
if [ "$FITNESS_EC" -ne 0 ] || [ -z "$SCORE" ]; then
log "WARNING: fitness.sh failed for $FNAME (exit $FITNESS_EC) — skipping"
FAILURES=$((FAILURES + 1))
continue
fi
log " $FNAME → fitness=$SCORE"
SCORES["$FNAME"]="$SCORE"
done <<< "$NULL_ENTRIES"
if [ "${#SCORES[@]}" -eq 0 ]; then
log "No seeds were successfully evaluated."
exit 1
fi
# =============================================================================
# Write results back to manifest.jsonl (atomic temp-file rename)
# =============================================================================
MANIFEST_TMP="$(mktemp "${MANIFEST}.XXXXXX")"
trap 'rm -f "$MANIFEST_TMP"' EXIT
python3 - "$MANIFEST" "$MANIFEST_TMP" <<PYEOF
import json, sys, os
manifest_path = sys.argv[1]
tmp_path = sys.argv[2]
# scores injected from shell (format: "file1\tscore1\nfile2\tscore2\n…")
scores_raw = """$(for k in "${!SCORES[@]}"; do printf '%s\t%s\n' "$k" "${SCORES[$k]}"; done)"""
scores = {}
for line in scores_raw.strip().splitlines():
if '\t' in line:
fname, score = line.split('\t', 1)
scores[fname.strip()] = int(score.strip())
lines_out = []
with open(manifest_path) as f:
for line in f:
stripped = line.rstrip('\n')
if not stripped:
continue
try:
d = json.loads(stripped)
fname = d.get('file', '')
if fname in scores and d.get('fitness') is None:
d['fitness'] = scores[fname]
lines_out.append(json.dumps(d, separators=(',', ':')))
except json.JSONDecodeError:
lines_out.append(stripped)
with open(tmp_path, 'w') as f:
for line in lines_out:
f.write(line + '\n')
PYEOF
mv "$MANIFEST_TMP" "$MANIFEST"
trap - EXIT
EVALUATED="${#SCORES[@]}"
log "Done. Evaluated $EVALUATED seed(s); $FAILURES failure(s)."
log "Results written to $MANIFEST"
if [ "$FAILURES" -gt 0 ]; then
exit 1
fi
exit 0