From cb6e6708b65b9c2999ed82dd3064dedfac5aac37 Mon Sep 17 00:00:00 2001 From: openhands Date: Sun, 15 Mar 2026 03:08:29 +0000 Subject: [PATCH 1/2] fix: \`llm\`-origin entries in manifest have null fitness and no evaluation path (#724) - Add evaluate-seeds.sh: standalone script that reads manifest.jsonl, finds every entry with fitness: null, runs fitness.sh against each seed file, and atomically writes results back to manifest.jsonl. Supports --dry-run to preview without evaluating. - Add comment to --diverse-seeds sampling in evolve.sh documenting that null-fitness seeds are included with effective_fitness=0 and that evaluate-seeds.sh should be run to score them. Co-Authored-By: Claude Sonnet 4.6 --- STATE.md | 1 + tools/push3-evolution/evaluate-seeds.sh | 187 ++++++++++++++++++++++++ tools/push3-evolution/evolve.sh | 5 + 3 files changed, 193 insertions(+) create mode 100755 tools/push3-evolution/evaluate-seeds.sh diff --git a/STATE.md b/STATE.md index 8ed2b90..2b001dd 100644 --- a/STATE.md +++ b/STATE.md @@ -38,3 +38,4 @@ - [2026-03-15] add evolution run 8 champion to seed pool (#781) - [2026-03-15] fix FitnessEvaluator.t.sol broken on Base mainnet fork (#780) - [2026-03-15] No generic flag dispatch: only `token_value_inflation` is ever zero-rated (#723) +- [2026-03-15] `llm`-origin entries in manifest have null fitness and no evaluation path (#724): evaluate-seeds.sh scores null-fitness seeds and writes results back to manifest.jsonl diff --git a/tools/push3-evolution/evaluate-seeds.sh b/tools/push3-evolution/evaluate-seeds.sh new file mode 100755 index 0000000..feeeb2a --- /dev/null +++ b/tools/push3-evolution/evaluate-seeds.sh @@ -0,0 +1,187 @@ +#!/usr/bin/env bash +# ============================================================================= +# evaluate-seeds.sh — Score null-fitness manifest entries via fitness.sh +# +# Reads manifest.jsonl, finds every entry with fitness: null, runs fitness.sh +# against the corresponding seed file, and writes the result back into +# manifest.jsonl (atomic temp-file rename). +# +# Usage: +# ./tools/push3-evolution/evaluate-seeds.sh [--dry-run] +# +# Options: +# --dry-run Print which seeds would be evaluated without running fitness.sh +# +# Environment: +# ANVIL_FORK_URL Passed through to fitness.sh when Anvil is not already +# running. Must point to a Base RPC endpoint. +# +# Exit codes: +# 0 All null-fitness entries evaluated (or nothing to do). +# 1 One or more evaluations failed (partial results may have been written). +# 2 Infrastructure error (missing tool, manifest not found, etc.). +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SEEDS_DIR="$SCRIPT_DIR/seeds" +MANIFEST="$SEEDS_DIR/manifest.jsonl" +FITNESS_SH="$SCRIPT_DIR/fitness.sh" + +DRY_RUN=false + +# ============================================================================= +# Argument parsing +# ============================================================================= + +while [ $# -gt 0 ]; do + case "$1" in + --dry-run) DRY_RUN=true; shift ;; + *) echo "Usage: $0 [--dry-run]" >&2; exit 2 ;; + esac +done + +# ============================================================================= +# Helpers +# ============================================================================= + +log() { echo " [evaluate-seeds] $*" >&2; } +fail() { echo " [evaluate-seeds] ERROR: $*" >&2; exit 2; } + +# ============================================================================= +# Pre-flight checks +# ============================================================================= + +[ -f "$MANIFEST" ] || fail "manifest.jsonl not found at $MANIFEST" +[ -f "$FITNESS_SH" ] || fail "fitness.sh not found at $FITNESS_SH" +command -v python3 &>/dev/null || fail "python3 not found in PATH" + +# ============================================================================= +# Find null-fitness entries +# ============================================================================= + +NULL_ENTRIES="$(python3 - "$MANIFEST" <<'PYEOF' +import json, sys +manifest_path = sys.argv[1] +with open(manifest_path) as f: + for line in f: + line = line.strip() + if not line: + continue + try: + d = json.loads(line) + if d.get('fitness') is None: + print(d.get('file', '')) + except json.JSONDecodeError: + pass +PYEOF +)" + +if [ -z "$NULL_ENTRIES" ]; then + log "No null-fitness entries in manifest — nothing to do." + exit 0 +fi + +NULL_COUNT=$(printf '%s\n' "$NULL_ENTRIES" | grep -c '.') +log "Found $NULL_COUNT null-fitness entry/entries: $(printf '%s\n' "$NULL_ENTRIES" | tr '\n' ' ')" + +if $DRY_RUN; then + echo "Dry run — would evaluate:" + printf '%s\n' "$NULL_ENTRIES" | while IFS= read -r fname; do + echo " $fname" + done + exit 0 +fi + +# ============================================================================= +# Evaluate each null-fitness seed and collect results +# ============================================================================= + +FAILURES=0 + +# scores: associative array file -> score (bash 4+) +declare -A SCORES + +while IFS= read -r FNAME; do + [ -z "$FNAME" ] && continue + SEED_FILE="$SEEDS_DIR/$FNAME" + if [ ! -f "$SEED_FILE" ]; then + log "WARNING: seed file not found: $SEED_FILE — skipping" + FAILURES=$((FAILURES + 1)) + continue + fi + + log "Evaluating $FNAME …" + SCORE="" + FITNESS_EC=0 + SCORE=$("$FITNESS_SH" "$SEED_FILE") || FITNESS_EC=$? + + if [ "$FITNESS_EC" -ne 0 ] || [ -z "$SCORE" ]; then + log "WARNING: fitness.sh failed for $FNAME (exit $FITNESS_EC) — skipping" + FAILURES=$((FAILURES + 1)) + continue + fi + + log " $FNAME → fitness=$SCORE" + SCORES["$FNAME"]="$SCORE" +done <<< "$NULL_ENTRIES" + +if [ "${#SCORES[@]}" -eq 0 ]; then + log "No seeds were successfully evaluated." + exit 1 +fi + +# ============================================================================= +# Write results back to manifest.jsonl (atomic temp-file rename) +# ============================================================================= + +MANIFEST_TMP="$(mktemp "${MANIFEST}.XXXXXX")" +trap 'rm -f "$MANIFEST_TMP"' EXIT + +python3 - "$MANIFEST" "$MANIFEST_TMP" < Date: Sun, 15 Mar 2026 03:29:47 +0000 Subject: [PATCH 2/2] fix: address review findings for evaluate-seeds.sh (#724) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace unquoted heredoc (shell-injection path) with a temp file: the shell loop now appends tab-separated filename/score lines to a temp file, which is passed as a plain path argument to the Python manifest- rewrite block. Python reads only file contents, never executes shell- expanded strings. - Add early abort on fitness.sh exit code 2 (infra error: Anvil down, missing tool). Iterating past an infra failure produces no useful results; aborting immediately surfaces the real problem. - Remove unused `os` import from the manifest-rewrite Python block. - Fix inaccurate comment in evolve.sh --diverse-seeds sampling: the pool sampler does a flat random shuffle with no fitness weighting; null- fitness seeds are not "treated as 0" — they are sampled with equal probability to any other seed. Co-Authored-By: Claude Sonnet 4.6 --- tools/push3-evolution/evaluate-seeds.sh | 45 ++++++++++++++++--------- tools/push3-evolution/evolve.sh | 7 ++-- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/tools/push3-evolution/evaluate-seeds.sh b/tools/push3-evolution/evaluate-seeds.sh index feeeb2a..ece31dc 100755 --- a/tools/push3-evolution/evaluate-seeds.sh +++ b/tools/push3-evolution/evaluate-seeds.sh @@ -100,8 +100,11 @@ fi FAILURES=0 -# scores: associative array file -> score (bash 4+) -declare -A SCORES +# Scores are accumulated in a temp file as tab-separated "filename\tscore" +# lines. Using a file (rather than a shell associative array embedded in a +# heredoc) avoids injecting values into Python source code. +SCORES_FILE="$(mktemp)" +trap 'rm -f "$SCORES_FILE"' EXIT while IFS= read -r FNAME; do [ -z "$FNAME" ] && continue @@ -117,6 +120,13 @@ while IFS= read -r FNAME; do FITNESS_EC=0 SCORE=$("$FITNESS_SH" "$SEED_FILE") || FITNESS_EC=$? + if [ "$FITNESS_EC" -eq 2 ]; then + # Exit code 2 = infra error (Anvil down, missing tool, etc.). + # All subsequent evaluations will fail for the same reason; abort early. + log "ERROR: fitness.sh reported infra failure (exit 2) for $FNAME — aborting" + exit 2 + fi + if [ "$FITNESS_EC" -ne 0 ] || [ -z "$SCORE" ]; then log "WARNING: fitness.sh failed for $FNAME (exit $FITNESS_EC) — skipping" FAILURES=$((FAILURES + 1)) @@ -124,10 +134,10 @@ while IFS= read -r FNAME; do fi log " $FNAME → fitness=$SCORE" - SCORES["$FNAME"]="$SCORE" + printf '%s\t%s\n' "$FNAME" "$SCORE" >> "$SCORES_FILE" done <<< "$NULL_ENTRIES" -if [ "${#SCORES[@]}" -eq 0 ]; then +if [ ! -s "$SCORES_FILE" ]; then log "No seeds were successfully evaluated." exit 1 fi @@ -137,22 +147,25 @@ fi # ============================================================================= MANIFEST_TMP="$(mktemp "${MANIFEST}.XXXXXX")" -trap 'rm -f "$MANIFEST_TMP"' EXIT +# Update trap to clean up both temp files. +trap 'rm -f "$SCORES_FILE" "$MANIFEST_TMP"' EXIT -python3 - "$MANIFEST" "$MANIFEST_TMP" <