From cb6e6708b65b9c2999ed82dd3064dedfac5aac37 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Sun, 15 Mar 2026 03:08:29 +0000
Subject: [PATCH 1/2] fix: \`llm\`-origin entries in manifest have null fitness
 and no evaluation path (#724)

- Add evaluate-seeds.sh: standalone script that reads manifest.jsonl,
  finds every entry with fitness: null, runs fitness.sh against each
  seed file, and atomically writes results back to manifest.jsonl.
  Supports --dry-run to preview without evaluating.
- Add comment to --diverse-seeds sampling in evolve.sh documenting that
  null-fitness seeds are included with effective_fitness=0 and that
  evaluate-seeds.sh should be run to score them.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 STATE.md                                |   1 +
 tools/push3-evolution/evaluate-seeds.sh | 187 ++++++++++++++++++++++++
 tools/push3-evolution/evolve.sh         |   5 +
 3 files changed, 193 insertions(+)
 create mode 100755 tools/push3-evolution/evaluate-seeds.sh

diff --git a/STATE.md b/STATE.md
index 8ed2b90..2b001dd 100644
--- a/STATE.md
+++ b/STATE.md
@@ -38,3 +38,4 @@
 - [2026-03-15] add evolution run 8 champion to seed pool (#781)
 - [2026-03-15] fix FitnessEvaluator.t.sol broken on Base mainnet fork (#780)
 - [2026-03-15] No generic flag dispatch: only `token_value_inflation` is ever zero-rated (#723)
+- [2026-03-15] `llm`-origin entries in manifest have null fitness and no evaluation path (#724): evaluate-seeds.sh scores null-fitness seeds and writes results back to manifest.jsonl
diff --git a/tools/push3-evolution/evaluate-seeds.sh b/tools/push3-evolution/evaluate-seeds.sh
new file mode 100755
index 0000000..feeeb2a
--- /dev/null
+++ b/tools/push3-evolution/evaluate-seeds.sh
@@ -0,0 +1,187 @@
+#!/usr/bin/env bash
+# =============================================================================
+# evaluate-seeds.sh — Score null-fitness manifest entries via fitness.sh
+#
+# Reads manifest.jsonl, finds every entry with fitness: null, runs fitness.sh
+# against the corresponding seed file, and writes the result back into
+# manifest.jsonl (atomic temp-file rename).
+#
+# Usage:
+#   ./tools/push3-evolution/evaluate-seeds.sh [--dry-run]
+#
+# Options:
+#   --dry-run   Print which seeds would be evaluated without running fitness.sh
+#
+# Environment:
+#   ANVIL_FORK_URL   Passed through to fitness.sh when Anvil is not already
+#                    running.  Must point to a Base RPC endpoint.
+#
+# Exit codes:
+#   0  All null-fitness entries evaluated (or nothing to do).
+#   1  One or more evaluations failed (partial results may have been written).
+#   2  Infrastructure error (missing tool, manifest not found, etc.).
+# =============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SEEDS_DIR="$SCRIPT_DIR/seeds"
+MANIFEST="$SEEDS_DIR/manifest.jsonl"
+FITNESS_SH="$SCRIPT_DIR/fitness.sh"
+
+DRY_RUN=false
+
+# =============================================================================
+# Argument parsing
+# =============================================================================
+
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --dry-run) DRY_RUN=true; shift ;;
+        *) echo "Usage: $0 [--dry-run]" >&2; exit 2 ;;
+    esac
+done
+
+# =============================================================================
+# Helpers
+# =============================================================================
+
+log()   { echo "  [evaluate-seeds]  $*" >&2; }
+fail()  { echo "  [evaluate-seeds]  ERROR: $*" >&2; exit 2; }
+
+# =============================================================================
+# Pre-flight checks
+# =============================================================================
+
+[ -f "$MANIFEST" ]   || fail "manifest.jsonl not found at $MANIFEST"
+[ -f "$FITNESS_SH" ] || fail "fitness.sh not found at $FITNESS_SH"
+command -v python3 &>/dev/null || fail "python3 not found in PATH"
+
+# =============================================================================
+# Find null-fitness entries
+# =============================================================================
+
+NULL_ENTRIES="$(python3 - "$MANIFEST" <<'PYEOF'
+import json, sys
+manifest_path = sys.argv[1]
+with open(manifest_path) as f:
+    for line in f:
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            d = json.loads(line)
+            if d.get('fitness') is None:
+                print(d.get('file', ''))
+        except json.JSONDecodeError:
+            pass
+PYEOF
+)"
+
+if [ -z "$NULL_ENTRIES" ]; then
+    log "No null-fitness entries in manifest — nothing to do."
+    exit 0
+fi
+
+NULL_COUNT=$(printf '%s\n' "$NULL_ENTRIES" | grep -c '.')
+log "Found $NULL_COUNT null-fitness entry/entries: $(printf '%s\n' "$NULL_ENTRIES" | tr '\n' ' ')"
+
+if $DRY_RUN; then
+    echo "Dry run — would evaluate:"
+    printf '%s\n' "$NULL_ENTRIES" | while IFS= read -r fname; do
+        echo "  $fname"
+    done
+    exit 0
+fi
+
+# =============================================================================
+# Evaluate each null-fitness seed and collect results
+# =============================================================================
+
+FAILURES=0
+
+# scores: associative array  file -> score (bash 4+)
+declare -A SCORES
+
+while IFS= read -r FNAME; do
+    [ -z "$FNAME" ] && continue
+    SEED_FILE="$SEEDS_DIR/$FNAME"
+    if [ ! -f "$SEED_FILE" ]; then
+        log "WARNING: seed file not found: $SEED_FILE — skipping"
+        FAILURES=$((FAILURES + 1))
+        continue
+    fi
+
+    log "Evaluating $FNAME …"
+    SCORE=""
+    FITNESS_EC=0
+    SCORE=$("$FITNESS_SH" "$SEED_FILE") || FITNESS_EC=$?
+
+    if [ "$FITNESS_EC" -ne 0 ] || [ -z "$SCORE" ]; then
+        log "WARNING: fitness.sh failed for $FNAME (exit $FITNESS_EC) — skipping"
+        FAILURES=$((FAILURES + 1))
+        continue
+    fi
+
+    log "  $FNAME → fitness=$SCORE"
+    SCORES["$FNAME"]="$SCORE"
+done <<< "$NULL_ENTRIES"
+
+if [ "${#SCORES[@]}" -eq 0 ]; then
+    log "No seeds were successfully evaluated."
+    exit 1
+fi
+
+# =============================================================================
+# Write results back to manifest.jsonl (atomic temp-file rename)
+# =============================================================================
+
+MANIFEST_TMP="$(mktemp "${MANIFEST}.XXXXXX")"
+trap 'rm -f "$MANIFEST_TMP"' EXIT
+
+python3 - "$MANIFEST" "$MANIFEST_TMP" <<PYEOF
+import json, sys, os
+
+manifest_path = sys.argv[1]
+tmp_path      = sys.argv[2]
+
+# scores injected from shell (format: "file1\tscore1\nfile2\tscore2\n…")
+scores_raw = """$(for k in "${!SCORES[@]}"; do printf '%s\t%s\n' "$k" "${SCORES[$k]}"; done)"""
+
+scores = {}
+for line in scores_raw.strip().splitlines():
+    if '\t' in line:
+        fname, score = line.split('\t', 1)
+        scores[fname.strip()] = int(score.strip())
+
+lines_out = []
+with open(manifest_path) as f:
+    for line in f:
+        stripped = line.rstrip('\n')
+        if not stripped:
+            continue
+        try:
+            d = json.loads(stripped)
+            fname = d.get('file', '')
+            if fname in scores and d.get('fitness') is None:
+                d['fitness'] = scores[fname]
+            lines_out.append(json.dumps(d, separators=(',', ':')))
+        except json.JSONDecodeError:
+            lines_out.append(stripped)
+
+with open(tmp_path, 'w') as f:
+    for line in lines_out:
+        f.write(line + '\n')
+PYEOF
+
+mv "$MANIFEST_TMP" "$MANIFEST"
+trap - EXIT
+
+EVALUATED="${#SCORES[@]}"
+log "Done. Evaluated $EVALUATED seed(s); $FAILURES failure(s)."
+log "Results written to $MANIFEST"
+
+if [ "$FAILURES" -gt 0 ]; then
+    exit 1
+fi
+exit 0
diff --git a/tools/push3-evolution/evolve.sh b/tools/push3-evolution/evolve.sh
index 856954b..361f39f 100755
--- a/tools/push3-evolution/evolve.sh
+++ b/tools/push3-evolution/evolve.sh
@@ -342,6 +342,11 @@ if [ "$DIVERSE_SEEDS" = "true" ]; then
 
   # Build a random sample list from the pool in one pass (also determines if
   # the pool has any usable entries, avoiding a second manifest parse).
+  #
+  # NOTE: seeds with fitness: null (e.g. llm-origin entries not yet evaluated)
+  # are included in the sample with no filtering — their effective_fitness is
+  # treated as 0 by the pool-admission logic.  Run evaluate-seeds.sh to score
+  # them and write real fitness values back to manifest.jsonl.
   POOL_SAMPLE_LIST="$WORK_DIR/pool_sample.txt"
   POOL_COUNT=0
   if [ -f "$POOL_MANIFEST" ]; then

From c508efa31fc09e16af5a8202e313c6f162f4be59 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Sun, 15 Mar 2026 03:29:47 +0000
Subject: [PATCH 2/2] fix: address review findings for evaluate-seeds.sh (#724)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace unquoted heredoc (shell-injection path) with a temp file: the
  shell loop now appends tab-separated filename/score lines to a temp
  file, which is passed as a plain path argument to the Python manifest-
  rewrite block.  Python reads only file contents, never executes shell-
  expanded strings.
- Add early abort on fitness.sh exit code 2 (infra error: Anvil down,
  missing tool).  Iterating past an infra failure produces no useful
  results; aborting immediately surfaces the real problem.
- Remove unused `os` import from the manifest-rewrite Python block.
- Fix inaccurate comment in evolve.sh --diverse-seeds sampling: the pool
  sampler does a flat random shuffle with no fitness weighting; null-
  fitness seeds are not "treated as 0" — they are sampled with equal
  probability to any other seed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tools/push3-evolution/evaluate-seeds.sh | 45 ++++++++++++++++---------
 tools/push3-evolution/evolve.sh         |  7 ++--
 2 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/tools/push3-evolution/evaluate-seeds.sh b/tools/push3-evolution/evaluate-seeds.sh
index feeeb2a..ece31dc 100755
--- a/tools/push3-evolution/evaluate-seeds.sh
+++ b/tools/push3-evolution/evaluate-seeds.sh
@@ -100,8 +100,11 @@ fi
 
 FAILURES=0
 
-# scores: associative array  file -> score (bash 4+)
-declare -A SCORES
+# Scores are accumulated in a temp file as tab-separated "filename\tscore"
+# lines.  Using a file (rather than a shell associative array embedded in a
+# heredoc) avoids injecting values into Python source code.
+SCORES_FILE="$(mktemp)"
+trap 'rm -f "$SCORES_FILE"' EXIT
 
 while IFS= read -r FNAME; do
     [ -z "$FNAME" ] && continue
@@ -117,6 +120,13 @@ while IFS= read -r FNAME; do
     FITNESS_EC=0
     SCORE=$("$FITNESS_SH" "$SEED_FILE") || FITNESS_EC=$?
 
+    if [ "$FITNESS_EC" -eq 2 ]; then
+        # Exit code 2 = infra error (Anvil down, missing tool, etc.).
+        # All subsequent evaluations will fail for the same reason; abort early.
+        log "ERROR: fitness.sh reported infra failure (exit 2) for $FNAME — aborting"
+        exit 2
+    fi
+
     if [ "$FITNESS_EC" -ne 0 ] || [ -z "$SCORE" ]; then
         log "WARNING: fitness.sh failed for $FNAME (exit $FITNESS_EC) — skipping"
         FAILURES=$((FAILURES + 1))
@@ -124,10 +134,10 @@ while IFS= read -r FNAME; do
     fi
 
     log "  $FNAME → fitness=$SCORE"
-    SCORES["$FNAME"]="$SCORE"
+    printf '%s\t%s\n' "$FNAME" "$SCORE" >> "$SCORES_FILE"
 done <<< "$NULL_ENTRIES"
 
-if [ "${#SCORES[@]}" -eq 0 ]; then
+if [ ! -s "$SCORES_FILE" ]; then
     log "No seeds were successfully evaluated."
     exit 1
 fi
@@ -137,22 +147,25 @@ fi
 # =============================================================================
 
 MANIFEST_TMP="$(mktemp "${MANIFEST}.XXXXXX")"
-trap 'rm -f "$MANIFEST_TMP"' EXIT
+# Update trap to clean up both temp files.
+trap 'rm -f "$SCORES_FILE" "$MANIFEST_TMP"' EXIT
 
-python3 - "$MANIFEST" "$MANIFEST_TMP" <<PYEOF
-import json, sys, os
+python3 - "$MANIFEST" "$MANIFEST_TMP" "$SCORES_FILE" <<'PYEOF'
+import json, sys
 
 manifest_path = sys.argv[1]
 tmp_path      = sys.argv[2]
+scores_path   = sys.argv[3]
 
-# scores injected from shell (format: "file1\tscore1\nfile2\tscore2\n…")
-scores_raw = """$(for k in "${!SCORES[@]}"; do printf '%s\t%s\n' "$k" "${SCORES[$k]}"; done)"""
-
+# Load scores from the tab-separated file written by the shell loop.
+# Values are plain integers produced by fitness.sh — no shell expansion here.
 scores = {}
-for line in scores_raw.strip().splitlines():
-    if '\t' in line:
-        fname, score = line.split('\t', 1)
-        scores[fname.strip()] = int(score.strip())
+with open(scores_path) as sf:
+    for line in sf:
+        line = line.rstrip('\n')
+        if '\t' in line:
+            fname, score = line.split('\t', 1)
+            scores[fname.strip()] = int(score.strip())
 
 lines_out = []
 with open(manifest_path) as f:
@@ -175,9 +188,9 @@ with open(tmp_path, 'w') as f:
 PYEOF
 
 mv "$MANIFEST_TMP" "$MANIFEST"
-trap - EXIT
+trap 'rm -f "$SCORES_FILE"' EXIT
 
-EVALUATED="${#SCORES[@]}"
+EVALUATED=$(wc -l < "$SCORES_FILE" | tr -d ' ')
 log "Done. Evaluated $EVALUATED seed(s); $FAILURES failure(s)."
 log "Results written to $MANIFEST"
 
diff --git a/tools/push3-evolution/evolve.sh b/tools/push3-evolution/evolve.sh
index 361f39f..4937c09 100755
--- a/tools/push3-evolution/evolve.sh
+++ b/tools/push3-evolution/evolve.sh
@@ -344,9 +344,10 @@ if [ "$DIVERSE_SEEDS" = "true" ]; then
   # the pool has any usable entries, avoiding a second manifest parse).
   #
   # NOTE: seeds with fitness: null (e.g. llm-origin entries not yet evaluated)
-  # are included in the sample with no filtering — their effective_fitness is
-  # treated as 0 by the pool-admission logic.  Run evaluate-seeds.sh to score
-  # them and write real fitness values back to manifest.jsonl.
+  # are included in the sample with equal probability to any other seed — the
+  # pool sampler does a flat random shuffle and does not read or weight by
+  # fitness at all.  Run evaluate-seeds.sh to score them and write real fitness
+  # values back to manifest.jsonl.
   POOL_SAMPLE_LIST="$WORK_DIR/pool_sample.txt"
   POOL_COUNT=0
   if [ -f "$POOL_MANIFEST" ]; then