fix: address review findings for evaluate-seeds.sh (#724)

- Replace unquoted heredoc (shell-injection path) with a temp file: the shell loop now appends tab-separated filename/score lines to a temp file, which is passed as a plain path argument to the Python manifest- rewrite block. Python reads only file contents, never executes shell- expanded strings. - Add early abort on fitness.sh exit code 2 (infra error: Anvil down, missing tool). Iterating past an infra failure produces no useful results; aborting immediately surfaces the real problem. - Remove unused `os` import from the manifest-rewrite Python block. - Fix inaccurate comment in evolve.sh --diverse-seeds sampling: the pool sampler does a flat random shuffle with no fitness weighting; null- fitness seeds are not "treated as 0" — they are sampled with equal probability to any other seed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-15 03:29:47 +00:00 · 2026-03-15 03:29:47 +00:00 · c508efa31f
commit c508efa31f
parent cb6e6708b6
2 changed files with 33 additions and 19 deletions
--- a/tools/push3-evolution/evaluate-seeds.sh
+++ b/tools/push3-evolution/evaluate-seeds.sh
@ -100,8 +100,11 @@ fi

 FAILURES=0

-# scores: associative array  file -> score (bash 4+)
-declare -A SCORES
+# Scores are accumulated in a temp file as tab-separated "filename\tscore"
+# lines.  Using a file (rather than a shell associative array embedded in a
+# heredoc) avoids injecting values into Python source code.
+SCORES_FILE="$(mktemp)"
+trap 'rm -f "$SCORES_FILE"' EXIT

 while IFS= read -r FNAME; do
    [ -z "$FNAME" ] && continue
@ -117,6 +120,13 @@ while IFS= read -r FNAME; do
    FITNESS_EC=0
    SCORE=$("$FITNESS_SH" "$SEED_FILE") || FITNESS_EC=$?

+    if [ "$FITNESS_EC" -eq 2 ]; then
+        # Exit code 2 = infra error (Anvil down, missing tool, etc.).
+        # All subsequent evaluations will fail for the same reason; abort early.
+        log "ERROR: fitness.sh reported infra failure (exit 2) for $FNAME — aborting"
+        exit 2
+    fi
+
    if [ "$FITNESS_EC" -ne 0 ] || [ -z "$SCORE" ]; then
        log "WARNING: fitness.sh failed for $FNAME (exit $FITNESS_EC) — skipping"
        FAILURES=$((FAILURES + 1))
@ -124,10 +134,10 @@ while IFS= read -r FNAME; do
    fi

    log "  $FNAME → fitness=$SCORE"
-    SCORES["$FNAME"]="$SCORE"
+    printf '%s\t%s\n' "$FNAME" "$SCORE" >> "$SCORES_FILE"
 done <<< "$NULL_ENTRIES"

-if [ "${#SCORES[@]}" -eq 0 ]; then
+if [ ! -s "$SCORES_FILE" ]; then
    log "No seeds were successfully evaluated."
    exit 1
 fi
@ -137,22 +147,25 @@ fi
 # =============================================================================

 MANIFEST_TMP="$(mktemp "${MANIFEST}.XXXXXX")"
-trap 'rm -f "$MANIFEST_TMP"' EXIT
+# Update trap to clean up both temp files.
+trap 'rm -f "$SCORES_FILE" "$MANIFEST_TMP"' EXIT

-python3 - "$MANIFEST" "$MANIFEST_TMP" <<PYEOF
-import json, sys, os
+python3 - "$MANIFEST" "$MANIFEST_TMP" "$SCORES_FILE" <<'PYEOF'
+import json, sys

 manifest_path = sys.argv[1]
 tmp_path      = sys.argv[2]
+scores_path   = sys.argv[3]

-# scores injected from shell (format: "file1\tscore1\nfile2\tscore2\n…")
-scores_raw = """$(for k in "${!SCORES[@]}"; do printf '%s\t%s\n' "$k" "${SCORES[$k]}"; done)"""
-
+# Load scores from the tab-separated file written by the shell loop.
+# Values are plain integers produced by fitness.sh — no shell expansion here.
 scores = {}
-for line in scores_raw.strip().splitlines():
-    if '\t' in line:
-        fname, score = line.split('\t', 1)
-        scores[fname.strip()] = int(score.strip())
+with open(scores_path) as sf:
+    for line in sf:
+        line = line.rstrip('\n')
+        if '\t' in line:
+            fname, score = line.split('\t', 1)
+            scores[fname.strip()] = int(score.strip())

 lines_out = []
 with open(manifest_path) as f:
@ -175,9 +188,9 @@ with open(tmp_path, 'w') as f:
 PYEOF

 mv "$MANIFEST_TMP" "$MANIFEST"
-trap - EXIT
+trap 'rm -f "$SCORES_FILE"' EXIT

-EVALUATED="${#SCORES[@]}"
+EVALUATED=$(wc -l < "$SCORES_FILE" | tr -d ' ')
 log "Done. Evaluated $EVALUATED seed(s); $FAILURES failure(s)."
 log "Results written to $MANIFEST"

--- a/tools/push3-evolution/evolve.sh
+++ b/tools/push3-evolution/evolve.sh
@ -344,9 +344,10 @@ if [ "$DIVERSE_SEEDS" = "true" ]; then
  # the pool has any usable entries, avoiding a second manifest parse).
  #
  # NOTE: seeds with fitness: null (e.g. llm-origin entries not yet evaluated)
-  # are included in the sample with no filtering — their effective_fitness is
-  # treated as 0 by the pool-admission logic.  Run evaluate-seeds.sh to score
-  # them and write real fitness values back to manifest.jsonl.
+  # are included in the sample with equal probability to any other seed — the
+  # pool sampler does a flat random shuffle and does not read or weight by
+  # fitness at all.  Run evaluate-seeds.sh to score them and write real fitness
+  # values back to manifest.jsonl.
  POOL_SAMPLE_LIST="$WORK_DIR/pool_sample.txt"
  POOL_COUNT=0
  if [ -f "$POOL_MANIFEST" ]; then