fix: address review findings for evaluate-seeds.sh (#724)

- Replace unquoted heredoc (shell-injection path) with a temp file: the
  shell loop now appends tab-separated filename/score lines to a temp
  file, which is passed as a plain path argument to the Python manifest-
  rewrite block.  Python reads only file contents, never executes shell-
  expanded strings.
- Add early abort on fitness.sh exit code 2 (infra error: Anvil down,
  missing tool).  Iterating past an infra failure produces no useful
  results; aborting immediately surfaces the real problem.
- Remove unused `os` import from the manifest-rewrite Python block.
- Fix inaccurate comment in evolve.sh --diverse-seeds sampling: the pool
  sampler does a flat random shuffle with no fitness weighting; null-
  fitness seeds are not "treated as 0" — they are sampled with equal
  probability to any other seed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
openhands 2026-03-15 03:29:47 +00:00
parent cb6e6708b6
commit c508efa31f
2 changed files with 33 additions and 19 deletions

View file

@ -100,8 +100,11 @@ fi
FAILURES=0
# scores: associative array file -> score (bash 4+)
declare -A SCORES
# Scores are accumulated in a temp file as tab-separated "filename\tscore"
# lines. Using a file (rather than a shell associative array embedded in a
# heredoc) avoids injecting values into Python source code.
SCORES_FILE="$(mktemp)"
trap 'rm -f "$SCORES_FILE"' EXIT
while IFS= read -r FNAME; do
[ -z "$FNAME" ] && continue
@ -117,6 +120,13 @@ while IFS= read -r FNAME; do
FITNESS_EC=0
SCORE=$("$FITNESS_SH" "$SEED_FILE") || FITNESS_EC=$?
if [ "$FITNESS_EC" -eq 2 ]; then
# Exit code 2 = infra error (Anvil down, missing tool, etc.).
# All subsequent evaluations will fail for the same reason; abort early.
log "ERROR: fitness.sh reported infra failure (exit 2) for $FNAME — aborting"
exit 2
fi
if [ "$FITNESS_EC" -ne 0 ] || [ -z "$SCORE" ]; then
log "WARNING: fitness.sh failed for $FNAME (exit $FITNESS_EC) — skipping"
FAILURES=$((FAILURES + 1))
@ -124,10 +134,10 @@ while IFS= read -r FNAME; do
fi
log " $FNAME → fitness=$SCORE"
SCORES["$FNAME"]="$SCORE"
printf '%s\t%s\n' "$FNAME" "$SCORE" >> "$SCORES_FILE"
done <<< "$NULL_ENTRIES"
if [ "${#SCORES[@]}" -eq 0 ]; then
if [ ! -s "$SCORES_FILE" ]; then
log "No seeds were successfully evaluated."
exit 1
fi
@ -137,22 +147,25 @@ fi
# =============================================================================
MANIFEST_TMP="$(mktemp "${MANIFEST}.XXXXXX")"
trap 'rm -f "$MANIFEST_TMP"' EXIT
# Update trap to clean up both temp files.
trap 'rm -f "$SCORES_FILE" "$MANIFEST_TMP"' EXIT
python3 - "$MANIFEST" "$MANIFEST_TMP" <<PYEOF
import json, sys, os
python3 - "$MANIFEST" "$MANIFEST_TMP" "$SCORES_FILE" <<'PYEOF'
import json, sys
manifest_path = sys.argv[1]
tmp_path = sys.argv[2]
scores_path = sys.argv[3]
# scores injected from shell (format: "file1\tscore1\nfile2\tscore2\n…")
scores_raw = """$(for k in "${!SCORES[@]}"; do printf '%s\t%s\n' "$k" "${SCORES[$k]}"; done)"""
# Load scores from the tab-separated file written by the shell loop.
# Values are plain integers produced by fitness.sh — no shell expansion here.
scores = {}
for line in scores_raw.strip().splitlines():
if '\t' in line:
fname, score = line.split('\t', 1)
scores[fname.strip()] = int(score.strip())
with open(scores_path) as sf:
for line in sf:
line = line.rstrip('\n')
if '\t' in line:
fname, score = line.split('\t', 1)
scores[fname.strip()] = int(score.strip())
lines_out = []
with open(manifest_path) as f:
@ -175,9 +188,9 @@ with open(tmp_path, 'w') as f:
PYEOF
mv "$MANIFEST_TMP" "$MANIFEST"
trap - EXIT
trap 'rm -f "$SCORES_FILE"' EXIT
EVALUATED="${#SCORES[@]}"
EVALUATED=$(wc -l < "$SCORES_FILE" | tr -d ' ')
log "Done. Evaluated $EVALUATED seed(s); $FAILURES failure(s)."
log "Results written to $MANIFEST"