#!/usr/bin/env bash set -euo pipefail cd "$(dirname "$0")/.." # Timeout constants (in seconds) readonly ANVIL_TIMEOUT=60 # Anvil starts fast (increased for first-time setup) readonly POSTGRES_TIMEOUT=30 # Database init is quick readonly BOOTSTRAP_TIMEOUT=120 # Contract deployment + seeding readonly PONDER_TIMEOUT=120 # Must index bootstrap events readonly WEBAPP_TIMEOUT=120 # npm install + Vite startup readonly CADDY_TIMEOUT=20 # Proxy starts instantly readonly POLL_INTERVAL=2 # Check health every N seconds readonly MAX_DOCKER_DISK_GB=20 # Maximum Docker disk usage in GB PID_FILE=/tmp/kraiken-watcher.pid PROJECT_NAME=${COMPOSE_PROJECT_NAME:-$(basename "$PWD")} # Detect container runtime if command -v docker compose &> /dev/null; then COMPOSE_CMD="docker compose" RUNTIME_CMD="docker" elif command -v docker-compose &> /dev/null; then COMPOSE_CMD="docker-compose" RUNTIME_CMD="docker" else echo "Error: docker/docker-compose not found. Please install Docker." echo "" echo "Installation instructions:" echo " Linux: https://docs.docker.com/engine/install/" echo " Mac: brew install colima docker docker-compose && colima start" exit 1 fi container_name() { local service="$1" echo "${PROJECT_NAME}_${service}_1" } # Check Docker disk usage and warn if approaching limits check_docker_disk_usage() { if ! command -v docker &> /dev/null; then return 0 # Skip if Docker not available fi # Get total Docker disk usage in GB (works on Linux and macOS) local total_size_bytes total_size_bytes=$(docker system df --format '{{.Size}}' 2>/dev/null | \ sed 's/[^0-9.]//g' | awk '{sum+=$1} END {print sum}' || echo "0") # Parse the actual usage more accurately local docker_df_output docker_df_output=$(docker system df 2>/dev/null || echo "") if [[ -z "$docker_df_output" ]]; then return 0 # Docker not running fi # Extract total reclaimable space (more accurate than parsing Size) local total_gb total_gb=$(echo "$docker_df_output" | tail -n 1 | awk '{print $NF}' | sed 's/GB//; s/MB/\/1024/; s/KB/\/1048576/' | bc -l 2>/dev/null || echo "0") # Alternative: sum up all TYPE sizes (column 3 has the SIZE) local images_size containers_size volumes_size build_cache_size images_size=$(echo "$docker_df_output" | grep "Images" | awk '{print $3}' | sed 's/GB$//; s/MB$/\/1024/; s/KB$/\/1048576/; s/B$/\/1073741824/' | sed 's/^$/0/' | bc -l 2>/dev/null || echo "0") containers_size=$(echo "$docker_df_output" | grep "Containers" | awk '{print $3}' | sed 's/GB$//; s/MB$/\/1024/; s/KB$/\/1048576/; s/B$/\/1073741824/' | sed 's/^$/0/' | bc -l 2>/dev/null || echo "0") volumes_size=$(echo "$docker_df_output" | grep "Local Volumes" | awk '{print $3}' | sed 's/GB$//; s/MB$/\/1024/; s/KB$/\/1048576/; s/B$/\/1073741824/' | sed 's/^$/0/' | bc -l 2>/dev/null || echo "0") build_cache_size=$(echo "$docker_df_output" | grep "Build Cache" | awk '{print $3}' | sed 's/GB$//; s/MB$/\/1024/; s/KB$/\/1048576/; s/B$/\/1073741824/' | sed 's/^$/0/' | bc -l 2>/dev/null || echo "0") total_gb=$(echo "$images_size + $containers_size + $volumes_size + $build_cache_size" | bc -l 2>/dev/null || echo "0") # Round to 1 decimal place total_gb=$(printf "%.1f" "$total_gb" 2>/dev/null || echo "0") echo " Docker disk usage: ${total_gb}GB / ${MAX_DOCKER_DISK_GB}GB limit" # Warn if approaching 80% of limit (16GB) if (( $(echo "$total_gb > 16" | bc -l 2>/dev/null || echo "0") )); then echo " [!!] WARNING: Docker disk usage is high!" echo " [!!] Run './scripts/cleanup-disk.sh' to free up space" fi # Hard stop if over limit if (( $(echo "$total_gb > $MAX_DOCKER_DISK_GB" | bc -l 2>/dev/null || echo "0") )); then echo "" echo "ERROR: Docker disk usage exceeds ${MAX_DOCKER_DISK_GB}GB limit!" echo "Run './scripts/cleanup-disk.sh' to free up space, then try again." exit 1 fi } cleanup_existing() { # Kill any existing watch scripts pkill -f "watch-kraiken-lib.sh" 2>/dev/null || true pkill -f "inotifywait.*$(pwd)/kraiken-lib" 2>/dev/null || true # Remove PID file rm -f "$PID_FILE" # Kill zombie container processes pkill -9 -f "${RUNTIME_CMD} wait.*${PROJECT_NAME}_" 2>/dev/null || true # Remove any existing containers (suppress errors if they don't exist) echo " Cleaning up existing containers..." ${RUNTIME_CMD} ps -a --filter "label=com.docker.compose.project=${PROJECT_NAME}" --format "{{.Names}}" 2>/dev/null | \ xargs -r ${RUNTIME_CMD} rm -f 2>&1 | grep -v "Error.*no container" || true } # Wait for container to be healthy (via healthcheck) wait_for_healthy() { local container=$1 local timeout_sec=$2 local max_attempts=$((timeout_sec / POLL_INTERVAL)) local start_time=$(date +%s) for i in $(seq 1 "$max_attempts"); do # Docker doesn't have a standalone healthcheck command, check via inspect local health_status health_status=$(${RUNTIME_CMD} inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null || echo "unknown") if [[ "$health_status" == "healthy" ]]; then local elapsed=$(($(date +%s) - start_time)) echo " ✓ $container ready (${elapsed}s)" return 0 fi sleep "$POLL_INTERVAL" done echo "ERROR: $container failed to become healthy after ${timeout_sec}s" return 1 } # Wait for container to exit (used for bootstrap) wait_for_exited() { local container=$1 local timeout_sec=$2 local max_attempts=$((timeout_sec / POLL_INTERVAL)) local start_time=$(date +%s) for i in $(seq 1 "$max_attempts"); do local status status=$(${RUNTIME_CMD} inspect "$container" --format='{{.State.Status}}' 2>/dev/null || echo "unknown") if [[ "$status" == "exited" ]]; then local elapsed=$(($(date +%s) - start_time)) echo " ✓ $container completed (${elapsed}s)" return 0 fi sleep "$POLL_INTERVAL" done echo "ERROR: $container failed to complete after ${timeout_sec}s" return 1 } start_stack() { local stack_start_time=$(date +%s) # Check Docker disk usage before starting check_docker_disk_usage # Clean up any existing processes first cleanup_existing # Show branch if set if [[ -n "${GIT_BRANCH:-}" ]]; then echo "Branch: $GIT_BRANCH" fi echo "Building kraiken-lib..." ./scripts/build-kraiken-lib.sh echo "Starting stack..." # Phase 1: Start base services (no dependencies) echo " Starting anvil & postgres..." ${COMPOSE_CMD} up -d anvil postgres 2>&1 | grep -v "STEP\|Copying\|Writing\|Getting\|fetch\|Installing\|Executing" || true wait_for_healthy "$(container_name anvil)" "$ANVIL_TIMEOUT" || exit 1 wait_for_healthy "$(container_name postgres)" "$POSTGRES_TIMEOUT" || exit 1 # Phase 2: Start bootstrap (depends on anvil & postgres healthy) echo " Starting bootstrap..." ${COMPOSE_CMD} up -d bootstrap >/dev/null 2>&1 wait_for_exited "$(container_name bootstrap)" "$BOOTSTRAP_TIMEOUT" || exit 1 # Phase 3: Start ponder (depends on bootstrap completed) echo " Starting ponder..." ${COMPOSE_CMD} up -d ponder >/dev/null 2>&1 wait_for_healthy "$(container_name ponder)" "$PONDER_TIMEOUT" || exit 1 # Phase 4: Start frontend services (depend on ponder healthy) echo " Starting webapp, landing, txn-bot..." ${COMPOSE_CMD} up -d webapp landing txn-bot >/dev/null 2>&1 wait_for_healthy "$(container_name webapp)" "$WEBAPP_TIMEOUT" || exit 1 # Phase 5: Start caddy (depends on frontend services) echo " Starting caddy..." ${COMPOSE_CMD} up -d caddy >/dev/null 2>&1 wait_for_healthy "$(container_name caddy)" "$CADDY_TIMEOUT" || exit 1 if [[ -z "${SKIP_WATCH:-}" ]]; then echo "Watching for kraiken-lib changes..." ./scripts/watch-kraiken-lib.sh & echo $! > "$PID_FILE" fi local total_time=$(($(date +%s) - stack_start_time)) echo "" echo "[ok] Stack started in ${total_time}s" echo " Web App: http://localhost:8081/app/" echo " RPC Proxy: http://localhost:8081/api/rpc" echo " GraphQL: http://localhost:8081/api/graphql" } stop_stack() { cleanup_existing ${COMPOSE_CMD} down # Aggressive pruning to prevent disk bloat echo " Pruning Docker resources (images, containers, volumes, build cache)..." # Prune build cache aggressively (this is usually the biggest culprit) ${RUNTIME_CMD} builder prune -af 2>&1 | grep -E "Total|deleted" || true # Prune all unused data (containers, networks, images, volumes) ${RUNTIME_CMD} system prune -af --volumes 2>&1 | grep -E "Total reclaimed|deleted" || true echo "[ok] Stack stopped and cleaned" } check_health() { echo "Checking health..." local services=(anvil postgres ponder webapp landing txn-bot caddy) for service in "${services[@]}"; do local container container=$(${RUNTIME_CMD} ps --all \ --filter "label=com.docker.compose.project=${PROJECT_NAME}" \ --filter "label=com.docker.compose.service=${service}" \ --format '{{.Names}}' | head -n1) if [[ -z "$container" ]]; then echo " [??] $service (not created)" continue fi local health_status health_status=$(${RUNTIME_CMD} inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null || echo "unknown") if [[ "$health_status" == "healthy" ]]; then echo " [ok] $service" else echo " [!!] $service" fi done } restart_light() { echo "Light restart: webapp + txn-bot only..." echo " Preserving Anvil state (contracts remain deployed)" local webapp_container txnbot_container webapp_container=$(${RUNTIME_CMD} ps --all \ --filter "label=com.docker.compose.project=${PROJECT_NAME}" \ --filter "label=com.docker.compose.service=webapp" \ --format '{{.Names}}' | head -n1) txnbot_container=$(${RUNTIME_CMD} ps --all \ --filter "label=com.docker.compose.project=${PROJECT_NAME}" \ --filter "label=com.docker.compose.service=txn-bot" \ --format '{{.Names}}' | head -n1) if [[ -z "$webapp_container" ]]; then echo "[!!] webapp container not found - run './scripts/dev.sh start' first" exit 1 fi local start_time=$(date +%s) echo " Restarting containers..." ${RUNTIME_CMD} restart "$webapp_container" >/dev/null [[ -n "$txnbot_container" ]] && ${RUNTIME_CMD} restart "$txnbot_container" >/dev/null echo " Waiting for webapp to be ready..." local max_attempts=30 local attempt=0 while ((attempt < max_attempts)); do if curl -s -f -o /dev/null http://localhost:5173/app/ 2>/dev/null; then local end_time=$(date +%s) local duration=$((end_time - start_time)) echo "[ok] Light restart complete (~${duration}s)" echo " Web App: http://localhost:8081/app/" return 0 fi sleep 2 ((attempt++)) done echo "[!!] Webapp failed to respond after ${max_attempts} attempts" exit 1 } restart_full() { echo "Full restart: all containers + bootstrap..." stop_stack start_stack echo "[ok] Full restart complete" } usage() { cat <