forgejo-mcp-broker/internal/supervisor/stress_test.go

package supervisor_test

import (
	"os"
	"runtime"
	"syscall"
	"testing"
	"time"

	"kode.naiv.no/olemd/forgejo-mcp-broker/internal/supervisor"
)

// TestStress_NoLeaksAcross1000Cycles spawns and reaps a thousand children
// in sequence and asserts that the parent process leaks neither file
// descriptors, goroutines, nor zombie children.
//
// Each piece is checked separately so a failure points clearly at the
// culprit:
//   - FD count via /proc/self/fd (Linux-only; the test skips elsewhere).
//   - goroutine count via runtime.NumGoroutine() with a small fudge for
//     runtime-internal goroutines that wax and wane.
//   - zombies via syscall.Wait4(-1, ...) returning ECHILD when no
//     waitable children remain.
//
// Bypass with -short for the cheap unit tests during inner-loop development.
func TestStress_NoLeaksAcross1000Cycles(t *testing.T) {
	if testing.Short() {
		t.Skip("stress test (~5s); rerun without -short")
	}
	if _, err := os.Stat("/proc/self/fd"); err != nil {
		t.Skipf("FD probe requires /proc/self/fd (Linux): %v", err)
	}

	// Use /bin/true rather than the test-binary helper-process: spawning
	// 1000 copies of the test binary itself drowns the test in re-entrant
	// startup cost and (on some kernels) fd-inheritance edge cases. /bin/true
	// is the canonical "exits immediately, does no IO" binary.
	if _, err := os.Stat("/bin/true"); err != nil {
		t.Skipf("/bin/true required: %v", err)
	}
	cmd := []string{"/bin/true"}
	var env []string

	// Warm-up: GC, allow runtime goroutines to settle, take baselines.
	runtime.GC()
	time.Sleep(50 * time.Millisecond)
	fdsBefore := countOpenFDs(t)
	gosBefore := runtime.NumGoroutine()

	const cycles = 1000
	start := time.Now()
	for i := 0; i < cycles; i++ {
		c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
		if err != nil {
			t.Fatalf("Start cycle %d: %v", i, err)
		}
		select {
		case <-c.Done():
		case <-time.After(2 * time.Second):
			t.Fatalf("cycle %d: child did not exit within 2s", i)
		}
	}
	elapsed := time.Since(start)
	t.Logf("%d cycles in %s (%.1f spawn/s)", cycles, elapsed, float64(cycles)/elapsed.Seconds())

	// Let goroutines wind down before sampling.
	runtime.GC()
	time.Sleep(100 * time.Millisecond)

	fdsAfter := countOpenFDs(t)
	gosAfter := runtime.NumGoroutine()

	// Allow a small slack — Go's runtime can spawn or shed background
	// goroutines/FDs unrelated to our test. A real leak shows up as
	// hundreds, not single digits.
	const slack = 5
	if delta := fdsAfter - fdsBefore; delta > slack {
		t.Errorf("FD leak: %d before → %d after (Δ=%d, slack=%d)",
			fdsBefore, fdsAfter, delta, slack)
	}
	if delta := gosAfter - gosBefore; delta > slack {
		t.Errorf("goroutine leak: %d before → %d after (Δ=%d, slack=%d)",
			gosBefore, gosAfter, delta, slack)
	}

	// Zombie check. With everything reaped, Wait4(-1) should return ECHILD.
	var status syscall.WaitStatus
	pid, err := syscall.Wait4(-1, &status, syscall.WNOHANG, nil)
	switch {
	case pid > 0:
		t.Errorf("zombie process leaked: pid=%d status=%v", pid, status)
	case err == nil:
		// Wait4 succeeded with pid=0 (which can also indicate "child
		// exists but not yet exited" with WNOHANG); not strictly a
		// zombie. Note it but don't fail.
		t.Logf("Wait4 returned pid=0 (no waitable child; ok)")
	case err == syscall.ECHILD:
		// Expected: no children to wait for.
	default:
		t.Errorf("unexpected Wait4 error: %v", err)
	}
}

// TestStress_StopMidLifecycle runs 200 cycles that exercise the Stop path
// (rather than letting the child exit on its own), to catch leaks specific
// to the SIGTERM/SIGKILL path.
func TestStress_StopMidLifecycle(t *testing.T) {
	if testing.Short() {
		t.Skip("stress test (~3s); rerun without -short")
	}
	if _, err := os.Stat("/proc/self/fd"); err != nil {
		t.Skipf("FD probe requires /proc/self/fd (Linux): %v", err)
	}

	// /bin/cat acts as an echo-to-EOF helper: it reads stdin and writes
	// stdout until stdin closes. Lighter than re-execing the test binary.
	if _, err := os.Stat("/bin/cat"); err != nil {
		t.Skipf("/bin/cat required: %v", err)
	}
	cmd := []string{"/bin/cat"}
	var env []string

	runtime.GC()
	time.Sleep(50 * time.Millisecond)
	fdsBefore := countOpenFDs(t)
	gosBefore := runtime.NumGoroutine()

	const cycles = 200
	for i := 0; i < cycles; i++ {
		c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
		if err != nil {
			t.Fatalf("Start cycle %d: %v", i, err)
		}
		// Echo helper exits cleanly when its stdin is closed; Stop drives
		// that via stdin.Close + SIGTERM.
		if err := c.Stop(t.Context()); err != nil {
			// Some helpers may report a SIGTERM exit; not a failure.
			t.Logf("cycle %d Stop: %v", i, err)
		}
	}

	runtime.GC()
	time.Sleep(100 * time.Millisecond)

	if delta := countOpenFDs(t) - fdsBefore; delta > 5 {
		t.Errorf("FD leak across %d Stop cycles: Δ=%d", cycles, delta)
	}
	if delta := runtime.NumGoroutine() - gosBefore; delta > 5 {
		t.Errorf("goroutine leak across %d Stop cycles: Δ=%d", cycles, delta)
	}
}

func countOpenFDs(t *testing.T) int {
	t.Helper()
	entries, err := os.ReadDir("/proc/self/fd")
	if err != nil {
		t.Fatalf("/proc/self/fd: %v", err)
	}
	return len(entries)
}