Adds two stress tests: TestStress_NoLeaksAcross1000Cycles — spawns and reaps 1000 children in sequence, asserts FD count, goroutine count, and zombie status are all stable. TestStress_StopMidLifecycle — 200 cycles that exercise the Stop path (SIGTERM via Close+Signal) rather than relying on natural exit. Bypassed by -short for the unit-test inner loop. Notable findings: * Using the helper-process pattern at this scale was a dead end. Each spawn re-execs the test binary, which inherits the parent's open FDs and runs Go's `testing` package init. Past a few hundred cycles the inner test binaries drag delivery of EOF on their inherited stderr pipe ends, leaving drainStderr goroutines blocked in bufio.ReadString even after Wait returned. Replacing the helper with /bin/true (for quick-exit) and /bin/cat (for echo-loop) sidesteps the recursion and is closer to the production case anyway: the broker spawns forgejo-mcp, not itself. * Defensively close stdout/stderr handles in supervisor's reap goroutine after cmd.Wait returns. cmd.StderrPipe is supposed to be closed by Wait, but under load the kernel doesn't always deliver EOF promptly through Go 1.26's pidfd-based wait path; an explicit Close ensures drainStderr exits and FDs aren't held longer than needed. Tests pass under -race with FD/goroutine deltas in single digits across 1000+200 cycles, and Wait4(-1) confirms no zombie children. Closes forgejo-mcp-broker-31t. Phase 3 complete. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
160 lines
5.1 KiB
Go
160 lines
5.1 KiB
Go
package supervisor_test
|
|
|
|
import (
|
|
"os"
|
|
"runtime"
|
|
"syscall"
|
|
"testing"
|
|
"time"
|
|
|
|
"kode.naiv.no/olemd/forgejo-mcp-broker/internal/supervisor"
|
|
)
|
|
|
|
// TestStress_NoLeaksAcross1000Cycles spawns and reaps a thousand children
|
|
// in sequence and asserts that the parent process leaks neither file
|
|
// descriptors, goroutines, nor zombie children.
|
|
//
|
|
// Each piece is checked separately so a failure points clearly at the
|
|
// culprit:
|
|
// - FD count via /proc/self/fd (Linux-only; the test skips elsewhere).
|
|
// - goroutine count via runtime.NumGoroutine() with a small fudge for
|
|
// runtime-internal goroutines that wax and wane.
|
|
// - zombies via syscall.Wait4(-1, ...) returning ECHILD when no
|
|
// waitable children remain.
|
|
//
|
|
// Bypass with -short for the cheap unit tests during inner-loop development.
|
|
func TestStress_NoLeaksAcross1000Cycles(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("stress test (~5s); rerun without -short")
|
|
}
|
|
if _, err := os.Stat("/proc/self/fd"); err != nil {
|
|
t.Skipf("FD probe requires /proc/self/fd (Linux): %v", err)
|
|
}
|
|
|
|
// Use /bin/true rather than the test-binary helper-process: spawning
|
|
// 1000 copies of the test binary itself drowns the test in re-entrant
|
|
// startup cost and (on some kernels) fd-inheritance edge cases. /bin/true
|
|
// is the canonical "exits immediately, does no IO" binary.
|
|
if _, err := os.Stat("/bin/true"); err != nil {
|
|
t.Skipf("/bin/true required: %v", err)
|
|
}
|
|
cmd := []string{"/bin/true"}
|
|
var env []string
|
|
|
|
// Warm-up: GC, allow runtime goroutines to settle, take baselines.
|
|
runtime.GC()
|
|
time.Sleep(50 * time.Millisecond)
|
|
fdsBefore := countOpenFDs(t)
|
|
gosBefore := runtime.NumGoroutine()
|
|
|
|
const cycles = 1000
|
|
start := time.Now()
|
|
for i := 0; i < cycles; i++ {
|
|
c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
|
|
if err != nil {
|
|
t.Fatalf("Start cycle %d: %v", i, err)
|
|
}
|
|
select {
|
|
case <-c.Done():
|
|
case <-time.After(2 * time.Second):
|
|
t.Fatalf("cycle %d: child did not exit within 2s", i)
|
|
}
|
|
}
|
|
elapsed := time.Since(start)
|
|
t.Logf("%d cycles in %s (%.1f spawn/s)", cycles, elapsed, float64(cycles)/elapsed.Seconds())
|
|
|
|
// Let goroutines wind down before sampling.
|
|
runtime.GC()
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
fdsAfter := countOpenFDs(t)
|
|
gosAfter := runtime.NumGoroutine()
|
|
|
|
// Allow a small slack — Go's runtime can spawn or shed background
|
|
// goroutines/FDs unrelated to our test. A real leak shows up as
|
|
// hundreds, not single digits.
|
|
const slack = 5
|
|
if delta := fdsAfter - fdsBefore; delta > slack {
|
|
t.Errorf("FD leak: %d before → %d after (Δ=%d, slack=%d)",
|
|
fdsBefore, fdsAfter, delta, slack)
|
|
}
|
|
if delta := gosAfter - gosBefore; delta > slack {
|
|
t.Errorf("goroutine leak: %d before → %d after (Δ=%d, slack=%d)",
|
|
gosBefore, gosAfter, delta, slack)
|
|
}
|
|
|
|
// Zombie check. With everything reaped, Wait4(-1) should return ECHILD.
|
|
var status syscall.WaitStatus
|
|
pid, err := syscall.Wait4(-1, &status, syscall.WNOHANG, nil)
|
|
switch {
|
|
case pid > 0:
|
|
t.Errorf("zombie process leaked: pid=%d status=%v", pid, status)
|
|
case err == nil:
|
|
// Wait4 succeeded with pid=0 (which can also indicate "child
|
|
// exists but not yet exited" with WNOHANG); not strictly a
|
|
// zombie. Note it but don't fail.
|
|
t.Logf("Wait4 returned pid=0 (no waitable child; ok)")
|
|
case err == syscall.ECHILD:
|
|
// Expected: no children to wait for.
|
|
default:
|
|
t.Errorf("unexpected Wait4 error: %v", err)
|
|
}
|
|
}
|
|
|
|
// TestStress_StopMidLifecycle runs 200 cycles that exercise the Stop path
|
|
// (rather than letting the child exit on its own), to catch leaks specific
|
|
// to the SIGTERM/SIGKILL path.
|
|
func TestStress_StopMidLifecycle(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("stress test (~3s); rerun without -short")
|
|
}
|
|
if _, err := os.Stat("/proc/self/fd"); err != nil {
|
|
t.Skipf("FD probe requires /proc/self/fd (Linux): %v", err)
|
|
}
|
|
|
|
// /bin/cat acts as an echo-to-EOF helper: it reads stdin and writes
|
|
// stdout until stdin closes. Lighter than re-execing the test binary.
|
|
if _, err := os.Stat("/bin/cat"); err != nil {
|
|
t.Skipf("/bin/cat required: %v", err)
|
|
}
|
|
cmd := []string{"/bin/cat"}
|
|
var env []string
|
|
|
|
runtime.GC()
|
|
time.Sleep(50 * time.Millisecond)
|
|
fdsBefore := countOpenFDs(t)
|
|
gosBefore := runtime.NumGoroutine()
|
|
|
|
const cycles = 200
|
|
for i := 0; i < cycles; i++ {
|
|
c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
|
|
if err != nil {
|
|
t.Fatalf("Start cycle %d: %v", i, err)
|
|
}
|
|
// Echo helper exits cleanly when its stdin is closed; Stop drives
|
|
// that via stdin.Close + SIGTERM.
|
|
if err := c.Stop(t.Context()); err != nil {
|
|
// Some helpers may report a SIGTERM exit; not a failure.
|
|
t.Logf("cycle %d Stop: %v", i, err)
|
|
}
|
|
}
|
|
|
|
runtime.GC()
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
if delta := countOpenFDs(t) - fdsBefore; delta > 5 {
|
|
t.Errorf("FD leak across %d Stop cycles: Δ=%d", cycles, delta)
|
|
}
|
|
if delta := runtime.NumGoroutine() - gosBefore; delta > 5 {
|
|
t.Errorf("goroutine leak across %d Stop cycles: Δ=%d", cycles, delta)
|
|
}
|
|
}
|
|
|
|
func countOpenFDs(t *testing.T) int {
|
|
t.Helper()
|
|
entries, err := os.ReadDir("/proc/self/fd")
|
|
if err != nil {
|
|
t.Fatalf("/proc/self/fd: %v", err)
|
|
}
|
|
return len(entries)
|
|
}
|