package supervisor_test import ( "os" "runtime" "syscall" "testing" "time" "kode.naiv.no/olemd/forgejo-mcp-broker/internal/supervisor" ) // TestStress_NoLeaksAcross1000Cycles spawns and reaps a thousand children // in sequence and asserts that the parent process leaks neither file // descriptors, goroutines, nor zombie children. // // Each piece is checked separately so a failure points clearly at the // culprit: // - FD count via /proc/self/fd (Linux-only; the test skips elsewhere). // - goroutine count via runtime.NumGoroutine() with a small fudge for // runtime-internal goroutines that wax and wane. // - zombies via syscall.Wait4(-1, ...) returning ECHILD when no // waitable children remain. // // Bypass with -short for the cheap unit tests during inner-loop development. func TestStress_NoLeaksAcross1000Cycles(t *testing.T) { if testing.Short() { t.Skip("stress test (~5s); rerun without -short") } if _, err := os.Stat("/proc/self/fd"); err != nil { t.Skipf("FD probe requires /proc/self/fd (Linux): %v", err) } // Use /bin/true rather than the test-binary helper-process: spawning // 1000 copies of the test binary itself drowns the test in re-entrant // startup cost and (on some kernels) fd-inheritance edge cases. /bin/true // is the canonical "exits immediately, does no IO" binary. if _, err := os.Stat("/bin/true"); err != nil { t.Skipf("/bin/true required: %v", err) } cmd := []string{"/bin/true"} var env []string // Warm-up: GC, allow runtime goroutines to settle, take baselines. runtime.GC() time.Sleep(50 * time.Millisecond) fdsBefore := countOpenFDs(t) gosBefore := runtime.NumGoroutine() const cycles = 1000 start := time.Now() for i := 0; i < cycles; i++ { c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env}) if err != nil { t.Fatalf("Start cycle %d: %v", i, err) } select { case <-c.Done(): case <-time.After(2 * time.Second): t.Fatalf("cycle %d: child did not exit within 2s", i) } } elapsed := time.Since(start) t.Logf("%d cycles in %s (%.1f spawn/s)", cycles, elapsed, float64(cycles)/elapsed.Seconds()) // Let goroutines wind down before sampling. runtime.GC() time.Sleep(100 * time.Millisecond) fdsAfter := countOpenFDs(t) gosAfter := runtime.NumGoroutine() // Allow a small slack — Go's runtime can spawn or shed background // goroutines/FDs unrelated to our test. A real leak shows up as // hundreds, not single digits. const slack = 5 if delta := fdsAfter - fdsBefore; delta > slack { t.Errorf("FD leak: %d before → %d after (Δ=%d, slack=%d)", fdsBefore, fdsAfter, delta, slack) } if delta := gosAfter - gosBefore; delta > slack { t.Errorf("goroutine leak: %d before → %d after (Δ=%d, slack=%d)", gosBefore, gosAfter, delta, slack) } // Zombie check. With everything reaped, Wait4(-1) should return ECHILD. var status syscall.WaitStatus pid, err := syscall.Wait4(-1, &status, syscall.WNOHANG, nil) switch { case pid > 0: t.Errorf("zombie process leaked: pid=%d status=%v", pid, status) case err == nil: // Wait4 succeeded with pid=0 (which can also indicate "child // exists but not yet exited" with WNOHANG); not strictly a // zombie. Note it but don't fail. t.Logf("Wait4 returned pid=0 (no waitable child; ok)") case err == syscall.ECHILD: // Expected: no children to wait for. default: t.Errorf("unexpected Wait4 error: %v", err) } } // TestStress_StopMidLifecycle runs 200 cycles that exercise the Stop path // (rather than letting the child exit on its own), to catch leaks specific // to the SIGTERM/SIGKILL path. func TestStress_StopMidLifecycle(t *testing.T) { if testing.Short() { t.Skip("stress test (~3s); rerun without -short") } if _, err := os.Stat("/proc/self/fd"); err != nil { t.Skipf("FD probe requires /proc/self/fd (Linux): %v", err) } // /bin/cat acts as an echo-to-EOF helper: it reads stdin and writes // stdout until stdin closes. Lighter than re-execing the test binary. if _, err := os.Stat("/bin/cat"); err != nil { t.Skipf("/bin/cat required: %v", err) } cmd := []string{"/bin/cat"} var env []string runtime.GC() time.Sleep(50 * time.Millisecond) fdsBefore := countOpenFDs(t) gosBefore := runtime.NumGoroutine() const cycles = 200 for i := 0; i < cycles; i++ { c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env}) if err != nil { t.Fatalf("Start cycle %d: %v", i, err) } // Echo helper exits cleanly when its stdin is closed; Stop drives // that via stdin.Close + SIGTERM. if err := c.Stop(t.Context()); err != nil { // Some helpers may report a SIGTERM exit; not a failure. t.Logf("cycle %d Stop: %v", i, err) } } runtime.GC() time.Sleep(100 * time.Millisecond) if delta := countOpenFDs(t) - fdsBefore; delta > 5 { t.Errorf("FD leak across %d Stop cycles: Δ=%d", cycles, delta) } if delta := runtime.NumGoroutine() - gosBefore; delta > 5 { t.Errorf("goroutine leak across %d Stop cycles: Δ=%d", cycles, delta) } } func countOpenFDs(t *testing.T) int { t.Helper() entries, err := os.ReadDir("/proc/self/fd") if err != nil { t.Fatalf("/proc/self/fd: %v", err) } return len(entries) }