160 lines
5.1 KiB
Go
160 lines
5.1 KiB
Go
|
|
package supervisor_test
|
||
|
|
|
||
|
|
import (
|
||
|
|
"os"
|
||
|
|
"runtime"
|
||
|
|
"syscall"
|
||
|
|
"testing"
|
||
|
|
"time"
|
||
|
|
|
||
|
|
"kode.naiv.no/olemd/forgejo-mcp-broker/internal/supervisor"
|
||
|
|
)
|
||
|
|
|
||
|
|
// TestStress_NoLeaksAcross1000Cycles spawns and reaps a thousand children
|
||
|
|
// in sequence and asserts that the parent process leaks neither file
|
||
|
|
// descriptors, goroutines, nor zombie children.
|
||
|
|
//
|
||
|
|
// Each piece is checked separately so a failure points clearly at the
|
||
|
|
// culprit:
|
||
|
|
// - FD count via /proc/self/fd (Linux-only; the test skips elsewhere).
|
||
|
|
// - goroutine count via runtime.NumGoroutine() with a small fudge for
|
||
|
|
// runtime-internal goroutines that wax and wane.
|
||
|
|
// - zombies via syscall.Wait4(-1, ...) returning ECHILD when no
|
||
|
|
// waitable children remain.
|
||
|
|
//
|
||
|
|
// Bypass with -short for the cheap unit tests during inner-loop development.
|
||
|
|
func TestStress_NoLeaksAcross1000Cycles(t *testing.T) {
|
||
|
|
if testing.Short() {
|
||
|
|
t.Skip("stress test (~5s); rerun without -short")
|
||
|
|
}
|
||
|
|
if _, err := os.Stat("/proc/self/fd"); err != nil {
|
||
|
|
t.Skipf("FD probe requires /proc/self/fd (Linux): %v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Use /bin/true rather than the test-binary helper-process: spawning
|
||
|
|
// 1000 copies of the test binary itself drowns the test in re-entrant
|
||
|
|
// startup cost and (on some kernels) fd-inheritance edge cases. /bin/true
|
||
|
|
// is the canonical "exits immediately, does no IO" binary.
|
||
|
|
if _, err := os.Stat("/bin/true"); err != nil {
|
||
|
|
t.Skipf("/bin/true required: %v", err)
|
||
|
|
}
|
||
|
|
cmd := []string{"/bin/true"}
|
||
|
|
var env []string
|
||
|
|
|
||
|
|
// Warm-up: GC, allow runtime goroutines to settle, take baselines.
|
||
|
|
runtime.GC()
|
||
|
|
time.Sleep(50 * time.Millisecond)
|
||
|
|
fdsBefore := countOpenFDs(t)
|
||
|
|
gosBefore := runtime.NumGoroutine()
|
||
|
|
|
||
|
|
const cycles = 1000
|
||
|
|
start := time.Now()
|
||
|
|
for i := 0; i < cycles; i++ {
|
||
|
|
c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
|
||
|
|
if err != nil {
|
||
|
|
t.Fatalf("Start cycle %d: %v", i, err)
|
||
|
|
}
|
||
|
|
select {
|
||
|
|
case <-c.Done():
|
||
|
|
case <-time.After(2 * time.Second):
|
||
|
|
t.Fatalf("cycle %d: child did not exit within 2s", i)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
elapsed := time.Since(start)
|
||
|
|
t.Logf("%d cycles in %s (%.1f spawn/s)", cycles, elapsed, float64(cycles)/elapsed.Seconds())
|
||
|
|
|
||
|
|
// Let goroutines wind down before sampling.
|
||
|
|
runtime.GC()
|
||
|
|
time.Sleep(100 * time.Millisecond)
|
||
|
|
|
||
|
|
fdsAfter := countOpenFDs(t)
|
||
|
|
gosAfter := runtime.NumGoroutine()
|
||
|
|
|
||
|
|
// Allow a small slack — Go's runtime can spawn or shed background
|
||
|
|
// goroutines/FDs unrelated to our test. A real leak shows up as
|
||
|
|
// hundreds, not single digits.
|
||
|
|
const slack = 5
|
||
|
|
if delta := fdsAfter - fdsBefore; delta > slack {
|
||
|
|
t.Errorf("FD leak: %d before → %d after (Δ=%d, slack=%d)",
|
||
|
|
fdsBefore, fdsAfter, delta, slack)
|
||
|
|
}
|
||
|
|
if delta := gosAfter - gosBefore; delta > slack {
|
||
|
|
t.Errorf("goroutine leak: %d before → %d after (Δ=%d, slack=%d)",
|
||
|
|
gosBefore, gosAfter, delta, slack)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Zombie check. With everything reaped, Wait4(-1) should return ECHILD.
|
||
|
|
var status syscall.WaitStatus
|
||
|
|
pid, err := syscall.Wait4(-1, &status, syscall.WNOHANG, nil)
|
||
|
|
switch {
|
||
|
|
case pid > 0:
|
||
|
|
t.Errorf("zombie process leaked: pid=%d status=%v", pid, status)
|
||
|
|
case err == nil:
|
||
|
|
// Wait4 succeeded with pid=0 (which can also indicate "child
|
||
|
|
// exists but not yet exited" with WNOHANG); not strictly a
|
||
|
|
// zombie. Note it but don't fail.
|
||
|
|
t.Logf("Wait4 returned pid=0 (no waitable child; ok)")
|
||
|
|
case err == syscall.ECHILD:
|
||
|
|
// Expected: no children to wait for.
|
||
|
|
default:
|
||
|
|
t.Errorf("unexpected Wait4 error: %v", err)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// TestStress_StopMidLifecycle runs 200 cycles that exercise the Stop path
|
||
|
|
// (rather than letting the child exit on its own), to catch leaks specific
|
||
|
|
// to the SIGTERM/SIGKILL path.
|
||
|
|
func TestStress_StopMidLifecycle(t *testing.T) {
|
||
|
|
if testing.Short() {
|
||
|
|
t.Skip("stress test (~3s); rerun without -short")
|
||
|
|
}
|
||
|
|
if _, err := os.Stat("/proc/self/fd"); err != nil {
|
||
|
|
t.Skipf("FD probe requires /proc/self/fd (Linux): %v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
// /bin/cat acts as an echo-to-EOF helper: it reads stdin and writes
|
||
|
|
// stdout until stdin closes. Lighter than re-execing the test binary.
|
||
|
|
if _, err := os.Stat("/bin/cat"); err != nil {
|
||
|
|
t.Skipf("/bin/cat required: %v", err)
|
||
|
|
}
|
||
|
|
cmd := []string{"/bin/cat"}
|
||
|
|
var env []string
|
||
|
|
|
||
|
|
runtime.GC()
|
||
|
|
time.Sleep(50 * time.Millisecond)
|
||
|
|
fdsBefore := countOpenFDs(t)
|
||
|
|
gosBefore := runtime.NumGoroutine()
|
||
|
|
|
||
|
|
const cycles = 200
|
||
|
|
for i := 0; i < cycles; i++ {
|
||
|
|
c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
|
||
|
|
if err != nil {
|
||
|
|
t.Fatalf("Start cycle %d: %v", i, err)
|
||
|
|
}
|
||
|
|
// Echo helper exits cleanly when its stdin is closed; Stop drives
|
||
|
|
// that via stdin.Close + SIGTERM.
|
||
|
|
if err := c.Stop(t.Context()); err != nil {
|
||
|
|
// Some helpers may report a SIGTERM exit; not a failure.
|
||
|
|
t.Logf("cycle %d Stop: %v", i, err)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
runtime.GC()
|
||
|
|
time.Sleep(100 * time.Millisecond)
|
||
|
|
|
||
|
|
if delta := countOpenFDs(t) - fdsBefore; delta > 5 {
|
||
|
|
t.Errorf("FD leak across %d Stop cycles: Δ=%d", cycles, delta)
|
||
|
|
}
|
||
|
|
if delta := runtime.NumGoroutine() - gosBefore; delta > 5 {
|
||
|
|
t.Errorf("goroutine leak across %d Stop cycles: Δ=%d", cycles, delta)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
func countOpenFDs(t *testing.T) int {
|
||
|
|
t.Helper()
|
||
|
|
entries, err := os.ReadDir("/proc/self/fd")
|
||
|
|
if err != nil {
|
||
|
|
t.Fatalf("/proc/self/fd: %v", err)
|
||
|
|
}
|
||
|
|
return len(entries)
|
||
|
|
}
|