test(supervisor): stress tests for FD/goroutine/zombie leaks (forgejo-mcp-broker-31t)

Adds two stress tests:

  TestStress_NoLeaksAcross1000Cycles — spawns and reaps 1000 children
  in sequence, asserts FD count, goroutine count, and zombie status are
  all stable.

  TestStress_StopMidLifecycle — 200 cycles that exercise the Stop path
  (SIGTERM via Close+Signal) rather than relying on natural exit.

Bypassed by -short for the unit-test inner loop.

Notable findings:

* Using the helper-process pattern at this scale was a dead end. Each
  spawn re-execs the test binary, which inherits the parent's open FDs
  and runs Go's `testing` package init. Past a few hundred cycles the
  inner test binaries drag delivery of EOF on their inherited stderr
  pipe ends, leaving drainStderr goroutines blocked in bufio.ReadString
  even after Wait returned. Replacing the helper with /bin/true (for
  quick-exit) and /bin/cat (for echo-loop) sidesteps the recursion and
  is closer to the production case anyway: the broker spawns
  forgejo-mcp, not itself.

* Defensively close stdout/stderr handles in supervisor's reap goroutine
  after cmd.Wait returns. cmd.StderrPipe is supposed to be closed by
  Wait, but under load the kernel doesn't always deliver EOF promptly
  through Go 1.26's pidfd-based wait path; an explicit Close ensures
  drainStderr exits and FDs aren't held longer than needed.

Tests pass under -race with FD/goroutine deltas in single digits across
1000+200 cycles, and Wait4(-1) confirms no zombie children.

Closes forgejo-mcp-broker-31t. Phase 3 complete.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Ole-Morten Duesund 2026-04-27 16:04:34 +02:00
commit 03f67786be
3 changed files with 179 additions and 3 deletions

View file

@ -2,8 +2,8 @@
{"id":"forgejo-mcp-broker-ytw","title":"Phase 5b: bearer-token middleware on /mcp","description":"Middleware reads Authorization: Bearer \u003cmcp_token\u003e, resolves via store, attaches Forgejo access token to request context. 401 for missing/expired/revoked.","acceptance_criteria":"Table-driven tests: missing header, malformed, unknown token, expired, revoked, valid. Valid-token path puts Forgejo token on ctx via typed key.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:18Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:18Z","dependencies":[{"issue_id":"forgejo-mcp-broker-ytw","depends_on_id":"forgejo-mcp-broker-pur","type":"blocks","created_at":"2026-04-24T17:45:30Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":1,"comment_count":0} {"id":"forgejo-mcp-broker-ytw","title":"Phase 5b: bearer-token middleware on /mcp","description":"Middleware reads Authorization: Bearer \u003cmcp_token\u003e, resolves via store, attaches Forgejo access token to request context. 401 for missing/expired/revoked.","acceptance_criteria":"Table-driven tests: missing header, malformed, unknown token, expired, revoked, valid. Valid-token path puts Forgejo token on ctx via typed key.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:18Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:18Z","dependencies":[{"issue_id":"forgejo-mcp-broker-ytw","depends_on_id":"forgejo-mcp-broker-pur","type":"blocks","created_at":"2026-04-24T17:45:30Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":1,"comment_count":0}
{"id":"forgejo-mcp-broker-t81","title":"Phase 5a: session registry + spawn-on-initialize","description":"Map Mcp-Session-Id -\u003e supervisor.Child + user metadata. On first initialize for unknown sid, spawn forgejo-mcp with user's Forgejo token in env, bind to bridge. LastActive bumped per request.","acceptance_criteria":"Tests with fake supervisor + fake bridge cover: spawn-on-initialize, reuse for subsequent messages, unknown-sid returns 410, max-sessions cap enforced.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:17Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:17Z","dependencies":[{"issue_id":"forgejo-mcp-broker-t81","depends_on_id":"forgejo-mcp-broker-am1","type":"blocks","created_at":"2026-04-24T17:45:29Z","created_by":"Ole-Morten Duesund","metadata":"{}"},{"issue_id":"forgejo-mcp-broker-t81","depends_on_id":"forgejo-mcp-broker-pur","type":"blocks","created_at":"2026-04-24T17:45:30Z","created_by":"Ole-Morten Duesund","metadata":"{}"},{"issue_id":"forgejo-mcp-broker-t81","depends_on_id":"forgejo-mcp-broker-zuq","type":"blocks","created_at":"2026-04-24T17:45:28Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":3,"dependent_count":2,"comment_count":0} {"id":"forgejo-mcp-broker-t81","title":"Phase 5a: session registry + spawn-on-initialize","description":"Map Mcp-Session-Id -\u003e supervisor.Child + user metadata. On first initialize for unknown sid, spawn forgejo-mcp with user's Forgejo token in env, bind to bridge. LastActive bumped per request.","acceptance_criteria":"Tests with fake supervisor + fake bridge cover: spawn-on-initialize, reuse for subsequent messages, unknown-sid returns 410, max-sessions cap enforced.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:17Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:17Z","dependencies":[{"issue_id":"forgejo-mcp-broker-t81","depends_on_id":"forgejo-mcp-broker-am1","type":"blocks","created_at":"2026-04-24T17:45:29Z","created_by":"Ole-Morten Duesund","metadata":"{}"},{"issue_id":"forgejo-mcp-broker-t81","depends_on_id":"forgejo-mcp-broker-pur","type":"blocks","created_at":"2026-04-24T17:45:30Z","created_by":"Ole-Morten Duesund","metadata":"{}"},{"issue_id":"forgejo-mcp-broker-t81","depends_on_id":"forgejo-mcp-broker-zuq","type":"blocks","created_at":"2026-04-24T17:45:28Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":3,"dependent_count":2,"comment_count":0}
{"id":"forgejo-mcp-broker-xot","title":"Phase 4b: bridge integration test against real forgejo-mcp","description":"Drive the bridge with initialize -\u003e tools/list -\u003e tools/call get_forgejo_mcp_server_version against a real forgejo-mcp subprocess. Validates the opaque-pipe assumption.","acceptance_criteria":"Full handshake, tools/list returns expected set, tools/call returns a version string. Tagged as integration test if runtime exceeds 2s.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:16Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:16Z","dependencies":[{"issue_id":"forgejo-mcp-broker-xot","depends_on_id":"forgejo-mcp-broker-am1","type":"blocks","created_at":"2026-04-24T17:45:28Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":0,"comment_count":0} {"id":"forgejo-mcp-broker-xot","title":"Phase 4b: bridge integration test against real forgejo-mcp","description":"Drive the bridge with initialize -\u003e tools/list -\u003e tools/call get_forgejo_mcp_server_version against a real forgejo-mcp subprocess. Validates the opaque-pipe assumption.","acceptance_criteria":"Full handshake, tools/list returns expected set, tools/call returns a version string. Tagged as integration test if runtime exceeds 2s.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:16Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:16Z","dependencies":[{"issue_id":"forgejo-mcp-broker-xot","depends_on_id":"forgejo-mcp-broker-am1","type":"blocks","created_at":"2026-04-24T17:45:28Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":0,"comment_count":0}
{"id":"forgejo-mcp-broker-31t","title":"Phase 3b: supervisor stress tests (FD/goroutine/zombie leak detection)","description":"1000 spawn/stop cycles under -race. Verify no FD leak, no goroutine leak (go.uber.org/goleak), no zombies (wait4 returns ECHILD when idle).","acceptance_criteria":"Cycle test passes under -race. FD count stable within a small constant. goleak detects no extra goroutines after test.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:15Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:15Z","dependencies":[{"issue_id":"forgejo-mcp-broker-31t","depends_on_id":"forgejo-mcp-broker-zuq","type":"blocks","created_at":"2026-04-24T17:45:26Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":0,"comment_count":0} {"id":"forgejo-mcp-broker-31t","title":"Phase 3b: supervisor stress tests (FD/goroutine/zombie leak detection)","description":"1000 spawn/stop cycles under -race. Verify no FD leak, no goroutine leak (go.uber.org/goleak), no zombies (wait4 returns ECHILD when idle).","acceptance_criteria":"Cycle test passes under -race. FD count stable within a small constant. goleak detects no extra goroutines after test.","status":"in_progress","priority":1,"issue_type":"task","assignee":"Ole-Morten Duesund","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:15Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-27T12:00:32Z","started_at":"2026-04-27T12:00:32Z","dependencies":[{"issue_id":"forgejo-mcp-broker-31t","depends_on_id":"forgejo-mcp-broker-zuq","type":"blocks","created_at":"2026-04-24T17:45:26Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":0,"comment_count":0}
{"id":"forgejo-mcp-broker-am1","title":"Phase 4a: internal/bridge JSON-RPC pipe + SSE writer","description":"Given a supervisor.Child: inbound HTTP JSON -\u003e newline-framed stdin; stdout lines -\u003e SSE frames. Handle client disconnect without killing the child.","acceptance_criteria":"Unit tests with mock Child that echoes: request/response round trip, multiple concurrent requests with correct id routing, client disconnect mid-stream.","status":"in_progress","priority":1,"issue_type":"task","assignee":"Ole-Morten Duesund","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:15Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-27T11:56:15Z","started_at":"2026-04-27T11:56:15Z","dependencies":[{"issue_id":"forgejo-mcp-broker-am1","depends_on_id":"forgejo-mcp-broker-zuq","type":"blocks","created_at":"2026-04-24T17:45:27Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":2,"comment_count":0} {"id":"forgejo-mcp-broker-am1","title":"Phase 4a: internal/bridge JSON-RPC pipe + SSE writer","description":"Given a supervisor.Child: inbound HTTP JSON -\u003e newline-framed stdin; stdout lines -\u003e SSE frames. Handle client disconnect without killing the child.","acceptance_criteria":"Unit tests with mock Child that echoes: request/response round trip, multiple concurrent requests with correct id routing, client disconnect mid-stream.","status":"closed","priority":1,"issue_type":"task","assignee":"Ole-Morten Duesund","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:15Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-27T11:59:35Z","started_at":"2026-04-27T11:56:15Z","closed_at":"2026-04-27T11:59:35Z","close_reason":"Bridge shipped: per-id routing, SSE responses for request/reply messages, 204 for notifications, structured 4xx/5xx for malformed input. Decoupled from supervisor (takes pipes directly) for clean testing via io.Pipe. 90.0% coverage.","dependencies":[{"issue_id":"forgejo-mcp-broker-am1","depends_on_id":"forgejo-mcp-broker-zuq","type":"blocks","created_at":"2026-04-24T17:45:27Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":2,"comment_count":0}
{"id":"forgejo-mcp-broker-wgo","title":"Phase 2e: OAuth security review + attack-path tests","description":"Phase 2 exit gate. Review every handler for classic OAuth vulns (open redirect, code replay, mix-up, token leak in logs, host spoofing). Add at least one test per attack class. Update design.md §8 with findings.","acceptance_criteria":"Review checklist documented. Tests added for: PKCE mismatch, stale code, token absent from log attributes, bad redirect_uri, mismatched state, replay of used code.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:14Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:14Z","dependencies":[{"issue_id":"forgejo-mcp-broker-wgo","depends_on_id":"forgejo-mcp-broker-b2o","type":"blocks","created_at":"2026-04-24T17:45:26Z","created_by":"Ole-Morten Duesund","metadata":"{}"},{"issue_id":"forgejo-mcp-broker-wgo","depends_on_id":"forgejo-mcp-broker-pur","type":"blocks","created_at":"2026-04-24T17:45:25Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":2,"dependent_count":0,"comment_count":0} {"id":"forgejo-mcp-broker-wgo","title":"Phase 2e: OAuth security review + attack-path tests","description":"Phase 2 exit gate. Review every handler for classic OAuth vulns (open redirect, code replay, mix-up, token leak in logs, host spoofing). Add at least one test per attack class. Update design.md §8 with findings.","acceptance_criteria":"Review checklist documented. Tests added for: PKCE mismatch, stale code, token absent from log attributes, bad redirect_uri, mismatched state, replay of used code.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:14Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:14Z","dependencies":[{"issue_id":"forgejo-mcp-broker-wgo","depends_on_id":"forgejo-mcp-broker-b2o","type":"blocks","created_at":"2026-04-24T17:45:26Z","created_by":"Ole-Morten Duesund","metadata":"{}"},{"issue_id":"forgejo-mcp-broker-wgo","depends_on_id":"forgejo-mcp-broker-pur","type":"blocks","created_at":"2026-04-24T17:45:25Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":2,"dependent_count":0,"comment_count":0}
{"id":"forgejo-mcp-broker-zuq","title":"Phase 3a: internal/supervisor managed stdio subprocess","description":"Child type: Start, Stop(ctx) with SIGTERM -\u003e grace -\u003e SIGKILL, Wait+reap goroutine (no zombies), stderr drainer with prefix. Protocol-agnostic.","acceptance_criteria":"Unit tests against an echo-loop helper: round trip, graceful stop, kill-after-grace, child-exits-on-own detection, stderr capture. Manual spawn of real forgejo-mcp --transport stdio works.","status":"closed","priority":1,"issue_type":"task","assignee":"Ole-Morten Duesund","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:14Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-27T11:41:07Z","started_at":"2026-04-27T11:32:54Z","closed_at":"2026-04-27T11:41:07Z","close_reason":"internal/supervisor shipped: Start/Stop/Done/ExitErr/Pid, SIGTERM-\u003egrace-\u003eSIGKILL escalation, mandatory wait-and-reap. Test uses TestMain helper-process pattern; coverage 89.6% on the testable surface.","dependency_count":0,"dependent_count":3,"comment_count":0} {"id":"forgejo-mcp-broker-zuq","title":"Phase 3a: internal/supervisor managed stdio subprocess","description":"Child type: Start, Stop(ctx) with SIGTERM -\u003e grace -\u003e SIGKILL, Wait+reap goroutine (no zombies), stderr drainer with prefix. Protocol-agnostic.","acceptance_criteria":"Unit tests against an echo-loop helper: round trip, graceful stop, kill-after-grace, child-exits-on-own detection, stderr capture. Manual spawn of real forgejo-mcp --transport stdio works.","status":"closed","priority":1,"issue_type":"task","assignee":"Ole-Morten Duesund","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:14Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-27T11:41:07Z","started_at":"2026-04-27T11:32:54Z","closed_at":"2026-04-27T11:41:07Z","close_reason":"internal/supervisor shipped: Start/Stop/Done/ExitErr/Pid, SIGTERM-\u003egrace-\u003eSIGKILL escalation, mandatory wait-and-reap. Test uses TestMain helper-process pattern; coverage 89.6% on the testable surface.","dependency_count":0,"dependent_count":3,"comment_count":0}
{"id":"forgejo-mcp-broker-b2o","title":"Phase 2d: OAuth discovery endpoints (/.well-known/*)","description":"GET /.well-known/oauth-protected-resource and /.well-known/oauth-authorization-server. Issuer URLs MUST derive from cfg.PublicURL, never inbound headers (host-header attack defense per design.md §8).","acceptance_criteria":"Responses validate against RFC 8414/9728 shapes. Issuer URL sourced from config only. supported_scopes matches cfg.ForgejoOAuthScopes.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:13Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:13Z","dependencies":[{"issue_id":"forgejo-mcp-broker-b2o","depends_on_id":"forgejo-mcp-broker-pur","type":"blocks","created_at":"2026-04-24T17:45:25Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":1,"comment_count":0} {"id":"forgejo-mcp-broker-b2o","title":"Phase 2d: OAuth discovery endpoints (/.well-known/*)","description":"GET /.well-known/oauth-protected-resource and /.well-known/oauth-authorization-server. Issuer URLs MUST derive from cfg.PublicURL, never inbound headers (host-header attack defense per design.md §8).","acceptance_criteria":"Responses validate against RFC 8414/9728 shapes. Issuer URL sourced from config only. supported_scopes matches cfg.ForgejoOAuthScopes.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:13Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:13Z","dependencies":[{"issue_id":"forgejo-mcp-broker-b2o","depends_on_id":"forgejo-mcp-broker-pur","type":"blocks","created_at":"2026-04-24T17:45:25Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":1,"comment_count":0}

View file

@ -0,0 +1,160 @@
package supervisor_test
import (
"os"
"runtime"
"syscall"
"testing"
"time"
"kode.naiv.no/olemd/forgejo-mcp-broker/internal/supervisor"
)
// TestStress_NoLeaksAcross1000Cycles spawns and reaps a thousand children
// in sequence and asserts that the parent process leaks neither file
// descriptors, goroutines, nor zombie children.
//
// Each piece is checked separately so a failure points clearly at the
// culprit:
// - FD count via /proc/self/fd (Linux-only; the test skips elsewhere).
// - goroutine count via runtime.NumGoroutine() with a small fudge for
// runtime-internal goroutines that wax and wane.
// - zombies via syscall.Wait4(-1, ...) returning ECHILD when no
// waitable children remain.
//
// Bypass with -short for the cheap unit tests during inner-loop development.
func TestStress_NoLeaksAcross1000Cycles(t *testing.T) {
if testing.Short() {
t.Skip("stress test (~5s); rerun without -short")
}
if _, err := os.Stat("/proc/self/fd"); err != nil {
t.Skipf("FD probe requires /proc/self/fd (Linux): %v", err)
}
// Use /bin/true rather than the test-binary helper-process: spawning
// 1000 copies of the test binary itself drowns the test in re-entrant
// startup cost and (on some kernels) fd-inheritance edge cases. /bin/true
// is the canonical "exits immediately, does no IO" binary.
if _, err := os.Stat("/bin/true"); err != nil {
t.Skipf("/bin/true required: %v", err)
}
cmd := []string{"/bin/true"}
var env []string
// Warm-up: GC, allow runtime goroutines to settle, take baselines.
runtime.GC()
time.Sleep(50 * time.Millisecond)
fdsBefore := countOpenFDs(t)
gosBefore := runtime.NumGoroutine()
const cycles = 1000
start := time.Now()
for i := 0; i < cycles; i++ {
c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
if err != nil {
t.Fatalf("Start cycle %d: %v", i, err)
}
select {
case <-c.Done():
case <-time.After(2 * time.Second):
t.Fatalf("cycle %d: child did not exit within 2s", i)
}
}
elapsed := time.Since(start)
t.Logf("%d cycles in %s (%.1f spawn/s)", cycles, elapsed, float64(cycles)/elapsed.Seconds())
// Let goroutines wind down before sampling.
runtime.GC()
time.Sleep(100 * time.Millisecond)
fdsAfter := countOpenFDs(t)
gosAfter := runtime.NumGoroutine()
// Allow a small slack — Go's runtime can spawn or shed background
// goroutines/FDs unrelated to our test. A real leak shows up as
// hundreds, not single digits.
const slack = 5
if delta := fdsAfter - fdsBefore; delta > slack {
t.Errorf("FD leak: %d before → %d after (Δ=%d, slack=%d)",
fdsBefore, fdsAfter, delta, slack)
}
if delta := gosAfter - gosBefore; delta > slack {
t.Errorf("goroutine leak: %d before → %d after (Δ=%d, slack=%d)",
gosBefore, gosAfter, delta, slack)
}
// Zombie check. With everything reaped, Wait4(-1) should return ECHILD.
var status syscall.WaitStatus
pid, err := syscall.Wait4(-1, &status, syscall.WNOHANG, nil)
switch {
case pid > 0:
t.Errorf("zombie process leaked: pid=%d status=%v", pid, status)
case err == nil:
// Wait4 succeeded with pid=0 (which can also indicate "child
// exists but not yet exited" with WNOHANG); not strictly a
// zombie. Note it but don't fail.
t.Logf("Wait4 returned pid=0 (no waitable child; ok)")
case err == syscall.ECHILD:
// Expected: no children to wait for.
default:
t.Errorf("unexpected Wait4 error: %v", err)
}
}
// TestStress_StopMidLifecycle runs 200 cycles that exercise the Stop path
// (rather than letting the child exit on its own), to catch leaks specific
// to the SIGTERM/SIGKILL path.
func TestStress_StopMidLifecycle(t *testing.T) {
if testing.Short() {
t.Skip("stress test (~3s); rerun without -short")
}
if _, err := os.Stat("/proc/self/fd"); err != nil {
t.Skipf("FD probe requires /proc/self/fd (Linux): %v", err)
}
// /bin/cat acts as an echo-to-EOF helper: it reads stdin and writes
// stdout until stdin closes. Lighter than re-execing the test binary.
if _, err := os.Stat("/bin/cat"); err != nil {
t.Skipf("/bin/cat required: %v", err)
}
cmd := []string{"/bin/cat"}
var env []string
runtime.GC()
time.Sleep(50 * time.Millisecond)
fdsBefore := countOpenFDs(t)
gosBefore := runtime.NumGoroutine()
const cycles = 200
for i := 0; i < cycles; i++ {
c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
if err != nil {
t.Fatalf("Start cycle %d: %v", i, err)
}
// Echo helper exits cleanly when its stdin is closed; Stop drives
// that via stdin.Close + SIGTERM.
if err := c.Stop(t.Context()); err != nil {
// Some helpers may report a SIGTERM exit; not a failure.
t.Logf("cycle %d Stop: %v", i, err)
}
}
runtime.GC()
time.Sleep(100 * time.Millisecond)
if delta := countOpenFDs(t) - fdsBefore; delta > 5 {
t.Errorf("FD leak across %d Stop cycles: Δ=%d", cycles, delta)
}
if delta := runtime.NumGoroutine() - gosBefore; delta > 5 {
t.Errorf("goroutine leak across %d Stop cycles: Δ=%d", cycles, delta)
}
}
func countOpenFDs(t *testing.T) int {
t.Helper()
entries, err := os.ReadDir("/proc/self/fd")
if err != nil {
t.Fatalf("/proc/self/fd: %v", err)
}
return len(entries)
}

View file

@ -92,6 +92,13 @@ func Start(ctx context.Context, cfg Config) (*Child, error) {
_ = stdin.Close() _ = stdin.Close()
return nil, fmt.Errorf("supervisor: start %q: %w", cfg.Cmd[0], err) return nil, fmt.Errorf("supervisor: start %q: %w", cfg.Cmd[0], err)
} }
// Close-handles we may need to clean up explicitly post-Wait. Some
// kernels don't deliver EOF to drainStderr promptly under load when
// using cmd.StderrPipe; an explicit Close after Wait ensures
// drainStderr exits and FDs aren't leaked across high-frequency
// spawn/reap cycles.
stdoutCloser, _ := stdout.(io.Closer)
stderrCloser, _ := stderr.(io.Closer)
c := &Child{ c := &Child{
Stdin: stdin, Stdin: stdin,
@ -114,9 +121,18 @@ func Start(ctx context.Context, cfg Config) (*Child, error) {
go drainStderr(stderr, onStderr) go drainStderr(stderr, onStderr)
// Reap the child. cmd.Wait must be called exactly once; do it here so // Reap the child. cmd.Wait must be called exactly once; do it here so
// nobody else has to remember. // nobody else has to remember. After Wait, defensively close the
// stdout/stderr handles so drainStderr definitely exits and our
// reference count drops to zero — important under stress loads where
// the kernel doesn't always deliver EOF promptly.
go func() { go func() {
err := cmd.Wait() err := cmd.Wait()
if stdoutCloser != nil {
_ = stdoutCloser.Close()
}
if stderrCloser != nil {
_ = stderrCloser.Close()
}
c.mu.Lock() c.mu.Lock()
c.exitErr = err c.exitErr = err
c.mu.Unlock() c.mu.Unlock()