diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index 97c4f7f..32488d5 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -2,8 +2,8 @@ {"id":"forgejo-mcp-broker-ytw","title":"Phase 5b: bearer-token middleware on /mcp","description":"Middleware reads Authorization: Bearer \u003cmcp_token\u003e, resolves via store, attaches Forgejo access token to request context. 401 for missing/expired/revoked.","acceptance_criteria":"Table-driven tests: missing header, malformed, unknown token, expired, revoked, valid. Valid-token path puts Forgejo token on ctx via typed key.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:18Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:18Z","dependencies":[{"issue_id":"forgejo-mcp-broker-ytw","depends_on_id":"forgejo-mcp-broker-pur","type":"blocks","created_at":"2026-04-24T17:45:30Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":1,"comment_count":0} {"id":"forgejo-mcp-broker-t81","title":"Phase 5a: session registry + spawn-on-initialize","description":"Map Mcp-Session-Id -\u003e supervisor.Child + user metadata. On first initialize for unknown sid, spawn forgejo-mcp with user's Forgejo token in env, bind to bridge. LastActive bumped per request.","acceptance_criteria":"Tests with fake supervisor + fake bridge cover: spawn-on-initialize, reuse for subsequent messages, unknown-sid returns 410, max-sessions cap enforced.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:17Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:17Z","dependencies":[{"issue_id":"forgejo-mcp-broker-t81","depends_on_id":"forgejo-mcp-broker-am1","type":"blocks","created_at":"2026-04-24T17:45:29Z","created_by":"Ole-Morten Duesund","metadata":"{}"},{"issue_id":"forgejo-mcp-broker-t81","depends_on_id":"forgejo-mcp-broker-pur","type":"blocks","created_at":"2026-04-24T17:45:30Z","created_by":"Ole-Morten Duesund","metadata":"{}"},{"issue_id":"forgejo-mcp-broker-t81","depends_on_id":"forgejo-mcp-broker-zuq","type":"blocks","created_at":"2026-04-24T17:45:28Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":3,"dependent_count":2,"comment_count":0} {"id":"forgejo-mcp-broker-xot","title":"Phase 4b: bridge integration test against real forgejo-mcp","description":"Drive the bridge with initialize -\u003e tools/list -\u003e tools/call get_forgejo_mcp_server_version against a real forgejo-mcp subprocess. Validates the opaque-pipe assumption.","acceptance_criteria":"Full handshake, tools/list returns expected set, tools/call returns a version string. Tagged as integration test if runtime exceeds 2s.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:16Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:16Z","dependencies":[{"issue_id":"forgejo-mcp-broker-xot","depends_on_id":"forgejo-mcp-broker-am1","type":"blocks","created_at":"2026-04-24T17:45:28Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":0,"comment_count":0} -{"id":"forgejo-mcp-broker-31t","title":"Phase 3b: supervisor stress tests (FD/goroutine/zombie leak detection)","description":"1000 spawn/stop cycles under -race. Verify no FD leak, no goroutine leak (go.uber.org/goleak), no zombies (wait4 returns ECHILD when idle).","acceptance_criteria":"Cycle test passes under -race. FD count stable within a small constant. goleak detects no extra goroutines after test.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:15Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:15Z","dependencies":[{"issue_id":"forgejo-mcp-broker-31t","depends_on_id":"forgejo-mcp-broker-zuq","type":"blocks","created_at":"2026-04-24T17:45:26Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":0,"comment_count":0} -{"id":"forgejo-mcp-broker-am1","title":"Phase 4a: internal/bridge JSON-RPC pipe + SSE writer","description":"Given a supervisor.Child: inbound HTTP JSON -\u003e newline-framed stdin; stdout lines -\u003e SSE frames. Handle client disconnect without killing the child.","acceptance_criteria":"Unit tests with mock Child that echoes: request/response round trip, multiple concurrent requests with correct id routing, client disconnect mid-stream.","status":"in_progress","priority":1,"issue_type":"task","assignee":"Ole-Morten Duesund","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:15Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-27T11:56:15Z","started_at":"2026-04-27T11:56:15Z","dependencies":[{"issue_id":"forgejo-mcp-broker-am1","depends_on_id":"forgejo-mcp-broker-zuq","type":"blocks","created_at":"2026-04-24T17:45:27Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":2,"comment_count":0} +{"id":"forgejo-mcp-broker-31t","title":"Phase 3b: supervisor stress tests (FD/goroutine/zombie leak detection)","description":"1000 spawn/stop cycles under -race. Verify no FD leak, no goroutine leak (go.uber.org/goleak), no zombies (wait4 returns ECHILD when idle).","acceptance_criteria":"Cycle test passes under -race. FD count stable within a small constant. goleak detects no extra goroutines after test.","status":"in_progress","priority":1,"issue_type":"task","assignee":"Ole-Morten Duesund","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:15Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-27T12:00:32Z","started_at":"2026-04-27T12:00:32Z","dependencies":[{"issue_id":"forgejo-mcp-broker-31t","depends_on_id":"forgejo-mcp-broker-zuq","type":"blocks","created_at":"2026-04-24T17:45:26Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":0,"comment_count":0} +{"id":"forgejo-mcp-broker-am1","title":"Phase 4a: internal/bridge JSON-RPC pipe + SSE writer","description":"Given a supervisor.Child: inbound HTTP JSON -\u003e newline-framed stdin; stdout lines -\u003e SSE frames. Handle client disconnect without killing the child.","acceptance_criteria":"Unit tests with mock Child that echoes: request/response round trip, multiple concurrent requests with correct id routing, client disconnect mid-stream.","status":"closed","priority":1,"issue_type":"task","assignee":"Ole-Morten Duesund","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:15Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-27T11:59:35Z","started_at":"2026-04-27T11:56:15Z","closed_at":"2026-04-27T11:59:35Z","close_reason":"Bridge shipped: per-id routing, SSE responses for request/reply messages, 204 for notifications, structured 4xx/5xx for malformed input. Decoupled from supervisor (takes pipes directly) for clean testing via io.Pipe. 90.0% coverage.","dependencies":[{"issue_id":"forgejo-mcp-broker-am1","depends_on_id":"forgejo-mcp-broker-zuq","type":"blocks","created_at":"2026-04-24T17:45:27Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":2,"comment_count":0} {"id":"forgejo-mcp-broker-wgo","title":"Phase 2e: OAuth security review + attack-path tests","description":"Phase 2 exit gate. Review every handler for classic OAuth vulns (open redirect, code replay, mix-up, token leak in logs, host spoofing). Add at least one test per attack class. Update design.md §8 with findings.","acceptance_criteria":"Review checklist documented. Tests added for: PKCE mismatch, stale code, token absent from log attributes, bad redirect_uri, mismatched state, replay of used code.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:14Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:14Z","dependencies":[{"issue_id":"forgejo-mcp-broker-wgo","depends_on_id":"forgejo-mcp-broker-b2o","type":"blocks","created_at":"2026-04-24T17:45:26Z","created_by":"Ole-Morten Duesund","metadata":"{}"},{"issue_id":"forgejo-mcp-broker-wgo","depends_on_id":"forgejo-mcp-broker-pur","type":"blocks","created_at":"2026-04-24T17:45:25Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":2,"dependent_count":0,"comment_count":0} {"id":"forgejo-mcp-broker-zuq","title":"Phase 3a: internal/supervisor managed stdio subprocess","description":"Child type: Start, Stop(ctx) with SIGTERM -\u003e grace -\u003e SIGKILL, Wait+reap goroutine (no zombies), stderr drainer with prefix. Protocol-agnostic.","acceptance_criteria":"Unit tests against an echo-loop helper: round trip, graceful stop, kill-after-grace, child-exits-on-own detection, stderr capture. Manual spawn of real forgejo-mcp --transport stdio works.","status":"closed","priority":1,"issue_type":"task","assignee":"Ole-Morten Duesund","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:14Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-27T11:41:07Z","started_at":"2026-04-27T11:32:54Z","closed_at":"2026-04-27T11:41:07Z","close_reason":"internal/supervisor shipped: Start/Stop/Done/ExitErr/Pid, SIGTERM-\u003egrace-\u003eSIGKILL escalation, mandatory wait-and-reap. Test uses TestMain helper-process pattern; coverage 89.6% on the testable surface.","dependency_count":0,"dependent_count":3,"comment_count":0} {"id":"forgejo-mcp-broker-b2o","title":"Phase 2d: OAuth discovery endpoints (/.well-known/*)","description":"GET /.well-known/oauth-protected-resource and /.well-known/oauth-authorization-server. Issuer URLs MUST derive from cfg.PublicURL, never inbound headers (host-header attack defense per design.md §8).","acceptance_criteria":"Responses validate against RFC 8414/9728 shapes. Issuer URL sourced from config only. supported_scopes matches cfg.ForgejoOAuthScopes.","status":"open","priority":1,"issue_type":"task","owner":"olemd@glemt.net","created_at":"2026-04-24T15:45:13Z","created_by":"Ole-Morten Duesund","updated_at":"2026-04-24T15:45:13Z","dependencies":[{"issue_id":"forgejo-mcp-broker-b2o","depends_on_id":"forgejo-mcp-broker-pur","type":"blocks","created_at":"2026-04-24T17:45:25Z","created_by":"Ole-Morten Duesund","metadata":"{}"}],"dependency_count":1,"dependent_count":1,"comment_count":0} diff --git a/internal/supervisor/stress_test.go b/internal/supervisor/stress_test.go new file mode 100644 index 0000000..82e6802 --- /dev/null +++ b/internal/supervisor/stress_test.go @@ -0,0 +1,160 @@ +package supervisor_test + +import ( + "os" + "runtime" + "syscall" + "testing" + "time" + + "kode.naiv.no/olemd/forgejo-mcp-broker/internal/supervisor" +) + +// TestStress_NoLeaksAcross1000Cycles spawns and reaps a thousand children +// in sequence and asserts that the parent process leaks neither file +// descriptors, goroutines, nor zombie children. +// +// Each piece is checked separately so a failure points clearly at the +// culprit: +// - FD count via /proc/self/fd (Linux-only; the test skips elsewhere). +// - goroutine count via runtime.NumGoroutine() with a small fudge for +// runtime-internal goroutines that wax and wane. +// - zombies via syscall.Wait4(-1, ...) returning ECHILD when no +// waitable children remain. +// +// Bypass with -short for the cheap unit tests during inner-loop development. +func TestStress_NoLeaksAcross1000Cycles(t *testing.T) { + if testing.Short() { + t.Skip("stress test (~5s); rerun without -short") + } + if _, err := os.Stat("/proc/self/fd"); err != nil { + t.Skipf("FD probe requires /proc/self/fd (Linux): %v", err) + } + + // Use /bin/true rather than the test-binary helper-process: spawning + // 1000 copies of the test binary itself drowns the test in re-entrant + // startup cost and (on some kernels) fd-inheritance edge cases. /bin/true + // is the canonical "exits immediately, does no IO" binary. + if _, err := os.Stat("/bin/true"); err != nil { + t.Skipf("/bin/true required: %v", err) + } + cmd := []string{"/bin/true"} + var env []string + + // Warm-up: GC, allow runtime goroutines to settle, take baselines. + runtime.GC() + time.Sleep(50 * time.Millisecond) + fdsBefore := countOpenFDs(t) + gosBefore := runtime.NumGoroutine() + + const cycles = 1000 + start := time.Now() + for i := 0; i < cycles; i++ { + c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env}) + if err != nil { + t.Fatalf("Start cycle %d: %v", i, err) + } + select { + case <-c.Done(): + case <-time.After(2 * time.Second): + t.Fatalf("cycle %d: child did not exit within 2s", i) + } + } + elapsed := time.Since(start) + t.Logf("%d cycles in %s (%.1f spawn/s)", cycles, elapsed, float64(cycles)/elapsed.Seconds()) + + // Let goroutines wind down before sampling. + runtime.GC() + time.Sleep(100 * time.Millisecond) + + fdsAfter := countOpenFDs(t) + gosAfter := runtime.NumGoroutine() + + // Allow a small slack — Go's runtime can spawn or shed background + // goroutines/FDs unrelated to our test. A real leak shows up as + // hundreds, not single digits. + const slack = 5 + if delta := fdsAfter - fdsBefore; delta > slack { + t.Errorf("FD leak: %d before → %d after (Δ=%d, slack=%d)", + fdsBefore, fdsAfter, delta, slack) + } + if delta := gosAfter - gosBefore; delta > slack { + t.Errorf("goroutine leak: %d before → %d after (Δ=%d, slack=%d)", + gosBefore, gosAfter, delta, slack) + } + + // Zombie check. With everything reaped, Wait4(-1) should return ECHILD. + var status syscall.WaitStatus + pid, err := syscall.Wait4(-1, &status, syscall.WNOHANG, nil) + switch { + case pid > 0: + t.Errorf("zombie process leaked: pid=%d status=%v", pid, status) + case err == nil: + // Wait4 succeeded with pid=0 (which can also indicate "child + // exists but not yet exited" with WNOHANG); not strictly a + // zombie. Note it but don't fail. + t.Logf("Wait4 returned pid=0 (no waitable child; ok)") + case err == syscall.ECHILD: + // Expected: no children to wait for. + default: + t.Errorf("unexpected Wait4 error: %v", err) + } +} + +// TestStress_StopMidLifecycle runs 200 cycles that exercise the Stop path +// (rather than letting the child exit on its own), to catch leaks specific +// to the SIGTERM/SIGKILL path. +func TestStress_StopMidLifecycle(t *testing.T) { + if testing.Short() { + t.Skip("stress test (~3s); rerun without -short") + } + if _, err := os.Stat("/proc/self/fd"); err != nil { + t.Skipf("FD probe requires /proc/self/fd (Linux): %v", err) + } + + // /bin/cat acts as an echo-to-EOF helper: it reads stdin and writes + // stdout until stdin closes. Lighter than re-execing the test binary. + if _, err := os.Stat("/bin/cat"); err != nil { + t.Skipf("/bin/cat required: %v", err) + } + cmd := []string{"/bin/cat"} + var env []string + + runtime.GC() + time.Sleep(50 * time.Millisecond) + fdsBefore := countOpenFDs(t) + gosBefore := runtime.NumGoroutine() + + const cycles = 200 + for i := 0; i < cycles; i++ { + c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env}) + if err != nil { + t.Fatalf("Start cycle %d: %v", i, err) + } + // Echo helper exits cleanly when its stdin is closed; Stop drives + // that via stdin.Close + SIGTERM. + if err := c.Stop(t.Context()); err != nil { + // Some helpers may report a SIGTERM exit; not a failure. + t.Logf("cycle %d Stop: %v", i, err) + } + } + + runtime.GC() + time.Sleep(100 * time.Millisecond) + + if delta := countOpenFDs(t) - fdsBefore; delta > 5 { + t.Errorf("FD leak across %d Stop cycles: Δ=%d", cycles, delta) + } + if delta := runtime.NumGoroutine() - gosBefore; delta > 5 { + t.Errorf("goroutine leak across %d Stop cycles: Δ=%d", cycles, delta) + } +} + +func countOpenFDs(t *testing.T) int { + t.Helper() + entries, err := os.ReadDir("/proc/self/fd") + if err != nil { + t.Fatalf("/proc/self/fd: %v", err) + } + return len(entries) +} diff --git a/internal/supervisor/supervisor.go b/internal/supervisor/supervisor.go index 02167b9..5e0f40f 100644 --- a/internal/supervisor/supervisor.go +++ b/internal/supervisor/supervisor.go @@ -92,6 +92,13 @@ func Start(ctx context.Context, cfg Config) (*Child, error) { _ = stdin.Close() return nil, fmt.Errorf("supervisor: start %q: %w", cfg.Cmd[0], err) } + // Close-handles we may need to clean up explicitly post-Wait. Some + // kernels don't deliver EOF to drainStderr promptly under load when + // using cmd.StderrPipe; an explicit Close after Wait ensures + // drainStderr exits and FDs aren't leaked across high-frequency + // spawn/reap cycles. + stdoutCloser, _ := stdout.(io.Closer) + stderrCloser, _ := stderr.(io.Closer) c := &Child{ Stdin: stdin, @@ -114,9 +121,18 @@ func Start(ctx context.Context, cfg Config) (*Child, error) { go drainStderr(stderr, onStderr) // Reap the child. cmd.Wait must be called exactly once; do it here so - // nobody else has to remember. + // nobody else has to remember. After Wait, defensively close the + // stdout/stderr handles so drainStderr definitely exits and our + // reference count drops to zero — important under stress loads where + // the kernel doesn't always deliver EOF promptly. go func() { err := cmd.Wait() + if stdoutCloser != nil { + _ = stdoutCloser.Close() + } + if stderrCloser != nil { + _ = stderrCloser.Close() + } c.mu.Lock() c.exitErr = err c.mu.Unlock()