forgejo-mcp-broker/internal/supervisor/supervisor_test.go
Ole-Morten Duesund 7be7f5e199 feat(supervisor): managed stdio subprocess (forgejo-mcp-broker-zuq)
Adds internal/supervisor: a thin wrapper around os/exec that handles the
zombie/leak/escalation concerns once, so phase-4 (bridge) and phase-5
(session glue) don't each have to re-derive them.

Lifecycle (Stop):
  1. Close stdin — well-behaved stdio servers exit on EOF
  2. Send SIGTERM
  3. Wait up to StopGrace (default 5s) for exit
  4. SIGKILL if still alive

Reaping is mandatory: a goroutine calls cmd.Wait so the kernel actually
collects the child. Without it you accumulate zombies under N concurrent
sessions. Tests exercise this via the helper-process pattern (TestMain
re-execs the test binary in helper mode) — no shell or external binary
dependency.

Tests cover: empty Cmd validation, missing-binary error, echo round
trip via stdin/stdout, stderr drainer collecting lines, SIGTERM-friendly
graceful stop, SIGTERM-ignoring child escalating to SIGKILL (with a
ready-on-stdout sync barrier so the test isn't racing the helper's
signal.Notify), idempotent Stop, clean exit detection, non-zero exit
detection, env override propagation. 89.6% coverage; remaining gap is
unreachable-from-public-API defensive branches (pipe-creation failures
under FD exhaustion, post-release Pid).

Manual smoke test against a real `forgejo-mcp --transport stdio` is
deferred to phase 4b's integration test (where it adds the most value).

Closes forgejo-mcp-broker-zuq.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 13:41:00 +02:00

323 lines
8.2 KiB
Go

package supervisor_test
import (
"bufio"
"errors"
"fmt"
"io"
"os"
"os/exec"
"os/signal"
"strconv"
"strings"
"sync"
"syscall"
"testing"
"time"
"kode.naiv.no/olemd/forgejo-mcp-broker/internal/supervisor"
)
// TestMain implements the helper-process pattern: when invoked with
// FJMCP_SUPERVISOR_HELPER set, the test binary acts as a child instead of
// running tests. This avoids needing a separate helper binary or shell
// dependency.
func TestMain(m *testing.M) {
if mode := os.Getenv("FJMCP_SUPERVISOR_HELPER"); mode != "" {
runHelper(mode)
return
}
os.Exit(m.Run())
}
func runHelper(mode string) {
switch mode {
case "echo":
// Echo each stdin line back to stdout.
s := bufio.NewScanner(os.Stdin)
for s.Scan() {
fmt.Println(s.Text())
}
os.Exit(0)
case "stderr_at_startup":
// Print N lines to stderr at startup, then echo loop.
n, _ := strconv.Atoi(os.Getenv("FJMCP_HELPER_N"))
if n == 0 {
n = 3
}
for i := 1; i <= n; i++ {
fmt.Fprintf(os.Stderr, "stderr line %d\n", i)
}
s := bufio.NewScanner(os.Stdin)
for s.Scan() {
fmt.Println(s.Text())
}
os.Exit(0)
case "ignore_term":
// Install SIGTERM handler that swallows the signal, announce
// readiness on stdout (so tests have a sync barrier — the parent
// must not send SIGTERM before the handler is in place), then
// sleep until SIGKILL.
sig := make(chan os.Signal, 1)
signal.Notify(sig, syscall.SIGTERM)
go func() {
for range sig { /* ignore */
}
}()
fmt.Println("ready")
time.Sleep(60 * time.Second)
os.Exit(0)
case "exit_zero":
os.Exit(0)
case "exit_nonzero":
os.Exit(7)
default:
fmt.Fprintf(os.Stderr, "unknown helper mode %q\n", mode)
os.Exit(2)
}
}
// helperCmd returns Cmd args + Env that re-exec the test binary as a helper
// in the given mode. -test.run=^$ skips all tests (we just want runHelper).
func helperCmd(mode string, extraEnv ...string) ([]string, []string) {
cmd := []string{os.Args[0], "-test.run=^$"}
env := append([]string{"FJMCP_SUPERVISOR_HELPER=" + mode}, extraEnv...)
return cmd, env
}
func TestStart_RequiresCmd(t *testing.T) {
_, err := supervisor.Start(t.Context(), supervisor.Config{})
if err == nil || !strings.Contains(err.Error(), "Cmd is required") {
t.Errorf("want Cmd-required error, got %v", err)
}
}
func TestStart_BadBinary(t *testing.T) {
_, err := supervisor.Start(t.Context(), supervisor.Config{
Cmd: []string{"/this/path/does/not/exist"},
})
if err == nil {
t.Fatal("expected error for missing binary")
}
}
func TestEcho_RoundTrip(t *testing.T) {
cmd, env := helperCmd("echo")
c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
if err != nil {
t.Fatalf("Start: %v", err)
}
defer c.Stop(t.Context())
if _, err := io.WriteString(c.Stdin, "hello-world\n"); err != nil {
t.Fatalf("write: %v", err)
}
line, err := c.Stdout.ReadString('\n')
if err != nil {
t.Fatalf("read: %v", err)
}
if got := strings.TrimRight(line, "\n"); got != "hello-world" {
t.Errorf("read %q, want hello-world", got)
}
if c.Pid() == 0 {
t.Error("Pid should be non-zero while child is running")
}
}
func TestStderr_LinesDelivered(t *testing.T) {
var (
mu sync.Mutex
lines []string
)
cmd, env := helperCmd("stderr_at_startup", "FJMCP_HELPER_N=4")
c, err := supervisor.Start(t.Context(), supervisor.Config{
Cmd: cmd,
Env: env,
OnStderr: func(line string) {
mu.Lock()
lines = append(lines, line)
mu.Unlock()
},
})
if err != nil {
t.Fatalf("Start: %v", err)
}
defer c.Stop(t.Context())
// Helper drops the four stderr lines at startup; close stdin so it exits.
_ = c.Stdin.Close()
select {
case <-c.Done():
case <-time.After(3 * time.Second):
t.Fatal("child did not exit after stdin close")
}
mu.Lock()
defer mu.Unlock()
if len(lines) != 4 {
t.Fatalf("collected %d stderr lines, want 4: %v", len(lines), lines)
}
for i, want := range []string{"stderr line 1", "stderr line 2", "stderr line 3", "stderr line 4"} {
if lines[i] != want {
t.Errorf("stderr[%d] = %q, want %q", i, lines[i], want)
}
}
}
func TestStop_GracefulOnSIGTERM(t *testing.T) {
cmd, env := helperCmd("echo")
c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
if err != nil {
t.Fatalf("Start: %v", err)
}
start := time.Now()
if err := c.Stop(t.Context()); err != nil {
// Echo exits cleanly on stdin close; ExitErr should be nil. Some
// platforms report SIGTERM as an error if the helper got the signal
// before stdin EOF reached it — accept either.
t.Logf("Stop returned: %v (acceptable)", err)
}
if elapsed := time.Since(start); elapsed > 2*time.Second {
t.Errorf("Stop took %s, want <2s for a SIGTERM-friendly child", elapsed)
}
// Done must be closed by now.
select {
case <-c.Done():
default:
t.Error("Done should be closed after Stop returns")
}
}
func TestStop_EscalatesToSIGKILL(t *testing.T) {
cmd, env := helperCmd("ignore_term")
c, err := supervisor.Start(t.Context(), supervisor.Config{
Cmd: cmd,
Env: env,
StopGrace: 200 * time.Millisecond,
})
if err != nil {
t.Fatalf("Start: %v", err)
}
// Wait for the helper to confirm its SIGTERM handler is installed.
// Without this, SIGTERM races signal.Notify and kills the process
// outright — the test would then mis-conclude that escalation worked
// when actually graceful exit happened.
line, err := c.Stdout.ReadString('\n')
if err != nil || strings.TrimSpace(line) != "ready" {
t.Fatalf("helper readiness sync failed: line=%q err=%v", line, err)
}
start := time.Now()
err = c.Stop(t.Context())
elapsed := time.Since(start)
// SIGKILL'd processes report a non-nil exit error.
if err == nil {
t.Error("expected non-nil exit error after SIGKILL escalation")
}
// Stop should return a tick or two after the grace period — not, say, 60s.
if elapsed > 2*time.Second {
t.Errorf("Stop took %s, want fast escalation past grace", elapsed)
}
if elapsed < 150*time.Millisecond {
t.Errorf("Stop took only %s, escalated before grace?", elapsed)
}
}
func TestStop_IsIdempotent(t *testing.T) {
cmd, env := helperCmd("echo")
c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
if err != nil {
t.Fatalf("Start: %v", err)
}
if err := c.Stop(t.Context()); err != nil {
t.Logf("first Stop: %v", err)
}
// Second call must not panic and must return promptly.
done := make(chan struct{})
go func() {
_ = c.Stop(t.Context())
close(done)
}()
select {
case <-done:
case <-time.After(time.Second):
t.Error("second Stop hung")
}
}
func TestDone_ChildExitsCleanly(t *testing.T) {
cmd, env := helperCmd("exit_zero")
c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
if err != nil {
t.Fatalf("Start: %v", err)
}
select {
case <-c.Done():
case <-time.After(3 * time.Second):
t.Fatal("Done did not close")
}
if err := c.ExitErr(); err != nil {
t.Errorf("ExitErr = %v, want nil for clean exit", err)
}
}
func TestDone_ChildExitsBadly(t *testing.T) {
cmd, env := helperCmd("exit_nonzero")
c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
if err != nil {
t.Fatalf("Start: %v", err)
}
select {
case <-c.Done():
case <-time.After(3 * time.Second):
t.Fatal("Done did not close")
}
err = c.ExitErr()
if err == nil {
t.Fatal("ExitErr = nil, want exit error for non-zero exit")
}
var exitErr *exec.ExitError
if !errors.As(err, &exitErr) {
t.Errorf("ExitErr = %v, want *exec.ExitError", err)
} else if exitErr.ExitCode() != 7 {
t.Errorf("ExitCode = %d, want 7", exitErr.ExitCode())
}
}
func TestStart_AppliesEnvOverrides(t *testing.T) {
// Verify cfg.Env actually reaches the child by reading FJMCP_HELPER_N
// in the stderr_at_startup helper.
cmd, env := helperCmd("stderr_at_startup", "FJMCP_HELPER_N=2")
var collected []string
var mu sync.Mutex
c, err := supervisor.Start(t.Context(), supervisor.Config{
Cmd: cmd,
Env: env,
OnStderr: func(line string) {
mu.Lock()
collected = append(collected, line)
mu.Unlock()
},
})
if err != nil {
t.Fatalf("Start: %v", err)
}
_ = c.Stdin.Close()
<-c.Done()
mu.Lock()
defer mu.Unlock()
if len(collected) != 2 {
t.Errorf("got %d stderr lines, want 2 (env should set N=2): %v", len(collected), collected)
}
}