forgejo-mcp-broker/internal/supervisor/supervisor_test.go

package supervisor_test

import (
	"bufio"
	"errors"
	"fmt"
	"io"
	"os"
	"os/exec"
	"os/signal"
	"strconv"
	"strings"
	"sync"
	"syscall"
	"testing"
	"time"

	"kode.naiv.no/olemd/forgejo-mcp-broker/internal/supervisor"
)

// TestMain implements the helper-process pattern: when invoked with
// FJMCP_SUPERVISOR_HELPER set, the test binary acts as a child instead of
// running tests. This avoids needing a separate helper binary or shell
// dependency.
func TestMain(m *testing.M) {
	if mode := os.Getenv("FJMCP_SUPERVISOR_HELPER"); mode != "" {
		runHelper(mode)
		return
	}
	os.Exit(m.Run())
}

func runHelper(mode string) {
	switch mode {
	case "echo":
		// Echo each stdin line back to stdout.
		s := bufio.NewScanner(os.Stdin)
		for s.Scan() {
			fmt.Println(s.Text())
		}
		os.Exit(0)

	case "stderr_at_startup":
		// Print N lines to stderr at startup, then echo loop.
		n, _ := strconv.Atoi(os.Getenv("FJMCP_HELPER_N"))
		if n == 0 {
			n = 3
		}
		for i := 1; i <= n; i++ {
			fmt.Fprintf(os.Stderr, "stderr line %d\n", i)
		}
		s := bufio.NewScanner(os.Stdin)
		for s.Scan() {
			fmt.Println(s.Text())
		}
		os.Exit(0)

	case "ignore_term":
		// Install SIGTERM handler that swallows the signal, announce
		// readiness on stdout (so tests have a sync barrier — the parent
		// must not send SIGTERM before the handler is in place), then
		// sleep until SIGKILL.
		sig := make(chan os.Signal, 1)
		signal.Notify(sig, syscall.SIGTERM)
		go func() {
			for range sig { /* ignore */
			}
		}()
		fmt.Println("ready")
		time.Sleep(60 * time.Second)
		os.Exit(0)

	case "exit_zero":
		os.Exit(0)

	case "exit_nonzero":
		os.Exit(7)

	default:
		fmt.Fprintf(os.Stderr, "unknown helper mode %q\n", mode)
		os.Exit(2)
	}
}

// helperCmd returns Cmd args + Env that re-exec the test binary as a helper
// in the given mode. -test.run=^$ skips all tests (we just want runHelper).
func helperCmd(mode string, extraEnv ...string) ([]string, []string) {
	cmd := []string{os.Args[0], "-test.run=^$"}
	env := append([]string{"FJMCP_SUPERVISOR_HELPER=" + mode}, extraEnv...)
	return cmd, env
}

func TestStart_RequiresCmd(t *testing.T) {
	_, err := supervisor.Start(t.Context(), supervisor.Config{})
	if err == nil || !strings.Contains(err.Error(), "Cmd is required") {
		t.Errorf("want Cmd-required error, got %v", err)
	}
}

func TestStart_BadBinary(t *testing.T) {
	_, err := supervisor.Start(t.Context(), supervisor.Config{
		Cmd: []string{"/this/path/does/not/exist"},
	})
	if err == nil {
		t.Fatal("expected error for missing binary")
	}
}

func TestEcho_RoundTrip(t *testing.T) {
	cmd, env := helperCmd("echo")
	c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
	if err != nil {
		t.Fatalf("Start: %v", err)
	}
	defer c.Stop(t.Context())

	if _, err := io.WriteString(c.Stdin, "hello-world\n"); err != nil {
		t.Fatalf("write: %v", err)
	}
	line, err := c.Stdout.ReadString('\n')
	if err != nil {
		t.Fatalf("read: %v", err)
	}
	if got := strings.TrimRight(line, "\n"); got != "hello-world" {
		t.Errorf("read %q, want hello-world", got)
	}

	if c.Pid() == 0 {
		t.Error("Pid should be non-zero while child is running")
	}
}

func TestStderr_LinesDelivered(t *testing.T) {
	var (
		mu    sync.Mutex
		lines []string
	)
	cmd, env := helperCmd("stderr_at_startup", "FJMCP_HELPER_N=4")
	c, err := supervisor.Start(t.Context(), supervisor.Config{
		Cmd: cmd,
		Env: env,
		OnStderr: func(line string) {
			mu.Lock()
			lines = append(lines, line)
			mu.Unlock()
		},
	})
	if err != nil {
		t.Fatalf("Start: %v", err)
	}
	defer c.Stop(t.Context())

	// Helper drops the four stderr lines at startup; close stdin so it exits.
	_ = c.Stdin.Close()
	select {
	case <-c.Done():
	case <-time.After(3 * time.Second):
		t.Fatal("child did not exit after stdin close")
	}

	mu.Lock()
	defer mu.Unlock()
	if len(lines) != 4 {
		t.Fatalf("collected %d stderr lines, want 4: %v", len(lines), lines)
	}
	for i, want := range []string{"stderr line 1", "stderr line 2", "stderr line 3", "stderr line 4"} {
		if lines[i] != want {
			t.Errorf("stderr[%d] = %q, want %q", i, lines[i], want)
		}
	}
}

func TestStop_GracefulOnSIGTERM(t *testing.T) {
	cmd, env := helperCmd("echo")
	c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
	if err != nil {
		t.Fatalf("Start: %v", err)
	}
	start := time.Now()
	if err := c.Stop(t.Context()); err != nil {
		// Echo exits cleanly on stdin close; ExitErr should be nil. Some
		// platforms report SIGTERM as an error if the helper got the signal
		// before stdin EOF reached it — accept either.
		t.Logf("Stop returned: %v (acceptable)", err)
	}
	if elapsed := time.Since(start); elapsed > 2*time.Second {
		t.Errorf("Stop took %s, want <2s for a SIGTERM-friendly child", elapsed)
	}
	// Done must be closed by now.
	select {
	case <-c.Done():
	default:
		t.Error("Done should be closed after Stop returns")
	}
}

func TestStop_EscalatesToSIGKILL(t *testing.T) {
	cmd, env := helperCmd("ignore_term")
	c, err := supervisor.Start(t.Context(), supervisor.Config{
		Cmd:       cmd,
		Env:       env,
		StopGrace: 200 * time.Millisecond,
	})
	if err != nil {
		t.Fatalf("Start: %v", err)
	}

	// Wait for the helper to confirm its SIGTERM handler is installed.
	// Without this, SIGTERM races signal.Notify and kills the process
	// outright — the test would then mis-conclude that escalation worked
	// when actually graceful exit happened.
	line, err := c.Stdout.ReadString('\n')
	if err != nil || strings.TrimSpace(line) != "ready" {
		t.Fatalf("helper readiness sync failed: line=%q err=%v", line, err)
	}

	start := time.Now()
	err = c.Stop(t.Context())
	elapsed := time.Since(start)

	// SIGKILL'd processes report a non-nil exit error.
	if err == nil {
		t.Error("expected non-nil exit error after SIGKILL escalation")
	}
	// Stop should return a tick or two after the grace period — not, say, 60s.
	if elapsed > 2*time.Second {
		t.Errorf("Stop took %s, want fast escalation past grace", elapsed)
	}
	if elapsed < 150*time.Millisecond {
		t.Errorf("Stop took only %s, escalated before grace?", elapsed)
	}
}

func TestStop_IsIdempotent(t *testing.T) {
	cmd, env := helperCmd("echo")
	c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
	if err != nil {
		t.Fatalf("Start: %v", err)
	}
	if err := c.Stop(t.Context()); err != nil {
		t.Logf("first Stop: %v", err)
	}
	// Second call must not panic and must return promptly.
	done := make(chan struct{})
	go func() {
		_ = c.Stop(t.Context())
		close(done)
	}()
	select {
	case <-done:
	case <-time.After(time.Second):
		t.Error("second Stop hung")
	}
}

func TestDone_ChildExitsCleanly(t *testing.T) {
	cmd, env := helperCmd("exit_zero")
	c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
	if err != nil {
		t.Fatalf("Start: %v", err)
	}
	select {
	case <-c.Done():
	case <-time.After(3 * time.Second):
		t.Fatal("Done did not close")
	}
	if err := c.ExitErr(); err != nil {
		t.Errorf("ExitErr = %v, want nil for clean exit", err)
	}
}

func TestDone_ChildExitsBadly(t *testing.T) {
	cmd, env := helperCmd("exit_nonzero")
	c, err := supervisor.Start(t.Context(), supervisor.Config{Cmd: cmd, Env: env})
	if err != nil {
		t.Fatalf("Start: %v", err)
	}
	select {
	case <-c.Done():
	case <-time.After(3 * time.Second):
		t.Fatal("Done did not close")
	}
	err = c.ExitErr()
	if err == nil {
		t.Fatal("ExitErr = nil, want exit error for non-zero exit")
	}
	var exitErr *exec.ExitError
	if !errors.As(err, &exitErr) {
		t.Errorf("ExitErr = %v, want *exec.ExitError", err)
	} else if exitErr.ExitCode() != 7 {
		t.Errorf("ExitCode = %d, want 7", exitErr.ExitCode())
	}
}

func TestStart_AppliesEnvOverrides(t *testing.T) {
	// Verify cfg.Env actually reaches the child by reading FJMCP_HELPER_N
	// in the stderr_at_startup helper.
	cmd, env := helperCmd("stderr_at_startup", "FJMCP_HELPER_N=2")

	var collected []string
	var mu sync.Mutex
	c, err := supervisor.Start(t.Context(), supervisor.Config{
		Cmd: cmd,
		Env: env,
		OnStderr: func(line string) {
			mu.Lock()
			collected = append(collected, line)
			mu.Unlock()
		},
	})
	if err != nil {
		t.Fatalf("Start: %v", err)
	}
	_ = c.Stdin.Close()
	<-c.Done()

	mu.Lock()
	defer mu.Unlock()
	if len(collected) != 2 {
		t.Errorf("got %d stderr lines, want 2 (env should set N=2): %v", len(collected), collected)
	}
}