-
Notifications
You must be signed in to change notification settings - Fork 32
fix(lifecycle): drain StreamManager goroutines in tests #1227
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,7 +38,18 @@ type Client struct { | |
| // WebSocket connections for streaming | ||
| agentStreamConn *websocket.Conn | ||
| workspaceStreamConn *websocket.Conn | ||
| mu sync.RWMutex | ||
| // workspaceStream is the most-recent workspace stream returned by | ||
| // StreamWorkspace, retained so Client.Close can wait for its read/write | ||
| // goroutines to drain. Cleared by readWorkspaceStream's defer once the | ||
| // stream tears down. | ||
| workspaceStream *WorkspaceStream | ||
| // closed flips to true on Client.Close and prevents new StreamWorkspace | ||
| // dials from leaking goroutines past the close barrier. Agent (updates) | ||
| // stream is not gated on this flag because the cascade flow legitimately | ||
| // stops + restarts the agent stream on the same client; gating it would | ||
| // strand workflow step transitions on a closed client. | ||
| closed bool | ||
| mu sync.RWMutex | ||
|
|
||
| // Shared write mutex for agent stream (used by StreamUpdates and sendStreamRequest) | ||
| streamWriteMu sync.Mutex | ||
|
|
@@ -656,10 +667,32 @@ type ( | |
| ProcessStatusUpdate = types.ProcessStatusUpdate | ||
| ) | ||
|
|
||
| // Close closes all connections and releases resources | ||
| // Close closes all connections and releases resources. It is a drain | ||
| // barrier for workspace stream goroutines: when Close returns, the workspace | ||
| // read/write loops have fully exited and future StreamWorkspace calls return | ||
| // immediately with an error. The agent (updates) stream is closed but not | ||
| // drained synchronously — the cascade flow legitimately calls Close on a | ||
| // client whose updates stream is still mid-event, and blocking would stall | ||
| // workflow step transitions. | ||
| func (c *Client) Close() { | ||
| c.mu.Lock() | ||
| c.closed = true | ||
| ws := c.workspaceStream | ||
| c.mu.Unlock() | ||
|
|
||
| c.CloseUpdatesStream() | ||
| // CloseWorkspaceStream closes the raw conn to wake the blocked read loop. | ||
| // ws.Close (below) is needed to close the writeLoop's closeCh; closeOnce | ||
| // makes ws.Close idempotent so the duplicate conn.Close it issues just | ||
| // logs at Debug. Both calls together wake both goroutines deterministically. | ||
| c.CloseWorkspaceStream() | ||
|
|
||
| // Wait for the workspace stream's read/write goroutines to fully unwind. | ||
| if ws != nil { | ||
| ws.Close() | ||
| ws.Wait() | ||
| } | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This note applies to The Commit 3 intentionally removed the synchronous drain barrier for the agent stream — meaning the old goroutine can outlive
The fix is one extra if-guard in c.mu.Lock()
if c.agentStreamConn == conn {
c.agentStreamConn = nil
}
c.mu.Unlock()Greptile flagged this as P1 in their review. The response indicated it would be fixed but it landed only in |
||
| if c.httpClient != nil { | ||
| c.httpClient.CloseIdleConnections() | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,126 @@ | ||
| package client | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "net/http" | ||
| "net/http/httptest" | ||
| "strings" | ||
| "sync" | ||
| "testing" | ||
| "time" | ||
|
|
||
| "github.com/gorilla/websocket" | ||
| "github.com/kandev/kandev/internal/common/logger" | ||
| ) | ||
|
|
||
| // closeBarrierMockServer is a minimal agentctl mock that exposes the two | ||
| // WebSocket endpoints needed for Close/drain regression coverage. Handlers | ||
| // stay open until the client tears down, mirroring the behaviour real | ||
| // agentctl exhibits when the manager hasn't asked it to exit. | ||
| type closeBarrierMockServer struct { | ||
| server *httptest.Server | ||
|
|
||
| mu sync.Mutex | ||
| wsConns []*websocket.Conn | ||
| connected chan struct{} | ||
| once sync.Once | ||
| } | ||
|
|
||
| func newCloseBarrierMockServer(t *testing.T) *closeBarrierMockServer { | ||
| t.Helper() | ||
| m := &closeBarrierMockServer{connected: make(chan struct{})} | ||
| upgrader := websocket.Upgrader{CheckOrigin: func(*http.Request) bool { return true }} | ||
|
|
||
| handler := func(w http.ResponseWriter, r *http.Request) { | ||
| conn, err := upgrader.Upgrade(w, r, nil) | ||
| if err != nil { | ||
| return | ||
| } | ||
| m.mu.Lock() | ||
| m.wsConns = append(m.wsConns, conn) | ||
| m.mu.Unlock() | ||
| m.once.Do(func() { close(m.connected) }) | ||
| // Block until client closes. | ||
| for { | ||
| if _, _, err := conn.ReadMessage(); err != nil { | ||
| _ = conn.Close() | ||
| return | ||
| } | ||
| } | ||
| } | ||
|
|
||
| mux := http.NewServeMux() | ||
| mux.HandleFunc("/api/v1/agent/stream", handler) | ||
| mux.HandleFunc("/api/v1/workspace/stream", handler) | ||
| m.server = httptest.NewServer(mux) | ||
| t.Cleanup(func() { | ||
| m.mu.Lock() | ||
| for _, c := range m.wsConns { | ||
| _ = c.Close() | ||
| } | ||
| m.mu.Unlock() | ||
| m.server.Close() | ||
| }) | ||
| return m | ||
| } | ||
|
|
||
| func newCloseBarrierTestClient(t *testing.T, serverURL string) *Client { | ||
| t.Helper() | ||
| url := strings.TrimPrefix(serverURL, "http://") | ||
| parts := strings.SplitN(url, ":", 2) | ||
| host := parts[0] | ||
| var port int | ||
| _, _ = fmt.Sscanf(parts[1], "%d", &port) | ||
| log, _ := logger.NewLogger(logger.LoggingConfig{Level: "error", Format: "json"}) | ||
| return NewClient(host, port, log) | ||
| } | ||
|
|
||
| // TestClientClose_DrainsWorkspaceStream is the regression test for the | ||
| // CI-only goleak flake around StreamManager and WorkspaceStream goroutines | ||
| // surviving Close. After Close returns, the workspace read/write loops must | ||
| // have fully unwound — otherwise tests with `defer client.Close()` see | ||
| // lingering goroutines and goleak.VerifyTestMain fails. The agent (updates) | ||
| // stream is closed but not drained synchronously: the cascade flow legitimately | ||
| // stops + restarts the updates stream on the same client. | ||
| func TestClientClose_DrainsWorkspaceStream(t *testing.T) { | ||
| mock := newCloseBarrierMockServer(t) | ||
| client := newCloseBarrierTestClient(t, mock.server.URL) | ||
|
|
||
| ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) | ||
| defer cancel() | ||
|
|
||
| ws, err := client.StreamWorkspace(ctx, WorkspaceStreamCallbacks{}) | ||
| if err != nil { | ||
| t.Fatalf("StreamWorkspace failed: %v", err) | ||
| } | ||
| if ws == nil { | ||
| t.Fatal("nil WorkspaceStream") | ||
| } | ||
|
|
||
| select { | ||
| case <-mock.connected: | ||
| case <-time.After(2 * time.Second): | ||
| t.Fatal("mock server never observed a WS connection") | ||
| } | ||
|
|
||
| // Close must return promptly and have drained the workspace stream. A hung | ||
| // goroutine here would block Close forever (or, pre-fix, return early and | ||
| // leave the goroutine running past goleak's check). | ||
| done := make(chan struct{}) | ||
| go func() { | ||
|
jcfs marked this conversation as resolved.
|
||
| client.Close() | ||
| close(done) | ||
| }() | ||
| select { | ||
| case <-done: | ||
| case <-time.After(2 * time.Second): | ||
| t.Fatal("Client.Close did not return within 2s — workspace drain is stuck") | ||
| } | ||
|
|
||
| // Post-Close, StreamWorkspace must error so a racy second close path | ||
| // doesn't strand a new dial past the barrier. | ||
| if _, err := client.StreamWorkspace(context.Background(), WorkspaceStreamCallbacks{}); err == nil { | ||
| t.Error("StreamWorkspace after Close should return error, got nil") | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
P1: Agent stream restart can clobber live connection pointer. Old read goroutine sets
agentStreamConn=nilafter new stream starts. Guard cleanup by connection identity.Prompt for AI agents