Skip to content

Commit 36b93df

Browse files
fix: add robust process exit detection for child processes
Port robust process exit detection from PR anomalyco#15757 to fix zombie/stuck child processes in containers where Bun fails to deliver exit events. - Add polling watchdog to bash tool and Process.spawn that detects process exit via kill(pid, 0) when event-loop events are missed - Add process registry (active map) with stale/reap exports for server-level watchdog to detect and clean up stuck bash processes - Improve Shell.killTree with alive() helper and proper SIGKILL escalation after SIGTERM timeout - Add session-level watchdog interval in prompt loop to periodically reap stale bash processes Based on the work in anomalyco#15757. Co-Authored-By: Nacho F. Lizaur <NachoFLizaur@users.noreply.github.com>
1 parent b395d19 commit 36b93df

File tree

4 files changed

+166
-19
lines changed

4 files changed

+166
-19
lines changed

packages/opencode/src/session/prompt.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import { LLM } from "./llm"
4545
import { iife } from "@/util/iife"
4646
import { Shell } from "@/shell/shell"
4747
import { Truncate } from "@/tool/truncation"
48+
import { stale, reap } from "@/tool/bash"
4849

4950
// @ts-ignore
5051
globalThis.AI_SDK_LOG_WARNINGS = false
@@ -289,6 +290,13 @@ export namespace SessionPrompt {
289290

290291
using _ = defer(() => cancel(sessionID))
291292

293+
const watchdog = setInterval(() => {
294+
for (const id of stale()) {
295+
reap(id)
296+
}
297+
}, 5000)
298+
using _watchdog = defer(() => clearInterval(watchdog))
299+
292300
// Structured output state
293301
// Note: On session resumption, state is reset but outputFormat is preserved
294302
// on the user message and will be retrieved from lastUser below

packages/opencode/src/shell/shell.ts

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ import { setTimeout as sleep } from "node:timers/promises"
99
const SIGKILL_TIMEOUT_MS = 200
1010

1111
export namespace Shell {
12+
function alive(pid: number): boolean {
13+
try {
14+
process.kill(pid, 0)
15+
return true
16+
} catch {
17+
return false
18+
}
19+
}
20+
1221
export async function killTree(proc: ChildProcess, opts?: { exited?: () => boolean }): Promise<void> {
1322
const pid = proc.pid
1423
if (!pid || opts?.exited?.()) return
@@ -24,17 +33,24 @@ export namespace Shell {
2433

2534
try {
2635
process.kill(-pid, "SIGTERM")
27-
await sleep(SIGKILL_TIMEOUT_MS)
28-
if (!opts?.exited?.()) {
29-
process.kill(-pid, "SIGKILL")
30-
}
31-
} catch (_e) {
32-
proc.kill("SIGTERM")
33-
await sleep(SIGKILL_TIMEOUT_MS)
34-
if (!opts?.exited?.()) {
36+
} catch {
37+
try {
38+
proc.kill("SIGTERM")
39+
} catch {}
40+
}
41+
42+
await sleep(SIGKILL_TIMEOUT_MS)
43+
44+
if (opts?.exited?.() || !alive(pid)) return
45+
try {
46+
process.kill(-pid, "SIGKILL")
47+
} catch {
48+
try {
3549
proc.kill("SIGKILL")
36-
}
50+
} catch {}
3751
}
52+
53+
await sleep(SIGKILL_TIMEOUT_MS)
3854
}
3955
const BLACKLIST = new Set(["fish", "nu"])
4056

packages/opencode/src/tool/bash.ts

Lines changed: 93 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,40 @@ const DEFAULT_TIMEOUT = Flag.OPENCODE_EXPERIMENTAL_BASH_DEFAULT_TIMEOUT_MS || 2
2323

2424
export const log = Log.create({ service: "bash-tool" })
2525

26+
// Registry for active bash processes — enables server-level watchdog
27+
const active = new Map<
28+
string,
29+
{
30+
pid: number
31+
timeout: number
32+
started: number
33+
kill: () => void
34+
done: () => void
35+
}
36+
>()
37+
38+
export function stale() {
39+
const result: string[] = []
40+
const now = Date.now()
41+
for (const [id, entry] of active) {
42+
if (now - entry.started > entry.timeout + 5000) result.push(id)
43+
}
44+
return result
45+
}
46+
47+
export function reap(id: string) {
48+
const entry = active.get(id)
49+
if (!entry) return
50+
log.info("reaping stuck process", {
51+
callID: id,
52+
pid: entry.pid,
53+
age: Date.now() - entry.started,
54+
})
55+
entry.kill()
56+
entry.done()
57+
active.delete(id)
58+
}
59+
2660
const resolveWasm = (asset: string) => {
2761
if (asset.startsWith("file://")) return fileURLToPath(asset)
2862
if (asset.startsWith("/") || /^[a-z]:/i.test(asset)) return asset
@@ -180,6 +214,14 @@ export const BashTool = Tool.define("bash", async () => {
180214
detached: process.platform !== "win32",
181215
})
182216

217+
if (!proc.pid) {
218+
if (proc.exitCode !== null) {
219+
log.info("process exited before pid could be read", { exitCode: proc.exitCode })
220+
} else {
221+
throw new Error(`Failed to spawn process: pid is undefined for command "${params.command}"`)
222+
}
223+
}
224+
183225
const MAX_OUTPUT_BYTES = 10 * 1024 * 1024 // 10 MB cap
184226
const outputChunks: Buffer[] = []
185227
let outputLen = 0
@@ -236,25 +278,72 @@ export const BashTool = Tool.define("bash", async () => {
236278
void kill()
237279
}, timeout + 100)
238280

281+
const callID = ctx.callID
282+
if (callID) {
283+
active.set(callID, {
284+
pid: proc.pid!,
285+
timeout,
286+
started: Date.now(),
287+
kill: () => Shell.killTree(proc, { exited: () => exited }),
288+
done: () => {},
289+
})
290+
}
291+
239292
await new Promise<void>((resolve, reject) => {
293+
let resolved = false
294+
240295
const cleanup = () => {
296+
if (resolved) return
297+
resolved = true
241298
clearTimeout(timeoutTimer)
299+
clearInterval(poll)
242300
ctx.abort.removeEventListener("abort", abortHandler)
243301
}
244302

245-
proc.once("exit", () => {
303+
const done = () => {
304+
if (resolved) return
246305
exited = true
247306
cleanup()
248307
resolve()
249-
})
308+
}
250309

251-
proc.once("error", (error) => {
310+
// Update the active entry with the real done callback
311+
if (callID) {
312+
const entry = active.get(callID)
313+
if (entry) entry.done = done
314+
}
315+
316+
const fail = (error: Error) => {
317+
if (resolved) return
252318
exited = true
253319
cleanup()
254320
reject(error)
255-
})
321+
}
322+
323+
proc.once("exit", done)
324+
proc.once("close", done)
325+
proc.once("error", fail)
326+
327+
// Polling watchdog: detect process exit when Bun's event loop
328+
// fails to deliver the "exit" event (confirmed Bun bug in containers)
329+
const poll = setInterval(() => {
330+
if (proc.exitCode !== null || proc.signalCode !== null) {
331+
done()
332+
return
333+
}
334+
if (proc.pid && process.platform !== "win32") {
335+
try {
336+
process.kill(proc.pid, 0)
337+
} catch {
338+
done()
339+
return
340+
}
341+
}
342+
}, 1000)
256343
})
257344

345+
if (callID) active.delete(callID)
346+
258347
let output = Buffer.concat(outputChunks).toString()
259348
// Free the chunks array
260349
outputChunks.length = 0

packages/opencode/src/util/process.ts

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,20 +74,54 @@ export namespace Process {
7474
}
7575

7676
const exited = new Promise<number>((resolve, reject) => {
77-
const done = () => {
77+
let resolved = false
78+
79+
const cleanup = () => {
80+
if (resolved) return
81+
resolved = true
7882
opts.abort?.removeEventListener("abort", abort)
7983
if (timer) clearTimeout(timer)
84+
clearInterval(poll)
85+
}
86+
87+
const finish = (code: number) => {
88+
if (resolved) return
89+
cleanup()
90+
resolve(code)
91+
}
92+
93+
const fail = (error: Error) => {
94+
if (resolved) return
95+
cleanup()
96+
reject(error)
8097
}
8198

8299
proc.once("exit", (code, signal) => {
83-
done()
84-
resolve(code ?? (signal ? 1 : 0))
100+
finish(code ?? (signal ? 1 : 0))
85101
})
86102

87-
proc.once("error", (error) => {
88-
done()
89-
reject(error)
103+
proc.once("close", (code, signal) => {
104+
finish(code ?? (signal ? 1 : 0))
90105
})
106+
107+
proc.once("error", fail)
108+
109+
// Polling watchdog: detect process exit when Bun's event loop
110+
// fails to deliver the "exit" event (confirmed Bun bug in containers)
111+
const poll = setInterval(() => {
112+
if (proc.exitCode !== null || proc.signalCode !== null) {
113+
finish(proc.exitCode ?? (proc.signalCode ? 1 : 0))
114+
return
115+
}
116+
if (proc.pid && process.platform !== "win32") {
117+
try {
118+
process.kill(proc.pid, 0)
119+
} catch {
120+
finish(proc.exitCode ?? 1)
121+
return
122+
}
123+
}
124+
}, 1000)
91125
})
92126

93127
if (opts.abort) {

0 commit comments

Comments
 (0)