Skip to content

Commit 69dba08

Browse files
mansursclaude
andcommitted
feat(worker): add per-request worker_timeout (hard request timeout)
Add an experimental `worker_timeout` worker option: a hard per-request timeout for worker mode, the equivalent of PHP-FPM's request_terminate_timeout. When a worker request runs longer than the timeout it is aborted with a "Worker request timeout of N second(s) exceeded" fatal and the worker restarts cleanly for the next request. Unlike max_execution_time, this also covers time spent blocked in an external call. A signal/EINTR alone cannot abort such a call (PHP retries EINTR, and mysqlnd even drops its socket from EG(regular_list)), so on Linux the watchdog inspects what the thread is parked in via /proc/self/task/<tid>/syscall and shuts down the socket(s) involved: - read/recvfrom/recvmsg/connect: fd is the syscall's first argument; - poll/ppoll: the pollfd array is read from the process's own memory with process_vm_readv(2) (PHP's stream layer, and Redis/HTTP/DB clients on it, always poll before reading). Both syscalls are matched: glibc and musl implement poll() via the dedicated poll syscall on arches that have one (e.g. amd64) and via ppoll only elsewhere (e.g. arm64); - epoll_wait/epoll_pwait: watched fds are enumerated from /proc/self/fdinfo/<epfd> (covers curl_multi, gRPC). Every fd is confirmed to be a socket, and after recovering a pointer/table-derived fd the thread's syscall is re-read to confirm it is still parked there before shutdown, so a stale pointer or reused fd cannot close an unrelated descriptor. The watchdog body runs under the same mutex as its cancellation, so a watchdog racing request completion can never interrupt the wrong request. A long sleep() is woken by the realtime kill signal (Linux/FreeBSD). The fatal is raised at the next opcode via a custom zend_interrupt_function (guarded against double installation across embedded Init/Shutdown cycles). On macOS/Windows only the VM-interrupt flag is set (CPU-bound overruns are caught; a blocking syscall already in progress cannot be unblocked). Configurable per worker via the Caddyfile `worker_timeout` directive and the WithWorkerTimeout API; defaults to 0 (disabled). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
1 parent edaffab commit 69dba08

26 files changed

Lines changed: 1243 additions & 3 deletions

caddy/app.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ func (f *FrankenPHPApp) Start() error {
164164
frankenphp.WithWorkerWatchMode(w.Watch),
165165
frankenphp.WithWorkerMaxFailures(w.MaxConsecutiveFailures),
166166
frankenphp.WithWorkerMaxThreads(w.MaxThreads),
167+
frankenphp.WithWorkerTimeout(w.WorkerTimeout),
167168
frankenphp.WithWorkerRequestOptions(w.requestOptions...),
168169
)
169170

caddy/config_test.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package caddy
22

33
import (
44
"testing"
5+
"time"
56

67
"github.com/caddyserver/caddy/v2/caddyconfig/caddyfile"
78
"github.com/stretchr/testify/require"
@@ -35,6 +36,64 @@ func TestModuleWorkerDuplicateFilenamesFail(t *testing.T) {
3536
require.Contains(t, err.Error(), "must not have duplicate filenames", "Error message should mention duplicate filenames")
3637
}
3738

39+
func TestModuleWorkerTimeoutParses(t *testing.T) {
40+
config := `
41+
{
42+
php {
43+
worker {
44+
file ../testdata/worker-with-env.php
45+
num 1
46+
worker_timeout 30s
47+
}
48+
}
49+
}`
50+
51+
d := caddyfile.NewTestDispenser(config)
52+
module := &FrankenPHPModule{}
53+
54+
require.NoError(t, module.UnmarshalCaddyfile(d))
55+
require.Len(t, module.Workers, 1)
56+
require.Equal(t, 30*time.Second, module.Workers[0].WorkerTimeout)
57+
}
58+
59+
func TestModuleWorkerTimeoutDefaultsToZero(t *testing.T) {
60+
config := `
61+
{
62+
php {
63+
worker {
64+
file ../testdata/worker-with-env.php
65+
num 1
66+
}
67+
}
68+
}`
69+
70+
d := caddyfile.NewTestDispenser(config)
71+
module := &FrankenPHPModule{}
72+
73+
require.NoError(t, module.UnmarshalCaddyfile(d))
74+
require.Len(t, module.Workers, 1)
75+
require.Zero(t, module.Workers[0].WorkerTimeout)
76+
}
77+
78+
func TestModuleWorkerTimeoutInvalidDurationFails(t *testing.T) {
79+
config := `
80+
{
81+
php {
82+
worker {
83+
file ../testdata/worker-with-env.php
84+
worker_timeout not-a-duration
85+
}
86+
}
87+
}`
88+
89+
d := caddyfile.NewTestDispenser(config)
90+
module := &FrankenPHPModule{}
91+
92+
err := module.UnmarshalCaddyfile(d)
93+
require.Error(t, err)
94+
require.Contains(t, err.Error(), "worker_timeout must be a valid duration")
95+
}
96+
3897
func TestModuleWorkersWithDifferentFilenames(t *testing.T) {
3998
// Create a test configuration with different worker filenames
4099
configWithDifferentFilenames := `

caddy/workerconfig.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"path"
66
"path/filepath"
77
"strconv"
8+
"time"
89

910
"github.com/caddyserver/caddy/v2"
1011
"github.com/caddyserver/caddy/v2/caddyconfig/caddyfile"
@@ -41,6 +42,8 @@ type workerConfig struct {
4142
MatchPath []string `json:"match_path,omitempty"`
4243
// MaxConsecutiveFailures sets the maximum number of consecutive failures before panicking (defaults to 6, set to -1 to never panick)
4344
MaxConsecutiveFailures int `json:"max_consecutive_failures,omitempty"`
45+
// WorkerTimeout sets a hard per-request timeout (e.g. 30s). A worker request running longer is interrupted so the thread can be reclaimed. 0 (default) disables it.
46+
WorkerTimeout time.Duration `json:"worker_timeout,omitempty"`
4447

4548
options []frankenphp.WorkerOption
4649
requestOptions []frankenphp.RequestOption
@@ -145,8 +148,22 @@ func unmarshalWorker(d *caddyfile.Dispenser) (workerConfig, error) {
145148
}
146149

147150
wc.MaxConsecutiveFailures = v
151+
case "worker_timeout":
152+
if !d.NextArg() {
153+
return wc, d.ArgErr()
154+
}
155+
156+
v, err := time.ParseDuration(d.Val())
157+
if err != nil {
158+
return wc, d.Errf("worker_timeout must be a valid duration (example: 30s): %v", err)
159+
}
160+
if v < 0 {
161+
return wc, d.Err("worker_timeout must be >= 0")
162+
}
163+
164+
wc.WorkerTimeout = v
148165
default:
149-
return wc, wrongSubDirectiveError("worker", "name, file, num, env, watch, match, max_consecutive_failures, max_threads", v)
166+
return wc, wrongSubDirectiveError("worker", "name, file, num, env, watch, match, max_consecutive_failures, max_threads, worker_timeout", v)
150167
}
151168
}
152169

docs/config.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ You can also explicitly configure FrankenPHP using the [global option](https://c
111111
watch <path> # Sets the path to watch for file changes. Can be specified more than once for multiple paths.
112112
name <name> # Sets the name of the worker, used in logs and metrics. Default: absolute path of worker file
113113
max_consecutive_failures <num> # Sets the maximum number of consecutive failures before the worker is considered unhealthy, -1 means the worker will always restart. Default: 6.
114+
worker_timeout <duration> # (experimental) Hard per-request timeout (e.g. 30s). A request running longer is interrupted so the worker thread can be reclaimed. Default: 0 (disabled).
114115
}
115116
}
116117
}

docs/worker.md

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,74 @@ frankenphp {
151151
}
152152
```
153153

154+
### Request timeout (experimental)
155+
156+
By default a worker thread blocked on a slow external call (a hung MySQL query, a
157+
stuck HTTP client, a Redis call, a long `sleep()`) holds that thread until the call
158+
returns on its own. The `worker_timeout` option sets a hard per-request timeout —
159+
the worker-mode equivalent of PHP-FPM's `request_terminate_timeout` — after which
160+
FrankenPHP interrupts the PHP thread so the request bails out and the worker is
161+
reclaimed:
162+
163+
```caddyfile
164+
frankenphp {
165+
worker {
166+
# ...
167+
worker_timeout 30s
168+
}
169+
}
170+
```
171+
172+
When the timeout elapses, the request is aborted with a fatal error whose message
173+
is `Worker request timeout of N second(s) exceeded`. The worker script then
174+
restarts cleanly and serves the next request — no special userland code is
175+
required. Note that `max_execution_time` does **not** count time spent inside a
176+
blocking call such as a database query, which is exactly the case `worker_timeout`
177+
is designed to cover.
178+
179+
How it works (and its limits):
180+
181+
- A blocking syscall (a stuck database query, a hung Redis/Elasticsearch/HTTP
182+
read, a black-holed `connect()`) cannot be aborted by PHP's timeout flag
183+
alone, because PHP retries the interrupted read. On **Linux**, FrankenPHP
184+
inspects what the worker thread is blocked on and shuts down the socket(s)
185+
involved, so the read fails and the request unwinds. Only sockets are
186+
aborted this way (a read blocked on a file or pipe is not). It recognises:
187+
- `read`/`recvfrom`/`recvmsg` and a blocking `connect` — the descriptor is the
188+
syscall's first argument;
189+
- `poll`/`ppoll` — the descriptors are read out of the poll set (PHP's stream
190+
layer, and thus most Redis/HTTP/DB clients built on it, always poll before
191+
reading). This is what lets a stuck `SELECT SLEEP(30)` actually stop at the
192+
timeout instead of running to completion;
193+
- `epoll_wait`/`epoll_pwait` — the watched descriptors are enumerated from the
194+
epoll instance (covers clients running their own event loop, such as
195+
`curl_multi` and gRPC).
196+
197+
Every descriptor is confirmed to be a socket before it is shut down.
198+
- A long `sleep()`/`usleep()` (no socket) is interrupted by a realtime signal on
199+
**Linux and FreeBSD**.
200+
- On **macOS** and **Windows**, and for a tight CPU loop inside a C extension that
201+
swallows `EINTR`, only PHP's VM-interrupt flag is set: a CPU-bound overrun is
202+
still caught at the next opcode boundary, but a blocking syscall already in
203+
progress cannot be unblocked. A client blocked in a `select`-based loop (rare on
204+
Linux, where `poll` is preferred) is likewise not aborted.
205+
- The socket abort needs no extra privilege (all inspection is of the process
206+
itself), but it relies on `/proc` and — for poll-based waits, the common case —
207+
on [`process_vm_readv(2)`](https://man7.org/linux/man-pages/man2/process_vm_readv.2.html).
208+
Docker's default seccomp profile allows this syscall on kernels ≥ 4.8
209+
([moby#42083](https://github.com/moby/moby/pull/42083)); under an older or
210+
stricter policy (gVisor, custom profiles) the call fails closed: FrankenPHP
211+
logs a warning once and a request blocked in a poll-based socket read can then
212+
not be aborted (sleeps and CPU-bound overruns still are).
213+
- `worker_timeout` aborts the request hard, like `request_terminate_timeout`
214+
does in PHP-FPM. The database server rolls back an open transaction when its
215+
connection is shut down, and PHP's request shutdown still runs (sessions are
216+
released as usual). But application-level sequences are not rolled back: an
217+
e-mail already sent, a file already written or an external lock with a TTL
218+
stay as they are. Set the timeout comfortably above your slowest legitimate
219+
request.
220+
- `worker_timeout` defaults to `0` (disabled).
221+
154222
## Superglobals behavior
155223

156224
[PHP superglobals](https://www.php.net/manual/language.variables.superglobals.php) (`$_SERVER`, `$_ENV`, `$_GET`...)

frankenphp.c

Lines changed: 135 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,35 @@ static void frankenphp_register_atfork(void) {
139139
static void CALLBACK frankenphp_noop_apc(ULONG_PTR param) { (void)param; }
140140
#endif
141141

142+
/* ===== Worker request timeout (per-request hard timeout) =====
143+
*
144+
* A blocking syscall (a stuck SELECT SLEEP(), a hung HTTP read, ...) cannot be
145+
* aborted by the VM-interrupt flag alone: PHP's network layer retries EINTR, so
146+
* the read just resumes, and a driver like mysqlnd even removes its socket from
147+
* EG(regular_list) so it can't be found by walking the resource list. To cut
148+
* such a request short the Go watchdog shuts down the fd the thread is blocked
149+
* on (found via /proc/<tid>/syscall); the EINTR wakes sleep-style waits. Once
150+
* the thread runs PHP again, this custom zend_interrupt_function raises a clear
151+
* "Worker request timeout" fatal.
152+
*
153+
* Per-thread state is indexed by thread_index and allocated once max_threads is
154+
* known (frankenphp_init_worker_timeout). */
155+
static zend_atomic_bool *worker_timeout_pending = NULL;
156+
static double *worker_timeout_seconds = NULL;
157+
static int worker_timeout_max_threads = 0;
158+
/* Saved to chain PHP's own interrupt handler (fibers, pcntl, ...). */
159+
static void (*frankenphp_original_interrupt)(zend_execute_data *) = NULL;
160+
161+
static bool frankenphp_worker_timeout_is_pending(uintptr_t idx) {
162+
return worker_timeout_pending != NULL &&
163+
idx < (uintptr_t)worker_timeout_max_threads &&
164+
zend_atomic_bool_load(&worker_timeout_pending[idx]);
165+
}
166+
142167
#ifdef FRANKENPHP_HAS_KILL_SIGNAL
143-
/* No-op: delivery itself is what unblocks the syscall via EINTR. */
168+
/* No-op: delivery itself is what unblocks an EINTR-abortable wait. The socket
169+
* abort that handles retried blocking reads is done from Go (shutdown on the
170+
* blocked fd), not here. */
144171
static void frankenphp_kill_signal_handler(int sig) { (void)sig; }
145172

146173
static pthread_once_t kill_signal_handler_installed = PTHREAD_ONCE_INIT;
@@ -222,6 +249,104 @@ void frankenphp_release_thread_for_kill(force_kill_slot slot) {
222249
#endif
223250
}
224251

252+
/* zend_interrupt_function hook: when a worker timeout is pending for this
253+
* thread, raise a fatal that unwinds the request with a clear message. Any
254+
* exception left over from the aborted I/O (e.g. a mysqli connection error
255+
* caused by the socket shutdown) is dropped so our message is what surfaces.
256+
* E_ERROR triggers a bailout, so the original handler is not chained in that
257+
* case; otherwise we chain it. */
258+
static void frankenphp_timeout_interrupt(zend_execute_data *execute_data) {
259+
if (is_worker_thread && frankenphp_worker_timeout_is_pending(thread_index)) {
260+
zend_atomic_bool_store(&worker_timeout_pending[thread_index], false);
261+
if (EG(exception)) {
262+
zend_clear_exception();
263+
}
264+
zend_error_noreturn(E_ERROR, "Worker request timeout of %g second(s) exceeded",
265+
worker_timeout_seconds[thread_index]);
266+
}
267+
268+
if (frankenphp_original_interrupt != NULL) {
269+
frankenphp_original_interrupt(execute_data);
270+
}
271+
}
272+
273+
/* Installed on the main thread after SAPI startup. php_main can run more than
274+
* once per process (Init/Shutdown cycles when embedding, and in the test
275+
* suite) and zend_interrupt_function survives a SAPI shutdown, so guard
276+
* against saving ourselves as the "original" handler - the chain call would
277+
* recurse forever the first time the hook fired without a pending timeout. */
278+
static void frankenphp_install_timeout_interrupt(void) {
279+
if (zend_interrupt_function == frankenphp_timeout_interrupt) {
280+
return;
281+
}
282+
frankenphp_original_interrupt = zend_interrupt_function;
283+
zend_interrupt_function = frankenphp_timeout_interrupt;
284+
}
285+
286+
/* Allocate per-thread timeout state once max_threads is known. Called from Go
287+
* alongside frankenphp_init_thread_metrics. */
288+
void frankenphp_init_worker_timeout(int max_threads) {
289+
worker_timeout_max_threads = max_threads;
290+
worker_timeout_pending = calloc(max_threads, sizeof(zend_atomic_bool));
291+
worker_timeout_seconds = calloc(max_threads, sizeof(double));
292+
}
293+
294+
void frankenphp_destroy_worker_timeout(void) {
295+
free(worker_timeout_pending);
296+
worker_timeout_pending = NULL;
297+
free(worker_timeout_seconds);
298+
worker_timeout_seconds = NULL;
299+
worker_timeout_max_threads = 0;
300+
}
301+
302+
/* Arm the timeout for a thread that has overrun its worker_timeout: record the
303+
* limit (for the message) and set the per-thread flag + VM interrupt so the
304+
* interrupt hook fires the moment the thread next runs PHP. No wakeup yet - the
305+
* caller first shuts down the blocked fd (so the message isn't pre-empted by the
306+
* driver's own connection error), then calls frankenphp_wake_worker_thread. */
307+
void frankenphp_arm_worker_timeout(uintptr_t thread_index_arg, force_kill_slot slot,
308+
double timeout_seconds) {
309+
if (slot.vm_interrupt == NULL ||
310+
thread_index_arg >= (uintptr_t)worker_timeout_max_threads ||
311+
worker_timeout_pending == NULL || worker_timeout_seconds == NULL) {
312+
return;
313+
}
314+
315+
worker_timeout_seconds[thread_index_arg] = timeout_seconds;
316+
zend_atomic_bool_store(&worker_timeout_pending[thread_index_arg], true);
317+
zend_atomic_bool_store(slot.vm_interrupt, true);
318+
}
319+
320+
/* Wake a thread parked in an EINTR-abortable wait (sleep, usleep) so it returns
321+
* and reaches the VM interrupt. Socket reads are handled by the fd shutdown done
322+
* before this call; this is the fallback for waits that have no fd. Safe on a
323+
* thread that has already gone away (zeroed slot). */
324+
void frankenphp_wake_worker_thread(force_kill_slot slot) {
325+
if (slot.vm_interrupt == NULL) {
326+
return;
327+
}
328+
#ifdef FRANKENPHP_HAS_KILL_SIGNAL
329+
if (zend_atomic_bool_load(&kill_signal_handler_active)) {
330+
pthread_kill(slot.tid, FRANKENPHP_KILL_SIGNAL);
331+
}
332+
#elif defined(PHP_WIN32)
333+
if (slot.thread_handle != NULL) {
334+
CancelSynchronousIo(slot.thread_handle);
335+
QueueUserAPC((PAPCFUNC)frankenphp_noop_apc, slot.thread_handle, 0);
336+
}
337+
#endif
338+
}
339+
340+
/* Clear a (possibly stale) pending flag at the start of a worker request so a
341+
* watchdog that raced request completion cannot abort the next request. */
342+
void frankenphp_clear_worker_timeout(uintptr_t thread_index_arg) {
343+
if (worker_timeout_pending == NULL ||
344+
thread_index_arg >= (uintptr_t)worker_timeout_max_threads) {
345+
return;
346+
}
347+
zend_atomic_bool_store(&worker_timeout_pending[thread_index_arg], false);
348+
}
349+
225350
void frankenphp_update_local_thread_context(bool is_worker) {
226351
is_worker_thread = is_worker;
227352

@@ -1269,6 +1394,12 @@ static void *php_thread(void *arg) {
12691394
* grace period can wake it from a busy PHP loop or blocking syscall. */
12701395
frankenphp_register_thread_for_kill(thread_index);
12711396

1397+
#ifdef __linux__
1398+
/* Publish the kernel thread id so the worker-timeout watchdog can locate the
1399+
* fd this thread blocks on (via /proc/<tid>/syscall) and shut it down. */
1400+
go_frankenphp_store_thread_tid(thread_index, (int)gettid());
1401+
#endif
1402+
12721403
bool thread_is_healthy = true;
12731404
bool has_attempted_shutdown = false;
12741405

@@ -1467,6 +1598,9 @@ static void *php_main(void *arg) {
14671598

14681599
frankenphp_sapi_module.startup(&frankenphp_sapi_module);
14691600

1601+
/* Hook the VM interrupt so worker_timeout can raise its own fatal. */
1602+
frankenphp_install_timeout_interrupt();
1603+
14701604
/* check if a default filter is set in php.ini and only filter if
14711605
* it is, this is deprecated and will be removed in PHP 9 */
14721606
char *default_filter;

frankenphp.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ import (
3737
"unsafe"
3838
// debug on Linux
3939
//_ "github.com/ianlancetaylor/cgosymbolizer"
40+
41+
"github.com/dunglas/frankenphp/internal/blockio"
4042
)
4143

4244
type contextKeyStruct struct{}
@@ -269,6 +271,10 @@ func Init(options ...Option) error {
269271
opt.logger = nil
270272
}
271273

274+
// Let the worker-timeout watchdog report (once) when the platform denies
275+
// the syscalls it needs to abort a blocked socket read.
276+
blockio.SetLogger(globalLogger)
277+
272278
globalMu.Unlock()
273279

274280
if opt.metrics != nil {

0 commit comments

Comments
 (0)